/* -----------------------------------------------------------------
 file      : /home/schuerer/prosite/src/pros2re.c

 author    : Schuerer <schuerer@pasteur.fr>
 creation  : <Wed May 16 19:56:02 2001>
 Time-stamp: <Tue Apr 30 23:21:02 2002>
 Dev-stage : under construction

 description : 


-------------------------------------------------------------------- */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <stdio.h>
#include <ctype.h>

#ifdef STDC_HEADERS
#include <stdlib.h>
#include <string.h>
#endif

#include <sys/types.h>
#include <regex.h>

#include "mytag.h"
#include "error.h"
#include "getpara.h"
#include "fasta.h"
#include "pat.h"
#include "translate.h"

#ifndef PROSITEDATA
#define PROSITEDATA "/usr/local/share/"PACKAGE
#endif

/* internal macros */

/* internal prototypes */

static int scanpat (fseq_t *s, para_t *pa);
static int processpat (pat_t *p, fseq_t *s, para_t *pa);

static int string_upcase (char *s);
//static int next_match(regex_t *re, regmatch_t *match, char *bufi, int dseq,
//		      pat_t *p, fseq_t *s, para_t *pa);
static void print_match (int start, int end, pat_t *p, fseq_t *s, para_t *pa);

/* function definitions */

int main (int argc, char *argv[]) {

  para_t pa;
  int sfindex;

  int i, ret;
  FILE *SIN;
  char *sfile, *datadir;
  fseq_t s;


  /* default values */
  pa.skip = TRUE;
  pa.doc = FALSE;
  pa.verif = FALSE;
  pa.is_nucl = FALSE;
  pa.case_sens = FALSE;
  pa.pformat = PROSITE;
  pa.t_exp = PROSITE;
  pa.match = SHORT;

  if ((datadir = getenv("PROSITEDATA")) == NULL) 
    datadir = PROSITEDATA;
  if ((pa.pattern = (char *) malloc((strlen(datadir)+13)*sizeof(char))) == NULL)
    error_fatal("memory", NULL);
  (void) sprintf(pa.pattern, "%s/prosite.dat", datadir);

  /* command line parsing */
  sfindex = parsepara(&pa, argc, argv);

  for (i=sfindex; i<argc; i++) {
    sfile = argv[i];

    /* open file */
    SIN = fopen_fasta(sfile);
  
    /* sequence treatment */
    while ((ret = getfseq(&s, SIN)) >= 0) {
      
      if (ret == NOSEQ) {
	error_warn(s.id, "missing sequence");
	fseq_free(&s); 
	continue;
      }
    
      if (!pa.case_sens) { (void) string_upcase(s.seq); }
      (void) scanpat (&s, &pa);
      fseq_free(&s);
    }

    if (ret == TRUNC) {
      if (strcmp(sfile, "-") == 0) error_fatal("", "broken pipe"); 
      else error_fatal(sfile, "truncated file");
    }

    /* close file */
    if (strcmp(sfile, "-") != 0)
      if (fclose(SIN) != 0)   
	error_fatal (sfile, NULL);
  }

  return OK;
}

static int scanpat (fseq_t *s, para_t *pa) {

  int ret;
  pat_t p;
  FILE *PIN;

  switch(pa->pformat) {
  case PATTERN:
    pat_init (&p);
    p.type = PONLY;
    p.pat = pa->pattern;
    (void) processpat(&p, s, pa);
    break;
  case LIST:
    if ((PIN = fopen(pa->pattern, "r")) == NULL) 
      error_fatal (pa->pattern, NULL);
    while ((ret = getplist (&p, PIN)) == OK) { 
      (void) processpat(&p, s, pa);
      free (p.pat);
    }
    break;
  default: /* case PROSITE: */
    PIN = fopen_prosite (pa->pattern);
    while ((ret = getprosite(&p, PIN)) != EOF) { 
      if ( ret == OK ) {
	if ((!p.skip || !pa->skip) && p.type == PAT) {
	  (void) processpat(&p, s, pa);
	}
	pat_free(&p);
      }
    }
    break;
  }

  if (pa->pformat != PATTERN) {
    if (ret == TRUNC)
      error_fatal(pa->pattern, "truncated file");
    if (fclose(PIN) != 0)
      error_fatal(pa->pattern, NULL);
  }

  return OK;
}


static int processpat (pat_t *p, fseq_t *s, para_t *pa) {

  regex_t re;
  regmatch_t match;
  int start, end, oend;
  int hasmatch, err, bufsize;
  char estr[100];
  char *seq, *seqi, *buf, *bufi;
  char *regpat;
  char joker;
  
  hasmatch = 0;

  /* change N to x when searching nucleotid sequences -- preliminary */
  joker = 'x';
  if (pa->is_nucl) joker = 'N';

  if (pa->verif && pa->t_exp == PROSITE && (err = prosite_verif(p->pat, joker))) {  
    error_warn(p->pat, error_verif_prosite(err));
    if (err == INCORRECT_PROSITE) {
      error_warn(p->pat, "skipped");
      return hasmatch;
    }
  }

  if (!pa->case_sens) { (void) string_upcase(p->pat); }

  /* compile pattern */
  regpat = (pa->t_exp == PROSITE) ? pros2re(&re, p->pat, pa) : p->pat;
  err = regcomp(&re, regpat, REG_EXTENDED);
  if (err != 0) {
    if (regerror(err, &re, estr, 100)) error_warn(regpat, estr);
    else error_warn(regpat, "error while compiling regular expression");
    error_warn(p->pat, "skipped");
    return hasmatch;
  }
  if (pa->t_exp == PROSITE) { free(regpat); }

  /* search pattern */
  seqi = seq = s->seq;

  /* allocate submatch buffer */
  bufsize = strlen(seq)+1;
  if ((buf = (char *) malloc(bufsize*sizeof(char))) == NULL)
    error_fatal("memory", NULL);
  bufi = buf;
  oend = 0;
  while(regexec(&re, seqi, 1, &match, 0) != REG_NOMATCH) {
    hasmatch++;
    oend = end;
    start = seqi-seq+match.rm_so;
    end = seqi-seq+match.rm_eo-1;

    if (pa->match == LONG) {
      if (oend != end) { print_match(start, end, p, s, pa); } 
    }
    else {
      if (pa->match == ALL) { print_match(start, end, p, s, pa); }

      (void) strncpy(buf, seq+start, end-start);
      *(buf+end-start) = '\0';
          
      /* lstrip */
      while(regexec(&re, buf, 1, &match, 0) != REG_NOMATCH) {
	hasmatch++;
	end = start+match.rm_eo-1;
	if (pa->match == ALL) { print_match(start, end, p, s, pa); }
	*(buf+match.rm_eo-1) = '\0';
      }

      /* rstrip */
      if (pa->match == SHORT) {
	*(buf+end-start) = *(seq+end);
	*(buf+end-start+1) = '\0';
	bufi = buf + 1;
	while(regexec(&re, bufi, 1, &match, 0) != REG_NOMATCH) {
	  hasmatch++;
	  start = start+match.rm_so + 1;
	  bufi = bufi + match.rm_so + 1; 
	}
	print_match(start, end, p, s, pa);
      }
    }
      
    seqi = seq + start + 1;
  } 

  regfree(&re);
  free(buf);

  return hasmatch;
}
  
static void print_match(int start, int end, pat_t *p, fseq_t *s, para_t *pa) {
  

  //  start = dseq+match->rm_so;
  //end = dseq+match->rm_eo-1;

  if ( pa->doc ) {
    (void) printf("%s %d - %d %s %s %.*s\n", (*(s->id) == '\0') ?  "unknown" : s->id, 
		  start+1, end+1, (p->id == NULL) ? "unknown" : p->id, 
		  (p->acdoc == NULL) ? "unknown" : p->acdoc,
		  end-start+1, s->seq+start);
  }
  else {
    (void) printf("%s %d - %d %s %.*s\n", (*(s->id) == '\0') ?  "unknown" : s->id, 
		  start+1, end+1, (p->ac == NULL) ? p->pat : p->ac, 
		  end-start+1, s->seq+start);
  }
}


static int string_upcase (char *s) {

  char *sintern = s;

  while (*sintern) {
    *sintern = toupper (*sintern);
    sintern++;
  }

  return 0;
}







