########################################################################################
#                                                                                      #
#   Author: Bertrand Neron,                                                            #
#   Organization:'Biological Software and Databases' Group, Institut Pasteur, Paris.   #  
#   Distributed under GPLv2 Licence. Please refer to the COPYING.LIB document.        #
#                                                                                      #
########################################################################################


"""
utilities to detect and convert the sequences formats
"""

import os , os.path , re
import sys
from subprocess import Popen , PIPE


import logging
s_log = logging.getLogger('mobyle.converter')

import Mobyle.ConfigManager
from Mobyle.MobyleError import *

__cfg = Mobyle.ConfigManager.Config()

__extra_epydoc_fields__ = [('call', 'Called by','Called by')]


myCode2Format = { 22  : 'CLUSTAL' ,
                  8   : 'FASTA'   ,
                  15  : 'MSF'     ,
                  17  : 'NEXUS'   ,
                  12  : 'PHYLIPI'  ,
                  11  : 'PHYLIPS'  ,
                  100 : 'STOCKHOLM',
                  101 : 'MEGA',
                  }


myFormat2Code = { 'CLUSTAL'  :  22 ,
                  'FASTA'    :   8 ,
                  'MSF'      :  15 ,
                  'NEXUS'    :  17 ,
                  'PHYLIPS'   :  11 ,
                  'PHYLIPI'   :  12 ,
                  'PHYLIP'    : 12 , # temporary code to ensure compatibility between squizz versions
                  'STOCKHOLM': 100 ,
                  'MEGA'     : 101 ,
                  }



squizz2code = { 'CLUSTAL'  :  22 ,
                'FASTA'    :   8 ,
                'MSF'      :  15 ,
                'NEXUS'    :  17 ,
                'PHYLIPS'   :  11 ,
                'PHYLIPI'   :  12 ,
                'PHYLIP'   :  120 , # temporary code to ensure compatibility between squizz versions
                'STOCKHOLM': 100 ,
                'MEGA'     : 101 ,
                }


code2squizz = { 22 : 'CLUSTAL' ,
                 8 : 'FASTA'   ,
                15 : 'MSF'     ,
                17 : 'NEXUS'   ,
                11 : 'PHYLIPS'  ,
                12 : 'PHYLIPI'  ,
               100 : 'STOCKHOLM' ,
               101 : 'MEGA' ,
                }



code2readseq = {8  : 'Pearson|Fasta|fa' , # we can't differentiate fasta sequences from fasta alignment
                11 : 'Phylip3.2'        ,
                12 : 'Phylip|Phylip4'   ,
                15 : 'MSF'              ,
                17 : 'PAUP|NEXUS'       ,
                22 : 'Clustal'     
                 }


readseq2code = {'Pearson|Fasta|fa' : 8  ,  # we can't differentiate fasta sequences from fasta alignment
                'Phylip3.2'        : 110 , # readseq detetct all phylip format as 3.2. it doesn't know recognise interleave from sequential format
                'Phylip|Phylip4'   : 12 ,  # futhermore it doesn't read correctly sequential formats where each sequence is multiligne.
                'MSF'              : 15 ,
                'PAUP|NEXUS'       : 17 ,
                'Clustal'          : 22
                }



code2suffixe = {22 : '.aln'       ,
                 8 : '.fasta'     ,
                15 : '.msf'       ,
                17 : '.nexus'     ,
                11 : '.phylips'    ,
                12 : '.phylipi'    ,
               100 : '.stockholm' ,
               101 : '.mega' ,
                }




class UnsupportedFormatError( MobyleError ):
    pass


def supportedFormat():
    seqconverter = __cfg.seqconverter()
    squizz = []
    readseq = []
    if 'SQUIZZ' in seqconverter:
        squizz = code2squizz.keys()

    if 'READSEQ' in seqconverter:
        readseq = code2readseq.keys()
    
    codes = list( set( squizz ) | set ( readseq ) )
    return [ myCode2Format[ code ] for code in codes ]


        
def format2code( prg ,format ):
    """
    @param prg: the program which has detect the format sequence
    @type prg: string 'squizz' , 'readseq' 
    @param format: the format sequence return by detect
    @type format: string
    @return: the code corresponding to a sequence format
    @rtype int
    @raise MobyleError: raise a L{MobyleError} if the format is not supported by Mobyle
    """
    try:
        if prg == 'squizz':
            code = squizz2code[ format ]
        if prg == 'readseq':
            code = readseq2code [ format ]
        return code
    except KeyError ,err:
        msg = "this format ( " + format + " ) is not supported by Mobyle"
        #s_log.error( msg )
        raise MobyleError, msg



def acceptCodes( converter ):
    """
    @param converter: the name of the sequence checker/converter
    @type converter: string
    @return: a list of codes accepted by the sequence cheker converter
    @rtype: list of int
    """
    if converter.upper() == 'SQUIZZ':
        return squizz2code.values()
    elif converter.upper() == 'READSEQ':
        return readseq2code.values()
    else:
        raise MobyleError , "this converter is not supported by Mobyle"




def detect( fileName , prg = None):
    """
    detect the Sequence format of a sequence file (use external sequence checker program: squizz ,readseq ). you should install at least one of this software.
    @param fileName: the path of the file.
    @type fileName: string
    @return: a tuple of 3 elements
       - The name of the program which has detected the format
       - the code of this format
       - the string corresponding to this format.
    If the format could not be determined by any programs return (None, None, None)
    @rtype: tuple ( string prg, int inCode, string inFormat).
    @call: L{SequenceParameter.convert}
    @raise MobyleError: if the fileName doesn't exist a MobyleError is raised
    """
    _error = None
    fileName = str( fileName )
    if not os.path.exists( fileName ):
        raise MobyleError , "AlignmentConverter.detect " + str( fileName ) + " this file doesn't exist"

    if prg and __cfg.seqconverter( prg ) is not None:
        if prg == 'SQUIZZ' :
            return  squizzDetect( fileName )
        elif prg == 'READSEQ' :
            return readseqDetect( fileName )
        else:
            raise MobyleError , "this converter; " + prg + ", is not used in Mobyle"
       
    else:
        for prg in  __cfg.seqconverter():
            try:
                detected = eval( prg.lower() + 'Detect( "' + fileName + '" )' )

            except MobyleError , err:
                _error = err
                continue
            if detected and detected[1] is not None :
                return detected
        if _error is None:
            return ( None , None , None , None )
        else:
            raise MobyleError, _error


def squizzDetect( fileName ):
    """
    @param fileName: the path to the sequence file
    @type fileName: string
    @return: a tuple of 3 elements
        - the name of the sequence detector used 'squizz'
        - the format Number or None
        - the Format sequence detected or None 
    @rtype: ('squizz', int , string , int )
    """


    squizz_path =  __cfg.seqconverter( 'SQUIZZ' )
    if squizz_path is not None :
        try:
            squizz_pipe = Popen( [ squizz_path , "-An" , fileName ] ,
                                 shell = False ,
                                 stdout = None ,
                                 stdin = None ,
                                 stderr = PIPE
                                 )
            squizz_nb = True
            
        except OSError:
            raise MobyleError, "squizz exit abnormaly: " + err
       
        squizz_pipe.wait()
        if squizz_pipe.returncode != 0:
            err = ''.join( squizz_pipe.stderr.readlines() )
            #logger l'ereur
            # on a pas le nom du job ni sa cle ??
            match = re.search( "squizz: invalid option -- n" , err )
            if match:
                try:
                    squizz_pipe = Popen( [ squizz_path , "-A" , fileName ] ,
                                         shell = False ,
                                         stdout = None ,
                                         stdin = None ,
                                         stderr = PIPE
                                         )
                    squizz_nb = False
                except OSError , err :
                    raise MobyleError , err
           
                squizz_pipe.wait()
                if squizz_pipe.returncode != 0:
                    
                    err = ''.join( squizz_pipe.stderr.readlines() )
                    #logger l'ereur
                    # on a pas le nom du job ni sa cle ??
                    raise UserValueError( msg =  "squizz exit abnormaly: " + err  )
           
            else:
                raise UserValueError( msg =  "squizz exit abnormaly: " + err  )

        for line in squizz_pipe.stderr :
            if squizz_nb:
                match = re.search( ": (.+) format, (\d+) entries\.$" ,  line)
            else:
                match = re.search( ": (.+) format.$" ,  line)
            if match :
                format = match.group(1)
                if squizz_nb:
                    al_nb = int( match.group(2))
                else:
                    al_nb = 0
                break
                                  
        if match and format != "UNKNOWN":
            fmt_nb = format2code( 'squizz' , format )
            return ('squizz', fmt_nb , format , al_nb )
        else:
            return ('squizz', None , None , None )


def readseqDetect( fileName ):
    """
    @param fileName: the path to the sequence file
    @type fileName: string
    @return: a tuple of 3 elements
        - the name of the sequence detector used 'readseq'
        - the format Number or None
        - the Format sequence detected or None 
    @rtype: ('readseq', int , string )
    """
    
    readseq_path = __cfg.seqconverter( 'READSEQ' )
    if readseq_path is not None:
        # readseq generate automatically a genbank version of the input file
        # I don't care of this file thus I redirect it to /dev/null
        
        cmde = "%s -v -o /dev/null %s" %( readseq_path ,
                                             fileName
                                             )

        readseq_pipe = Popen( [ readseq_path , "-v" , "-o" , "/dev/null" , fileName ] ,
                              shell = False ,
                              stdout = None ,
                              stdin = None ,
                              stderr = PIPE
                              )
        readseq_pipe.wait()
        if readseq_pipe.returncode != 0:
            err = ''.join( readseq_pipe.stderr.readlines() )
            # logger l'ereur
            # on a pas le nom du job ni sa cle ??
            raise MobyleError, "readseq exit abnormaly: " + err

            
        pattern = re.compile("^Sequence.*format=\s+(\d+)\.\s+(\S+),")

        for line in readseq_pipe.stderr:
            match = re.search(pattern,  line )

            if match :
                #fmt_nb = match.group(1) 
                format = match.group(2)
                break

        if match and format.find( 'unknown' ) == -1 and format.find( 'Plain' ) == -1 :
            try:
                fmt_nb = format2code( 'readseq' , format )
            except MobyleError: 
                #readseq could not recognize separetly sequence and alignment
                #if a valid alignment is submitt readseq recognize the format but this 
                #format will be not a alignment format and  format2code will raised a mobyleError
                return ( 'readseq' , None , None , None)
            
            #we can't determine nb of alignment with readseq
            return ( 'readseq' , fmt_nb , format , None)

        else:
            #readseq doesn't support the raw format correctly
            #it recognize about every thing as raw
            
            return ('readseq', None , None , None)



def convert( fileName , fmtList , force = False):
    """
    convert a sequence file in a format among the fmtList. the sequence converted
     is write in a new file.
    @param fileName: the name of the Sequence file to convert
    @type fileName: string
    @param fmtList: a list of the format sequence ( see mobyle.dtd <ELEMENT AcceptetdFormat> )
    @type fmtList: list of string
    @param force: if force is True, do the conversion even if the detected format is in the accepted formats.
    @return:
      - the name of the converter used
      - the format in
      - the format produce by the converter
      - the file name containing the sequence generated by the converter
    or (None, xx , None , None) if the conversion isn't possible.
    @rtype: tuple (string, string , string ,string)
    @raise UnsupportedFormatError: if the inCode or the outCode are not suported
    @raise MobyleError: if something goes wrong during squizz convertion
      ( permission denied to write a file, to read the sequence file ...)
    @call: L{SequenceParameter.convert}
    """
    if not os.path.exists( fileName ):
        raise MobyleError , "convert: no such file " + str( fileName )

    codeList = []

    for fmt in fmtList:
        try:
            codeList.append( myFormat2Code[ fmt.upper() ] )
        except KeyError:
            pass
    if not codeList:
        if fmtList:
            msg = "the formats ( specified in the xml ) %s are not supported by Mobyle" % fmtList 
        else:
            msg = "there is no dataFormats in acceptedDataFormats"
            
        raise MobyleError ,msg
    
    for converter in __cfg.seqconverter():
        prg , inCode , inFormat , al_nb = detect( fileName , prg = converter )
          
        if inCode is None:
            continue #try whith the next converter

        elif inCode in codeList:
            if force:
                commonCodes = [ inCode ]
            else:
                #      fmtPrg , fmtIn ,  inFileName ,fmtOut , outFileName , al_nb
                return ( prg , inFormat , fileName , None , fileName , al_nb)

        else:
            # commonCodes = list( set( fmtList ) & set( acceptCodes( converter ) ) )
            # to keep the fmtList order
            commonCodes =[ code for code in codeList if code in acceptCodes( converter )]


        for outCode in commonCodes :
            oriFileName = fileName + ".ori"
            os.rename( fileName , oriFileName )

            outFileName = os.path.splitext( fileName )[0] + code2suffixe[ outCode ]

            try:
                if converter == 'SQUIZZ':
                   
                    prg , inFormat , outFormat , al_nb = squizzConvert( oriFileName ,
                                                                        outCode ,
                                                                        inCode ,
                                                                        outFileName
                                                                        )
                elif converter == 'READSEQ':
                    prg , inFormat , outFormat , al_nb = readseqConvert( oriFileName ,
                                                                         outCode ,
                                                                         inCode ,
                                                                         outFileName
                                                                         )
                else:
                    raise MobyleError, "this converter; " + converter + ", is not used in Mobyle"

            except UnsupportedFormatError:
                continue #try the next outFormat

            if outFormat is None:
                continue #try the next outFormat
            else:
                return ( prg , inFormat , oriFileName , outFormat , outFileName , al_nb)

    
    #      fmtPrg , fmtIn ,  inFileName ,fmtOut , outFileName , al_nb
    return ( None , None ,     None ,    None   ,   None    ,   None  )


def squizzConvert( fileName , outCode , inCode = None , outFileName = None ):
    """
    @param fileName: the name of the file containing the sequence to convert
    @type fileName: string
    @param outCode: the format number in wich we want to convert the sequence
    @type outCode: int
    @type inCode: the format number detected by squizzdetect .
      if it keep at None a detection pass will be done again.
    @type inCode: int
    @type outFileName: the name of the file where the converter must write the
     sequence converted. if it None a name will be generate from the filename
     with changing the extension. the extensions used are defined in  code2suffixe
    @raise UnsupportedFormatError: if the inCode or the outCode are not supported
    @raise MobyleError: if something goes wrong during squizz convertion
      ( permission denied to write a file, to read the sequence file ...)
    """
    squizz_path = __cfg.seqconverter( 'SQUIZZ' )
    outFormat = code2squizz[ outCode ]

    if outFileName is None:
        outFileName = os.path.splitext( fileName )[0] + "." + code2suffixe[ outCode ]
        

    cmde =  [ squizz_path ,
              "-A",
              "-n",
              "-c", outFormat ,
              fileName
              ]

    if inCode :
        try:
            inFormat = squizz2code[ inCode ]

            cmde = [ squizz_path ,
                     "-A",
                     "-n",
                     "-f" , inFormat ,
                     "-c", outFormat ,
                     fileName
                     ]
            
        except KeyError , err :
            pass
    
    squizz_nb = True
    
    try:
        outFile = open( outFileName , 'w' )
    except IOError ,err :
        # pb on ne connait pas l'id du job
        # il faut paut etre cree une erreur convert error
        # la trapper au niveau supeieur (core.py)
        # et seulement a ce niveau logger l'erreur
        s_log.error( "can't write outFile:" + str( err ) )
        raise MobyleError , "Sequence Convertion Error: "+ str( err )

    
    try:
        squizz_pipe = Popen( cmde ,
                             shell  = False ,
                             stdout = outFile ,
                             stdin  = None ,
                             stderr = PIPE
                             )
    except OSError, err:
         raise MobyleError , err 
        
    squizz_pipe.wait()
        
    if squizz_pipe.returncode != 0:
            err = ''.join( squizz_pipe.stderr.readlines() )
            
            match = re.search( "squizz: invalid option -- n" , err )
            if match:
                try:
                    cmde = [ squizz_path ,
                            "-A" ,
                            "-c" , outFormat ,
                            fileName
                            ]               
                    if inCode :
                        try:
                            inFormat = squizz2code[ inCode ]
                            cmde = [ squizz_path ,
                                    "-S" ,
                                    "-c" , outFormat ,                    
                                    "-f" , inFormat ,
                                    fileName
                                    ]
            
                        except KeyError , err :
                            pass                    
                    
                    squizz_pipe = Popen( cmde ,
                                         shell  = False ,
                                         stdout = outFile ,
                                         stdin  = None ,
                                         stderr = PIPE
                                         )
                    squizz_nb = False
 
                except OSError , err :
                    raise MobyleError , err
           
                squizz_pipe.wait()
                if squizz_pipe.returncode != 0:
                    err = ''.join( squizz_pipe.stderr.readlines() )
                    #logger l'ereur
                    # on a pas le nom du job ni sa cle ??
                    raise UserValueError( msg = "squizz exit abnormaly: " + err )
           
            else: #the error doesn't come from -n option
                raise UserValueError( msg = "squizz exit abnormaly: " + err )
            
            
                
    outFile.close()
    err = ''.join( squizz_pipe.stderr.readlines() )
    if squizz_nb:
        match = re.search(  "(: \w+)?: (.+) format, (\d+) entries\.$",  err )
    else:
        match = re.search( "(: \w+)?: (.+) format\.$" , err )    
    
    if match:
        detectFormat = match.group(2)
        if squizz_nb:
            al_nb = int( match.group(3) )
        else :
            al_nb = 0        
    else:
        raise MobyleError , str( err )
        
    if squizz_pipe.returncode == 0:
        if match and detectFormat != "UNKNOWN":
            return ( 'squizz' , detectFormat , outFormat , al_nb) 
        else:
            # the inFormat is not recognize  
            return ( 'squizz' , None , None , None )
    else:
        if match and detectFormat == "unsupported" :
            #if the specified format ( -f format ) is not supported 
            raise UnSupportedFormatError , err
        else:
            raise MobyleError , str( err )
            






def readseqConvert( fileName , outCode , inCode = None , outFileName = None ):
    """
    @param fileName: the name of the file containing the sequence to convert
    @type fileName: string
    @param outCode: the format number in wich we want to convert the sequence
    @type outCode: int
    @type inCode: the format number detected by squizzdtect 
    @type inCode: int
    @type outFileName: the name of the file where the converter must write the
     sequence converted. if it None a name will be generate from the filename
     with changing the extension. the extensions used are defined in  code2suffixe
    """
    readseq_path = __cfg.seqconverter( 'READSEQ' )
    outFormat = myCode2Format[ outCode ]

    if outFileName is None:
        outFileName = os.path.splitext( fileName )[0] + "." + code2suffixe[ outCode ]
    
    # if the result sequence file is not specified with the
    # -o readseq option. readseq write the output sequence
    # in file named infile + suffixe_correspondint to the -f option
    # but doesn't used the stdout 
    cmde = [ readseq_path , "-a" , "-v" , "-f" , str( outCode ) ,"-o" , outFileName , fileName ]

    try:
        readseq_pipe = Popen( cmde ,
                              shell  = False ,
                              stdout = None,
                              stdin  = None ,
                              stderr = PIPE
                              )

    except OSError, err:
        raise MobyleError , str( err )

    readseq_pipe.wait()

    if readseq_pipe.returncode != 0:
        raise MobyleError

    format_pattern = re.compile("^Sequence.*format=\s+(\d+)\.\s+(\S+),")
   
    for line in readseq_pipe.stderr:
        match_fmt = re.search( format_pattern ,  line )

        if match_fmt :
            format_nb = int( match_fmt.group(1) )
            format = match_fmt.group(2)
            break
        elif line.find("No BioseqWriter for this format") != -1:
            #the format provide to -f option is not supported 
            raise UnSupportedFormatError , line

    if match_fmt and format.find( 'unknown' ) == -1 and format.find( 'Plain'):
          # I cant determine the nuber of alignment with readseq
          return ( 'readseq' , myCode2Format[ format_nb ]   , outFormat , 0 )
    else:
        try:
            # if readseq don't know the format (ex gde) it recognise it as Plain/raw
            # do a convertion but the final result is False !
            # thus the outFile must be erased
            os.unlink( outFileName )
        except IOError :
            pass

        return ( 'readseq' , None , None , None )

        






    
if __name__ == '__main__':
    
    print convert( sys.argv[-1] ,sys.argv[1:-1]  )

##     outCode = map(int ,sys.argv[2:])
##     seqFile = sys.argv[1]
##     print convert( seqFile , outCode ) 

## problems


