#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
circseq_cup -- circular RNA analysis toolkits.

Usage: get_candidate_site.py [options]

Options:
    -h --help                      Show this screen.
    -f FUSION --fusion=FUSION      TopHat-Fusion fusion BAM file. (used in \
TopHat-Fusion mapping)
    -j JUNC --junc=JUNC            The fusion info file.
    -g GENOME --genome=GENOME      Genome FASTA file.
    -o PREFIX --output=PREFIX      Output prefix [default: circ].
    -l LENGHT --length=LENGHT      The maximal length between candidate start and end [default: 5000].
"""


from docopt import docopt
import sys
import pysam
from collections import defaultdict
from interval import Interval
import tempfile
import os


def convert_fusion(fusion_bam, genome_fa,output_double_seq):
    """
    Extract fusion junction reads from the BAM file
    """
    fusions = defaultdict(int)
    for i, read in enumerate(parse_bam(fusion_bam)):
        chrom, strand, start, end,rr_tag = read
        segments = [start, end]
        if (i + 1) % 2 == 1:  # first fragment of the fusion junction read
            interval = [start, end]
        else:  # second fragment of the fusion junction read
            sta1, end1 = interval
            sta2, end2 = segments
            if (end1 < sta2 or end2 < sta1) and rr_tag==1:  # no overlap between fragments
                sta = sta1 if sta1 < sta2 else sta2
                end = end1 if end1 > end2 else end2
                fusions['%s\t%d\t%d' % (chrom, sta, end)] = 1
    total = 0

    f_double_seq = open(output_double_seq, 'w')
    for i, pos in enumerate(fusions):
        chrom=pos.split()[0]
        sta=int(pos.split()[1])
        end=int(pos.split()[2])
        if (end - sta <= int(options['--length'])) and (end - sta >= 20):
           total += fusions[pos]
           seq=genome_fa.fetch(chrom, sta, end)
           pos_name=chrom+'_'+pos.split()[1]+'_'+pos.split()[2]
           f_double_seq.write('>%s\n%s%s\n' % (pos_name, seq, seq)) #doule the sequence to make the circle reference
    f_double_seq.close()
    print('Converted %d fusions' % total)
 

def convert_junction(input_f, genome_fa,output_double_seq):
    total = 0
    f_double_seq = open(output_double_seq, 'w')
    with open(input_f, 'r') as f:
       for line in f:
          chrom, start, end = line.split()[:3]
          sta = int(start)
          end = int(end)
          if (end - sta <= int(options['--length'])) and (end - sta >= 20):
             total += 1
             seq=genome_fa.fetch(chrom, sta, end)
             pos_name=chrom+'_'+str(sta)+'_'+str(end)
             f_double_seq.write('>%s\n%s%s\n' % (pos_name, seq, seq)) #doule the sequence to make the circle reference
    f_double_seq.close()
    print('Converted %d junctions' % total)


def parse_bam(bam):
    fusions = {}
    for read in bam:
        if read.is_secondary:  # not the primary alignment
            continue
        tags = dict(read.tags)
        if 'XF' not in tags:  # not fusion junctions
            continue
        chr1, chr2 = tags['XF'].split()[1].split('-')
        if chr1 != chr2:  # not on the same chromosome
            continue
        s_position=tags['XF'].split()[3];rr_tag=1
        if s_position.find('M')!=-1 and s_position.find('m')!=-1: 
            rr_tag=0      # delete fr and rf type of fusion
        strand = '+' if not read.is_reverse else '-'
        if read.qname not in fusions:  # first fragment
            fusions[read.qname] = [chr1, strand, read.pos, read.aend,rr_tag]
        else:  # second fragment
            if chr1 == fusions[read.qname][0] \
               and strand == fusions[read.qname][1]:
                yield [chr1, strand, read.pos, read.aend,rr_tag]
                yield fusions[read.qname]



if __name__ == '__main__':
    if len(sys.argv) == 1:
        sys.exit(__doc__)
    options = docopt(__doc__)
    if options['--fusion'] and not options['--junc']:
        try:
            fusion_bam = pysam.Samfile(options['--fusion'], 'rb')
        except:
            sys.exit('Please make sure %s is BAM file!' % options['--fusion'])
    elif not options['--junc'] and not options['--fusion']:
        sys.exit('--fusion or --junc should be used!')
    elif options['--junc'] and options['--fusion']:
        sys.exit('Could not use --fusion and --junc simultaneously!')
    try:
        genome_fa = pysam.Fastafile(options['--genome'])
    except:
        sys.exit('Please make sure %s is a Fasta file and indexed!'
                 % options['--genome']) 
    #print "len:"+str(options['--length'])
    output_prefix=options['--output']
    output_dir=os.getcwd()+"/"+ output_prefix+"_output/"+ output_prefix
    print "get_candidate_site.py Start .. "
    if options['--junc']:
        convert_junction(options['--junc'], genome_fa,os.getcwd()+"/"+output_prefix+"_output/circ_index/"+ output_prefix+"_circ_ref.fa")
    elif options['--fusion']:
        convert_fusion(fusion_bam, genome_fa,os.getcwd()+"/"+output_prefix+"_output/circ_index/"+ output_prefix+"_circ_ref.fa")
    print "get_candidate_site Done! "
