use strict;
my $input = $ARGV[0];
my $output = $ARGV[1];
open IN, '<'.$input;
open OUT, '>'.$output;

my %fa;
my $cont=0;
my $spl_site=0;
my %h_pair_num;
my %h_len_num;
my $right=0;
my %type;

while (<IN>){
     my $s=$_;
     chomp($s);  
     
     my @cc=split(/\t/,$s);
     if ($#cc<2){next;} 
     $cont++;
     if($cc[6] eq "AG GT" || $cc[6] eq "AC CT"){$spl_site++;}
     $h_pair_num{$cc[5]}++;
     $h_len_num{$cc[4]}++;
     if($cc[7] == 0 && $cc[8] == 0){$right++;}
     $type{$cc[3]}++;

     my @c=split(/_/,$cc[0]);
     my $ss=$cc[0];
     $fa{$ss}="None";
    for my $k(keys %fa)
    {
        my @c_fa=split(/_/,$k);
        if (not $c_fa[0] eq $c[0]){next;}
           if (($c[1]>$c_fa[1] && $c[1]<$c_fa[2])||($c_fa[1]>$c[1] && $c_fa[1]<$c[2])) {
               combine($k, $ss);
           }  
    }
}

# Use Union-Find algorithm to find alternative circularization
sub combine
{
    my ($s1,$s2)=@_;
    my $x=find_father($s1);
    my $y=find_father($s2);
    if (not $x eq $y) {$fa{$x}=$y;}
}

sub find_father
{
    my ($s)=@_;
    if ($fa{$s} eq "None") { return $s;}
    else{return $fa{$s}=find_father($fa{$s});}
}

my %h_out;
for my $k(keys %fa)
{
    my $father=find_father($k);
    if (not exists($h_out{$father})) {
        $h_out{$father}=$k;
    }
    else{$h_out{$father}.="\t".$k;}
}

print OUT "predict circle number:".$cont."\n";
print OUT "splicing site(AG/GT AC/CT) number:".$spl_site."\n";
print OUT "predict sequnce same as reference sequnce number:".$right."\n";
print OUT "circle type:\nexon:".$type{"exon"}."\tintron:".$type{"intron"}."\tintergenic:".$type{"inter"}."\n\n";
print OUT "Length distribution:\n";

my $l_100=0;my $l_150=0;my $l_200=0;my $l_250=0;my $l_300=0;my $l_400=0;my $l_500=0;my $l_600=0;my $l_maxn=0;my $other=0;
my %h_len;
my $mul=30;

for my $k(keys %h_len_num)
{
     my $n=$h_len_num{$k};
     $h_len{int($k/$mul)}+=$n;
     my $nn=$k/$mul;
     if ($l_maxn<$k) {$l_maxn=$k;}
}

for my $i(1..20)
{
    print OUT ($i*$mul)."-".($i*$mul+$mul)."\t";
    if (exists($h_len{$i})) {
        print OUT $h_len{$i}."\n";
    }
    else{
         print OUT "0\n";
    }
}

print OUT "max length:\t".$l_maxn."\n\n";
print OUT "Support pair reads number distribution:\n";
my $p_1=0;my $p_5=0;my $p_10=0;my $p_20=0;my $p_50=0;my $p_100=0;my $p_200=0;my $p_maxn=0;$other=0;

for my $k(keys %h_pair_num)
{
     my $n=$h_pair_num{$k};
     if ($p_maxn<$k) {$p_maxn=$k;}
     if ($k==1) {$p_1+=$n;next; }
     if ($k<=5) {$p_5+=$n;next; }
     if ($k<=10) {$p_10+=$n;next; }
     if ($k<=20) {$p_20+=$n;next; }
     if ($k<=50) {$p_50+=$n;next; }
     if ($k<=100) {$p_100+=$n;next; }
     if ($k<=200) {$p_200+=$n;next; }      
     $other+=$n;
}
print OUT "2-5:\t".$p_5."\n6-10:\t".$p_10."\n11-20:\t".
$p_20."\n21-50:\t".$p_50."\n51-100:\t".$p_100."\n101-200:\t".$p_200."\nother:\t".$other."\nmax support pair reads:\t".$p_maxn."\n\n";

print OUT "Alternative cluster distribution:\n";
my $c_1=0;my $c_2=0;my $c_3=0;my $c_4=0;my $c_5=0;my $c_10=0;my $c_15=0;my $c_30=0;my $c_maxn=0;$other=0;
my %h_out_num;

for my $k(keys %h_out)
{
     my @c=split(/\t/,$h_out{$k});
     if ($c_maxn<$#c+1) {$c_maxn=$#c+1;}
     $h_out_num{$k}=($#c+1);
     if ($#c==0) {$c_1++;next; }
     if ($#c==1) {$c_2++;next; }
      if ($#c==2) {$c_3++;next; }
      if ($#c==3) {$c_4++;next; }
      if ($#c<5) {$c_5++;next; }
      if ($#c<10) {$c_10++;next; }
      if ($#c<15) {$c_15++;next; }
      if ($#c<30) {$c_30++;next; }
      $other++;
}

print OUT "1:\t".$c_1."\n2:\t".$c_2."\n3:\t".$c_3."\n4:\t".$c_4."\n5:\t".$c_5."\n6-10:\t".$c_10."\n11-15:\t".
$c_15."\n16-30:\t".$c_30."\nother:\t".$other."\nmax alternative cluster:\t".$c_maxn."\n\n";

foreach my $k ( sort { $h_out_num{$b} <=> $h_out_num{$a} } keys %h_out ) { 
     if($h_out_num{$k}<2){next;}
     my @c=split(/\t/,$h_out{$k});
     my %h_c;
     for my $i(@c){
          my @c2=split(/_/,$i);
          $h_c{$i}=$c2[1]*100000000+$c2[2];
          my $a=1;
     }
     foreach my $k ( sort { $h_c{$a} <=> $h_c{$b} } keys %h_c ){
          print OUT $k."\t";
     }
     print OUT "\n\n";
}

close IN;
close OUT;
print "circ_seq_statistics.pl Done!\n";
print "\n";
