#!/usr/bin/perl -w

=head
#This program, for given enqury sequence in FASTA format, returns a list of FIDPD subdomains (0) 
to whom enqury sequence might be matched using HMMSCAN program. And for each matched
subdomains, the FDPD module active sites will be mapped back to the enqury aimono-acids as 
the candidates of prediction sites.

Such information (1) as residue-ligand interaction patterns (VDW/HYDROGEN-BONDS/PI-PI/ELECTRO/etc.) 
as well as the FREQUENCY of active sites illustrated by subdomain membrane proteins are 
recorded for each candidate sites. Also recorded is the conservation value (2) of these sites derived 
from the multiple sequence alignment belonging to this FDPD subdomain module. Finally, the program
recorded the E-value (3) calculated by HMMSCAN that reflect the consistency between equenry sequence 
and the FIDPD module.

This program calls the external program hmmscan, which is the searching engine of HMMMER (http://hmmer.janelia.org) 


=cut

use strict;

my  ($fastafile,$outputf,$myjob,$myseq,
     $database_dir,$fidpd_db,$fidpd_hmmf,$fidpd_annotationf,$evalue_cutoff,$site_annotation_type,
     $very_conserv,$media_conserv,$little_conserv,
     $pred_method,
     $flag_use_bestmatch,$match_number,$flag_check)=&parse_arg();

my ($fastaname,$predictions,$enqury_fastaseq)=&_predict_active_site_with_fidpd($fastafile,$site_annotation_type,$evalue_cutoff);
exit if(! $predictions);
&_write_intermediate_predictions($fastaname,$outputf,$predictions) if ($flag_check);

my ($nsite,$predsites)=&_prediction_postprocessing($fastaname,$predictions,$very_conserv,$media_conserv,$little_conserv);
if($nsite<=0){
    print "_FIDPD_$fastaname"."_WARNING> NO_PREDICTION\n";
    exit;}
else {
    print "_FIDPD_$fastaname> $nsite FDPD-MODULE-SITES FOUND (WITHOUT FILTERING)\n";
}

if($flag_check) {
    my $allpredf=$fastaname.".allsites";
    open(ALLDATA,">$allpredf");
    print ALLDATA "_FIDPD_$fastaname> $nsite FDPD-MODULE-SITES FOUND (WITHOUT FILTERING)\n";
    foreach my $resid (sort {$$predsites{$b}{FRE}<=>$$predsites{$a}{FRE}} keys %$predsites){
	printf ALLDATA "_FIDPD_ALLLIST> %6d ",$resid;	
	printf ALLDATA  "%12d  %12.3f",  $$predsites{$resid}{FRE}, $$predsites{$resid}{FPT};
	printf ALLDATA  " ||  %10d %10d %10d %10d %10d %10d %10d %10d %10d %10d %10d\n", 
	$$predsites{$resid}{FRE},$$predsites{$resid}{COV},$$predsites{$resid}{COO},$$predsites{$resid}{ELE},
	$$predsites{$resid}{HBD},$$predsites{$resid}{HBA},$$predsites{$resid}{HBN},$$predsites{$resid}{PAI},
	$$predsites{$resid}{VDW},$$predsites{$resid}{CAT},$$predsites{$resid}{PHO};
    }
    close(ALLDATA);
    print "_FIDPD_$fastaname> CHECKING DETAILS IN $outputf, xxx.scan, ALL_FDPD_HITS: $allpredf\n";
}

my ($method,$npred,$predcut)=&_making_prediction_w_topscores($predsites,$pred_method);

print "_FIDPD_PINFO> $npred SITES FOUND\n";

#printf "\#FIDPD_PLIST> %6s %4s %12s  %12s","RESID","NAME","METHOD_SCORE","FREQ_PERCENT";
#    printf " ||  %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", 
#    "FRE","COV","COO","ELE","HBD","HBA","HBN","PAI","VDW","CAT","PHO";

printf "\#FIDPD_PLIST> %6s %4s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s  %15s\n","RESID","NAME",
    "COV","COO","ELE","HBD","HBA","HBN","PAI","VDW","CAT","PHO","FRE";


my @enquery_seq= split '', $enqury_fastaseq;

#print ">>>$enqury_fastaseq\n";
#print ">>>@enquery_seq\n";
#exit;


my $list=0;
foreach my $resid (sort {$$predsites{$b}{$method}<=>$$predsites{$a}{$method}}  keys %$predsites){
    #last if $$predsites{$resid}{$method} <= $predcut;
    $list++;
    my $tot=$$predsites{$resid}{FRE}; my $percnt=0;

=head
    printf "_FIDPD_PLIST> %6d %4s ",$resid,$enquery_seq[$resid-1];	
    printf "%6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %20d\n", 
    $$predsites{$resid}{COV}/$tot,$$predsites{$resid}{COO}/$tot,$$predsites{$resid}{ELE}/$tot,
    $$predsites{$resid}{HBD}/$tot,$$predsites{$resid}{HBA}/$tot,$$predsites{$resid}{HBN}/$tot,$$predsites{$resid}{PAI}/$tot,
    $$predsites{$resid}{VDW}/$tot,$$predsites{$resid}{CAT}/$tot,$$predsites{$resid}{PHO}/$tot,$tot;
=cut

    printf "_FIDPD_PLIST> %6d %4s ",$resid,$enquery_seq[$resid-1];	
    $percnt = $$predsites{$resid}{COV}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{COO}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{ELE}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{HBD}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{HBA}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{HBN}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{PAI}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{VDW}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{CAT}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    $percnt = $$predsites{$resid}{PHO}/$tot;
    if($percnt < 0.001) {printf "     0 ";} else {printf "%6.3f ",$percnt} 
    printf " %15d\n",$tot;

=head
    printf "_FIDPD_PLIST> %6d %4s ",$resid,$enquery_seq[$resid-1];	
    printf "%12d  %12.3f", $$predsites{$resid}{$method}, $$predsites{$resid}{FPT};
    printf "_FIDPD_PLIST> %6d %4s ",$resid,$enquery_seq[$resid-1];
    printf " %12d %10d %10d %10d %10d %10d %10d %10d %10d %10d %10d\n",
    $$predsites{$resid}{FRE},$$predsites{$resid}{COV},$$predsites{$resid}{COO},$$predsites{$resid}{ELE},
    $$predsites{$resid}{HBD},$$predsites{$resid}{HBA},$$predsites{$resid}{HBN},$$predsites{$resid}{PAI},
    $$predsites{$resid}{VDW},$$predsites{$resid}{CAT},$$predsites{$resid}{PHO};
=cut

    #print "===============================================\n" if $$predsites{$resid}{$method} >= $predcut;
    last if $list >= $npred;
}

exit;

sub _making_prediction_w_topscores {
    my ($predsites,$pred_method0)=@_; 

    my $pred_method=$pred_method0;
    if(! $pred_method) {
	print "_FIDPD_ERR> UNDEFINED PREDICTION_METHOD, NO PREDICTION MADE. (USING OPTION: -pm OR NONE).\n"; 
	return;
    }

    my $usetopp=0; my $usentop=0;
    my($myuse,$toppercent,$tailcut,$ntop,$npred,$predcut);
    
    $pred_method =~ s/^\s|\s$//g; $pred_method = uc $pred_method;
    if($pred_method =~ /^TOPPERCENT\s(.*)/i){
	$usetopp=1; $pred_method=$1;}
    elsif($pred_method =~ /^TOPP\s(.*)/i){
	$usetopp=1; $pred_method=$1;}
    elsif($pred_method =~ /^TOPP$/i){
	$usetopp=1; $pred_method="FREQ 45 35";}
    elsif($pred_method =~ /^TOPPERCENT/i){
	$usetopp=1; $pred_method="FREQ 45 35";}
    
    elsif($pred_method =~ /^NTOP\s(.*)/i) {
	$usentop=1; $pred_method=$1}
    elsif($pred_method =~ /^NTOP$/i) {
	$usentop=1; $pred_method='FREQ 3'}
    else {
	print "_FIDPD_ERR> UNRECOGNIZABLE PREDICTION_METHOD: $pred_method\n"; 
	return;
    }
    $pred_method =~ s/^\s|\s$//g;

    if($pred_method =~ /^(\S*)\s+(.*)/) {
	$myuse=$1; $pred_method=$2}
    elsif ($pred_method =~ /^(\S*)/){
	$myuse=$1; $pred_method='';}
    else {
	print "_FIDPD_ERR> 2nd PARAMETER ERROR IN PREDICTION_METHOD: \"$pred_method\"  :: \"$pred_method0\"\n"; 
	return;	
    }

    if($usetopp) {
	if($pred_method =~ /(\S+)\s+(\S+)/) {
	    $toppercent=$1/100; $tailcut=$2/100;}
	elsif ($pred_method =~ /(\S+)/){
	    $toppercent=$1/100; $tailcut=0.35;}
	else {
	    $toppercent=0.45; $tailcut=0.35;
	}
    }
    elsif($usentop) {
	if ($pred_method =~ /(\S+)/){
	    $ntop=$1;}	
	else {
	    $ntop=3;
	}
    }
    else {
	print "_FIDPD_$fastaname"."_ERR> UNRECOGNIZABLE PREDICTION/FILTER-METHOD: $pred_method0, PLEASE USE ONE OF: TOPPERCENT/NTOP\n";
	return;
    }
       
    print "_FIDPD_$fastaname> NTOP_PREDICTION USING:  \"$myuse\", CUTs = \"$ntop\" CUTs\n" if($usentop);
    print "_FIDPD_$fastaname> TOPP_PREDICTION USING:  \"$myuse\", TOPP = \"$toppercent\", TAILCUT = \"$tailcut\"\n" if($usetopp);    
    my $method='';
    $method='FRE' if $myuse =~ /^FRE$/ or $myuse =~/^FREQUENCY$/ or $myuse =~/^FREQ$/;
    $method='COO' if $myuse =~ /^COO$/ or $myuse =~ /^COORDINATE$/;
    $method='COV' if $myuse =~ /^COV$/ or $myuse =~ /^COVALENCE$/;
    $method='ELE' if $myuse =~ /^ELE$/ or $myuse =~ /^ELECTROSTATIC$/;
    $method='HBA' if $myuse =~ /^HBA$/ or $myuse =~ /^HB_A$/;
    $method='HBD' if $myuse =~ /^HBD$/ or $myuse =~ /^HB_D$/;
    $method='HBN' if $myuse =~ /^HBN$/ or $myuse =~ /^HB_N$/;
    $method='PAI' if $myuse =~ /^PI$/ or $myuse =~ /^PI_PI$/;
    $method='VDW' if $myuse =~ /^VDW$/;
    $method='CAT' if $myuse =~ /^CAT$/  or $myuse =~ /^CATALYTIC$/;
    $method='PHO' if $myuse =~ /^PHO$/  or $myuse =~ /^PHOSPHORYLATION$/;
    if (!$method) {
	print "_FIDPD_$fastaname"."_ERR> UNRECOGNIZABLE PREDICTION METHOD. PLEASE USE: FRE,COO,COV,ELE,HBA/B/N,PAI,VDW,CAT or PHO\n";
	exit;
    }
    my $nnpred=keys %$predsites; 
    if($usentop && $ntop >= $nnpred) {
	print "_FIDPD_$fastaname"."_WARNING> REQUIRED NTOP ($ntop) >= NNPRED(UN-TRIMMED PREDICTIONS: $nnpred), NNPRED USED INSTEAD OF NTOP\n";
	$npred=$nnpred;
	$predcut = -1;
	return $method,$npred,$predcut 
    }

    my $maximum='';  $npred=0;
    foreach my $resid (sort {$$predsites{$b}{$method}<=>$$predsites{$a}{$method}} keys %$predsites){	
	$maximum=$$predsites{$resid}{$method} if ! $maximum;
	$npred++;
	#printf "_FIDPD_SELECT> %6d %16.3f\n",$resid,$$predsites{$resid}{$method};
	if($usentop && $npred > $ntop) {
	    $predcut=$$predsites{$resid}{$method};
	    $npred--;
	    last;
	}
    }

    if($usetopp) {
	my $tocut=$maximum*$tailcut; my $nn=0;
	foreach my $resid (sort {$$predsites{$b}{$method}<=>$$predsites{$a}{$method}} keys %$predsites){
	    $nn++ if($$predsites{$resid}{$method} > $tocut);
	}
	$ntop=$nn*$toppercent; $ntop=1 if($ntop < 1);
	if($ntop >= $nnpred) {
	    print "_FIDPD_$fastaname"."_WARNING> REQUIRED NTOPP >= NNPRED(UN-TRIMMED PREDICTIONS), NNPRED USED INSTEAD OF NTOP\n";
	    $npred=$nnpred;
	    $predcut = -1;
	    return $method,$npred,$predcut;
	}

	$npred=0;
	foreach my $resid (sort {$$predsites{$b}{$method}<=>$$predsites{$a}{$method}} keys %$predsites){
	    $npred++;
	    if($npred > $ntop) {
		$predcut=$$predsites{$resid}{$method};
		$npred--;
		last;
	    }
	}
    }

    return $method,$npred,$predcut;
}

sub _predict_active_site_with_fidpd{
    my ($fastafile,$site_annotation_type,$evalue_cutoff)=@_; 
    #
    #-- INPUT SEQUENCE
    #
    my ($fastaname,$fastaseq,$slen,$fastafile2);
    ($fastaname,$fastaseq)=&_get_enqury_seq($fastafile) if ($fastafile);
    if(! $fastaseq and $myseq) {
	$fastaseq=$myseq;
	$fastaname=$myjob;
	$fastafile2=$fastaname.".fasta";
	open(WFASTA,">$fastafile2") ||die "_FIDPD_$fastaname> CANNOT OPEN-WRITE FASTA WITH SEQUENCE TO $fastafile\n";
	print WFASTA "> $fastaname | INPUT SEQUENCE\n";
	print WFASTA $fastaseq;
	close WFASTA;	    
    }
    if(! $fastaseq) {
	print "_FIDPD_$fastaname"."_ERR> NO SEQUENCE INPUT, PREDICTION SKIPPED\n";
	return; 
    }
    else {
	$slen=length $fastaseq;
	print "_FIDPD_$fastaname> QUERY SEQUENCE LENGTH = $slen\n";
    }
    
    #
    #-- PREDICTED FDPD-SUBDOMAIN/MODULE USING HMMSCAN 
    #
    # firstseq       :: matched FDPD-subdomain sequence, with first_start/first_end
    # secondseq      :: aligned enqury sequence, with second_start
    # conservation   :: conservation symbol return by hmmscan/alignment 
    #
    # all above quantities are depends on two matching indices: 
    #                   1 -- subdomain ID, or gf_id as listed in .sto file
    #                   2 -- domain list, exclusively used in hmmscan output
    #                        i.e. given one SUBDOMAIN-ID, 
    #                        there might be >=2 subsubdomain matches
    #
    my $fasta_file;
    $fasta_file=$fastafile if $fastafile; #INPUT FASTA FILE
    $fasta_file=$fastafile2 if $fastafile2; #INPUT FASTA FILE
    
    my ($predictions,$firstseq,$secondseq,$conservation,$first_start,$second_start,$first_end)=
	&_scan_fidpd_subdomain_with_hmmscan($fastaname,$fasta_file,$fidpd_hmmf);
    system ("rm -f $fastafile2") if $fastafile2 and ! $flag_check; #INPUT FASTA FILE
    if (! $predictions){
	print "_FIDPD_$fastaname> NO PREDICTION FROM FDPD!\n";
	return;
    }    
    my $nnpredict=keys %$predictions;
    if($match_number) {
	if($match_number > $nnpredict) {
	    print "_FIDPD_$fastaname"."_WARNING> REQUIRED PREDICTION NUMBER $match_number OUT OF RANGE $nnpredict, IT IS RESET TO 1\n";	
	    $match_number=1;
	}
	else {
	    print "_FIDPD_$fastaname"."_WARNING> USING NO. $match_number FROM $nnpredict PREDICTIONS\n";	
	}
    }
    #
    #-- FDP-MODULE-FUNCTION_SITES
    #
    # read active-site information from identified subdomain entries (recorded in "$predictions")
    #
    # fidpd_asites  :: {$gf_id}{$site}{$gap}{$type}{'FREQ'|'COV'|'COORD'|'ELECT'|'HB_D'|'HB_A'|'PI'|'VDW'|'CATALYTIC'|'PHOSPHORYLATION'}
    # gf_info      :: {$gf_id}{SQ|CL|CF|SF|FA|DM|SP}
    #
    #
    my ($fidpd_asites,$gf_info)=&_get_fidpd_module_sites($fastaname,$fidpd_annotationf,$predictions);
#=head
    foreach my $key (sort {$$predictions{$a}<=>$$predictions{$b}}keys %$predictions){
	my ($gf_id,$domain_id)=split/_/,$key;
	my $e_value=$$predictions{$key};
	#print "PREDICT>>> $gf_id : $domain_id :: $e_value >>> $$first_start{$gf_id}{$domain_id} : $$first_end{$gf_id}{$domain_id}  >> $$second_start{$gf_id}{$domain_id} \n";
    } 
#=cut 

    my %all_predict_sites=(); undef %all_predict_sites;


    my $npredict=0;           #NUMBER OF HMMSCAN HITS FROM FDPD
    my $npred_w_sites=0;      #NUMBER OF HMMSCAN HITS FROM FDPD WITH FUNCTION SITE RECORDS
    foreach my $fidpd_entry (sort {$$predictions{$a}<=>$$predictions{$b}} keys %$predictions){  #FOR EACH FDPD-SUBDOMAIN-PREDICTION
	$npredict++; 

	my ($gf_id,$domain_id)=split/_/,$fidpd_entry;
	my $e_value=$$predictions{$fidpd_entry};

	next if($match_number && $npredict != $match_number);

	next if($e_value > $evalue_cutoff);

	#print ">>>> EEEEEE>>> $e_value :: $evalue_cutoff\n";

	$all_predict_sites{$fidpd_entry}{INFO}{NPREDICT}=$npredict;
	$all_predict_sites{$fidpd_entry}{INFO}{GFID}=$gf_id;
	$all_predict_sites{$fidpd_entry}{INFO}{E}=$e_value;
	foreach my $txt (keys %{$$gf_info{$gf_id}}) {
	    $all_predict_sites{$fidpd_entry}{INFO}{$txt}=$$gf_info{$gf_id}{$txt};
	}
	$all_predict_sites{$fidpd_entry}{INFO}{ANNOTATIONTYPE}=$site_annotation_type; 

	#print "GF|domain|E-value $gf_id|$domain_id|$e_value\n";

	my $nseq=$$gf_info{$gf_id}{SQ}+0;    #NUMBER OF SEQUENCES IN THIS FDPD SUBDOMAIN ENTRY
	
	my $first_seq=$$firstseq{$gf_id}{$domain_id};  
	my $start_first_seq=$$first_start{$gf_id}{$domain_id};
	my $end_first_seq=$$first_end{$gf_id}{$domain_id};
	my $second_seq=$$secondseq{$gf_id}{$domain_id}; 
	my $start_second_seq=$$second_start{$gf_id}{$domain_id};
	my $seq_conservation=$$conservation{$gf_id}{$domain_id};	
	my ($corr,$before_gap)=&_hmmscan_seqalign_position_corr($fastaname,$first_seq,$second_seq,$start_first_seq,$start_second_seq);
	$all_predict_sites{$fidpd_entry}{INFO}{BG}=$$corr{$start_first_seq}{0}[0];
	$all_predict_sites{$fidpd_entry}{INFO}{ED}=$$corr{$end_first_seq}{0}[0];
	
	my ($nsite_this_subdomain,$prediction_sites)=&_get_prediction_sites_from_fidpd($fidpd_asites,$gf_id,$nseq,$start_first_seq,$end_first_seq,$site_annotation_type,
						  $corr,$before_gap,$seq_conservation);

	foreach my $resid (sort {$a<=>$b} keys %$prediction_sites) {
	    foreach my $txt (keys %{$$prediction_sites{$resid}}){
		$all_predict_sites{$fidpd_entry}{SITE}{$resid}{$txt} = $$prediction_sites{$resid}{$txt};
	    }
	}
	$npred_w_sites++ if $nsite_this_subdomain and $nsite_this_subdomain >=1;

	if ($flag_use_bestmatch) { #ONLY USE THE BEST FIT
	    print "_FIDPD_$fastaname> BESTMATCH USED E-VALUE= $$predictions{$fidpd_entry}\n";
	    last;
	}
	if($match_number && $match_number == $npredict){
	    print "_FIDPD_$fastaname> NO. $match_number MATCH E-VALUE= $$predictions{$fidpd_entry}\n";
	    last;
	}


    }
#    print "_FIDPD_$fastaname> FDPD = \"$fidpd_db\", ANNOTATION = \"$site_annotation_type\", EVALUE CUTOFF = $evalue_cutoff\n";
    print "_FIDPD_$fastaname> FDPD = \"FIDPD\", ANNOTATION = \"$site_annotation_type\", EVALUE CUTOFF = $evalue_cutoff\n";
    print "_FIDPD_$fastaname> $npred_w_sites SITE-CONTAINING ENTRIES FROM $npredict FDPD-SUBDOMAIN PREDICTIONS\n";

    return $fastaname,\%all_predict_sites,$fastaseq;
}


###########################################################
###########################################################
##############   sub of corrlation   ######################
sub _hmmscan_seqalign_position_corr{
    my ($fastaname,$firstseq,$secondseq,$first_start_id,$second_start_id)=@_;

    my $firstlen=length($firstseq); my $secondlen=length($secondseq); 
    if($firstlen != $secondlen) {
	print "_FIDPD_$fastaname> HMMSCAN OUTPUT ERROR SEQUENCE ALIGNMENT,",
	" CHECK HMMSCAN OUTPUT: ?$fastaname.scan\n";
	print "_FIDPD_$fastaname> SEQ1: ",$firstseq,"\n";
	print "_FIDPD_$fastaname> SEQ2: ",$secondseq,"\n";
	
    }
    my $firstaa=substr($firstseq,0,1); 
    if($firstaa eq '.' || $firstaa eq "-") {
	print "_FIDPD_$fastaname> HMMSCAN 1st SEUQENCE ALIGNMENT WARNING: FIRST AA IS \".\"\n";
    }
    $firstaa=substr($secondseq,0,1); 
    if($firstaa eq '.' || $firstaa eq "-") {
	print "_FIDPD_$fastaname> HMMSCAN 2nd SEUQENCE ALIGNMENT WARNING: FIRST AA IS \".\"\n";
    }

    my %ngaps_before_position1=(); undef %ngaps_before_position1;
    my $position=$first_start_id; my $ngap=0;

    for(1..length($firstseq)) {	
	my $i=$_;
	my $aa=substr($firstseq,$i-1,1);
	$ngap++ if($aa eq "." || $aa eq "-"); 

	$ngaps_before_position1{$position}=$ngap;   #TOTAL NUMBER OF GAPS BEFORE THIS (NON-GAP/REAL AA) POSITION
	$position++ if($aa ne "." && $aa ne "-");   #update position by +1, pointing to the position of next NON-GAP a.a.
    }

    my %ngaps_before_position2=(); undef %ngaps_before_position2;
    $position=$second_start_id; $ngap=0;
    for(1..length($secondseq)) {	
	my $i=$_;
	my $aa=substr($secondseq,$i-1,1);
	$ngap++ if($aa eq "." || $aa eq "-"); 

	$ngaps_before_position2{$position}=$ngap;   #TOTAL NUMBER OF GAPS BEFORE THIS (NON-GAP/REAL AA) POSITION
	$position++ if($aa ne "." && $aa ne "-");   #update position by +1, pointing to the position of next NON-GAP a.a.
    }

    my $ii=0;  #PHYSICAL POSITION IN THE HMMSCAN ALIGNMENT, NO DISTINGUISH BETWEEN REAL AA AND GAP
    my $nngaps=0;        # NUMBER OF GAPS BEFORE THE "PREVIOUS" POSITION
    my $insert=0;       # NUMBER OF GAPS INSERTED AT THIS POSITION 
    my %seqindex1=(); undef %seqindex1;
    foreach my $i (sort {$a<=>$b} keys %ngaps_before_position1){    # $i -- NON-GAP/REAL AA POSITION
	$insert=$ngaps_before_position1{$i}-$nngaps;
	if($insert == 0) { 
	    ++$ii;
	    $seqindex1{$ii}{SITE}=$i; 
	    $seqindex1{$ii}{GAP}=0;
	}
	else {
	    for my $j (1..$insert){      #ADDING GAPS TO PREVIOUS NON-GAP POSITION
		++$ii;
		$seqindex1{$ii}{SITE}=$i-1; 
		$seqindex1{$ii}{GAP}=$j; 
	    }
	    ++$ii;
	    $seqindex1{$ii}{SITE}=$i; 
	    $seqindex1{$ii}{GAP}=0;
	}
	$nngaps=$ngaps_before_position1{$i};
    }

    $ii=0; $nngaps=0; $insert=0;
    my %seqindex2=(); undef %seqindex2;
    foreach my $i (sort {$a<=>$b} keys %ngaps_before_position2){    # $i -- NON-GAP/REAL AA POSITION
	$insert=$ngaps_before_position2{$i}-$nngaps;
	if($insert == 0) { 
	    ++$ii;
	    $seqindex2{$ii}{SITE}=$i; 
	    $seqindex2{$ii}{GAP}=0;
	}
	else {
	    for my $j (1..$insert){      #ADDING GAPS TO PREVIOUS NON-GAP POSITION
		++$ii;
		$seqindex2{$ii}{SITE}=$i-1; 
		$seqindex2{$ii}{GAP}=$j; 
	    }
	    ++$ii;
	    $seqindex2{$ii}{SITE}=$i; 
	    $seqindex2{$ii}{GAP}=0;
	}
	$nngaps=$ngaps_before_position2{$i};
    }
    if($ii ne $secondlen) {
	print "_FIDPD_$fastaname> LEN NON_MATCH IN GAP PROCESSING: $ii --- $secondlen\n";
    }

    my %corr=(); undef %corr;
    
    foreach my $i (sort {$a<=>$b} keys %seqindex1){
	my $real=$seqindex1{$i}{SITE}; my $gap=$seqindex1{$i}{GAP};
	my $match_real=$seqindex2{$i}{SITE}; my $match_gap=$seqindex2{$i}{GAP};
	push (@{$corr{$real}{$gap}},$match_real,$match_gap);
    }
=head

    foreach my $i (sort {$a<=>$b} keys %ngaps_before_position1){
	print "_FIDPD_$fastaname> GAP1: $i == $ngaps_before_position1{$i}\n";
    }
    print "\n";
    foreach my $i (sort {$a<=>$b} keys %ngaps_before_position2){
	print "_FIDPD_$fastaname> GAP2: $i == $ngaps_before_position2{$i}\n";
    }
    foreach my $i (sort {$a<=>$b} keys %corr){
	foreach my $j (sort {$a<=>$b} keys %{$corr{$i}}){
	    print "$i:$j ====  $corr{$i}{$j}[0]:$corr{$i}{$j}[1]\n";
	    print "=================\n";
	}
    }
    foreach my $i (sort {$a<=>$b} keys %seqindex2){
	print ">>>>>222>>>>>> $i ::  $seqindex1{$i}{SITE} :: $seqindex1{$i}{GAP} ------ $seqindex2{$i}{SITE} :: $seqindex2{$i}{GAP}\n";
    }

   if($ii ne $firstlen) {
	print ">>>>> LEN---NON_MATCH IN GAP PROCESSING: $ii --- $firstlen\n";
    }
    foreach my $i (sort {$a<=>$b} keys %seqindex1){
	print ">>>>>>>>>>> $i ------ $seqindex1{$i}{SITE} :: $seqindex1{$i}{GAP}\n";
    }
=cut

    return \%corr,\%ngaps_before_position1;
}
    
#
#
#===========================================================================
sub parse_arg{
    my ($fastafile,$outputf,$fidpd_db,$fidpd_hmmf,$fidpd_annotationf,$evalue_cutoff,$site_annotation_type,
	$database_dir);
    my ($myjob,$myseq);
    my ($very_conserv,$media_conserv,$little_conserv);  #CONSERVATION VALUES FOR MATCHING
    my $pred_method;
    my ($flag_use_bestmatch,$match_number,$flag_check);

    my $numArgs = $#ARGV + 1;
    my $index=-1;
    while (++$index<$numArgs){
	my $para=$ARGV[$index];
	if($para eq "-in" || $para eq "-fasta" || $para eq "-f"){
	    $fastafile=$ARGV[++$index];}
	elsif($para eq "-seq" || $para eq "-sequence"){  #input sequence from command line
	    $myseq=$ARGV[++$index];}

	elsif($para eq "-job" || $para eq "-jobname"){
	    $myjob=$ARGV[++$index];}
	elsif($para eq "-fidpd" || $para eq "-database" || $para eq "-db"){
	    $fidpd_db=$ARGV[++$index];}

	elsif($para eq "-hmm"){
	    $fidpd_hmmf=$ARGV[++$index];}
	elsif($para eq "-ecut" || $para eq "-evalue_cutoff"){
	    $evalue_cutoff=$ARGV[++$index];}
	elsif($para eq "-annotation" || $para eq "-mod"){
	    $fidpd_annotationf=$ARGV[++$index];}
	elsif($para eq "-sitetype" || $para eq "-type"){
	    $site_annotation_type=$ARGV[++$index];}
	elsif($para eq "-output" || $para eq "-o"){
	    $outputf=$ARGV[++$index];}
	elsif($para eq "-datdir" || $para eq "-database_dir"){
	    $database_dir=$ARGV[++$index];}

	elsif($para eq "-bigconserv" || $para eq "-c3"){
	    $very_conserv=$ARGV[++$index];}
	elsif($para eq "-mediaconserv" || $para eq "-c2"){
	    $media_conserv=$ARGV[++$index];}
	elsif($para eq "-littleconserv" || $para eq "-c1"){
	    $little_conserv=$ARGV[++$index];}

	elsif($para eq "-prediction_method" || $para eq "-pm"){
	    $pred_method=$ARGV[++$index];}	

                                             #IN MANY CASES, HMMSCAN FOUND MANY SUBDOMAINS THAT MATCH ENQUERY SEQUENCES 
	elsif($para eq "-bestmatch" or $para eq "-best"){        #USING THE BEST-MATCH (HMMSCAN LOWEST E-VALUE) AS PREDICTION-OUTPUT
	    $flag_use_bestmatch='true';}
	elsif($para eq "-match"){            #USING THE $match_number MATCH AS PREDICTION-OUTPUT  
	    $match_number=$ARGV[++$index];}

	elsif($para eq "-h"){
	    &Show_Usage();}	
	elsif($para eq "-c"){
	    $flag_check='true';}
	else {  	    
	    print "\nunrecognized option: \"$para\"!\n";
	    &Show_Usage();}
    }
    if(! $fastafile && ! $myseq) {
	print "_FIDPD_ERR> PLEASE INPUT ENQURY SEQUENCE FASTA FILE WITH \"-in\"\n";
	print "_FIDPD_ERR> OR INPUT ENQURY SEQUENCE FROM COMMAND-LINE USING \"-seq\"\n";
	&Show_Usage;}
    if($fastafile) {
	if($fastafile =~ /(\S+)\./){
	    $outputf= $1.".psite" if ! $outputf;}
	else {
	    $outputf= $fastafile.".psite" if ! $outputf;}
    }
    else {
	$outputf='job1.psite' if ! $outputf and ! $myjob;
	$outputf=$myjob.".psite" if ! $outputf and $myjob;
    }

    $database_dir="/databases/fidpd" if ! $database_dir;  
    $fidpd_db="fidpd" if ! $fidpd_db;
    $fidpd_hmmf=$fidpd_db.".hmm" if ! $fidpd_hmmf;
    $fidpd_hmmf=$fidpd_hmmf.".hmm" if $fidpd_hmmf !~ /\.hmm$/;
    $fidpd_annotationf=$fidpd_db.".mod";
    $database_dir=$database_dir."/".$fidpd_db;
    $fidpd_hmmf=$database_dir."/".$fidpd_hmmf;
    $fidpd_annotationf=$database_dir."/".$fidpd_annotationf;


    $site_annotation_type='SITE' if ! $site_annotation_type;
    $evalue_cutoff=3 if ! $evalue_cutoff;
    $evalue_cutoff=1/10**$evalue_cutoff;
    $evalue_cutoff=1000 if $match_number or $flag_use_bestmatch; #IGNORE EVALUE-CUTOFF WHEN USING PREDICTION-MATCH-SELECTION
    
    $very_conserv=5 if ! $very_conserv;
    $media_conserv=3 if ! $media_conserv; 
    $little_conserv=2 if ! $little_conserv; 

    $pred_method='TOPPERCENT FREQ  45 35' if !$pred_method;
    $myjob='JOB1' if ! $myjob; 
    #$myseq='' if ! $myseq; 

    return $fastafile,$outputf,$myjob,$myseq,
    $database_dir,$fidpd_db,$fidpd_hmmf,$fidpd_annotationf,$evalue_cutoff,$site_annotation_type,
    $very_conserv,$media_conserv,$little_conserv,
    $pred_method,
    $flag_use_bestmatch,$match_number,$flag_check;
}

sub Show_Usage{
    print "\n";
    print "Usage: run.pl [-in fastafile |-seq sequence] [-hmm fidpd_hmm_file] [-ecut evalue_cutoff]\n";
    print "              [-mod fidpd_site_annotation_file] [ -datdir database_dir] \n";
    print "              [-o outputf] [-type site_annotation_type] [-pm prediction_method]\n";
    print "              [-check | -c]\n";
    print "     -h  show this message\n";
    print "    -in  input enqury sequence file in FASTA format (-fasta/-f)\n";
    print "   -seq  input enqury sequence\n";
    print "     -o  output file of primary prediction (-output)\n\n";

    print "-datdir  directory for database/annotation\n";
    print "   -hmm  FDPD subdomain database file in hmm format (-database)\n";
    print "   -mod  FDPD subdomain asite annotation file\n";
    print "  -fidpd  FDPD database name (NOW we support: s104, s107 (default), s304, s307), (it surpresses -hmm/-mod)\n";
    print "         eg, \"-in a.fasta -fidpd s107\" equals to \"-in a.fasta -hmm s107.hmm -mod s107.mod\"\n"; 
    print "  -ecut  E-value cutoff in selecting hmmscan output (-evalue_cutoff 0.001)\n";
    print "  -type  resource for domain active sites (SITE/DPA/LG, SITE)\n";
    print "           SITE--derived from PDB site records,\n";
    print "           DPA---derived from DPA predictions,\n\n";
    print "    -c3  concervation value: -bigconserv (5)\n";
    print "    -c2  concervation value: -neduaconserv (3)\n";
    print "    -c1  concervation value: -littleconserv (2)\n";
    print "    -pm  input prediction method, one of following two forms:\n";
    print "         1) TOPPERCENT  USE TOP [TAIL_CUT]: [delete TAIL_CUT% sites], then give TOP% sites as prediction; using USE\n";
    print "                                default: \"TOPPERCENT FREQ 45 35\"\n";
    print "         2)       NTOP  USE   N       [ST]: give [exactly] top N sites as prediction; using USE\n";
    print "  -best  using the best (1st) hmm-match as prediction (-bestmatch)\n";
    print " -match  match_number: n, using the nth hmmscan subdomain for site-prediction [-match n]\n";
    print "     -c  print intermediate data such as .psite .allsitesfiles\n";
    print "\n";
    exit;
}

#======================================================
sub _get_fidpd_module_sites{
    my ($fastaname,$fidpd_annotation_file,$predictions)=@_;

    my %gf_info=(); undef %gf_info;
    my %fidpd_asites=(); undef %fidpd_asites;

    foreach my $subid (sort{$$predictions{$a}<=>$$predictions{$b}} keys %$predictions){
	my ($subdomain,$did)=split/_/,$subid;
	my $subdir=substr($subdomain,1,3); my $modf= $database_dir."/".$subdir."/".$subdomain.".mod";
#	print ">>>>$subdomain >>> $did >>$$predictions{$subid} >>>> $modf\n";
	
	open(DATABASE,"< $modf")
	    or die "_FIDPD_$fastaname"."_ERR> CANNOT OPEN-READ FDPD-ANNOTATED MODULE FILE $modf\n";
	while(<DATABASE>){
	    my $line=$_;  chomp($line);
	    if($line=~/#=GF SQ (\S+)/){$gf_info{$subdomain}{SQ}=$1;}
	    if($line=~/#=GF CL (.*)/){ $gf_info{$subdomain}{CL}=$1;}
	    if($line=~/#=GF CF (.*)/){ $gf_info{$subdomain}{CF}=$1;}
	    if($line=~/#=GF SF (.*)/){ $gf_info{$subdomain}{SF}=$1;}
	    if($line=~/#=GF FA (.*)/){ $gf_info{$subdomain}{FA}=$1;} 
	    if($line=~/#=GF DM (.*)/){ $gf_info{$subdomain}{DM}=$1;}   
	    if($line=~/#=GF SP (.*)/){ $gf_info{$subdomain}{SP}=$1." (or so)";}   
	    
	    #print "$line\n";
	    #SITE : 1 : 0 : 3 >>> 0 : 0 : 0 : 3 : 0 : 0 : 21 : 0 : 0
	    #SITE : 2 : 0 : 2 >>> 0 : 0 : 0 : 0 : 0 : 0 : 7 : 0 : 0
#     if($line=~/_(\S+)\s:\s(\d+)\s:\s(\d+)\s:\s(\d+)/){
	    if($line=~/(\w+)\s:\s(\d+)\s:\s(\d+)\s:\s(\d+) >>>\s(.*)/){
		my $type=$1; my $site=$2; my $gap=$3; my $freq=$4; 
		my $pattern=$5; 
		my ($cov,$coord,$elect,$hbd,$hba,$pi,$vdw,$catalytic,$phosphorylation)=split/\s\:\s/,$pattern;
		
#	  print "_FIDPD_$fastaname>"." SITE MATCH :: $gf_id> $type - $site - $gap - $freq :: $pattern\n";
#	  print ">>>>$cov,$coord,$elect,$hbd,$hba,$pi,$vdw,$catalytic,$phosphorylation<<<\n"; exit;
		
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'FREQ'}=$freq;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'COV'}=$cov;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'COORD'}=$coord;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'ELECT'}=$elect;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'HB_D'}=$hbd;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'HB_A'}=$hba;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'PI'}=$pi;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'VDW'}=$vdw;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'CATALYTIC'}=$catalytic;
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'PHOSPHORYLATION'}=$phosphorylation;
	    }
	}
	close DATABASE;
#	if ($flag_use_bestmatch) { #ONLY USE THE BEST FIT
#	    print "_FIDPD_$fastaname> BESTMATCH USED E-VALUE= $$predictions{$subid}\n";
#	    last;
#	}
    }
=head   #####check module_active################## 
    my $type=$site_annotation_type;
    foreach my $subdomain(sort keys %fidpd_asites){
	foreach my $site(sort keys %{$fidpd_asites{$subdomain}}){
	    foreach my $gap(sort keys %{$fidpd_asites{$subdomain}{$site}}){
		print "_FIDPD_$fastaname> MATCH-MODULE-SITES ==>  $subdomain : $site : $gap >>> ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'FREQ'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'COV'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'COORD'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'ELECT'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'HB_D'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'HB_A'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'PI'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'VDW'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'CATALYTIC'},"  ",
		$fidpd_asites{$subdomain}{$site}{$gap}{$type}{'PHOSPHORYLATION'},"\n";
	    }
	}
    }
    exit;
=cut

    return \%fidpd_asites,\%gf_info;
}


sub _get_enqury_seq{
    my ($fasta_file)=@_;    
#------find fastaseq
    my $fastaname=$fasta_file; 
    if($fastaname =~ /\./ and ! $fastaname =~ /\.\// ){
	$fastaname='UNDEF'; 
	print "_FIDPD_$fastaname"."_WARNING> UNREASONABLE FASTA INPUT FILENAME: $fasta_file\n"; }
    elsif($fastaname =~ /(\S+)\./){
	$fastaname=$1
    }
    if(! -e "$fasta_file") {
	print "_FIDPD_$fastaname"."_ERR> CANNOT OPEN-READ THE FASTA/SEQUENCE FILE: $fasta_file, PREDICTION SKIPPED\n";
	return $fastaname;
    }
    my $fastaseq=''; 
    open(FASTAFILE,"$fasta_file");  
    while(<FASTAFILE>){
	my $line=$_; chomp($line);
	if($line=~/^>(\S+)/){                           #first FASTA line
	    $fastaname=$1;
	    if($fastaname =~ /(.*)\|/) {$fastaname=$1;}
	    $fastaname=~s/\s+//g;
	    next;}
	elsif($line=~/\s*(\S+)/) {
	    $fastaseq=$fastaseq.$1; $fastaseq=~s/-//g;	
	}
	else {
	    last;
	}
    }
    close (FASTAFILE);

    if(! $fastaseq) {
	print "_FIDPD_$fastaname> NO SEQUENCE READ IN FASTA FILE: $fasta_file\n";
	return $fastaname;
    }
    else {
#	print "_FIDPD_$fastaname> ENQUERY SEQUENCE: $fastaseq\n";
    }

    return $fastaname,$fastaseq;
}


sub _scan_fidpd_subdomain_with_hmmscan{
    my ($fastaname,$fastafile,$fidpd_hmm)=@_;
    my $hmmscan_outputf="_8591_".$fastaname.".scan"; $hmmscan_outputf =~ s/\s//g;
    
    if(! -e $fidpd_hmmf) {
	print "_FIDPD_$fastaname> CANNOT FIND HMMFILE: $fidpd_hmmf\n";
	exit;}
    system "/usr/bin/hmmscan $fidpd_hmmf $fastafile > $hmmscan_outputf\n";

    if(! -e $hmmscan_outputf) {
	print "_FIDPD_$fastaname> NO OUTPUT WITH HMMSCAN FROM $fidpd_hmm\n";
	return ;
    }
#    my %start=();my %over=();    #start{gf}{domain}
#-------build align between $hmmseq and $seq
    #!!!!!! notice: different from build_dpa_scop.pl the hmmseq is forhead

#>> 190299001
#   #    score  bias  c-Evalue  i-Evalue hmmfrom  hmm to    alifrom  ali to    envfrom  env to     acc
# ---   ------ ----- --------- --------- ------- -------    ------- -------    ------- -------    ----
#   1 !  218.9   2.3     4e-69   1.6e-65       1     128 [.       1     128 [.       1     129 [] 0.99
#
#  Alignments for each domain:
#  == domain 1    score: 218.9 bits;  conditional E-value: 4e-69
#  190299001   1 kvfekcelarelkrlgldgyrGvslaewvclakfesgyntqainenadestdyGilqinsklwckdgktpksrnicniscskllddditddvacakkivk 100
#                kv+++cela+++krlgld+yrG+sl++wvc+akfes++nt+a+n+n+d+stdyGilqins++wc+dg+tp+s+n+cni+cs+ll++dit++v+cakki +
#       1jef   1 KVYGRCELAAAMKRLGLDNYRGYSLGNWVCAAKFESNFNTHATNRNTDGSTDYGILQINSRWWCNDGRTPGSKNLCNIPCSALLSSDITASVNCAKKIAS 100
#                799************************************************************************************************* PP
#
#  190299001 101 dkkGinawvawkarckekdleqylkgce 128
#                  +G+nawvaw++rck++d++++++gc+
#       1jef 101 GGNGMNAWVAWRNRCKGTDVHAWIRGCR 128
#                ***************************8 PP

    my $subdomain='';  my $domain_id;
    my %predictions=();
    my %firstseq=();
    my %secondseq=();
    my %conservation=();

    my %first_start=(); my $define_firststart='';     my %first_end=();
    my %second_start=(); my $define_secondstart='';
    my $is_conservation_line=0; my $conservation_startcut=0; my $conservation_cutnum=0;

    open(FDPDSCAN,"$hmmscan_outputf")
	or die "_FIDPD_$fastaname> CANNOT OPEN-READ HMMSCAN OUTPUT: $hmmscan_outputf\n";
    while(<FDPDSCAN>){
	my $alnline=$_; chomp($alnline);

	if($alnline=~/^>> (\d+)/){  #MATCHED SCOP_SUB_DOMAIN IDs BY HMMSCAN
	    $subdomain=$1;          #START A HIT/SCOP_SUB_DOMAIN ENTRY
	}
	if($alnline=~/^  == domain (\d+)\s.*E-value: (.*)/){    #E-values 
	    $domain_id=$1;                                      #sometime given sequence was divided into a few domains by hmmscan
	    $predictions{$subdomain.'_'.$domain_id}=$2;

	    $firstseq{$subdomain}{$domain_id}='';               #A NEW FIRST  SEQUENCE
	    $conservation{$subdomain}{$domain_id}='';           #A NEW AA CONSERVATION SEQUENCE
	    $secondseq{$subdomain}{$domain_id}='';              #A NEW SECOND SEQUENCE
	    next;
	}
#
#MACTHING FIRST ALIGNMENT SEQUENCE SUCH AS THE FOLLOWING
#	  190299001   1 kvfekcelarelkrlgldgyrGvslaewvclakfesgyntqainenadestdyGilqinsklwckdgktpksrnicniscskllddditddvacakkivk 100
#         190299001 101 dkkGinawvawkarckekdleqylkgce 128
#         159439000   6 ivigggpsglmaaigaaeeganvllldkgnklgrkla.isgggrcnvtnrlpldeivkhipgngrf....lysafs..ifnnediitffenlgvklkeed 98
# 
	if($subdomain && $alnline=~/^\s+($subdomain)\s+(\d+)\s+(\S+)\s+(\d+)/){
	    my $subseq=$3;
	    $firstseq{$subdomain}{$domain_id}=$firstseq{$subdomain}{$domain_id}.$subseq;
	    if(! exists $first_start{$subdomain}{$domain_id}) {
		$first_start{$subdomain}{$domain_id}=$2;  #ALIGNMENT BEGINING SITE
	    }
	    $first_end{$subdomain}{$domain_id}=$4;        #ALIGNMENT ENDING SITE REPLACED IN EACH MATCH
	    #
	    #PREPARING FOR THE NEXT LINE WHICH IS CONSERVATION LINE AND
	    #  FIND THE POSITIONS IN CONSERVATION LINE THAT MATCH HEAD AND TAIL OF THE ALIGNMENT
	    #
	    $is_conservation_line=1;    
	    $alnline=~/^(.*)$subseq/;
	    $conservation_startcut=length($1);
	    $conservation_cutnum=length($subseq);
	    next;
	}	
#
#READING CONSERVATIONS
#       kv+++cela+++krlgld+yrG+sl++wvc+akfes++nt+a+n+n+d+stdyGilqins++wc+dg+tp+s+n+cni+cs+ll++dit++v+cakki +
#       ivig gp gl+a +  a+ g n +++++g+ +  +     g  r       p  ++     g g f    lys +    f    +it f + g
#
	if($is_conservation_line==1){
	    $conservation{$subdomain}{$domain_id}=
		$conservation{$subdomain}{$domain_id}.substr($alnline,$conservation_startcut,$conservation_cutnum);
	    $is_conservation_line=0;
	    next;
	}
#
#MACTHING SECOND ALIGNMENT SEQUENCE SUCH AS THE FOLLOWING
#         1jef   1 KVYGRCELAAAMKRLGLDNYRGYSLGNWVCAAKFESNFNTHATNRNTDGSTDYGILQINSRWWCNDGRTPGSKNLCNIPCSALLSSDITASVNCAKKIAS 100
# 	  1jef 101 GGNGMNAWVAWRNRCKGTDVHAWIRGCR 128
#        T0604 111 IVIGFGPCGLFAGLVLAQMGFNPIIVERGKEVRERTKdTFGFWRKRTL--NPESNVQFGEGGAGTFsdgkLYSQVKdpNFYGRKVITEFVEAGAPEEILY 208
#
	if($alnline=~/^\s+($fastaname)\s+(\d+)\s+(\S+)\s+(\d+)/){
	    $secondseq{$subdomain}{$domain_id}=$secondseq{$subdomain}{$domain_id}.$3;
	    if(!  exists $second_start{$subdomain}{$domain_id}) {
		$second_start{$subdomain}{$domain_id}=$2;  #ALIGNMENT BEGINING SITE
	    }
	    $is_conservation_line=0;
	    next;
	}
    }


    if(! %predictions){
	print "_FIDPD_$fastaname> NO HIT WITH HMMSCAN FROM $fidpd_hmm\n";
	system "rm -f $hmmscan_outputf" if(! $flag_check);
	return;
    }  
  
=head      ########check matched_fidpd_subdomains###########
    my $nhit=0;
    foreach my $subdomain (sort {$predictions{$a}<=>$predictions{$b}} keys %predictions) {
	$nhit++;
	print "_FIDPD_$fastaname>  $nhit HIT :: $subdomain\n";
    }
=cut
#
#-- HERE WE REPLACE "BLANK" POSITION IN CONSERVATION BY "+", IF THE MATCH AT THE POSITION IS NOT "GAP"!
#
    foreach my $dd (keys %firstseq) {
	foreach my $di (keys %{$firstseq{$dd}}){
	    my $fs=$firstseq{$dd}{$di};
	    my $ss=$secondseq{$dd}{$di};
	    my $cs0=$conservation{$dd}{$di};
	    my $cs='';
	    my $l1=length $fs;
	    for (my $jj=0; $jj < $l1; $jj++) {
		my $c=substr($cs0,$jj,1);
		my $f=substr($fs,$jj,1);
		my $s=substr($ss,$jj,1);
		
		if($c =~/[a-z]|[A-Z]|[+]/) {
		    $cs=$cs.$c;	}
		elsif($f =~ /[a-z]|[A-Z]/ &&  $s =~ /[a-z]|[A-Z]/){
		    $cs=$cs."+";
		}
		else {
		    $cs=$cs." ";
		}
	    }
	    $conservation{$dd}{$di}=$cs;
	}
    }

    system "rm -f $hmmscan_outputf" if(! $flag_check);

    return \%predictions,\%firstseq,\%secondseq,\%conservation,
    \%first_start,\%second_start,\%first_end;
}


#---------find predict active site from database
sub _get_prediction_sites_from_fidpd{
    my ($fidpd_asites,$gf_id,$nseq,$start_id,$end_id,$type,  #gf_id is subdomain entry
	$corr,$before_gap,$conservation)=@_;

    my %prediction_sites=(); my $nsite=0;
    
    foreach my $site (sort {$a<=>$b} keys %{$$fidpd_asites{$gf_id}}){	
	next if($site<$start_id || $site>$end_id);                    #prediction residue-range matching to the FDPD module

	my $scop_seq_locate=$site-($start_id)+$$before_gap{$site}+1;  #the position in HMMSCAN ALIGNMENT

	my $res_conser=substr($conservation,$scop_seq_locate-1,1);
	if($res_conser=~/[a-z]|[A-Z]|[+]/){

	    foreach my $gap (sort keys %{$$fidpd_asites{$gf_id}{$site}}){
		my $predict_active_residue=$$corr{$site}{$gap}[0];
		next if(! defined $predict_active_residue);
		my $percent=0; 
		my $freq=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'FREQ'};
		my $cov=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COV'};
		my $coord=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COORD'};
		my $elect=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'ELECT'};
		my $hbd=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_D'}; 
		my $hba=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_A'};
		my $pi=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PI'};
		my $vdw=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'VDW'};
		my $catalytic=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'CATALYTIC'}; 
		my $phosphorylation=$$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PHOSPHORYLATION'};

		$percent=$freq/$nseq if $nseq;
		
		$prediction_sites{$predict_active_residue}{'MODULE_SITE'}=[$site,$gap];
		$prediction_sites{$predict_active_residue}{'CONSERVATION'}=$res_conser;
		$prediction_sites{$predict_active_residue}{'FREQ'}=$freq;
		$prediction_sites{$predict_active_residue}{'FREQ-PERCENT'}=$percent;
		$prediction_sites{$predict_active_residue}{'COV'}=$cov;
		$prediction_sites{$predict_active_residue}{'COORD'}=$coord;
		$prediction_sites{$predict_active_residue}{'ELECT'}=$elect;
		$prediction_sites{$predict_active_residue}{'HB_D'}=$hbd;
		$prediction_sites{$predict_active_residue}{'HB_A'}=$hba;
		$prediction_sites{$predict_active_residue}{'PI'}=$pi;
		$prediction_sites{$predict_active_residue}{'VDW'}=$vdw;
		$prediction_sites{$predict_active_residue}{'CATALYTIC'}=$catalytic;
		$prediction_sites{$predict_active_residue}{'PHOSPHORYLATION'}=$phosphorylation;
		$nsite++;
 	    }
	}
    }

    return $nsite,\%prediction_sites;
}


sub _write_intermediate_predictions{
    my ($fastaname,$outputf,$predictions)=@_;
    print "..... *****  ----> $outputf\n"; exit;

    open(OUTPUT,"> $outputf") or die "_FIDPD_$fastaname> CANNOT OPEN-WRITE OUTPUT PREDICTION$!\n";
    foreach my $fentry (sort {$$predictions{$a}{INFO}{E} <=> $$predictions{$b}{INFO}{E}}  keys %$predictions){
	print OUTPUT "PREDICT $$predictions{$fentry}{INFO}{NPREDICT}\n";
	print OUTPUT "ID      $$predictions{$fentry}{INFO}{GFID}\n";
	print OUTPUT "CL      $$predictions{$fentry}{INFO}{CL}\n"; 
	print OUTPUT "CF      $$predictions{$fentry}{INFO}{CF}\n";
	print OUTPUT "SF      $$predictions{$fentry}{INFO}{SF}\n";
	print OUTPUT "FA      $$predictions{$fentry}{INFO}{FA}\n";
	print OUTPUT "DM      $$predictions{$fentry}{INFO}{DM}\n";
	print OUTPUT "SP      $$predictions{$fentry}{INFO}{SP}\n";
	print OUTPUT "SQ      $$predictions{$fentry}{INFO}{SQ}\n";
	print OUTPUT " E      $$predictions{$fentry}{INFO}{E}\n";
	print OUTPUT "WT      $$predictions{$fentry}{INFO}{ANNOTATIONTYPE}\n";
	print OUTPUT "BG      $$predictions{$fentry}{INFO}{BG}\n";
	print OUTPUT "ED      $$predictions{$fentry}{INFO}{ED}\n";
	printf OUTPUT "#####   - %4s   %1s %-4s %s ][ %4s %6s ",
	'RESI','ALI','MOLI','MGAP','FREQ','FRATIO';
	printf OUTPUT " || %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
	'FREQ','COV','COORD','ELECT','HB_D','HB_A','PI','VDW','CATAL','PHOSPH';
	foreach my $resid (sort {$a<=>$b} keys %{$$predictions{$fentry}{SITE}}){	
	    printf OUTPUT "RESID   - %4s %3s %-4s %s    %4d %6.3f ",
	    $resid,$$predictions{$fentry}{SITE}{$resid}{'CONSERVATION'},
	    $$predictions{$fentry}{SITE}{$resid}{'MODULE_SITE'}[0],
	    $$predictions{$fentry}{SITE}{$resid}{'MODULE_SITE'}[1],
	    $$predictions{$fentry}{SITE}{$resid}{'FREQ'},
	    $$predictions{$fentry}{SITE}{$resid}{'FREQ-PERCENT'};
	    printf OUTPUT " || %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d\n",
	    $$predictions{$fentry}{SITE}{$resid}{'FREQ'},$$predictions{$fentry}{SITE}{$resid}{'COV'},
	    $$predictions{$fentry}{SITE}{$resid}{'COORD'},$$predictions{$fentry}{SITE}{$resid}{'ELECT'},
	    $$predictions{$fentry}{SITE}{$resid}{'HB_D'},$$predictions{$fentry}{SITE}{$resid}{'HB_A'},
	    $$predictions{$fentry}{SITE}{$resid}{'PI'},$$predictions{$fentry}{SITE}{$resid}{'VDW'},
	    $$predictions{$fentry}{SITE}{$resid}{'CATALYTIC'},$$predictions{$fentry}{SITE}{$resid}{'PHOSPHORYLATION'};
	}
	print OUTPUT "ENDPRET $$predictions{$fentry}{INFO}{NPREDICT}\n";
    }
}

#============================================================================
#
#  PREDICTED SITES FILTERING ACCORDING INPUT.....
#
#============================================================================
sub _prediction_postprocessing{
    my ($fastaname,$predictions,$very_conserv,$media_conserv,$little_conserv)=@_;
    
    my %asites=(); undef %asites;   #PREDICTIONS CONDENSED INTO EACH RESIDUE FROM (MULTI-) PREDICTED FDPD MODULES
    my $nsites=0;
    foreach my $fentry (sort {$$predictions{$a}{INFO}{E} <=> $$predictions{$b}{INFO}{E}}  keys %$predictions){
	my $evalue=$$predictions{$fentry}{INFO}{E};

#	if($evalue=~/e-(\d+)/){ $evalue=$1; $evalue=sprintf"%d",$evalue;}
#	else { $evalue=1}      
#	print ">>>>>>>>>>>>$evalue\n";

	$evalue = -log($evalue) if ($evalue ne '0');
	$evalue = 9999 if ($evalue eq '0');
	$evalue=0 if $evalue <= 0;
#	$evalue=1;

	my $nseq=$$predictions{$fentry}{INFO}{SQ};
	foreach my $resid (sort {$a<=>$b} keys %{$$predictions{$fentry}{SITE}}){
	    my $conserv=$$predictions{$fentry}{SITE}{$resid}{'CONSERVATION'};
	
	    if($conserv=~/[A-Z]/){ $conserv=$very_conserv;}
	    elsif($conserv=~/[a-z]/){ $conserv=$media_conserv;}
	    else{
		$conserv=$little_conserv;
	    }
	    #my $fscore0=$nseq*$conserv*$evalue;	
	    my $fscore0=$nseq*$conserv*$evalue;      #THERE SHOULD BE "NO" $nseq, 
	                                       #SINCE EVERY VALUE HAD BE ACCUMULATED 
                                               #AND RECORDED IN .mod FILE.
#	    print ">>>>$$predictions{$fentry}{SITE}{$resid}{'CONSERVATION'} :: $conserv || $nseq ||$$predictions{$fentry}{INFO}{E} >>  $evalue || $fscore0\n"; 		
#	my $score=$$predictions{$fentry}{SITE}{$resid}{'FREQ'};
	    if(! exists $asites{$resid}) {
		$asites{$resid}{FRE}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'FREQ'};
		$asites{$resid}{FPT}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'FREQ-PERCENT'};
		$asites{$resid}{VDW}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'VDW'};
		$asites{$resid}{PAI}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'PI'};
		$asites{$resid}{HBA}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_A'};
		$asites{$resid}{HBD}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_D'};
		$asites{$resid}{HBN}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_A'}+
		    $fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_D'};
		$asites{$resid}{ELE}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'ELECT'};
		$asites{$resid}{COO}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'COORD'};
		$asites{$resid}{COV}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'COV'};
		$asites{$resid}{CAT}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'CATALYTIC'};
		$asites{$resid}{PHO}=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'PHOSPHORYLATION'};}	
	    else {
		$asites{$resid}{FRE}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'FREQ'};
		$asites{$resid}{FPT}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'FREQ-PERCENT'};
		$asites{$resid}{VDW}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'VDW'};
		$asites{$resid}{PAI}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'PI'};
		$asites{$resid}{HBA}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_A'};
		$asites{$resid}{HBD}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_D'};
		$asites{$resid}{HBN}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_A'}+
		    $fscore0*$$predictions{$fentry}{SITE}{$resid}{'HB_D'};
		$asites{$resid}{ELE}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'ELECT'};
		$asites{$resid}{COO}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'COORD'};
		$asites{$resid}{COV}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'COV'};
		$asites{$resid}{CAT}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'CATALYTIC'};
		$asites{$resid}{PHO}+=$fscore0*$$predictions{$fentry}{SITE}{$resid}{'PHOSPHORYLATION'};
	    }
	    $nsites++;
	}
    }
    
=head
    foreach my $resid (sort {$a<=>$b} keys %asites){
	printf "_FIDPD_SELECT> %6d ",$resid;	
	printf "%12d  %12.3f", $asites{$resid}{FRE}, $asites{$resid}{FPT};
	printf " ||  %10d %10d %10d %10d %10d %10d %10d %10d %10d %10d %10d\n", 
	$asites{$resid}{FRE},$asites{$resid}{COV},$asites{$resid}{COO},$asites{$resid}{ELE},
	$asites{$resid}{HBD},$asites{$resid}{HBA},$asites{$resid}{HBN},$asites{$resid}{PAI},
	$asites{$resid}{VDW},$asites{$resid}{CAT},$asites{$resid}{PHO};	
    }
=cut

    return $nsites,\%asites;
}







#======================================================
#
#  READ OLD .MOD FILE 
#
#


sub _get_fidpd_module_sites2{
    my ($fastaname,$fidpd_annotation_file,$predictions)=@_;

    my $iffind=0;
    my $gf_id='';
    my %gf_info=();

    my %fidpd_asites=(); undef %fidpd_asites;
    open(DATABASE,"< $fidpd_annotation_file")
	or die "_FIDPD_$fastaname> CANNOT OPEN-READ FDPD-ANNOTATED MODULE FILE $fidpd_annotationf\n";

  LAB1:while(<DATABASE>){
      my $line=$_;  chomp($line);
      foreach my $subid (sort keys %$predictions){
	  my ($subdomain,$did)=split/_/,$subid;

	  if($line=~/#=GF ID $subdomain/){
	      $iffind=1; $gf_id=$subdomain; 
	      next LAB1;
	  }

      }
      next if($iffind==0);

      if($line=~/^END OF SUBDOMAIN/){
	  $iffind=0;
	  next;
      }
      if($line=~/#=GF SQ (\S+)/){$gf_info{$gf_id}{SQ}=$1;}
      if($line=~/#=GF CL (.*)/){ $gf_info{$gf_id}{CL}=$1;}
      if($line=~/#=GF CF (.*)/){ $gf_info{$gf_id}{CF}=$1;}
      if($line=~/#=GF SF (.*)/){ $gf_info{$gf_id}{SF}=$1;}
      if($line=~/#=GF FA (.*)/){ $gf_info{$gf_id}{FA}=$1;} 
      if($line=~/#=GF DM (.*)/){ $gf_info{$gf_id}{DM}=$1;}   
      if($line=~/#=GF SP (.*)/){ $gf_info{$gf_id}{SP}=$1." (or so)";}      
          
     
      #print "$line\n";
      #SITE : 1 : 0 : 3 >>> 0 : 0 : 0 : 3 : 0 : 0 : 21 : 0 : 0
      #SITE : 2 : 0 : 2 >>> 0 : 0 : 0 : 0 : 0 : 0 : 7 : 0 : 0
#     if($line=~/_(\S+)\s:\s(\d+)\s:\s(\d+)\s:\s(\d+)/){
      if($line=~/(\w+)\s:\s(\d+)\s:\s(\d+)\s:\s(\d+) >>>\s(.*)/){
	  my $type=$1; my $site=$2; my $gap=$3; my $freq=$4; 
	  my $pattern=$5; 
	  my ($cov,$coord,$elect,$hbd,$hba,$pi,$vdw,$catalytic,$phosphorylation)=split/\s\:\s/,$pattern;

#	  print "_FIDPD_$fastaname>"." SITE MATCH :: $gf_id> $type - $site - $gap - $freq :: $pattern\n";
#	  print ">>>>$cov,$coord,$elect,$hbd,$hba,$pi,$vdw,$catalytic,$phosphorylation<<<\n"; exit;

	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'FREQ'}=$freq;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COV'}=$cov;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COORD'}=$coord;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'ELECT'}=$elect;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_D'}=$hbd;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_A'}=$hba;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PI'}=$pi;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'VDW'}=$vdw;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'CATALYTIC'}=$catalytic;
	  $fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PHOSPHORYLATION'}=$phosphorylation;
      }
  }
=head   #####check module_active################## 
    my $type=$site_annotation_type;
    foreach my $gf_id(sort keys %fidpd_asites){
	foreach my $site(sort keys %{$fidpd_asites{$gf_id}}){
	    foreach my $gap(sort keys %{$fidpd_asites{$gf_id}{$site}}){
		print "_FIDPD_$fastaname> MATCH-MODULE-SITES ==>  $gf_id : $site : $gap >>> ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'FREQ'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COV'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'COORD'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'ELECT'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_D'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'HB_A'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PI'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'VDW'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'CATALYTIC'},"  ",
		$fidpd_asites{$gf_id}{$site}{$gap}{$type}{'PHOSPHORYLATION'},"\n";
	    }
	}
    }
    exit;
=cut

    return \%fidpd_asites,\%gf_info;
}
