#!/usr/bin/perl -w
use strict;
use Data::Dumper;
use Getopt::Long;


if (-e '/tmp/ramdisk1') {
#	print "exists\n";
} else {
	system('hdid -nomount ram://52528');
	system('newfs_hfs /dev/disk1');
	system('mkdir /tmp/ramdisk1');
	system('mount -t hfs /dev/disk1 /tmp/ramdisk1');
#	print "not exists\n";
}

my ($if1, $if2, $s1, $s2, $e1, $e2, $of) = ('', '', 1, 1, 10000, 10000, 'gs_out');

my $tempFolder = randFolder(); 
	


my $option = GetOptions(
		"i1=s" => \$if1,
		"i2=s" => \$if2,
		"s1=i" => \$s1,
		"s2=i" => \$s2,
		"e1=i" => \$e1,
		"e2=i" => \$e2,
#		"t1=s" => \$t1,
#		"t2=s" => \$t2,
		"o=s" => \$of,
	);
if (! $if1 || ! $if2) {
	print "Usage: gs.pl [option]\n";
	print "\nOptions\n\n";
	print "    -i1 input filename of the sequences 1 in either FASTA or ClustalX alignment format, required\n";
	print "    -i2 input filename of the sequences 2 in either FASTA or ClustalX alignment format, required\n";
	print "    -s1 start position of sequences alignment file 1 for analysis, default 1\n";
	print "    -s2 start position of sequences alignment file 2 for analysis, default 1\n";
	print "    -e1 end position of sequences alignment file 1 , default last position\n";
	print "    -e2 end position of sequences alignment file 2 , default last position\n";
	print "    -o output file name, default gs_out\n";
	system("rm -rf $tempFolder");
	exit;
}

$s1 = ($s1 < 1) ? 1 : $s1;
$s2 = ($s2 < 1) ? 1 : $s2;

if ($e1 < $s1 || $e2 < $s2) {
	print "end position small then start position\n";
	system("rm -rf $tempFolder");
	exit;
}


my ($seqA1, $seqH1) = readFaa($if1);
my ($seqA2, $seqH2) = readFaa($if2);
open (OR, ">$of");

foreach my $name1 (@$seqA1) {
	my $seq1 = substr($seqH1->{$name1}, $s1 - 1, $e1 - $s1);
	$seq1 =~ s/-//g;
	if (!$seq1 || (length $seq1) < 30) {next;}
	my $outName1 = "$tempFolder/seq". (int rand(100000)) . ".faaa";
	my $outName1f = "$outName1-f";
	open (OF1, ">$outName1");
	print OF1 ">$name1\n$seq1\n";
	close OF1;

	my $sfl1 = Shuffle($name1, $seq1, 200);
	open (OF1f, ">$outName1f");
	foreach my $key (keys %$sfl1) {
		print OF1f ">$key\n$sfl1->{$key}\n";
	}
	close OF1f;

	foreach my $name2 (@$seqA2) {
		my $outName2 = "$tempFolder/sfl". (int rand(100000)) . ".faaa";
		my $seq2 = substr($seqH2->{$name2}, $s2 - 1, $e2 - $s2);
		$seq2 =~ s/-//g;
		if (!$seq2 || (length $seq2) < 30) {next;}
		open (OF2, ">$outName2");
		print OF2 ">$name2\n$seq2\n";
		close OF2;
		my $out = [];
		my $outs = [];
		my $cmd = "./ssearch35_t -p -q -w 80 -m 0 -k 500 -f -8 -g -2 -s BL62 -B $outName2 $outName1";
		@$outs = `$cmd`;
		my ($bits, $position) = sout($outs);
		my $bit = (@$bits) ? shift @$bits : 0;

		if ($bit > 60) {
			@$outs = `./ssearch35_t -p -q -w 80 -m 0 -k 500 -f -8 -g -2 -E 1000 -s BL62 -B $outName2 $outName1f`;
			my ($bits, $position2) = sout($outs);
			my $sca = scalar @$bits;
			my ($ave, $std) = Std2($bits);
			my $score = sprintf("%.2f",($bit - $ave) / $std);

			my $po1 = realPosition($seqH1->{$name1}, $s1, $e1, $seq1, $position->[2], $position->[3]);
			my $po2 = realPosition($seqH2->{$name2}, $s2, $e2, $seq2, $position->[0], $position->[1]);
			print "$name1\t$name2\t$bit\t$score\n";
		#	print OR "$name1\t($position->[2]-$position->[3])\t$name2\t($position->[0]-$position->[1])\t$bit\t$score\n";
			print OR "$name1\t($po1)\t$name2\t($po2)\t$bit\t$score\n";
		} else {
			print "$name1\t$name2\t$bit\t0.0\n";
		#	print OR "$name1\t($position->[2]-$position->[3])\t$name2\t($position->[0]-$position->[1])\t$bit\t0.0\n";
		}
			

		close OF2;
		unlink ($outName2);
	}
	unlink ($outName1);
	unlink ("$outName1f");
}
print "result in $of file\n";
system("rm -rf $tempFolder");


sub realPosition {
	my ($seqOri, $oriS, $oriE, $seqSelect, $aliS, $aliE) = @_;
	$oriE = ($oriE > length $seqOri) ? length $seqOri : $oriE;
	my ($finalS, $finalE) = ($oriS, $oriE);
	my $count = $aliS;
	for (my $i = $oriS - 1; $i < $oriE; $i++) {
		if (substr($seqOri, $i, 1) ne '-' ) {
			$count--;
		}
		if ($count == 0) {
			$finalS = $i + 1;
			last;
		}
	}

	$count = (length $seqSelect) - $aliE + 1;
	for (my $i = $oriE - 1; $i > $oriS; $i--) {
		if (substr($seqOri, $i, 1) ne '-' ) {
			$count--;
		}
		if ($count == 0) {
			$finalE = $i + 1;
			last;
		}
	}
	my $label = substr($seqSelect, $aliS - 1, 4) ."-". substr($seqSelect, $aliE - 4 , 4);
	return "$finalS $label $finalE";	

}


sub sout{
	my ($arr) = @_;
	my $score = [];
	my @position;
	my $currentQuery = '';
	foreach (@$arr) {
#		if ($_ =~ />>(\S+)\s/) {
#			$currentQuery = $1;
#			next;
#		}
		if ($_ =~ /Smith-Waterman score: (\d+);\s+(.+)% identity \((.+)% similar\) in (\d+) aa overlap \((\d+)-(\d+):(\d+)-(\d+)\)/) {
			my $smith = $1;
			my $identity = $2;
			my $similarity = $3;
			my $overlap = $4;
			my $q_s = $5;
			my $q_e = $6;
			my $d_s = $7;
			my $d_e = $8;

			push @$score, $smith;
			unless (@position) {				
				@position = ($q_s, $q_e, $d_s, $d_e);
			}
			next;
		}
	}
#	print Dumper($scoreHash);
#	print @$arr;
	return ($score, \@position);

}

sub Std {
	my ($out, $num) = @_;
	my $numResult = 0;
	my $total = 0;
	my @score;
	my $currName = '';
#	print Dumper($out);
	foreach (@$out) {
		chomp;
		my @aa = split (/\t/, $_);
		if ($aa[1] ne $currName) {
			push (@score, $aa[11]);
			$currName = $aa[1];
			$total += $aa[11];
			$numResult ++;
		}
	}
	my $average = $total / $numResult; 
	my $sqtotal = 0;
	foreach (@score) {
		$sqtotal += ($average - $_) ** 2;
	}
	my $std = ($sqtotal / $numResult) ** 0.5;	
	return ($average, $std);
#	print "$numResult\t$average\t$std\n";

}


sub Std2 {
        my ($bits) = @_;
        my $total = 0;
	my $numResult = scalar @$bits;
	foreach (@$bits) {
		$total += $_;
        }
        my $average = $total / $numResult;
        my $sqtotal = 0;
        foreach (@$bits) {
                $sqtotal += ($average - $_) ** 2;
        }
        my $std = ($sqtotal / $numResult) ** 0.5;
        return ($average, $std);
#       print "$numResult\t$average\t$std\n";

}




sub Seq {
	my ($infile) = @_;
	open (IF, $infile);
	my $title = '';
	my $seqs = {};
	while (<IF>) {
		chomp;
		if ($_ =~ /\>(\S+)/) {
			$title = $1;
			$seqs->{$title} = '';
			next;
		}
		$seqs->{$title} .= $_;
	}
	return $seqs;

}

sub readAln {
	my ($temp) = @_;
	my $array = [];
	my $hash = {};
	my $title = '';
	foreach (@$temp) {
		chomp;
		if ($_ =~ /^CLUSTAL.+alignment/ || $_ =~ /^\s/ || ! $_) {
			next;
		}
		my @arr = split (/\s+/,$_);
		if (! exists $hash->{$arr[0]}) {
			push (@$array, $arr[0]);
		}
		$hash->{$arr[0]} .= $arr[1];
	}
#	print Dumper ($hash);
	return ($array, $hash);
}

sub readFaa {
        my ($inFile) = @_;
        open (IS, $inFile) or die "No input sequence $inFile!!\n";
        my $array = [];
        my $hash = {};
	my $temp = [];
        my $title = '';
	while (<IS>) {
		chomp;
		push (@$temp, $_);
	}

	if ($temp->[0] =~ /^CLUSTAL.+alignment/) {
		return readAln($temp);
	} else {
		foreach (@$temp) {
	                if ($_ =~ />(\S+)/) {
				$title = $1;
				if (! exists $hash->{$title}) {
					push (@$array, $title);
                        	} else {
                                	print "sequence $title duplicated, please remove redundent sequence!\n";
                        	}
                        	next;
			}
                	$hash->{$title} .= $_;
        	}
        	return ($array, $hash);
	}
}

sub Shuffle {
	my ($name, $seq, $num) = @_;
	my $hash = {}; 
	for (my $k = 0; $k < $num; $k++) {
		my $size = length $seq;
		my @array = 0...($size - 1);
		my $newSeq = '';
		for (my $i = 0; $i < $size; $i++) {
			my $j = int rand ($i + 1); 
			@array[$i, $j] = @array[$j, $i];
		}

		for (my $i = 0; $i < $size; $i++) {
			$newSeq .= substr($seq, $array[$i], 1);
		}
		$hash->{"$name-$k"} = $newSeq;
	}
	return $hash;
}

sub randFolder {

	my $name = "/tmp/ramdisk1/sss". (int rand(1000000));
	if (-e $name) {
		$name = readFolder();
	} else {
		system("mkdir $name");
	}

	return $name;

}
