#!/usr/bin/php
_________________________________________________________
|  ___  ___  ____  __________  _________  __     ___     |
|  / _ \/ _ \/ __ \/_  __/ __ \/ ___/ __ \/ /    <  /    |
| / ___/ , _/ /_/ / / / / /_/ / /__/ /_/ / /__   / /     |
|/_/  /_/|_|\____/ /_/  \____/\___/\____/____/  /_/ V(1.3)
|========================================================|
| This tool will automate the first protocol procedure.  |
| Input an Accession, GI, or Full Sequence to begin. The |
| Sequence MUST be pasted onto ONE line to work properly.|
| This tool will produce your GI list, TinyXML files &   |
| will run MakeTable5.                                   |
|========================================================|
|    Contact: Vamsee Reddy (Symphony.Dev@Gmail.com)      |
----------------------------------------------------------

<?php 
if(!@$argv[1]=="debug")
{
	error_reporting(0);
} else
{
	echo "\n**DEBUG MODE ON**\n";
}
Class NCBI
{
	var $seq;
	var $dir;
	var $thresh_hold;
	var $cookies='ncbi.txt';
	var $blast_url;
	var $iterate_url;
	var $iterate_url_lite;
	var $blast_results;
	var $job;
	var $GIs;
	var $session;
	var $Desktop='/Users/saierlab/Desktop/';
	var $true_count;
	var $status=TRUE;
	
	public function __Construct()
	{
		$this->seq=$this->getInput("% Enter the Accession, GI, or Full protein sequence to investigate");
		$this->dir=$this->getInput("% Directory Name (session files will be stored here)");
		$this->thresh_hold=($this->getInput("% E Value Threshold (Default: 0.005)"))?$this->thresh_hold:"0.005";
		if(!$this->dir)
		{
			die("Please pick a directory");
		}
		chdir($this->Desktop);
		if(!file_exists($this->dir))
		{
			mkdir($this->dir);
		}
		chdir($this->Desktop);
		chdir($this->dir);
		$this->NCBI_BLAST();
		$this->fetch_GI_numbers();
		$this->write_GI_numbers();
		if($this->getInput("\n\nView BLAST Results? (Leave blank to skip)"))
		{
			system("open \"{$this->blast_url}\"");
		}
		$this->load_session();
		if($i=$this->getInput("% Max iteration (leave blank to ignore)"))
		{
			$this->iterate($i);
			$this->fetch_GI_numbers();
			$this->write_GI_numbers();
			if($this->getInput("\n\nView New BLAST Results? (Leave blank to skip)"))
			{
				//echo $this->iterate_url_lite;
				$replace='/(&good_GI=[0-9]{1,15})|(&checked_GI=[0-9]{1,15})/';
				$this->iterate_url_lite=preg_replace($replace,NULL,$this->iterate_url_lite);
				system("open \"{$this->iterate_url_lite}\"");
				
			}
		}
		$this->upload_entrez();
		$this->save_tinyxml();
		$this->maketable5();
		system("open ./");
	}
	
	public function getInput($msg)
	{
		fwrite(STDOUT, "$msg: ");
		$varin = trim(fgets(STDIN));
		return $varin;
	}
	
	private function bbrowse($url)
	{
		$ch = curl_init();
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_URL,$url);
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6');
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$result = curl_exec($ch);
		return $result;
	}
	
	
	public function browse($url)
	{
		if(!$url){ return false; }
		$attempts=0;
		do{
		$c=$this->bbrowse($url);
			if(!$c){
			$attempts++;
			echo "\n[*]Socket Failed - Attempt : $attempts";
			sleep(5);
			echo "\n[*]Trying Again...";
			}
			else{
			return $c;
			}
		}
		while($attempts<5);
		$this->status=FALSE;
		return FALSE;
	}

	private function NCBI_PRIMARY_BLAST()
	{
		$ch = curl_init();
		$job=md5($this->seq);
		$this->job=$job;
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		$data="QUERY={$this->seq}&db=protein&QUERY_FROM=&QUERY_TO=&QUERYFILE=&GENETIC_CODE=1&JOB_TITLE=$job&SUBJECTS=&stype=&SUBJECTS_FROM=&SUBJECTS_TO=&SUBJECTFILE=&DATABASE=nr&EQ_MENU=&NUM_ORG=1&EQ_TEXT=&BLAST_PROGRAMS=psiBlast&PHI_PATTERN=&MAX_NUM_SEQ=500&SHORT_QUERY_ADJUST=on&EXPECT=10&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&MATCH_SCORES=1,-2&GAPCOSTS=11 1&COMPOSITION_BASED_STATISTICS=2&REPEATS=repeat_9606&TEMPLATE_LENGTH=0&TEMPLATE_TYPE=0&PSSM=&I_THRESH={$this->thresh_hold}&PSI_PSEUDOCOUNT=0&SHOW_OVERVIEW=true&SHOW_LINKOUT=true&GET_SEQUENCE=true&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&DESCRIPTIONS=500&ALIGNMENTS=250&NEW_VIEW=true&OLD_BLAST=false&NCBI_GI=true&SHOW_CDS_FEATURE=false&NUM_OVERVIEW=100&FORMAT_EQ_TEXT=&FORMAT_ORGANISM=&EXPECT_LOW=&EXPECT_HIGH=&QUERY_INDEX=&CLIENT=web&SERVICE=plain&CMD=request&PAGE=Proteins&PROGRAM=blastp&MEGABLAST=&RUN_PSIBLAST=on&TWO_HITS=&CDD_SEARCH=on&ID_FOR_PSSM=&SAVED_PSSM=&SELECTED_PROG_TYPE=psiBlast&SAVED_SEARCH=true&BLAST_SPEC=&QUERY_BELIEVE_DEFLINE=&DB_DIR_PREFIX=&USER_DATABASE=&USER_WORD_SIZE=&USER_MATCH_SCORES=&USER_FORMAT_DEFAULTS=&NO_COMMON=&NUM_DIFFS=0&NUM_OPTS_DIFFS=0&UNIQ_DEFAULTS_NAME=A_SearchDefaults_1OYrOj_BT4_DLAtezdw3Hx_23ttSh_S6jfo&PAGE_TYPE=BlastSearch&USER_DEFAULT_PROG_TYPE=psiBlast&USER_DEFAULT_MATRIX=3";
		curl_setopt ($ch, CURLOPT_URL, "http://blast.ncbi.nlm.nih.gov/Blast.cgi");
		curl_setopt($ch, CURLOPT_USERAGENT, 'Opera/9.23 (Windows NT 5.1; U; en)');
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch,CURLOPT_POSTFIELDS,$data);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$store = curl_exec ($ch);
		curl_close($ch);
		$data=$this->extract_URL($store)."&NCBI_GI=on&CMD=Get";
		return $this->blast_url=$data;
	}
	
	public function post_data($url,$data)
	{
		$ch = curl_init();
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_URL,$url);
		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch,CURLOPT_POSTFIELDS,$data);
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6');
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$result = curl_exec($ch);
		return $result;
		return $store;
	}
	
	public function extract_URL($c)
	{
		$pattern='/var myncbi_cu = unescape\(\'.{1,}\'\);/iUs';
		preg_match_all($pattern,$c,$out,PREG_PATTERN_ORDER);
		$replace='/(var myncbi_cu = unescape\(\')|(\'\);)/';
		$data=urldecode(trim(preg_replace($replace,NULL,$out[0][0])));
		return $data;
	}
	
	private function NCBI_SECONDARY_BLAST()
	{
		$res=$this->browse($this->blast_url);
		$loading='/This page will be automatically updated in/i';
		if(preg_match($loading,$res))
		{
			return FALSE;
		} else
		{
			$this->blast_results=$res;
			return TRUE;
		}
	}
	
	public function NCBI_BLAST()
	{
		echo "\n >> Blasting NCBI Database...";
		$this->browse('http://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&BLAST_PROGRAMS=blastp&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on&LINK_LOC=blasthome');
		$this->NCBI_PRIMARY_BLAST();
		echo "Done.";
		echo "\n >> Fetching Results.";
		$res=FALSE;
		echo "\n >> Waiting for NCBI...";
		do
		{
			$res=$this->NCBI_SECONDARY_BLAST();
			echo ".";
			sleep(5);
		} 
		while($res==FALSE);
		echo "Done.\n >> Results Recieved.";
	}
	
	public function fetch_GI_numbers()
	{
		$pattern='/<input type="checkbox" name="checked_GI" value="[0-9]{1,15}" checked="checked" \/>/U';
		preg_match_all($pattern,$this->blast_results,$out,PREG_PATTERN_ORDER);
		$gis=implode("\n",$out[0]);
		preg_match_all('/[0-9]{1,}/s',$gis,$out,PREG_PATTERN_ORDER);
		$GIs=array_unique($out[0]);
		$count=count($GIs);
		echo "\n ++ Found $count Unique GI Numbers.";
		return $this->GIs=$out[0];
	}
	
	public function write_GI_numbers()
	{
		$content=trim(implode("\n",$this->GIs));
		$handle=fopen("GI_NUMBERS.TXT","w+");
		fwrite($handle,$content);
		echo "\n ++ GI List Written to {$this->dir}\n";
	}
	
	private function load_session()
	{
		$pattern='/<INPUT .{1,200}TYPE="hidden".{1,200}(\/)?>/siU';
		preg_match_all($pattern,$this->blast_results,$out,PREG_PATTERN_ORDER);
		$values=array_unique($out[0]);
		foreach($values as $value)
		{
			preg_match_all('/name( )?=( )?".{1,15}"/iU',$value,$name,PREG_PATTERN_ORDER);
			preg_match_all('/value( )?=( )?".{1,15}"/iU',$value,$val,PREG_PATTERN_ORDER);
			$name=preg_replace('/(name( )?=( )?)|(")/i',NULL,$name[0][0]);
			$val=preg_replace('/(value( )?=( )?)|(")/i',NULL,$val[0][0]);
			$session[$name]=$val;
		}
		$session=array_filter($session);
		return $this->session=$session;
	}
	
	public function iterate($i=NULL)
	{
		if(!$i)
		{
			return FALSE;
		} 
		foreach($this->GIs as $GI)
		{
			$string[]="checked_GI=$GI&good_GI=$GI";
		}
		$string=implode("&",$string);
		$rid=$this->session['RID'];
		$oldrid=$this->session['PREV_RID'];
		$repeats=$this->session['REPEATS'];
		echo "\n >> Iterating Results with MAX: $i";
		$url="http://blast.ncbi.nlm.nih.gov/Blast.cgi";
		$data="ALIGNMENTS=250&ALIGNMENT_VIEW=Pairwise&BLAST_PROGRAMS=psiBlast&CDD_SEARCH_STATE=0&CLIENT=web&CMD=request&COMPOSITION_BASED_STATISTICS=2&DATABASE=nr&DESCRIPTIONS=$i&ENTREZ_QUERY=&EQ_OP=AND&EQ_OP=AND&EXPECT=10&FILTER=F&FILTER=F&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&FULL_DBNAME=nr&FULL_DBNAME=nr&GAPCOSTS=11+1&GET_SEQUENCE=true&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&IMG_ALT=CD+search+result+summary&I_THRESH={$this->thresh_hold}&JOB_TITLE={$this->job}&LAYOUT=OneWindow&LAYOUT=OneWindow&MASK_CHAR=2&MASK_COLOR=1&MATRIX_NAME=BLOSUM62&MAX_NUM_SEQ=$i&NCBI_GI=true&NCBI_GI=on&NEW_VIEW=true&NEXT_I=Go&NEXT_I=Go&NUM_DIFFS=0&NUM_OPTS_DIFFS=0&NUM_ORG=1&NUM_OVERVIEW=100&OLD_BLAST=false&PAGE=Proteins&PAGE_TYPE=BlastSearch&PREV_RID=$oldrid&PROGRAM=blastp&PSI_PSEUDOCOUNT=0&QUERY_INDEX=0&REPEATS=$repeats&RID=$rid&RTOE=13&RTOE=11&RTOE=12&RTOE=14&RUN_PSIBLAST=on&RUN_PSIBLAST=on&RUN_PSIBLAST=on&SAVED_SEARCH=true&SEARCH_DB_STATUS=43&SELECTED_PROG_TYPE=psiBlast&SERVICE=plain&SHORT_QUERY_ADJUST=on&SHOW_CDS_FEATURE=false&SHOW_LINKOUT=true&SHOW_OVERVIEW=true&STEP_NUMBER=2&USER_DEFAULT_MATRIX=3&USER_DEFAULT_PROG_TYPE=psiBlast&USER_FORMAT_DEFAULTS=on&USER_TYPE=2&USER_TYPE=2&WORD_SIZE=3&_PGR=11&queryList=0&queryList=0&";
		$ddata=$data;
		$data=$data.$string;
		//$this->iterate_url_lite="$url?$ddata&CMD=Get&NCBI_GI=on";
		$page=$this->post_data($url,$data);
		$this->iterate_url=$this->extract_URL($page)."&CMD=Get&NCBI_GI=on";
		echo "\n >> Waiting for NCBI...";
		do
		{
			$res=$this->iterate_primary();
			echo ".";
			sleep(5);
		} 
		while($res==FALSE);
		$this->iterate_url_lite=$this->extract_URL($this->blast_results)."&CMD=Get&NCBI_GI=on";
		echo "Done.\n >> Results Recieved.";
		return TRUE;
	}
	
	private function iterate_primary()
	{
		$url=$this->iterate_url;
		$url=explode("Blast.cgi?",$url);
		$data=$url[1];
		$url='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		$res=$this->post_data($url,$data);
		$loading='/This page will be automatically updated in/i';
		if(preg_match($loading,$res))
		{
			return FALSE;
		} else
		{
			$this->blast_results=$res;
			return TRUE;
		}
	}
	
	public function upload_entrez()
	{
		echo "\n >> Uploading GI File to Entrez...";
		$uploadfile=getcwd().'/GI_NUMBERS.TXT';
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_URL, "http://www.ncbi.nlm.nih.gov/portal/utils/batchentrez_p.cgi"); // you can modify this URL based on your environment
		curl_setopt($ch, CURLOPT_POSTFIELDS, array('file' => "@$uploadfile",'db'=>'protein','cmd'=>'Retrieve'));
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_ENCODING, "");
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$store = curl_exec ($ch);
		$pattern='/Passed to Entrez: [0-9]{1,5}/';
		preg_match_all($pattern,$store,$count,PREG_PATTERN_ORDER);
		preg_match_all('/[0-9]{1,5}/',$count[0][0],$truecount,PREG_PATTERN_ORDER);
		$this->true_count=$truecount[0][0];
		curl_close($ch);
		echo 'Done!';
	}
	
	public function save_tinyxml()
	{
		echo "\n >> Downloading TinyXML Sequence...\n\n";
		$url="http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&sendto=on&log$=seqview&db=protein&dopt=fasta_xml&query_key=1&qty={$this->true_count}&filter=all";
		system("curl -b '{$this->cookies}' '$url' > sequences_fasta.xml");
		//$handle=fopen("sequences_fasta.xml","w+");
		//fwrite($handle,$content);
		echo "\n\n ## Done!\n ++ sequences_fasta.xml has been written to {$this->dir}\n";
	}
	
	public function download_blast($file)
	{
		$pattern='/<table class="defln hideCap">.{1,}/is';
		preg_match_all($pattern,$this->blast_results,$out,PREG_PATTERN_ORDER);
		$res=$out[0][0];
		$res=explode('</table>',$res);
		$res=strip_tags($res[0]);
		$handle=fopen('wtf','w+');
		fwrite($handle,$res);
	}
	
	public function maketable5()
	{
		echo "\n\n#### RUNNING MAKE_TABLE5 ####\n";
		$c=$this->getInput("Enter Sequence Identity Threshold (1 - 0.7)");
		$cmd="make_table5 -i sequences_fasta.xml -c $c";
		system($cmd);
		echo "\n\n>> Done. All files have been created.\n";
	}
}
$NCBI=new NCBI;
?>