<?php
Class NCBI extends GlobalD
{
	var $seq;
	var $dir;
	var $thresh_hold;
	var $cookies='ncbi.txt';
	var $blast_url;
	var $iterate_url;
	var $iterate_url_lite;
	var $blast_results;
	var $job;
	var $GIs;
	var $session;
	var $Desktop='/Users/saierlab/Desktop/';
	var $true_count;
	var $status=TRUE;
	var $GI_Count;
	var $c;
	
	public function __Construct($acc)
	{
		chdir(parent::LOCALDIR);
		$this->seq=$acc;
		$this->dir=$acc;
		$this->thresh_hold=0.005;
		@mkdir('accs');
		chdir('accs');
		@mkdir('packed');
		@mkdir($this->dir);
		chdir($this->dir);
		$this->NCBI_BLAST();
		$this->fetch_GI_numbers();
		$this->write_GI_numbers();
		$this->load_session();
		if($this->GI_Count<500)
		{
			// Less than 500 found, lets iterate...
			$this->iterate(500);
			$this->fetch_GI_numbers();
			$this->write_GI_numbers();
		}
		if($this->GI_Count>=500)
		{
			$this->c=0.7;
		} else
		{
			if($this->GI_Count<=200)
			{
				$this->c=1;
			} else
			{
				$this->c=0.9;
			}
		}
		$this->upload_entrez();
		$this->save_tinyxml();
		$this->maketable5();
		$this->write_status();
	}
	
	public function getInput($msg)
	{
		fwrite(STDOUT, "$msg: ");
		$varin = trim(fgets(STDIN));
		return $varin;
	}
	
	private function bbrowse($url)
	{
		$ch = curl_init();
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_URL,$url);
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6');
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$result = curl_exec($ch);
		curl_close($ch);
		return $result;
	}
	
	
	public function browse($url)
	{
		if(!$url){ return false; }
		$attempts=0;
		do{
		$c=$this->bbrowse($url);
			if(!$c){
			$attempts++;
			echo "\n[*]Socket Failed - Attempt : $attempts";
			sleep(20);
			echo "\n[*]Trying Again...";
			}
			else{
			return $c;
			}
		}
		while($attempts<5);
		$this->status=FALSE;
		return FALSE;
	}

	private function NCBI_PRIMARY_BLAST()
	{
		$ch = curl_init();
		$job=md5($this->seq);
		$this->job=$job;
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		$data="QUERY={$this->seq}&db=protein&QUERY_FROM=&QUERY_TO=&QUERYFILE=&GENETIC_CODE=1&JOB_TITLE=$job&SUBJECTS=&stype=&SUBJECTS_FROM=&SUBJECTS_TO=&SUBJECTFILE=&DATABASE=nr&EQ_MENU=&NUM_ORG=1&EQ_TEXT=&BLAST_PROGRAMS=psiBlast&PHI_PATTERN=&MAX_NUM_SEQ=500&SHORT_QUERY_ADJUST=on&EXPECT=10&WORD_SIZE=3&MATRIX_NAME=BLOSUM62&MATCH_SCORES=1,-2&GAPCOSTS=11 1&COMPOSITION_BASED_STATISTICS=2&REPEATS=repeat_9606&TEMPLATE_LENGTH=0&TEMPLATE_TYPE=0&PSSM=&I_THRESH={$this->thresh_hold}&PSI_PSEUDOCOUNT=0&SHOW_OVERVIEW=true&SHOW_LINKOUT=true&GET_SEQUENCE=true&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&ALIGNMENT_VIEW=Pairwise&MASK_CHAR=2&MASK_COLOR=1&DESCRIPTIONS=500&ALIGNMENTS=250&NEW_VIEW=true&OLD_BLAST=false&NCBI_GI=true&SHOW_CDS_FEATURE=false&NUM_OVERVIEW=100&FORMAT_EQ_TEXT=&FORMAT_ORGANISM=&EXPECT_LOW=&EXPECT_HIGH=&QUERY_INDEX=&CLIENT=web&SERVICE=plain&CMD=request&PAGE=Proteins&PROGRAM=blastp&MEGABLAST=&RUN_PSIBLAST=on&TWO_HITS=&CDD_SEARCH=on&ID_FOR_PSSM=&SAVED_PSSM=&SELECTED_PROG_TYPE=psiBlast&SAVED_SEARCH=true&BLAST_SPEC=&QUERY_BELIEVE_DEFLINE=&DB_DIR_PREFIX=&USER_DATABASE=&USER_WORD_SIZE=&USER_MATCH_SCORES=&USER_FORMAT_DEFAULTS=&NO_COMMON=&NUM_DIFFS=0&NUM_OPTS_DIFFS=0&UNIQ_DEFAULTS_NAME=A_SearchDefaults_1OYrOj_BT4_DLAtezdw3Hx_23ttSh_S6jfo&PAGE_TYPE=BlastSearch&USER_DEFAULT_PROG_TYPE=psiBlast&USER_DEFAULT_MATRIX=3";
		curl_setopt ($ch, CURLOPT_URL, "http://blast.ncbi.nlm.nih.gov/Blast.cgi");
		curl_setopt($ch, CURLOPT_USERAGENT, 'Opera/9.23 (Windows NT 5.1; U; en)');
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch,CURLOPT_POSTFIELDS,$data);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$store = curl_exec ($ch);
		curl_close($ch);
		if(eregi('Error',$store))
		{
			$this->status=FALSE;
			return 0;
		}
		if(!$store)
		{
			return 1;
		}
		$data=$this->extract_URL($store)."&NCBI_GI=on&CMD=Get";
		return $this->blast_url=$data;
	}
	
	public function post_data($url,$data)
	{
		$ch = curl_init();
		$ref='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		curl_setopt($ch,CURLOPT_REFERER,$ref);
		curl_setopt($ch, CURLOPT_URL,$url);
		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch,CURLOPT_POSTFIELDS,$data);
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6');
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$result = curl_exec($ch);
		curl_close($ch);
		return $result;
		return $store;
	}
	
	public function extract_URL($c)
	{
		$pattern='/var myncbi_cu = unescape\(\'.{1,}\'\);/iUs';
		preg_match_all($pattern,$c,$out,PREG_PATTERN_ORDER);
		$replace='/(var myncbi_cu = unescape\(\')|(\'\);)/';
		$data=urldecode(trim(preg_replace($replace,NULL,$out[0][0])));
		return $data;
	}
	
	private function NCBI_SECONDARY_BLAST()
	{
		$res=$this->browse($this->blast_url);
		$loading='/This page will be automatically updated in/i';
		if(preg_match($loading,$res))
		{
			return FALSE;
		} else
		{
			$this->blast_results=$res;
			return TRUE;
		}
	}
	
	private function exit_fml()
	{
		$this->status=FALSE;
		$this->write_status();
		exit;
	}
	
	public function NCBI_BLAST()
	{
		echo "\n >> Blasting NCBI Database...";
		$this->browse('http://blast.ncbi.nlm.nih.gov/Blast.cgi?PROGRAM=blastp&BLAST_PROGRAMS=blastp&PAGE_TYPE=BlastSearch&SHOW_DEFAULTS=on&LINK_LOC=blasthome');
		$i=0;
		do
		{
			$pb=$this->NCBI_PRIMARY_BLAST();
			if($pb==0){ $this->exit_fml(); }
			if($pb==1){ $this->exit_fml(); }
			if($i==5){ $this->exit_fml(); }
			$i++;
		} while(1);
		echo "Done.";
		echo "\n >> Fetching Results.";
		$res=FALSE;
		echo "\n >> Waiting for NCBI...";
		do
		{
			$res=$this->NCBI_SECONDARY_BLAST();
			echo ".";
			sleep(5);
		} 
		while($res==FALSE);
		echo "Done.\n >> Results Recieved.";
	}
	
	public function fetch_GI_numbers()
	{
		$pattern='/<input type="checkbox" name="checked_GI" value="[0-9]{1,15}" checked="checked" \/>/U';
		preg_match_all($pattern,$this->blast_results,$out,PREG_PATTERN_ORDER);
		$gis=implode("\n",$out[0]);
		preg_match_all('/[0-9]{1,}/s',$gis,$out,PREG_PATTERN_ORDER);
		$GIs=array_unique($out[0]);
		$count=count($GIs);
		$this->GI_Count=$count;
		echo "\n ++ Found $count Unique GI Numbers.";
		return $this->GIs=$out[0];
	}
	
	public function write_GI_numbers()
	{
		$content=trim(implode("\n",$this->GIs));
		$handle=fopen("GI_NUMBERS.TXT","w+");
		fwrite($handle,$content);
		echo "\n ++ GI List Written to {$this->dir}\n";
	}
	
	private function load_session()
	{
		$pattern='/<INPUT .{1,200}TYPE="hidden".{1,200}(\/)?>/siU';
		preg_match_all($pattern,$this->blast_results,$out,PREG_PATTERN_ORDER);
		$values=array_unique($out[0]);
		foreach($values as $value)
		{
			preg_match_all('/name( )?=( )?".{1,15}"/iU',$value,$name,PREG_PATTERN_ORDER);
			preg_match_all('/value( )?=( )?".{1,15}"/iU',$value,$val,PREG_PATTERN_ORDER);
			$name=@preg_replace('/(name( )?=( )?)|(")/i',NULL,$name[0][0]);
			$val=@preg_replace('/(value( )?=( )?)|(")/i',NULL,$val[0][0]);
			$session[$name]=$val;
		}
		print_r($session);
		$session=array_filter($session);
		return $this->session=$session;
	}
	
	public function iterate($i=NULL)
	{
		if(!$i)
		{
			return FALSE;
		} 
		foreach($this->GIs as $GI)
		{
			$string[]="checked_GI=$GI&good_GI=$GI";
		}
		$string=implode("&",$string);
		$rid=$this->session['RID'];
		$oldrid=$this->session['PREV_RID'];
		$repeats=$this->session['REPEATS'];
		echo "\n >> Iterating Results with MAX: $i";
		$url="http://blast.ncbi.nlm.nih.gov/Blast.cgi";
		$data="ALIGNMENTS=250&ALIGNMENT_VIEW=Pairwise&BLAST_PROGRAMS=psiBlast&CDD_SEARCH_STATE=0&CLIENT=web&CMD=request&COMPOSITION_BASED_STATISTICS=2&DATABASE=nr&DESCRIPTIONS=$i&ENTREZ_QUERY=&EQ_OP=AND&EQ_OP=AND&EXPECT=10&FILTER=F&FILTER=F&FORMAT_OBJECT=Alignment&FORMAT_TYPE=HTML&FULL_DBNAME=nr&FULL_DBNAME=nr&GAPCOSTS=11+1&GET_SEQUENCE=true&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&HITLIST_SIZE=$i&IMG_ALT=CD+search+result+summary&I_THRESH={$this->thresh_hold}&JOB_TITLE={$this->job}&LAYOUT=OneWindow&LAYOUT=OneWindow&MASK_CHAR=2&MASK_COLOR=1&MATRIX_NAME=BLOSUM62&MAX_NUM_SEQ=$i&NCBI_GI=true&NCBI_GI=on&NEW_VIEW=true&NEXT_I=Go&NEXT_I=Go&NUM_DIFFS=0&NUM_OPTS_DIFFS=0&NUM_ORG=1&NUM_OVERVIEW=100&OLD_BLAST=false&PAGE=Proteins&PAGE_TYPE=BlastSearch&PREV_RID=$oldrid&PROGRAM=blastp&PSI_PSEUDOCOUNT=0&QUERY_INDEX=0&REPEATS=$repeats&RID=$rid&RTOE=13&RTOE=11&RTOE=12&RTOE=14&RUN_PSIBLAST=on&RUN_PSIBLAST=on&RUN_PSIBLAST=on&SAVED_SEARCH=true&SEARCH_DB_STATUS=43&SELECTED_PROG_TYPE=psiBlast&SERVICE=plain&SHORT_QUERY_ADJUST=on&SHOW_CDS_FEATURE=false&SHOW_LINKOUT=true&SHOW_OVERVIEW=true&STEP_NUMBER=2&USER_DEFAULT_MATRIX=3&USER_DEFAULT_PROG_TYPE=psiBlast&USER_FORMAT_DEFAULTS=on&USER_TYPE=2&USER_TYPE=2&WORD_SIZE=3&_PGR=11&queryList=0&queryList=0&";
		$ddata=$data;
		$data=$data.$string;
		//$this->iterate_url_lite="$url?$ddata&CMD=Get&NCBI_GI=on";
		$page=$this->post_data($url,$data);
		$this->iterate_url=$this->extract_URL($page)."&CMD=Get&NCBI_GI=on";
		echo "\n >> Waiting for NCBI...";
		do
		{
			$res=$this->iterate_primary();
			echo ".";
			sleep(5);
		} 
		while($res==FALSE);
		$this->iterate_url_lite=$this->extract_URL($this->blast_results)."&CMD=Get&NCBI_GI=on";
		echo "Done.\n >> Results Recieved.";
		return TRUE;
	}
	
	private function iterate_primary()
	{
		$url=$this->iterate_url;
		$url=explode("Blast.cgi?",$url);
		$data=$url[1];
		$url='http://blast.ncbi.nlm.nih.gov/Blast.cgi';
		$res=$this->post_data($url,$data);
		$loading='/This page will be automatically updated in/i';
		if(preg_match($loading,$res))
		{
			return FALSE;
		} else
		{
			$this->blast_results=$res;
			return TRUE;
		}
	}
	
	public function upload_entrez()
	{
		echo "\n >> Uploading GI File to Entrez...";
		$uploadfile=getcwd().'/GI_NUMBERS.TXT';
		$ch = curl_init();
		curl_setopt($ch, CURLOPT_URL, "http://www.ncbi.nlm.nih.gov/portal/utils/batchentrez_p.cgi"); // you can modify this URL based on your environment
		curl_setopt($ch, CURLOPT_POSTFIELDS, array('file' => "@$uploadfile",'db'=>'protein','cmd'=>'Retrieve'));
		curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
		curl_setopt($ch, CURLOPT_ENCODING, "");
		curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookies);
		curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookies);
		$store = curl_exec ($ch);
		$pattern='/Passed to Entrez: [0-9]{1,5}/';
		preg_match_all($pattern,$store,$count,PREG_PATTERN_ORDER);
		preg_match_all('/[0-9]{1,5}/',$count[0][0],$truecount,PREG_PATTERN_ORDER);
		$this->true_count=$truecount[0][0];
		curl_close($ch);
		echo 'Done!';
	}
	
	public function save_tinyxml()
	{
		echo "\n >> Downloading TinyXML Sequence...\n\n";
		$url="http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?tool=portal&sendto=on&log$=seqview&db=protein&dopt=fasta_xml&query_key=1&qty={$this->true_count}&filter=all";
		system("curl -b '{$this->cookies}' '$url' > sequences_fasta.xml");
		//$handle=fopen("sequences_fasta.xml","w+");
		//fwrite($handle,$content);
		echo "\n\n ## Done!\n ++ sequences_fasta.xml has been written to {$this->dir}\n";
	}
	
	public function maketable5()
	{
		echo "\n\n#### RUNNING MAKE_TABLE5 ####\n";
		$c=$this->c;
		$cmd="make_table5 -i sequences_fasta.xml -c $c";
		system($cmd);
		if(!file_exists('sequences_fasta.xml.faa'))
		{
			$this->status=FALSE;
		}
		if(filesize('sequences_fasta.xml.faa')==0)
		{
			$this->status=FALSE;
		}
		echo "\n\n>> Done. All files have been created.\n";
	}
	
	public function write_status()
	{
		$handle=fopen("status","w+");
		fwrite($handle,$this->status);
		return TRUE;
	}
}
?>