/* Program From Progressive Alignment of Amino Acid Sequences and Construction of Phylogenetic Trees from Them, by D. F. Feng and R. F. Doolittle, Methods of Enzymology, v266, p368 (1996) 2/7/1996 Read "readme.doc" before using. */ #include #include FILE *Protein, *ProtLib; char linebuf[BUFSIZ]; char *prot_title, *lib_title; char *prot_sequence, *lib_sequence; int size_prot = 0, size_lib = 0; int exi, exj; int window_size, threshold; int search_count; int matchscore; int number_of_hits; int flag; /* This is inconvenient, but it's easier than setting up a dynamic array: */ #define MAX_PEPTIDE 45 main( argc, argv) int argc; char **argv; { if(argc<3) { fprintf(stderr,"\nPurpose: Find Matching Boundaries.\n"); fprintf(stderr,"Usage: %s rf1",argv[0]); fprintf(stderr," rf2 [Ln Wn Td MS] [>wf]\n"); fprintf(stderr,"\trf1: Query sequence.\n"); fprintf(stderr,"\trf2: Database.\n"); fprintf(stderr,"\t****Parameters below MUST BE "); fprintf(stderr,"ALL OR NONE specified!!****\n"); fprintf(stderr,"\tLn: Print only the first Ln scoring "); fprintf(stderr,"segments(default=10).\n"); fprintf(stderr,"\tWn: Window size(default=30).\n"); fprintf(stderr,"\tTd: Match Threshold per Wn(default=8).\n"); fprintf(stderr,"\tMS: Matching Score(default=360).\n"); fprintf(stderr,"Note: Query sequence and database must be in"); fprintf(stderr," \"Old Atlas\" format.\n\n"); exit(); } if( ( Protein = fopen( argv[1], "r")) == NULL) { fprintf( stderr, "%s: cannot read Protein file \"%s\"\n", argv[0], argv[1]); exit(1); } if( ( ProtLib = fopen( argv[2], "r")) == NULL) { fprintf( stderr,"%s: cannot read Protein Library file \"%s\"\n", argv[0], argv[2]); exit(1); } if(argv[3]==NULL) { flag=10; window_size=30; threshold=8; matchscore=360; } else { flag = atoi(argv[3]); window_size = atoi(argv[4]); threshold = atoi(argv[5]); matchscore = atoi(argv[6]); } printf( "Peptide Window Size is %d; Match Threshold is %d;", window_size, threshold); printf( " Matchscore is %d;\n", matchscore); if( (window_size < 2) || (threshold < 2) || (threshold > window_size)) { fprintf( stderr, "%s: for reasonable results, window size\n", argv[0]); fprintf( stderr, "and threshold must be greater than 2 and\n"); fprintf( stderr, "threshold must be less than window size\n"); fprintf( stderr, "threshold: %d window_size: %d\n", threshold, window_size); exit(1); } if( window_size > MAX_PEPTIDE) { fprintf( stderr, "window size (%d) exceeds MAX_PEPTIDE (%d)\n", window_size, MAX_PEPTIDE); fprintf( stderr, "This can be fixed by recompiling.\n"); exit(1); } if( ! read_protein( Protein, &prot_title, &prot_sequence, &size_prot)) { fprintf( stderr, "%s: Trouble reading Protein file\n", argv[0]); exit(1); } if( size_prot <= window_size) { fprintf(stderr,"Too few residues in Protein file\n"); fprintf( stderr, "found %d, must have more than %d ", size_prot, window_size); fprintf( stderr, "(window_size)\n"); exit(1); } printf( "Name of search protein:\n\n%s\n\n", prot_title); (void) fflush( stdout); search_count = 0; while( read_protein( ProtLib, &lib_title, &lib_sequence, &size_lib)) { if( size_lib <= window_size) continue; ++search_count; if( compare_protein() > 0) { report_results(); } } printf( "\nnumber of proteins searched = %d\n", search_count); printf( "number of proteins with matches = %d\n", number_of_hits); exit(0); } #define TITLE_SIZE 72 #define TCODE_OFFSET 1 /* beginning of line has codings to skip */ #define PCODE_OFFSET 11 #define RES_BLOCK_FACTOR 30 /* number of residues per line */ #define MAX_RESIDUES 20000 int read_protein( File, titlep, seqp, sizep) FILE *File; char **titlep; char **seqp; int *sizep; { register char *s, *d; extern char * malloc(); extern char * index(); /* locate protein title line--signified by `T' in first column */ do { if( fgets( linebuf, sizeof linebuf, File) == NULL) { printf("EOF on input\n"); return 0; } } while( *linebuf != 'T'); if( *titlep == NULL) *titlep = malloc( TITLE_SIZE); strncpy( *titlep, &linebuf[TCODE_OFFSET], TITLE_SIZE-1); /*DBG**fprintf( stderr, "%s", *titlep);*/ if( *sizep == 0) *seqp = malloc(MAX_RESIDUES + RES_BLOCK_FACTOR); d = *seqp; for(;;) { if( fgets( linebuf, sizeof linebuf, File) == NULL) { fprintf( stderr, "Unexpected EOF on input\n"); return 0; } if( (*linebuf != 'P') && (*linebuf != 'F')) { if( (*linebuf == 'T') && (d == *seqp)) { /* some proteins have extra title lines */ continue; } fprintf( stderr,"bad protein format: %-30s\n", *titlep); fprintf( stderr, "\"%-30s...\"\n", linebuf); printf( "bad protein format: %-30s\n", *titlep); printf( "\"%-30s...\"\n", linebuf); return 0; } for( s = &linebuf[PCODE_OFFSET]; s < linebuf + sizeof linebuf;) { if( *s == '*') { *d = '\0'; *sizep = d - *seqp; /*DBG**fprintf(stderr," %d residues\n",*sizep);*/ return 1; } if( ! isupper( *s)) { fprintf( stderr, "bad residue: %c in %s\n", *s, *titlep); printf( "bad residue: %c in %s\n", *s, *titlep); *(++s) = '\0'; fprintf( stderr, "\"%s\"\n", linebuf); printf( "\"%s\"\n", linebuf); return 0; } *d++ = *s - 'A'; ++s; if( d > *seqp + MAX_RESIDUES) { fprintf( stderr, "%-40s exceeds MAX_RESIDUES", *titlep); fprintf( stderr, " (%d)\n", MAX_RESIDUES); printf( "%-40s exceeds MAX_RESIDUES", *titlep); printf( " (%d)\n", MAX_RESIDUES); return 0; } if( ( *s == '\n' ) || ( *s == '\0')) break; /* skip over character separating residues */ ++s; /* to be extra careful... */ if( ( *s == '\n' ) || ( *s == '\0')) break; } } /* normal return from this routine occurs when '*' is found */ /*NOTREACHED*/ return 0; } /* * Method: * The goal of this program is to find regions of homology between protein * sequences. Homology is not easy to quantify. * * The approach taken in this program is to take subsequences of length * `window_size' from each parent and count the number of matching * residues between subsequences. If the number of matches exceeds * `threshold', then those two subsequences are considered to be * homologous regions. [If there are at least two matching * regions (sensibly ordered), then the library protein is selected for * printing (note that the two matching regions may overlap).] * * Details: * Subsequences are defined by their length and the index of their starting * position. For two subsequences, the Failure function Fn(a,b) is the * number of mismatches between the n-length subseqences starting at a and b. * The Failure function can be generated by induction: * * F1(a,b) := ( A(a) == B(b)) * Fi+1(a,b) := Fi(a,b) + F1(a+i,b+i) * * It will be efficient to store the Failure function as a list of its * values which are less than the rejection_threshold. This list is * referred to as `Match_list'. * * Lists of candidate matching regions are produced by finding zeroes * of F1(a,b) * for which ( a < size(A) - window_size and b < size(B) - window_size). * [candidate regions near the ends of the peptides are handled differently.] * The list contains triples (a,b,f), with f = 0. The generation of * this list is covered in a later discussion. * * Entries on the candidate lists are tested by doing the above recursion: * * For Fi(a,b) for i =2 ; i < rejection_threshold, simply update the candidate * list by adding the value of F1(a+i,b+i) to the f value in the triple. * * For Fi(a,b) for i = rejection_threshold; i <= window_size, update the * list by adding the value of F1(a+i,b+i) to the f value in the triple. * If the f value is less than the reject_threshold, copy the triple to * the new candidate list (list space gets re-used here). * * Surviving candidates are moved to Match_list for later reporting of * results. * */ struct list_ent { int a_pos; int b_pos; int f_val; }; struct list_head { struct list_ent **next; struct list_ent **first; int size; struct list_ent *listspace; }; struct list_head Match_list; struct list_head Cand_list; #define MAX_LIST (5*MAX_RESIDUES) int compare_protein() { int Apos; make_B_lists(); init_list( &Match_list, size_lib + size_prot); /* extend sequences with distinct tails for handling end effects */ extend_seq( prot_sequence, size_prot, 'P'); extend_seq( lib_sequence, size_lib, 'L'); for( Apos = 0; Apos < size_prot; ) { get_Cand( &Apos); test_Cand(); move_Matches(); } return (Match_list.next - Match_list.first); } /* * test candidate list by recursing */ #define F1( p) ( prot_offset[(p)->a_pos] != lib_offset[(p)->b_pos]) test_Cand() { register int rejection_threshold = window_size - threshold; register char *prot_offset, *lib_offset; register struct list_ent *elmp; int i; /*DBG**fprintf( stderr, "test_Cand: %d candidates\n",*/ /*DBG** Cand_list.next - Cand_list.first);*/ for( i = 1; i < rejection_threshold; ++i) { register struct list_ent *Cfirst, *Clast; Cfirst = *(Cand_list.first); Clast = *(Cand_list.next); prot_offset = prot_sequence + i; lib_offset = lib_sequence + i; for(elmp = Cfirst; elmp < Clast; ++elmp) { if( F1(elmp)) ++(elmp->f_val); } } for( i = rejection_threshold; i < window_size; ++i) { register struct list_ent **curpp, **newp; prot_offset = prot_sequence + i; lib_offset = lib_sequence + i; newp = Cand_list.first; for( curpp = Cand_list.first; curpp < Cand_list.next;) { elmp = *curpp++; if( F1(elmp)) { if( ++(elmp->f_val) > rejection_threshold) continue; } *newp++ = elmp; } Cand_list.next = newp; } } /* * Initialize a Cand_ or Match_list. */ init_list( headp, siz) struct list_head *headp; int siz; { register unsigned int size = siz; extern char *malloc(); if( headp->size < size) { if( headp->size) { free( (char *) headp->listspace); free( (char *) headp->first); } /*DBG**fprintf(stderr,"requesting %d slots for list entries\n",*/ /*DBG** size);*/ headp->first = (struct list_ent **) malloc( size * sizeof( struct list_ent *)); headp->listspace = (struct list_ent *) malloc( size * sizeof( struct list_ent)); headp->size = size; } headp->next = headp->first; *(headp->next) = headp->listspace; } /* * Move successful candidates to the Match_list. * This allows the search for successful candidates to resume. */ move_Matches() { register struct list_ent **srcp; register struct list_ent *newp; /*DBG**fprintf( stderr, "move_Matches: %d successful candidates\n",*/ /*DBG** Cand_list.next - Cand_list.first);*/ newp = *(Match_list.next); for( srcp = Cand_list.first; srcp < Cand_list.next; ++srcp) { if( Match_list.next - Match_list.first > Match_list.size) { printf( "\nlost some matches for this protein\n"); break; } newp->a_pos = (*srcp)->a_pos; newp->b_pos = (*srcp)->b_pos; newp->f_val = (*srcp)->f_val; *(Match_list.next++) = newp++; } *(Match_list.next) = newp; } /* * The "B_lists" are lists of positions for each residue in the protein. */ struct res_list_ent { struct res_list_ent *next; int b_pos; }; struct res_list_ent *B_lists[26]; struct res_list_ent *Blistspace = 0; struct res_list_ent *Blistnext; unsigned int sizeBlistspace = 0; make_B_lists() { register char *Bp; register int i; char *malloc(); /* ensure space for B-lists */ if(sizeBlistspace < size_lib) { sizeBlistspace = size_lib; if( Blistspace != NULL) free((char *) Blistspace); /*DBG**fprintf( stderr, "requesting space for %d B-lists\n",*/ /*DBG** sizeBlistspace);*/ Blistspace = (struct res_list_ent *) malloc( sizeBlistspace * sizeof( struct res_list_ent)); } /* initialize B_lists */ for( i = 0; i < 26; i++) B_lists[i] = NULL; Blistnext = Blistspace; for( Bp = lib_sequence + size_lib; Bp-- > lib_sequence; ) { Blistnext->b_pos = Bp - lib_sequence; Blistnext->next = B_lists[*Bp]; B_lists[*Bp] = Blistnext; ++Blistnext; } /*DBG print B-lists */ /*DBG**for( i = 0; i < 26; i++)*/ /*DBG** {register struct res_list_ent *Blp;*/ /*DBG** if( B_lists[i] == NULL)continue;*/ /*DBG** fprintf( stderr, "%c ", 'A'+i);*/ /*DBG** for( Blp=B_lists[i]; Blp ; Blp = Blp->next)*/ /*DBG** fprintf( stderr, "%d,", Blp->b_pos);*/ /*DBG** fprintf( stderr, "\n");*/ /*DBG** }*/ } /* * make candidate lists. * * The number of subsequences which must be examined can be greatly * reduced (and without loss of generality) by only trying pairs of * subsequences whose first elements are matches. * Thus the candidates are zeroes of F1(a,b). * * This approach is still O(MxN) in the worst case of all residues in both * sequences being the same residue. * * The B-lists record the positions in the lib_sequence of each residue. * For each residue in A, the appropriate B-list is used to extend the * list of candidates. */ get_Cand( apos) int *apos; { register int Apos; register int *flistp; register struct res_list_ent *Blp; register struct list_ent **nextp; init_list( &Cand_list, MAX_LIST ); flistp = (int *) Cand_list.listspace; nextp = Cand_list.next; for( Apos = *apos; Apos < size_prot; ++Apos) { /*DBG**fprintf( stderr, "Apos = %d, residue=%c, ", Apos,*/ /*DBG** 'A'+prot_sequence[Apos]);*/ Blp = B_lists[prot_sequence[Apos]]; if( Blp)do { *nextp++ = (struct list_ent *) flistp; *flistp++ = Apos; *flistp++ = Blp->b_pos; *flistp++ = 0; } while( Blp = Blp->next); /*DBG**fprintf( stderr, "list size = %d\n",*/ /*DBG** nextp - Cand_list.first);*/ if( nextp + size_lib - Cand_list.first > Cand_list.size) break; } *nextp = (struct list_ent *) flistp; Cand_list.next = nextp; *apos = Apos; } /* * in order to catch matching sequences within window_size of the end, * make sure that residue positions past the end have appropriate values. * The value supplied in 'ch' should be something outside the range * 0-26--those are residues--and should be different for the two sequences. * This ensures the correct value for F1(a,b) when a or b is off the end. */ extend_seq( seq, siz, ch) char *seq; int siz; char ch; { char *p; int i; p = seq + siz; for( i = 0; i < window_size; ++i) *p++ = ch; } /* * Results reporting. * The number of matches, length of library protein and its title are * written to the summary file. * The standard output gets the first few pairs of matches, * a few copies of the peptides that were matched, the same line as * on the summary file, and then the library protein sequence with * the matched regions specially marked. */ #define SEQ 40 /*Compared segment length (residues)*/ #define Mu 5 #define Mt 25 #define Ms 20 #define Mr 18 #define Mq 17 #define Mp 15 #define Mo 14 #define Mn 13 #define Mm 12 #define Ml 11 #define Mk 10 #define Mj 9 #define Mi 8 #define Mh 7 #define Mg 6 #define Mf 5 #define Me 4 #define Md 3 #define Mc 2 #define Mb 1 #define Ma 0 #define XB 0 int mptable[22][22] = { {Ms,Mi,Mg,Mf,Mg,Mf,Me,Md,Md,Md,Mf,Me,Md,Md,Mg,Mc,Mg,Me,Mi,Ma,XB,Mu}, {Mi,Mk,Mj,Mj,Mj,Mj,Mj,Mi,Mi,Mh,Mh,Mi,Mi,Mg,Mh,Mf,Mh,Mf,Mf,Mg,XB,Mu}, {Mg,Mj,Ml,Mi,Mj,Mi,Mi,Mi,Mi,Mh,Mh,Mh,Mi,Mh,Mi,Mg,Mi,Mf,Mf,Md,XB,Mu}, {Mf,Mj,Mi,Mo,Mj,Mh,Mh,Mh,Mh,Mi,Mi,Mi,Mh,Mg,Mg,Mf,Mh,Md,Md,Mc,XB,Mu}, {Mg,Mj,Mj,Mj,Mk,Mj,Mi,Mi,Mi,Mi,Mh,Mg,Mh,Mh,Mh,Mg,Mi,Me,Mf,Mc,XB,Mu}, {Mf,Mj,Mi,Mh,Mj,Mn,Mi,Mj,Mi,Mh,Mg,Mf,Mg,Mf,Mf,Me,Mh,Md,Md,Mb,XB,Mu}, {Me,Mj,Mi,Mh,Mi,Mi,Mk,Mk,Mj,Mj,Mk,Mi,Mj,Mg,Mg,Mf,Mg,Me,Mg,Me,XB,Mu}, {Md,Mi,Mi,Mh,Mi,Mj,Mk,Mm,Ml,Mk,Mj,Mh,Mi,Mf,Mg,Me,Mg,Mc,Me,Mb,XB,Mu}, {Md,Mi,Mi,Mh,Mi,Mi,Mj,Ml,Mm,Mk,Mj,Mh,Mi,Mg,Mg,Mf,Mg,Md,Me,Mb,XB,Mu}, {Md,Mh,Mh,Mi,Mi,Mh,Mj,Mk,Mk,Mm,Ml,Mj,Mj,Mh,Mg,Mg,Mg,Md,Me,Md,XB,Mu}, {Mf,Mh,Mh,Mi,Mh,Mg,Mk,Mj,Mj,Ml,Mo,Mk,Mi,Mg,Mg,Mg,Mg,Mg,Mi,Mf,XB,Mu}, {Me,Mi,Mh,Mi,Mg,Mf,Mi,Mh,Mh,Mj,Mk,Mo,Ml,Mi,Mg,Mf,Mg,Me,Me,Mk,XB,Mu}, {Md,Mi,Mi,Mh,Mh,Mg,Mj,Mi,Mi,Mj,Mi,Ml,Mn,Mi,Mg,Mf,Mg,Md,Me,Mf,XB,Mu}, {Md,Mg,Mh,Mg,Mh,Mf,Mg,Mf,Mg,Mh,Mg,Mi,Mi,Mo,Mk,Mm,Mk,Mi,Mg,Me,XB,Mu}, {Mg,Mh,Mi,Mg,Mh,Mf,Mg,Mg,Mg,Mg,Mg,Mg,Mg,Mk,Mn,Mk,Mm,Mj,Mh,Md,XB,Mu}, {Mc,Mf,Mg,Mf,Mg,Me,Mf,Me,Mf,Mg,Mg,Mf,Mf,Mm,Mk,Mo,Mk,Mk,Mh,Mg,XB,Mu}, {Mg,Mh,Mi,Mh,Mi,Mh,Mg,Mg,Mg,Mg,Mg,Mg,Mg,Mk,Mm,Mk,Mm,Mh,Mg,Mc,XB,Mu}, {Me,Mf,Mf,Md,Me,Md,Me,Mc,Md,Md,Mg,Me,Md,Mi,Mj,Mk,Mh,Mq,Mp,Mi,XB,Mu}, {Mi,Mf,Mf,Md,Mf,Md,Mg,Me,Me,Me,Mi,Me,Me,Mg,Mh,Mh,Mg,Mp,Mr,Mi,XB,Mu}, {Ma,Mg,Md,Mc,Mc,Mb,Me,Mb,Mb,Md,Mf,Mk,Mf,Me,Md,Mg,Mc,Mi,Mi,Mt,XB,Mu}, {XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,XB,Mu}, {Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu,Mu}, }; int indx[128]={ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 4, 7, 0, 7, 8, 17, 5, 10, 14, 21, 12, 15, 13, 6, 22, 3, 9, 11, 1, 2, 22, 16, 19, 20, 18, 8, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, }; report_results() { register struct list_ent **curp; char Apept[MAX_PEPTIDE+1]; char Bpept[MAX_PEPTIDE+1]; int i; int ei,ej; int msj,msj1,enter,enter1; int counter; int score; int qq,q,Z,X; enter1=0;msj=msj1=0; counter=0;enter=0; for( curp = Match_list.first; curp < Match_list.next; ++curp) { if(counter<=flag) { ei=(*curp)->a_pos+1; ej=(*curp)->b_pos+1; if(ei-10<1 || ej-10<1)X=0; else X=10; Z= -X; score=0; for(qq=0;qqa_pos+1)+Z+qq]&077)]] [indx['A'+(lib_sequence[((*curp)->b_pos+1)+Z+qq]&077)]]; } } if(score>matchscore) { msj=3700; if(enter==0) { printf( "\n matches A pos B pos A - B\n"); number_of_hits++; } enter=1; for( i = 0; i < window_size; ++i) { if( ( (*curp)->a_pos + i > size_prot) || ((*curp)->b_pos + i > size_lib)) break; Apept[i] = prot_sequence[(*curp)->a_pos+i]; Bpept[i] = ((lib_sequence[(*curp)->b_pos+i])&077); if( Apept[i] == Bpept[i]) { Apept[i] += 'a'; Bpept[i] += 'a'; } else { Apept[i] += 'A'; Bpept[i] += 'A'; } /* mark positions in lib_sequence */ lib_sequence[(*curp)->b_pos+i] |= 0100; } Apept[i] = '\0'; Bpept[i] = '\0'; if(counter<=flag) { counter++; printf( "%5d %4d %4d %4d\t%s\n\t", window_size - (*curp)->f_val, (*curp)->a_pos+1, (*curp)->b_pos+1, ((*curp)->a_pos+1) - ((*curp)->b_pos+1), Apept); if(score>=370)printf(" \t\t\t%s **%d**\n",Bpept,score); else printf(" \t\t\t%s %d\n",Bpept,score); } else if(msj1==0) { msj1=3700; printf("\n ** only first %d scoring segments printed!\n",flag); } } } if(msj>0) printf( "\n%3d %3d %s\n", Match_list.next - Match_list.first, size_lib, lib_title); /* * print formatted protein sequence with matching regions * highlighted. */ if(msj>0) { for( i = 0; i < size_lib; ++i) { if( (i%RES_BLOCK_FACTOR) == 0) printf( "\n%10d ", i+1); if( lib_sequence[i]&0100) printf( "_%c", (lib_sequence[i]&077) + 'a'); else printf( " %c", (lib_sequence[i]&077) + 'A'); } printf( " *\n\n"); } (void) fflush( stdout); }