#!/usr/bin/perl -w # ---------------------------------------------------------------------------- # BlastReport: # Written by: Jeanette McClintick/Howard Edenberg, # Dept. Medical & Molecular Genetics/Biochemistry and Molecular Biology, # Indiana University School of Medicine # Reads the output of a blastcl3 query and reformats it for ease of use. # ---------------------------------------------------------------------------- # BlastReport SETUP: # # $MyDisk and $Folder MUST BE MODIFIED before using BlastReport. # # $MyDisk and $Folder define the location of the input and output files used by # BlastReport. Customize the variables by replacing the placeholder names (e.g. # DISKNAME) with the names on your computer system. You must include ':' for # MacOS, '/' for Unix or '\' for Windows. REMEMBER that the FULL FOLDERNAME must # be used: if the folder/subdirectory is within another, this will be a # concatenated name (e.g. "blastfolder:blastresults:"). # Changes to other variables are optional. # # See the Readme document for BlastReport before making any changes. # ------------------------------------------------------------------------------ # C H A N G E F I L E A N D F O L D E R N A M E S H E R E : # ------------------------------------------------------------------------------ $MyDisk = "DISKNAME"; #the EXACT name of your disk $Folder = "FOLDERNAME"; #the EXACT folder/subdirectory location of BLASTCL3 output # Optional parameters to change report/file names: $Link = "_"; #value used to link parts of file name $ReportPre = "BRept "; #literal to prefix all reports written by this program. $BlastPre = "blast "; #prefix of BLASTCL3 output used as input to program. $Full = "Full"; #literal to suffix name of full report. $Sum = "Sum"; #literal to suffix name of summary report. $Tab = "Tab"; #literal to suffix name of tab delimited report. # ------------------------------------------------------------------------------ # C H A N G E P R I N T C O N T R O L P A R A M E T E R S H E R E: # ------------------------------------------------------------------------------ $RegionConst = 4; #number of alignment regions printed per line. # number of database sequences for which alignment regions will be printed. # Modify $cDNADBseq for 'cDNA' and $MarkerDBseq for 'marker' $DBseqPrint = 0; # DO NOT MODIFY $MarkerDBseq = 4; $cDNADBseq = 10; # number of alignment regions to give for each DB sequence # Modify $MarkerRegion for 'marker' and $cDNARegion for 'cDNA' $RegionPrint = 0; # DO NOT MODIFY $MarkerRegion = 4; $cDNARegion = 55; # number of alignment regions to give under analysis # Modify $MarkerAlign for 'marker' and $cDNAAlign for 'cDNA' $MaxAlign = 0; # DO NOT MODIFY $MarkerAlign = 12; $cDNAAlign = 55; # number of full alignments to be printed. # Modify $MarkerMaxFull for 'marker' and $cDNAMaxFull for 'cDNA' $MaxFull = 0; # DO NOT MODIFY $MarkerMaxFull = 4; $cDNAMaxFull = 5; # ---------------------------------------------------------------------------- # E N D O F P A R A M E T E R S # Variables below this line should be modified only at your own risk. # ---------------------------------------------------------------------------- ### --------------------------------------------------------------------- ### ### Define subroutines to be used by this program. ### ### --------------------------------------------------------------------- sub Align; # predeclaration of subroutine. ### Subroutine Align: Handles query with alignments: sub Align { $CountDesc = 0; #count of description lines found. while () { print REPORTOUT $line; if (length($line) > 10) { if (substr ($line, 0, 10) eq 'Sequences ') {last} } # end if if (! defined($line = )) {die "\n\nERROR1: unexpected end of input!!" }; } # end while while () { if (! defined($line = )) {die "\n\nERROR2: unexpected end of input!!" }; if (length($line) > 10) { if (substr ($line, 0, 1) eq '>') {last} else { ++$CountDesc; print REPORTOUT $line; } } else { print REPORTOUT $line; } } # end while # print REPORTOUT " The number of descriptions for this query: $CountDesc\n\n"; # Reached first alignment: $CountLine = 0; $SaveLine[$CountLine] = $line; $CountDBseq = 0; @DBlist = (); #list of DB sequences that align to query @DBptr = (); #number of Alignment that is 1st for this DBseq. @Clone = (); #clone associated with the DB seq in @DBlist. $CountAlign = -1; @DBseqForAlign = (); @AlignLo = (); @AlignHi = (); @SbjctLo = (); @SbjctHi = (); @AlignLoSort = (); @Temp = (); @Ident = (); @Expect = (); @items = split(/\s+/,$line); # Seq: Return here when action is 3: '>' found - new database sequence Seq: # extract name of sequence ($T1,$DBseq) = split (/:|\|/,$items[0]); # need | + : $DBlist[$CountDBseq] = $DBseq; $DBptr[$CountDBseq] = $CountAlign +1; # get clone number if available. $clone = 'None'; if ($line =~ /\bclone\b/) { ($T1, $clone) = split (/\s+|\,/,$'); } $Clone[$CountDBseq] = $clone; $action = &ReadTest(); # expecting action to be 7 found length if ($action == 7) { $Length[$CountDBseq] = $items[3]; } else { die "Expected action 7. Action = $action. Analysis terminated.\n\n" } ++$CountDBseq; $action = &ReadTest(); # Score: Return here when action is 1: ' Score including Expect Score: ++$CountAlign; if ($CountAlign == $MaxFull) {$AlignFullCount = $CountLine}; if ($action != 1) { die "Expected action 1. Action = $action. Analysis terminated.\n\n" } $DBseqForAlign [$CountAlign] = $DBlist [$CountDBseq-1]; $Expect[$CountAlign] = $items[8]; $action = &ReadTest(); if ($action !=5) { die "Expected action 5. Action = $action. Analysis terminated.\n\n"; } $Ident [$CountAlign] = $items[3]; $action = &ReadTest(); if ($action != 2) { die "Expected action 2. Action = $action. Analysis terminated.\n\n" } $AlignLo[$CountAlign] = $items[1]; $AlignHi[$CountAlign] = $items[3]; $action = &ReadTest(); if ($action != 6) { die "Expected action 6. Action = $action. Analysis terminated.\n\n" } $SbjctLo[$CountAlign] = $items[1]; $SbjctHi[$CountAlign] = $items[3]; $action = &ReadTest(); until ($action != 2) { $AlignHi[$CountAlign] = $items[3]; $action = &ReadTest(); if ($action != 6) { die "Expected action 6. Action = $action. Analysis terminated.\n\n" }; $SbjctHi[$CountAlign] = $items[3]; $action = &ReadTest(); } #end until # end of alignment expect, action 1,3, or 4 if ($action == 0) { die "End of input reached before expected. Analysis terminated.\n\n"; } elsif ($action == 1) { goto Score } elsif ($action == 3) { goto Seq } elsif ($action == 4) { ++$CountAlign; print REPORTOUT "--- $QueryName alignment analysis ---\n"; print REPORTOUT " Number of alignments: $CountAlign\n"; # New stuff if ($cDNA and $#AlignLo > 0) { $Asize = scalar(@AlignLo); for ($ii = 0; $ii < $Asize; ++$ii) { $TempDB = 'ND'; for ($jj = 0; $jj < $CountDBseq; ++$jj) { if ($ii > $DBptr[$jj] ) { next } elsif ($ii == $DBptr [$jj]) { $TempDB = $jj; last } elsif ($ii < $DBptr [$jj]) { $TempDB = $jj-1; last }; } #end for if ($TempDB eq 'ND') { $TempDB = $CountDBseq-1}; $Temp[$ii] = sprintf ("%2d %6d $ii",$TempDB,$AlignLo[$ii]); # New stuff } #end for @AlignLoSort = sort @Temp; for ($ii = 0; $ii < $Asize; ++$ii) { @items = split (/\s+/,$AlignLoSort[$ii]); $AlignLoSort[$ii] = $items [3]; } #end for } #end if if ($CountAlign > $MaxAlign) { $CountAlign = $MaxAlign; }; $Good = 'ok'; print REPORTOUT " Region(s) of alignment for Query: $QueryName (first $MaxAlign only)\n"; for ($i = 0; $i < $CountAlign; ++$i) { if ($i > 1) { $diff = abs($AlignLo[$i -1] - $AlignLo[$i]); if ($diff < 20) { $Good = ''; }; }; printf REPORTOUT (" From: %4d To: %4d Expect: %5s Match: %9s %12s\n", $AlignLo[$i], $AlignHi[$i]), $Expect[$i], $Ident[$i], $DBseqForAlign[$i]; } #end for print REPORTOUT "\n"; unless ($Good) { print REPORTOUT "***This query $QueryName, probably contains a region that is repeated often.\n"; print REPORTOUT "You may want to mask the region or note in query for future searches.\n\n"; if ($Expect[0] eq '0.0') { $Good = 'ok' } elsif ($Expect[1] =~ /\./ or $Expect[2] =~ /\./){ $Good = 'ok'} else { ($T1,$ExpSep0) = split(/\-/,$Expect[0]); ($T1,$ExpSep1) = split(/\-/,$Expect[1]); ($T1,$ExpSep2) = split(/\-/,$Expect[2]); $difE1 = $ExpSep0 - $ExpSep1; $difE2 = $ExpSep0 - $ExpSep2; if ($Ident[0] eq '(100%)') {$Good = 'ok'} elsif (($difE1 > 20) or ($difE2 > 20)) {$Good = 'ok'} elsif ($ExpSep0 > 50) {$Good = 'ok'}; } #end if } # end unless if ($Good) { ++$CountS; print REPORTOUT " >>>>>>> ---------------------------------------- <<<<<<<<\n"; print REPORTOUT " $QueryName may have a significant alignment.\n\n"; # print "INFO: Query sequence $QueryName may have a significant alignment.\n"; # print summary of alignments print REPORTSUM "$QueryLine\n"; $DBindex = $CountDBseq; if ($DBindex > $DBseqPrint) { $DBindex = $DBseqPrint; }; for ($i = 0; $i < $DBindex; ++$i) { $A1 = $DBptr[$i]; if ($i < ($CountDBseq -1)) { $A2 = $DBptr[$i+1]; } else { $A2 = $CountAlign; }; if (($A2 - $A1) > $RegionPrint) { $A2 = $A1 + $RegionPrint; }; $Qprint = $QueryName.' '; $Qprint = substr($Qprint,0,11); # print table entry print REPORTTAB "$QueryName\t$DBlist[$i]\t$Clone[$i]\t$Length[$i]\t$Expect[$A1]\t$AlignLo[$A1]\t$AlignHi[$A1]\t$SbjctLo[$A1]\t$SbjctHi[$A1]\n"; # print query lines while ($A1 < $A2) { $A2P = $A1 + $RegionConst; if ($A2P > $A2) { $A2P = $A2 } print REPORTOUT $Qprint; print REPORTSUM $Qprint; for ($k = $A1; $k < $A2P; ++$k) { if ($cDNA and $#AlignLo > 0) { $j = $AlignLoSort[$k] } else { $j = $k } printf REPORTOUT (" %7d - %7d",$AlignLo[$j],$AlignHi[$j]); printf REPORTSUM (" %7d - %7d",$AlignLo[$j],$AlignHi[$j]) } print REPORTOUT "\n"; print REPORTSUM "\n"; # print subject line $Sprint = $DBlist[$i].' '; $Sprint = substr($Sprint,0,11); print REPORTOUT $Sprint; print REPORTSUM $Sprint; for ($k = $A1; $k < $A2P; ++$k) { if ($cDNA and $#AlignLo > 0) { $j = $AlignLoSort[$k] } else { $j = $k } printf REPORTOUT (" %7d - %7d",$SbjctLo[$j],$SbjctHi[$j]); printf REPORTSUM (" %7d - %7d",$SbjctLo[$j],$SbjctHi[$j]); } print REPORTOUT "\n\n"; print REPORTSUM "\n\n"; $A1 = $A2P; } #end while print REPORTOUT "\n"; print REPORTSUM "\n"; } # end for # Print requested number of full alignments if ($CountAlign > $MaxFull) { $CountLine = $AlignFullCount; print REPORTOUT " Only first $MaxFull alignments will be printed.\n\n" }; for ($i = 0; $i < $CountLine; ++$i) { print REPORTOUT $SaveLine[$i] } # end for } # end if } # end if } #end of Align Subroutine ### Subroutine ReadTest: Read a line of blast report and determine content. ### Return value accordingly: ### String found Value Description ### ' Score' 1 Beginning of an alignment, Score & Expect ### 'Query:' 2 Query line of the alignment (1 or more per alignment) ### '>' 3 New subject sequence for alignments ### 'Lambda' 4 End of alignments for this query sequence ### ' Identities' 5 Identites line, gives % match ### 'Sbjct:' 6 Subject line of the alignment ### 'Length' 7 length of matching sequence ### EOF 0 Reached end of file, should not happen sub ReadTest { while () { $line = ; ++$CountLine; $SaveLine[$CountLine] = $line; if (length($line) < 11) { next } elsif (substr($line,1,5) eq 'Score') { $test = 1 } elsif (substr($line,0,6) eq 'Query:') { $test = 2 } elsif (substr($line,0,1) eq '>') { $test = 3 } elsif (substr($line,0,6) eq 'Lambda') { $test = 4 } elsif (substr($line,1,10) eq 'Identities') { $test = 5 } elsif (substr($line,0,6) eq 'Sbjct:') { $test = 6 } elsif ($line =~ /\bLength\b/) { $test = 7 } else {next}; @items = split(/\s+/,$line); return $test } #end while # reached end of input before expected, return 0 (error) return 0; } #end of subroutine ReadTest # ---------------------------------------------------------------------------- # ### ------------------ end of Subroutine definitions --------------------- # # ---------------------------------------------------------------------------- # ### ------------------ Begin main procedure code -------------------------- # # ---------------------------------------------------------------------------- $line = ''; $QueryLine = ''; $Directory = $MyDisk.$Folder; #directory/folder to be searched for input. # ($sec,$min,$hour,$mday,$mon,$year) = localtime(time); # Y2K bug 2000 gives year = 100. if ($year >= 100) { $year = substr($year,1,2) }; print "\n==== Begin BlastReport Parameter Selection ====\n\n"; if ($MyDisk eq "DISKNAME" or Folder eq "FOLDERNAME") { print "BlastReport must be customized before running.\n"; print "\$MyDisk and \$Folder must be modified for your computer.\n"; print "See ReadMe for customization instructions.\n"; print "\n==== BlastReport Execution Terminated ====\n\n"; exit; }; opendir DIRECTORY, $Directory or die "!!!! Can't open directory, $Directory -->$!\n"; @allfiles = readdir DIRECTORY; # print "directory: $Directory\n\n"; # foreach $file (@allfiles) { # print "$file\n"; # } # end foreach StartAgain: $BadIn = 'x'; while ($BadIn) { print "\n>Enter chromosome number of blast report to use or Q to quit. \n"; chomp ($input = ); $ChrIn = $input; #$ChrIn used in calculations, $input for error messages. if ($ChrIn =~ /^q/i) { #Does input start with a 'q' ? print "-----Blast Reporting terminated-----\n"; exit; } elsif (length($ChrIn) > 2) { print "chromosome must be 1 or 2 digits\n"; } elsif ($ChrIn !~ m/^\d{1,2}$/) { print "input not numeric: $input\n"; } else { $BadIn = ''; } #end if } #end while if ($ChrIn < 10) { $ChrIn = '0'.$ChrIn; } $BlastPat = $BlastPre.$ChrIn; print "\n$BlastPat files:\n\n"; $CountFile = 0; foreach $file (@allfiles) { if ($file =~ /^$BlastPat/){ printf ("%2d $file\n", $CountFile + 1); $FileSet [$CountFile] = $file; ++$CountFile; } } # end foreach if ($CountFile == 0) { print "No blast output for Chromosome $ChrIn\n"; goto StartAgain; } $BadIn = 'x'; while ($BadIn) { print "\n>Enter number of file to use or Q to quit: "; chomp($input = ); $Sel = $input; if (uc($Sel) eq 'Q') { print "-----Blast Reporting terminated-----\n"; exit; }; if (not ($Sel =~ m/^\d{1,}$/)) { print "input is not numeric: $input\n"; } elsif ($Sel > $CountFile or $Sel < 1) { print "input is not valid\n" } else { $BadIn = ''; } } # end while --$Sel; $BlastFile = $FileSet[$Sel]; print "\nUsing file: $BlastFile\n"; $Strt = length($BlastPat); $Len = length($BlastFile) - $Strt + 1; $ReportFile = $ReportPre.$ChrIn.substr($BlastFile,$Strt,$Len); # replace all white space characters with $Link variable (default _) $ReportFile =~ s/\s/$Link/g; # --------------------------------------------------------------------------- # File name initialization: # --------------------------------------------------------------------------- $BlastIn = $MyDisk.$Folder.$BlastFile; $ReportOut = $MyDisk.$Folder.$ReportFile; $ReportFull = $ReportOut.$Link.$Full; $ReportSum = $ReportOut.$Link.$Sum; $ReportTab = $ReportOut.$Link.$Tab; print "\nSelect type of Input:\n\n"; print " 1 Marker\n"; print " 2 cDNA\n\n"; $BadIn = 'x'; while ($BadIn) { print ">Enter number for type of file or Q to quit: "; chomp($input = ); $Sel = $input; if (uc($Sel) eq 'Q') { print "-----Blast Reporting terminated-----\n"; exit; }; if (not ($Sel =~ m/^\d{1,}$/)) { print "input is not numeric: $input\n"; } elsif ($Sel > 2 or $Sel < 1) { print "input is not valid\n" } else { $BadIn = ''; } } # end while #Set parameters that differ for marker and cDNA type data. if ($Sel == 1) { $cDNA = ''; $DBseqPrint = $MarkerDBseq; $RegionPrint = $MarkerRegion; $MaxAlign = $MarkerAlign; $MaxFull = $MarkerMaxFull; print "\nType of file selected: Marker\n\n"; } else { $cDNA ='T'; $DBseqPrint = $cDNADBseq; $RegionPrint = $cDNARegion; $MaxAlign = $cDNAAlign; $MaxFull = $cDNAMaxFull; print "\nType of file selected: cDNA\n\n"; } print "\n==== Begin BlastReport Execution ====\n\n"; # Open the Blast query output file, keep reading lines until 'Query=' is found open (BLASTIN, "$BlastIn") or die "\n!!!! Can't open BLASTIN file: $BlastIn\n-->$!\n"; print "INFO: BLASTIN opened successfully:\n"; print " $BlastIn\n"; # Open ReportOut file . open (REPORTOUT, ">$ReportFull") or die "\n\n!!!! Can't open output file for REPORTOUT: $ReportFull\n-->$!\n"; # Open ReportSum file . open (REPORTSUM, ">$ReportSum") or die "\n\n!!!! Can't open output file for REPORTSUM: $ReportSum\n-->$!\n"; # Open ReportTab file . open (REPORTTAB, ">$ReportTab") or die "\n\n!!!! Can't open output file for REPORTTAB: $ReportTab\n-->$!\n"; $CountQ = 0; #number of query sequences for this report. $CountA = 0; #number of queries that have alignments. $CountS = 0; #number of alignments that may be significant $CountNoA = 0; #count of queries without alignments, index to array @SaveNoA. @SaveNoA = (); #Array to hold queries that have no alignments. # Look for a line that starts with 'Query=' while (defined($line = )) { if (length($line) < 6) {next}; if (substr ($line,0,6) eq 'Query=') { print REPORTOUT " ------------------ Blast Report Full -----------------\n"; printf REPORTOUT (" Date: %2.2d/%2.2d/%2.2d ------------------------ Time: %2d:%2.2d:%2.2d\n\n",$mon +1,$mday,$year,$hour,$min,$sec); print REPORTOUT "Input Report => $BlastIn\n"; print REPORTOUT "Name of this Report => $ReportFull\n\n"; print REPORTOUT "The following query sequences had alignments, analysis of alignments follows each query.\n\n"; print REPORTSUM " -------------------- Blast Report Summary --------------------\n"; printf REPORTSUM (" Date: %2.2d/%2.2d/%2.2d ------------------------ Time: %2d:%2.2d:%2.2d\n\n",$mon +1,$mday,$year,$hour,$min,$sec); print REPORTSUM "Input Report => $BlastIn\n"; print REPORTSUM "Name of this Report => $ReportSum\n\n"; print REPORTSUM "The following alignments may be significant.\n\n"; print REPORTTAB "Locus\tSequence\tClone\tLength\tExpect\tLalign1\tLalign2\tSalign1\tSalign2\n"; last; } # end if } # end while # process the rest of the report. while () { if (substr ($line,0,6) eq 'Query=') { ++$CountQ; $QueryLine = $line; # save line with query information ($T1, $QueryName) = split(/:|\||\s+/, $line); if (! defined($line = )) { last;} # Save the next 2 lines to be diplayed with the query, one of these contains # length information for the query. $QueryLine2 = $line; # save line with length information. if (! defined($line = )) { last;} $QueryLine3 = $line; # save line with length information. } #end if if (! defined($line = )) { last;} if (length($line) < 13) {next}; if (substr ($line,0,9) eq 'Searching') { # Check to see if there are any alignments for this query. # Process the alignments by calling Align. if (! defined($line = )) { last;} #skip line if (! defined($line = )) { last;} if (length($line) < 15 || $line =~ /Score/) { print REPORTOUT " ----------------------=== $QueryName ===-------------------\n\n"; print REPORTOUT $QueryLine; print REPORTOUT $QueryLine2; if (length($QueryLine3) > 1) { print REPORTOUT $QueryLine3; }; ++$CountA; Align; print REPORTOUT "\n"; } else { $SaveNoA [$CountNoA] = $QueryLine; ++$CountNoA; } } } # end while close BLASTIN; #print list of queries with no alignments. print REPORTOUT "\nThe following query sequences had no alignments:\n\n"; for ($i = 0;$i< $CountNoA; ++$i){ print REPORTOUT $SaveNoA[$i]; } #end for print REPORTOUT "\n ------------------- End of Report ---------------------\n"; close REPORTOUT; print REPORTSUM " Number of Query Sequences in the run: $CountQ\n"; print REPORTSUM " Number of Query Sequences with alignments: $CountA\n"; print REPORTSUM " Sequences that may have significant alignments: $CountS\n"; print REPORTSUM "\n ------------------- End of Report ---------------------\n"; close REPORTSUM; print "INFO: Number of Query Sequences in the run: $CountQ\n"; print "INFO: Number of Query Sequences with alignments: $CountA\n"; print "INFO: Sequences that may have significant alignments: $CountS\n"; print "INFO: Full Report: $ReportFull\n"; print "INFO: Summary: $ReportSum\n"; print "INFO: Excel Table: $ReportTab\n\n"; print "\n==== BlastReport Execution Completed ====\n\n";