#!/usr/bin/perl #compare_library.pl #version 1.0 #Written by Paul Stothard, Canadian Bioinformatics Help Desk. #compare_library.pl - this script accepts two files (i and j) containing multiple DNA sequences #in FASTA format. Each sequence in file i is compared using local BLAST (bl2seq) to each #sequence in file j, and an HTML table is generated to display a summary of the findings. #The table contains one row for each sequence in file i that has at least one hit #and lists all the sequences in file j that yielded a significant match. Links in the summary #table allow the complete BLAST results to be accessed from the the summary page. The results #are viewed by opening the index.html file that is generated into a web browser. # #There are three command line options: #-i input file one #-j input file two #-o output directory # #example usage: #perl compare_library.pl -i sample_seqs_1.txt -j sample_seqs_2.txt -o results_directory # #Note that this script requires the BLAST program bl2seq to be installed locally. #The variable $BLAST2SEQ_PATH will need to be set to the bl2seq path on your system. # #Note that the effective database size used for calculating statistics (bitscore and expect value) #is set to 10,351,892,538 (The size of the nr database on 2004-04-05). To change this value, #edit the variable $EFFECTIVE_SEARCH_SPACE. # #Note that this script is suitable for comparing small sequence collections (around 100 sequences #per collection). For larger collections use a script that converts one file to a sequence #database using formatdb. # #Note that if the sequence titles of two sequences match, they are assumed to represent #the same sequence and are not compared. use warnings; use strict; #Edit this path to point to bl2seq program my $BLAST2SEQ_PATH = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; #these temp files will be written in the output directory my $TEMP_FILE_ONE = "temp_one"; my $TEMP_FILE_TWO = "temp_two"; #BLAST settings my $PROGRAM = "blastn"; my $EFFECTIVE_SEARCH_SPACE = 10351892538; my $MIN_LENGTH = 20; #Command line processing. use Getopt::Long; my $now = gmtime; my $inputFileOne; my $inputFileTwo; my $outputDirectory; Getopt::Long::Configure ('bundling'); GetOptions ('i|input_file_one=s' => \$inputFileOne, 'j|input_file_two=s' => \$inputFileTwo, 'o|output_directory=s' => \$outputDirectory); if(!defined($inputFileOne)) { die ("Usage: compare_library.pl -i -j -o \n"); } if(!defined($inputFileTwo)) { die ("Usage: compare_library.pl -i -j -o \n"); } if(!defined($outputDirectory)) { die ("Usage: compare_library.pl -i -j -o \n"); } #create directory if it doesn't exist if (!(-d $outputDirectory)) { mkdir ($outputDirectory, 0777); } #check for index.html file if (-e $outputDirectory . "/" . "index" . ".html") { die ("Please empty the directory $outputDirectory, or use a new output directory."); } my $htmlHeader = "\n\n\nBLAST results for sequences in $inputFileOne vs sequences in $inputFileTwo\n\n\n\n"; my $htmlFooter = "\n\n"; open (INDEXFILE, ">>" . $outputDirectory . "/" . "index" . ".html") or die ("Cannot open file for output: $!"); print (INDEXFILE $htmlHeader); print (INDEXFILE "Results for BLAST comparison of sequences performed on $now
\n"); print (INDEXFILE "Sequence file one: $inputFileOne
\n"); print (INDEXFILE "Sequence file two: $inputFileTwo

\n"); print (INDEXFILE "Each sequence in sequence file one was BLASTed against each sequence in sequence file two.
\n"); print (INDEXFILE "Click on the sequence hit titles to view the BLAST results.
\n"); print (INDEXFILE "\n"); print (INDEXFILE "\n"); print (INDEXFILE "\n"); close (INDEXFILE); #check for 1.html file if (-e $outputDirectory . "/" . "1.html") { die ("Please empty the directory $outputDirectory, or use a new output directory."); } #read each record from the input file open (INFILE_ONE, $inputFileOne) or die( "Cannot open file : $!" ); $/ = ">"; my $seqCountOne = 1; my $sequenceCountTwo = 1; #read each sequence. while (my $sequenceRecordOne = ) { if ($sequenceRecordOne eq ">"){ next; } my $sequenceTitleOne = ""; if ($sequenceRecordOne =~ m/^([^\n]+)/){ $sequenceTitleOne = $1; } else { die ("No sequence title was found for sequence $seqCountOne in file $inputFileOne"); } $sequenceRecordOne =~ s/^[^\n]+//; $sequenceRecordOne =~ s/[^A-Za-z]//g; print "searching for hits to $sequenceTitleOne.\n"; #if sequence is too short, skip my $sequenceOneLength = length($sequenceRecordOne); if ($sequenceOneLength < $MIN_LENGTH) { $seqCountOne = $seqCountOne + 1; next; } #write the first sequence and title to $TEMP_FILE_ONE open (TEMP_FILE_ONE, ">" . $outputDirectory . "/" . $TEMP_FILE_ONE) or die ("Cannot open file for output: $!"); print (TEMP_FILE_ONE ">" . $sequenceTitleOne . "\n" . $sequenceRecordOne . "\n"); close (TEMP_FILE_ONE); #now go through sequences in second file my $seqCountTwo = 1; my $firstMatch = 1; my $sepChar = ""; my $hitFound = 0; my $noHitsForSequenceOne = 1; my $hitList = ""; open (INFILE_TWO, $inputFileTwo) or die( "Cannot open file : $!" ); while (my $sequenceRecordTwo = ) { if ($sequenceRecordTwo eq ">"){ next; } my $sequenceTitleTwo = ""; if ($sequenceRecordTwo =~ m/^([^\n]+)/){ $sequenceTitleTwo = $1; } else { die ("No sequence title was found for sequence $seqCountTwo in file $inputFileTwo"); } $sequenceRecordTwo =~ s/^[^\n]+//; $sequenceRecordTwo =~ s/[^A-Za-z]//g; #skip sequence if it is too short my $sequenceTwoLength = length($sequenceRecordTwo); if ($sequenceTwoLength < $MIN_LENGTH) { $seqCountTwo = $seqCountTwo + 1; next; } #skip sequence if it has same name if ($sequenceTitleTwo eq $sequenceTitleOne) { $seqCountTwo = $seqCountTwo + 1; next; } #write the second sequence and title to $TEMP_FILE_TWO open (TEMP_FILE_TWO, ">" . $outputDirectory . "/" . $TEMP_FILE_TWO) or die ("Cannot open file for output: $!"); print (TEMP_FILE_TWO ">" . $sequenceTitleTwo . "\n" . $sequenceRecordTwo . "\n"); close (TEMP_FILE_TWO); #submit both to blast #could redirect STDERR using "2>/dev/null" my $blast_command = $BLAST2SEQ_PATH . "bl2seq" . " -p " . $PROGRAM . " -i " . $outputDirectory . "/" . $TEMP_FILE_ONE . " -j " . $outputDirectory . "/" . $TEMP_FILE_TWO . " -Y " . $EFFECTIVE_SEARCH_SPACE . " -d " . $EFFECTIVE_SEARCH_SPACE; my $result = `$blast_command` or die ("Could not execute BLAST. Do you need to edit the \$BLAST2SEQ_PATH variable?"); if ($result =~ m/^\sScore\s=\s\s/m) { $hitFound = 1; $noHitsForSequenceOne = 0; } elsif ($result =~ m/^Number of Hits to DB:/m) { $hitFound = 0; } else { #the result cannot be parsed. print "The BLAST results could not be parsed properly for sequence $sequenceTitleOne from $inputFileOne vs sequence $sequenceTitleTwo from $inputFileTwo. Skipping this sequence.\n"; $seqCountTwo = $seqCountTwo + 1; next; } #if there is a match if ($hitFound) { $hitFound = 0; my $resultHtmlHeader = "\n\n\nBLAST results for $sequenceTitleOne from $inputFileOne vs $sequenceTitleTwo from $inputFileTwo\n\n\n\n"; $result = $resultHtmlHeader . "BLAST results for $sequenceTitleOne from $inputFileOne vs $sequenceTitleTwo from $inputFileTwo
\n" . "
\n" . $result . "
\n" . $htmlFooter; #get result if (-e $outputDirectory . "/" . $seqCountOne . "_" . $seqCountTwo . ".html") { die ("Please empty the directory $outputDirectory, or use a new output directory."); } $hitList = $hitList . "$sepChar" . $sequenceTitleTwo . ""; #now write result to $outputDirectory open(OUTFILE, ">" . $outputDirectory . "/" . $seqCountOne . "_" . $seqCountTwo . ".html") or die ("Cannot open file for output: $!"); print (OUTFILE $result); close (OUTFILE); $sepChar = ", "; } $seqCountTwo = $seqCountTwo + 1; $sequenceCountTwo = $seqCountTwo; }#end of sequenceTwo while loop #if there were one or more hits, add entry to index.html if (!($noHitsForSequenceOne)) { #write a column in the output table giving the $sequenceTitleOne and the $hitList open (INDEXFILE, ">>" . $outputDirectory . "/" . "index" . ".html") or die ("Cannot open file for output: $!"); print (INDEXFILE "
"); print (INDEXFILE "\n"); close (INDEXFILE); } $seqCountOne = $seqCountOne + 1; }#end of sequenceOne while loop close (INFILE_ONE) or die( "Cannot close file : $!"); #complete index.html open (INDEXFILE, ">>" . $outputDirectory . "/" . "index" . ".html") or die ("Cannot open file for output: $!"); print (INDEXFILE "\n
BLAST query from $inputFileOneBLAST hits from $inputFileTwo
$sequenceTitleOne$hitList.
\n"); print (INDEXFILE "$htmlFooter"); print (INDEXFILE "
\n"); print (INDEXFILE "A total of $seqCountOne sequences from $inputFileOne were compared with $sequenceCountTwo sequences from $inputFileTwo\n"); close (INDEXFILE); #remove temp files unlink($outputDirectory . "/" . $TEMP_FILE_ONE); unlink($outputDirectory . "/" . $TEMP_FILE_TWO);