#!/usr/bin/perl #remote_blast_client.pl #Version 4.6 # #Written by Paul Stothard, University of Alberta. #stothard@ualberta.ca # #COMMAND OPTIONS: #This script performs BLAST searches against NCBI's sequence #databases. It prompts the user for a BLAST search type and an input #file of FASTA formatted sequences. An optional 'limit by Entrez query' #value can be supplied to restrict the search. The script then submits #each sequence to BLAST and retrieves the results. For each of the hits #the script retrieves a detailed title by performing a separate query #of NCBI's databases. Each BLAST hit and its descriptive title are #written to a single tab-delimited output file. # #To run this script, make sure Perl is installed on your system, and #enter: # #perl remote_blast_client.pl # #or use command line parameters to specify the type of search you would #like to perform. # #There are five required parameters: # #-i - Input file containing multiple fasta sequences. [File]. # #-o - Output file to create. [File]. # #-d - Database to search. [File]. # #-b - BLAST program (blastn, blastp, blastx, tblastn, #tblastx). [String]. # #Optional parameters: # #-e - Entrez query to limit results. [String]. # #-h - Number of hsps to keep. [Integer]. Default is to keep all hsps. # #-l - Whether to filter the query sequence. [F/L/R/LR]. Default is #F. Acceptable values are 'F' for no filter, 'L' for low complexity, #'R' for human repeats, or 'LR' for both. # #-a - Minimum alignment length to keep. [Integer]. Default is to keep #all alignments. # #-p - Minimum alignment length to keep, expressed as a proportion of #the query sequence length. [Real]. Overrides -a. Default is to keep #all alignments. # #-s - Minimum alignment score to keep. [Integer]. Default is to keep #all alignments. # #-n - Minimum alignment identity to keep. [Real]. Default is to keep #all alignments. # #-x - Expect value to supply to the BLAST program. [Real]. Default is #10.0. # #-t - Number of hits to keep. [Integer]. Default is 5. # #-f - Whether to fetch sequence descriptions using Entrez. [T/F]. #Default is T. # #-Q - The genetic code to use for translated BLAST searches. [Integer]. #Default is 1. # #-accession - Whether to attempt to replace the hit ID (the gi number) #with an accession number. Requires that "-f" be set to #"T". [T/F]. Default is T. # #example usage: # #perl remote_blast_client.pl -i my_seqs.fasta -o blast_results.txt -b blastn -e bacteria[Organism] -d nr # #TO ENSURE THAT ACCESSION NUMBERS ARE ALSO DOWNLOADED, set the -f and -accession flags to "T" use warnings; use strict; use Getopt::Long; use LWP::UserAgent; use HTTP::Request::Common; my %settings = ( PROGRAM => undef, DATABASE => undef, EXPECT => 10, WORD_SIZE => undef, HITLIST_SIZE => 5, FILTER => "F", OUTPUTFILE => undef, INPUTFILE => undef, INPUTTYPE => undef, ENTREZ_DB => undef, ENTREZ_QUERY => undef, ALIGN_TYPE => undef, BLAST_URL => "http://www.ncbi.nlm.nih.gov/blast/Blast.cgi", ENTREZ_URL => "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?", HSP_MAX => undef, ERROR_RETRY => 5, MIN_ALIGN_LENGTH => undef, MIN_ALIGN_PROP => undef, MIN_SCORE => undef, MIN_IDENTITY => undef, FETCH_DESC => 'T', QUERY_GENETIC_CODE => 1, BROWSER => undef, MAX_BYTES_RESPONSE => 5000000, COMPOSITION_BASED_STATISTICS => undef, ACCESSION => 'F' ); my $blastType = undef; GetOptions( 'i|input_file=s' => \$settings{INPUTFILE}, 'o|output_file=s' => \$settings{OUTPUTFILE}, 'b|blast_program=s' => \$blastType, 'e|entrez_query=s' => \$settings{ENTREZ_QUERY}, 'd|database=s' => \$settings{DATABASE}, 'h|hsps=i' => \$settings{HSP_MAX}, 'l|filter=s' => \$settings{FILTER}, 'a|min_align_length=i' => \$settings{MIN_ALIGN_LENGTH}, 'p|min_align_prop=f' => \$settings{MIN_ALIGN_PROP}, 's|min_score=i' => \$settings{MIN_SCORE}, 'n|min_identity=f' => \$settings{MIN_IDENTITY}, 'x|expect=f' => \$settings{EXPECT}, 't|hit_list_size=i' => \$settings{HITLIST_SIZE}, 'f|fetch_description=s' => \$settings{FETCH_DESC}, 'Q|query_genetic_code=i' => \$settings{QUERY_GENETIC_CODE}, 'accession=s' => \$settings{ACCESSION} ); if ( !( defined($blastType) ) ) { print "------------------------------------------------------------\n"; print "Please enter a number to indicated the type of BLAST search\n"; print "you want to perform:.\n"; print "1 - Nucleotide-nucleotide BLAST (blastn).\n"; print "2 - Protein-protein BLAST (blastp).\n"; print "3 - Translated query vs protein database (blastx).\n"; print "4 - Protein query vs translated database (tblastn).\n"; print "5 - Translated query vs. translated database (tblastx).\n"; print "------------------------------------------------------------\n"; $blastType = ; chomp($blastType); if ( $blastType =~ m/(\d)/ ) { $blastType = $1; } else { die("Please enter a digit between 1 and 5.\n"); } if ( ( $blastType < 1 ) || ( $blastType > 5 ) ) { die("Please enter a digit between 1 and 5.\n"); } } if ( !( defined( $settings{DATABASE} ) ) ) { print "------------------------------------------------------------\n"; print "Please enter the name of the database you wish to search.\n"; print "------------------------------------------------------------\n"; $settings{DATABASE} = ; chomp( $settings{DATABASE} ); } if ( ( !( defined( $settings{INPUTFILE} ) ) ) && ( !( defined( $settings{ENTREZ_QUERY} ) ) ) ) { print "------------------------------------------------------------\n"; print "Enter an entrez query or press enter to search all sequences.\n"; print "For example, the term 'bacteria[Organism]' restricts the\n"; print "search to bacteria sequences only.\n"; print "------------------------------------------------------------\n"; $settings{ENTREZ_QUERY} = ; chomp( $settings{ENTREZ_QUERY} ); } _setDefaults( $blastType, \%settings ); if ( !( defined( $settings{INPUTFILE} ) ) ) { print "------------------------------------------------------------\n"; print "Enter the name of the FASTA format " . $settings{INPUTTYPE} . " sequence file\n"; print "that contains your query sequences.\n"; print "------------------------------------------------------------\n"; $settings{INPUTFILE} = ; chomp( $settings{INPUTFILE} ); } my $inputLessExtentions = $settings{INPUTFILE}; if ( $settings{INPUTFILE} =~ m/(^[^\.]+)/g ) { $inputLessExtentions = $1; } if ( !( defined( $settings{OUTPUTFILE} ) ) ) { $settings{OUTPUTFILE} = $inputLessExtentions . "_" . "results.tab"; print "------------------------------------------------------------\n"; print "The results of this " . $settings{PROGRAM} . " search will be written to\n"; print "a file called " . $settings{OUTPUTFILE} . ".\n"; print "Start the search? (y or n) y\n"; print "------------------------------------------------------------\n"; my $continue = ; chomp($continue); if ( $continue =~ m/n/i ) { exit(0); } } $settings{HITLIST_SIZE} = _get_integer( $settings{HITLIST_SIZE} ); $settings{HSP_MAX} = _get_integer( $settings{HSP_MAX} ); $settings{MIN_ALIGN_LENGTH} = _get_integer( $settings{MIN_ALIGN_LENGTH} ); $settings{MIN_ALIGN_PROP} = _get_real( $settings{MIN_ALIGN_PROP} ); $settings{MIN_SCORE} = _get_integer( $settings{MIN_SCORE} ); $settings{MIN_IDENTITY} = _get_real( $settings{MIN_IDENTITY} ); $settings{EXPECT} = _get_real( $settings{EXPECT} ); $settings{BROWSER} = LWP::UserAgent->new(); $settings{BROWSER}->timeout(30); $settings{BROWSER}->max_size( $settings{MAX_BYTES_RESPONSE} ); open( OUTFILE, ">" . $settings{OUTPUTFILE} ) or die("Cannot open file : $!"); print( OUTFILE "#-------------------------------------------------------------------------------------------------------------------------------------------------\n" ); print( OUTFILE "#Results of automated BLAST query of performed on " . _getTime() . ".\n" ); print( OUTFILE "#Searches performed using remote_blast_client.pl, written by Paul Stothard, stothard\@ualberta.ca.\n" ); print( OUTFILE "#The following settings were specified:\n" ); my @settingsKeys = keys(%settings); foreach (@settingsKeys) { if ( defined( $settings{$_} ) ) { print( OUTFILE "#" . $_ . "=" . $settings{$_} . "\n" ); } } print( OUTFILE "#The following attributes are separated by tabs\n" ); print( OUTFILE "#-------------------------------------------------------------------------------------------------------------------------------------------------\n" ); if ( $settings{ALIGN_TYPE} eq "nucleotide" ) { print( OUTFILE "query_id\tmatch_id\tmatch_description\t%_identity\talignment_length\tmismatches\tgap_openings\tq_start\tq_end\ts_start\ts_end\tevalue\tbit_score\n" ); } elsif ( $settings{ALIGN_TYPE} eq "protein" ) { print( OUTFILE "query_id\tmatch_id\tmatch_description\t%_identity\t%_positives\talignment_length\tmismatches\tgap_opens\tq_start\tq_end\ts_start\ts_end\tevalue\tbit_score\n" ); } elsif ( $settings{ALIGN_TYPE} eq "translated" ) { print( OUTFILE "query_id\tmatch_id\tmatch_description\t%_identity\t%_positives\tquery/sbjct_frames\talignment_length\tmismatches\tgap_opens\tq_start\tq_end\ts_start\ts_end\tevalue\tbit_score\n" ); } close(OUTFILE) or die("Cannot close file : $!"); my $seqCount = 0; local $/ = ">"; open( SEQFILE, $settings{INPUTFILE} ) or die("Cannot open file : $!"); while ( my $sequenceEntry = ) { $seqCount++; if ( $sequenceEntry eq ">" ) { next; } my $title = undef; my $sequence = undef; my %query = (); if ( $sequenceEntry =~ m/^([^\n\cM]+)/ ) { $title = $1; } else { $title = "No title available"; } $sequence = $sequenceEntry; $sequence =~ s/^[^\n\cM]+//; $sequence =~ s/[^A-Z]//ig; if ( !( $sequence =~ m/[A-Z]/i ) ) { next; } $query{title} = $title; $query{sequence} = $sequence; if ( defined( $settings{MIN_ALIGN_PROP} ) ) { my $queryLength = length( $query{sequence} ); if ( $settings{ALIGN_TYPE} eq "nucleotide" ) { $settings{MIN_ALIGN_LENGTH} = $settings{MIN_ALIGN_PROP} * $queryLength; } elsif ( $settings{ALIGN_TYPE} eq "protein" ) { $settings{MIN_ALIGN_LENGTH} = $settings{MIN_ALIGN_PROP} * $queryLength; } elsif ( $settings{ALIGN_TYPE} eq "translated" ) { $settings{MIN_ALIGN_LENGTH} = $settings{MIN_ALIGN_PROP} * $queryLength / 3; } $settings{MIN_ALIGN_LENGTH} = sprintf( "%.0f", $settings{MIN_ALIGN_LENGTH} ); } my $RIDsuccess = 0; my $RIDattempts = 0; my $RID = undef; my $RTOE = undef; my $RIDresponse = undef; while ( ( !($RIDsuccess) ) && ( $RIDattempts < $settings{ERROR_RETRY} ) ) { $RIDresponse = _getRID( \%settings, \%query ); my $success = $RIDresponse->is_success(); if ($success) { my $result = $RIDresponse->as_string(); if ( $result =~ m/QBlastInfoBegin\s*RID\s=\s([^\s]+)\s*RTOE\s=\s(\d+)\s*QBlastInfoEnd/ ) { $RID = $1; $RTOE = $2; } else { print "Error: Could not parse RID and RTOE from response $result.\n"; } } else { print "Error: Could not get response when requesting RID and RTOE for sequence " . $query{title} . "\n"; } if ( ( defined($RID) ) && ( defined($RTOE) ) ) { $RIDsuccess = 1; print "RID received for " . $query{title} . ". The RTOE for this sequence is $RTOE seconds.\n"; } else { $RIDattempts++; sleep(60); } } if ( !($RIDsuccess) ) { print "Error1: " . $query{title} . " failed at RID and RTOE requst.\n"; open( OUTFILE, "+>>" . $settings{OUTPUTFILE} ) or die("Cannot open file : $!"); print( OUTFILE "Error: " . $query{title} . " failed at RID and RTOE request.\n" ); close(OUTFILE) or die("Cannot close file : $!"); next; } print "Sleeping for $RTOE seconds.\n"; sleep($RTOE); my $BLASTsuccess = 0; my $BLASTattempts = 0; my $BLASTresponse = undef; my $BLASTresult = undef; while (( !($BLASTsuccess) ) && ( $BLASTattempts < $settings{ERROR_RETRY} ) ) { $BLASTresponse = _getBLAST( \%settings, $RID ); my $success = $BLASTresponse->is_success(); if ($success) { my $result = $BLASTresponse->as_string(); if ( $result =~ m/status<\/td>searching<\/td>/i ) { print "The results are not ready for sequence " . $query{title} . ".\n"; print "Pausing before requesting again.\n"; sleep(20); next; } elsif ( $result =~ m/QBlastInfoBegin\s*Status=WAITING\s*QBlastInfoEnd/ ) { print "The results are not ready for sequence " . $query{title} . ".\n"; print "Pausing before requesting again.\n"; sleep(20); next; } else { print "The results were received for sequence " . $query{title} . ".\n"; $BLASTresult = $result; } } else { print "Error: Could not get response when requesting status for sequence " . $query{title} . "\n"; } if ( defined($BLASTresult) ) { $BLASTsuccess = 1; } else { $BLASTattempts++; sleep(60); } } if ( !($BLASTsuccess) ) { print "Error: " . $query{title} . " failed at BLAST results request.\n"; open( OUTFILE, "+>>" . $settings{OUTPUTFILE} ) or die("Cannot open file : $!"); print( OUTFILE "Error: " . $query{title} . " failed at BLAST results request.\n" ); close(OUTFILE) or die("Cannot close file : $!"); next; } my $hitFound = 0; my $hspCount = 0; my $hitCount = 0; my $hitMaxReached = 0; my $lastHitTitle = undef; my $searchPattern; if ( $settings{ALIGN_TYPE} eq "nucleotide" ) { $searchPattern = '^([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t\s]*$'; } elsif ( $settings{ALIGN_TYPE} eq "protein" ) { $searchPattern = '^([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t\s]*$'; } elsif ( $settings{ALIGN_TYPE} eq "translated" ) { $searchPattern = '^([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t]+([^\t]+)[\t\s]*$'; } my @results = split( /\n/, $BLASTresult ); foreach (@results) { if ( $_ =~ m/^\#\sFields:/ ) { next; } if ( !( $_ =~ m/$searchPattern/ ) ) { next; } my %hsp = ( query_id => undef, match_id => undef, match_description => undef, identity => undef, positives => undef, query_sbjct_frames => undef, alignment_length => undef, mismatches => undef, gap_opens => undef, q_start => undef, q_end => undef, s_start => undef, s_end => undef, evalue => undef, bit_score => undef, uid => undef, accession => undef, id_to_return => undef ); if ( $settings{ALIGN_TYPE} eq "nucleotide" ) { $hsp{query_id} = $1; $hsp{match_id} = $2; $hsp{match_description} = undef; $hsp{identity} = $3; $hsp{positives} = undef; $hsp{query_sbjct_frames} = undef; $hsp{alignment_length} = $4; $hsp{mismatches} = $5; $hsp{gap_opens} = $6; $hsp{q_start} = $7; $hsp{q_end} = $8; $hsp{s_start} = $9; $hsp{s_end} = $10; $hsp{evalue} = $11; $hsp{bit_score} = $12; } elsif ( $settings{ALIGN_TYPE} eq "protein" ) { $hsp{query_id} = $1; $hsp{match_id} = $2; $hsp{match_description} = undef; $hsp{identity} = $3; $hsp{positives} = $4; $hsp{query_sbjct_frames} = undef; $hsp{alignment_length} = $5; $hsp{mismatches} = $6; $hsp{gap_opens} = $7; $hsp{q_start} = $8; $hsp{q_end} = $9; $hsp{s_start} = $10; $hsp{s_end} = $11; $hsp{evalue} = $12; $hsp{bit_score} = $13; } elsif ( $settings{ALIGN_TYPE} eq "translated" ) { $hsp{query_id} = $1; $hsp{match_id} = $2; $hsp{match_description} = undef; $hsp{identity} = $3; $hsp{positives} = $4; $hsp{query_sbjct_frames} = $5; $hsp{alignment_length} = $6; $hsp{mismatches} = $7; $hsp{gap_opens} = $8; $hsp{q_start} = $9; $hsp{q_end} = $10; $hsp{s_start} = $11; $hsp{s_end} = $12; $hsp{evalue} = $13; $hsp{bit_score} = $14; } if ( ( defined($lastHitTitle) ) && ( $lastHitTitle eq $hsp{match_id} ) ) { $hspCount++; } else { $hspCount = 1; $hitCount++; $lastHitTitle = $hsp{match_id}; } if ( ( defined( $settings{HSP_MAX} ) ) && ( $hspCount > $settings{HSP_MAX} ) ) { print "Skipping HSP because more than " . $settings{HSP_MAX} . " HSPs have already been obtained.\n"; next; } if ( ( defined( $settings{HITLIST_SIZE} ) ) && ( $hitCount == $settings{HITLIST_SIZE} ) ) { $hitMaxReached = 1; } if ( defined( $settings{MIN_ALIGN_LENGTH} ) ) { if ( $hsp{alignment_length} < $settings{MIN_ALIGN_LENGTH} ) { print "Skipping hit because alignment length is less than " . $settings{MIN_ALIGN_LENGTH} . ".\n"; next; } } if ( defined( $settings{MIN_SCORE} ) ) { if ( $hsp{bit_score} < $settings{MIN_SCORE} ) { print "Skipping hit because score is less than " . $settings{MIN_SCORE} . ".\n"; next; } } if ( defined( $settings{MIN_IDENTITY} ) ) { if ( $hsp{identity} < $settings{MIN_IDENTITY} ) { print "Skipping hit because identity is less than " . $settings{MIN_IDENTITY} . ".\n"; next; } } #this is to return a single gi number if ( $hsp{match_id} =~ m/(ref|gi)\|(\d+)/ ) { $hsp{uid} = $2; $hsp{match_id} = $1 . "|" . $2; } $hsp{match_description} = "-"; if ( $settings{FETCH_DESC} =~ m/t/i ) { if ( !( defined( $hsp{uid} ) ) ) { $hsp{match_description} = "No identifier available"; } else { my $ENTREZsuccess = 0; my $ENTREZattempts = 0; my $ENTREZresponse = undef; my $ENTREZresult = undef; while (( !($ENTREZsuccess) ) && ( $ENTREZattempts < $settings{ERROR_RETRY} ) ) { $ENTREZresponse = _getENTREZ( \%settings, $hsp{uid} ); my $success = $ENTREZresponse->is_success(); if ($success) { my $result = $ENTREZresponse->as_string(); if ( $result =~ m/(.*?)<\/Item>/i ) { $ENTREZresult = $1; } else { print "Error: Could not parse Entrez information from response $result.\n"; } if ( $settings{ACCESSION} =~ m/t/i ) { #AM434294 #or #gi|123673855|emb|AM434294.1|[123673855] if ($result =~ m/(.*?)<\/Item>/i) { $hsp{accession} = $1; } else { print "Error: Could not parse accession information from response $result.\n"; } } } else { print "Error: Could not get response when requesting Entrez information for gi " . $hsp{uid} . "\n"; } if ( defined($ENTREZresult) ) { $ENTREZsuccess = 1; print "ENTREZ results received for " . $hsp{uid} . "\n"; } else { $ENTREZattempts++; sleep(60); } } if ( !($ENTREZsuccess) ) { $hsp{match_description} = "Unable to obtain description from ENTREZ"; } else { $hsp{match_description} = $ENTREZresult; } } } print "Writing HSP to file.\n"; open( OUTFILE, "+>>" . $settings{OUTPUTFILE} ) or die("Cannot open file : $!"); if (( $settings{ACCESSION} =~ m/t/i ) && (defined($hsp{accession}))) { $hsp{id_to_return} = $hsp{accession}; } else { $hsp{id_to_return} = $hsp{match_description}; } if ( $settings{ALIGN_TYPE} eq "nucleotide" ) { print( OUTFILE "$hsp{query_id}\t$hsp{id_to_return}\t$hsp{match_description}\t$hsp{identity}\t$hsp{alignment_length}\t$hsp{mismatches}\t$hsp{gap_opens}\t$hsp{q_start}\t$hsp{q_end}\t$hsp{s_start}\t$hsp{s_end}\t$hsp{evalue}\t$hsp{bit_score}\n" ); } elsif ( $settings{ALIGN_TYPE} eq "protein" ) { print( OUTFILE "$hsp{query_id}\t$hsp{id_to_return}\t$hsp{match_description}\t$hsp{identity}\t$hsp{positives}\t$hsp{alignment_length}\t$hsp{mismatches}\t$hsp{gap_opens}\t$hsp{q_start}\t$hsp{q_end}\t$hsp{s_start}\t$hsp{s_end}\t$hsp{evalue}\t$hsp{bit_score}\n" ); } elsif ( $settings{ALIGN_TYPE} eq "translated" ) { print( OUTFILE "$hsp{query_id}\t$hsp{id_to_return}\t$hsp{match_description}\t$hsp{identity}\t$hsp{positives}\t$hsp{query_sbjct_frames}\t$hsp{alignment_length}\t$hsp{mismatches}\t$hsp{gap_opens}\t$hsp{q_start}\t$hsp{q_end}\t$hsp{s_start}\t$hsp{s_end}\t$hsp{evalue}\t$hsp{bit_score}\n" ); } close(OUTFILE) or die("Cannot close file : $!"); $hitFound = 1; if ($hitMaxReached) { print "Skipping remaining hits because " . $settings{HITLIST_SIZE} . " hits have already been obtained.\n"; last; } } if ( !($hitFound) ) { open( OUTFILE, "+>>" . $settings{OUTPUTFILE} ) or die("Cannot open file : $!"); print( OUTFILE $query{title} . "\t" . "no acceptable hits returned\n" ); close(OUTFILE) or die("Cannot close file : $!"); } } close(SEQFILE) or die("Cannot close file : $!"); print "Open " . $settings{OUTPUTFILE} . " to view the BLAST results.\n"; sub _getRID { my $settings = shift; my $query = shift; my $fasta = ">" . $query->{title} . "\n" . $query->{sequence}; my $RIDresponse = undef; #-filter - the sequence filter to use on the query. Default is #'none'. Acceptable values are 'L' for low complexity, and 'R' for #human repeats, or 'LR' for both. #This script uses the 'm' filter option with all filters. #see http://www.ncbi.nlm.nih.gov/blast/blastcgihelp.shtml#filter #For more information about the BLAST URL API, see: #http://www.ncbi.nlm.nih.gov/blast/Doc/urlapi.html #2007-06-21; Added COMPOSITION_BASED_STATISTICS parameter if ( ( !( defined( $settings->{FILTER} ) ) ) || ( $settings->{FILTER} eq 'F' ) ) { $RIDresponse = $settings->{BROWSER}->request( POST( $settings->{BLAST_URL}, [ DATABASE => $settings->{DATABASE}, HITLIST_SIZE => $settings->{HITLIST_SIZE}, ENTREZ_QUERY => $settings->{ENTREZ_QUERY}, EXPECT => $settings->{EXPECT}, WORD_SIZE => $settings->{WORD_SIZE}, GENETIC_CODE => $settings->{QUERY_GENETIC_CODE}, PROGRAM => $settings->{PROGRAM}, COMPOSITION_BASED_STATISTICS => $settings->{COMPOSITION_BASED_STATISTICS}, QUERY => $fasta, CMD => "Put" ] ) ); } elsif ( $settings->{FILTER} eq 'L' ) { $RIDresponse = $settings->{BROWSER}->request( POST( $settings->{BLAST_URL}, [ DATABASE => $settings->{DATABASE}, HITLIST_SIZE => $settings->{HITLIST_SIZE}, ENTREZ_QUERY => $settings->{ENTREZ_QUERY}, EXPECT => $settings->{EXPECT}, WORD_SIZE => $settings->{WORD_SIZE}, FILTER => $settings->{FILTER}, FILTER => "m", GENETIC_CODE => $settings->{QUERY_GENETIC_CODE}, PROGRAM => $settings->{PROGRAM}, COMPOSITION_BASED_STATISTICS => $settings->{COMPOSITION_BASED_STATISTICS}, QUERY => $fasta, CMD => "Put" ] ) ); } elsif ( $settings->{FILTER} eq 'R' ) { $RIDresponse = $settings->{BROWSER}->request( POST( $settings->{BLAST_URL}, [ DATABASE => $settings->{DATABASE}, HITLIST_SIZE => $settings->{HITLIST_SIZE}, ENTREZ_QUERY => $settings->{ENTREZ_QUERY}, EXPECT => $settings->{EXPECT}, WORD_SIZE => $settings->{WORD_SIZE}, FILTER => $settings->{FILTER}, FILTER => "m", GENETIC_CODE => $settings->{QUERY_GENETIC_CODE}, PROGRAM => $settings->{PROGRAM}, COMPOSITION_BASED_STATISTICS => $settings->{COMPOSITION_BASED_STATISTICS}, QUERY => $fasta, CMD => "Put" ] ) ); } elsif ( $settings->{FILTER} =~ m/(RL)|(LR)/i ) { $RIDresponse = $settings->{BROWSER}->request( POST( $settings->{BLAST_URL}, [ DATABASE => $settings->{DATABASE}, HITLIST_SIZE => $settings->{HITLIST_SIZE}, ENTREZ_QUERY => $settings->{ENTREZ_QUERY}, EXPECT => $settings->{EXPECT}, WORD_SIZE => $settings->{WORD_SIZE}, FILTER => "R", FILTER => "L", FILTER => "m", GENETIC_CODE => $settings->{QUERY_GENETIC_CODE}, PROGRAM => $settings->{PROGRAM}, COMPOSITION_BASED_STATISTICS => $settings->{COMPOSITION_BASED_STATISTICS}, QUERY => $fasta, CMD => "Put" ] ) ); } else { die("Do not understand the specified -filter value: $settings->{FILTER}.\n" ); } return $RIDresponse; } sub _getBLAST { my $settings = shift; my $RID = shift; my $BLASTresponse = $settings->{BROWSER}->request( POST( $settings->{BLAST_URL}, [ RID => $RID, FORMAT_TYPE => "Text", ALIGNMENT_VIEW => "Tabular", ALIGNMENTS => $settings->{HITLIST_SIZE}, CMD => "Get" ] ) ); return $BLASTresponse; } sub _getENTREZ { my $settings = shift; my $uid = shift; my $ENTREZresponse = $settings->{BROWSER}->request( GET( $settings->{ENTREZ_URL} . "db=" . $settings->{ENTREZ_DB} . "&id=$uid&tool=remote_blast_client&retmode=xml" ) ); return $ENTREZresponse; } sub _getTime { my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) = localtime(time); $year += 1900; my @days = ( 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday' ); my @months = ( 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December' ); my $time = $days[$wday] . " " . $months[$mon] . " " . sprintf( "%02d", $mday ) . " " . sprintf( "%02d", $hour ) . ":" . sprintf( "%02d", $min ) . ":" . sprintf( "%02d", $sec ) . " " . sprintf( "%04d", $year ); return $time; } sub _setDefaults { my $blastType = shift; my $settings = shift; #1 - Nucleotide-nucleotide BLAST (blastn) if ( ( $blastType =~ /^blastn$/i ) || ( $blastType eq "1" ) ) { $settings->{PROGRAM} = "blastn"; $settings->{WORD_SIZE} = "11"; $settings->{INPUTTYPE} = "DNA"; $settings->{ENTREZ_DB} = "nucleotide"; $settings->{ALIGN_TYPE} = "nucleotide"; $settings->{COMPOSITION_BASED_STATISTICS} = 'yes'; } #2 - Protein-protein BLAST (blastp) elsif ( ( $blastType =~ /^blastp$/i ) || ( $blastType eq "2" ) ) { $settings->{PROGRAM} = "blastp"; $settings->{WORD_SIZE} = "3"; $settings->{INPUTTYPE} = "protein"; $settings->{ENTREZ_DB} = "protein"; $settings->{ALIGN_TYPE} = "protein"; $settings->{COMPOSITION_BASED_STATISTICS} = 'yes'; } #3 - Translated query vs protein database (blastx) elsif ( ( $blastType =~ /^blastx$/i ) || ( $blastType eq "3" ) ) { $settings->{PROGRAM} = "blastx"; $settings->{WORD_SIZE} = "3"; $settings->{INPUTTYPE} = "DNA"; $settings->{ENTREZ_DB} = "protein"; $settings->{ALIGN_TYPE} = "translated"; $settings->{COMPOSITION_BASED_STATISTICS} = 'yes'; } #4 - Protein query vs translated database (tblastn) elsif ( ( $blastType =~ /^tblastn$/i ) || ( $blastType eq "4" ) ) { $settings->{PROGRAM} = "tblastn"; $settings->{WORD_SIZE} = "3"; $settings->{INPUTTYPE} = "protein"; $settings->{ENTREZ_DB} = "nucleotide"; $settings->{ALIGN_TYPE} = "translated"; $settings->{COMPOSITION_BASED_STATISTICS} = 'yes'; } #5 - Translated query vs. translated database (tblastx) elsif ( ( $blastType =~ /^tblastx$/i ) || ( $blastType eq "5" ) ) { $settings->{PROGRAM} = "tblastx"; $settings->{WORD_SIZE} = "3"; $settings->{INPUTTYPE} = "DNA"; $settings->{ENTREZ_DB} = "nucleotide"; $settings->{ALIGN_TYPE} = "translated"; $settings->{COMPOSITION_BASED_STATISTICS} = 'yes'; } else { die("BLAST type $blastType is not recognized."); } } sub _get_integer { my $value = shift; my $int = undef; if ( ( defined($value) ) && ( $value =~ m/(\d+)/ ) ) { $int = $1; } return $int; } sub _get_real { my $value = shift; my $real = undef; if ( ( defined($value) ) && ( $value =~ m/(\S+)/ ) ) { $real = $1; } return $real; }