#!/usr/bin/perl -w # NCBI_condense_names.pl 2006 NUF # Replaces entry names with a more simple name # # Enter at command line: condense_NCBI_names.pl # # Example: # >gi|68551093|ref|ZP_00590521.1| Rhodanese-like [Pelodictyon phaeoclathratiforme BU-1] # becomes: # >ZP_00590521_Pelodictyon_phaeoclathratiforme_BU1 # # This script also replaces some genus names with abbreviations (see end of script) # # Modified by S.R. Santos 05 Dec 2007 # # NOTE: this script is good, but cannot take into account every possible iteration in a descriptor line, so your mileage may vary $inputfile=$ARGV[0]; $outputfile=$ARGV[1]; open(FH,$inputfile) or die "Cannot open $inputfile.\n"; @line_list=; chomp @line_list; close(FH); open(FH,">$outputfile") or die "Cannot open $outputfile.\n"; foreach $line (@line_list) { if ($line =~ />/ ) { # Replace everything up to the first "[" with the accession number for (1..3) { $a=index($line,"|"); substr($line,1,$a)=""; # Remove first 3 fields } $a=index($line,"|"); $a2=index($line,"."); if (($a2>-1) && ($a2<$a)) { $a=$a2; } $b=index($line,"["); substr($line,$a,$b-$a+1)="_"; # Remove everything starting with ":" or "]" or "/" to the end foreach $a (qw/ : ] \/ /) { if ($line =~ /$a/ ) { $line=substr($line,0,index($line,$a)); } } # Remove characters and words (max 10 occurrences of each can be removed, can be modified for specific cases) for (1..10) { foreach $a (qw/ sp. strain isolate chromosome \( \) \. /) { $line =~ s/$a//; } } # Replace double spaces with single space $ss=" "; $ds=" "; for (1..10) { $line =~ s/$ds/$ss/; } # Replace all " " with "_" and remove "-" for (1..10) { $line =~ s/$ss/_/; $line =~ s/-//; } # Remove any terminal "_" substr($line,-1) =~ s/_//; # Condense genus names $line =~ s/Chlorobium/Chl/; $line =~ s/Pelodictyon/Pld/; $line =~ s/Prosthecochloris/Ptc/; } print FH "$line\n"; } close(FH);