!/usr/bin/perl -w #Author: Thomas Thiel (thiel@ipk-gatersleben.de) #v 0.6 # #use LWP::Simple; #require 'dumpvar.pl'; ####################### ##### DECLARATION ##### ####################### # INTERNATIONAL UNION OF BIOCHEMISTRY AND MOLECULAR BIOLOGY (IUBMB) # Nomenclature for Incompletely Specified Bases in Nucleic Acid Sequences # http://www.chem.qmul.ac.uk/iubmb/misc/naseq.html my %bas_amb = ( 'A' => 'A-*', 'C' => 'C-*', 'G' => 'G-*', 'T' => 'T-*', 'm' => '[ac]-*', # A or C 'r' => '[ag]-*', # A or G 'w' => '[at]-*', # A or T 's' => '[cg]-*', # C or G 'y' => '[ct]-*', # C or T 'k' => '[gt]-*', # G or T 'v' => '[acg]-*', # A or C or G (not T) 'h' => '[act]-*', # A or C or T (not G) 'd' => '[agt]-*', # A or G or T (not C) 'b' => '[cgt]-*', # C or G or T (not A) 'x' => '[acgtn]-*', # G or A or T or C - not defined 'n' => '[acgtn]-*' # G or A or T or C ); my %bas_amb_fuzzy = ('A'=>'[AN]-*','C'=>'[CN]-*','G'=>'[GN]-*','T'=>'[TN]-*', 'm'=>'[acN]-*','r'=>'[agN]-*','w'=>'[atN]-*','s'=>'[cgN]-*','y'=>'[ctN]-*', 'k'=>'[gtN]-*','v'=>'[acgN]-*','h'=>'[actN]-*','d'=>'[agtN]-*','b'=>'[cgtN]-*', 'n' => '[acgtn]-*'); my $rebase_file; my %enzyme; # stores REBASE restriction enzyme data my %rebase_customer; # stores REBASE customer info my %marker; # stores sequence data to each marker my @candidate; # stores CAPS marker candidates ####################### ##### HELP/SYNTAX ##### ####################### @ARGV == 3 || @ARGV > 0 && $ARGV[0] eq '-gui' || die ( "\n_______________________________________________________________________________ SNP2CAPS ======== A SNP and INDEL analysis tool for CAPS marker development AUTHOR: Thomas Thiel SYNTAX: (1) GUI version: SNP2CAPS.pl -gui (2) Command line version: SNP2CAPS.pl > file containing the multiple alignments file containing the REBASE database in GCG format comma separated list of enzymes (e.g. EcoRV,BamHI) save the results in the output file (optional) see http://pgrc.ipk-gatersleben.de/SNP2CAPS/ for updates _______________________________________________________________________________\n\n"); ################### ##### M A I N ##### ################### if ($ARGV[0] eq '-gui') { use Tk; $mw = MainWindow->new(); $frameMenu = $mw->Frame()->pack(-anchor => 'w'); $frameEnz = $mw->Frame(-relief=>'raised', -borderwidth=>1)->pack(-fill => 'both', -ipady=>5); $frameChk = $mw->Frame(-relief=>'raised', -borderwidth=>1); $frameTxt = $mw->Frame(); $frameSel = $mw->Frame(-relief=>'raised', -borderwidth=>1); $framePic = $mw->Frame(); $frameInfo = $mw->Frame()->pack(-fill => 'both'); #~~ frameMenu ~~# $frameMenu->Menubutton(-text => 'File', -tearoff => 0, -underline => 0, -menuitems => [['command'=>"Open Sequence alignment", -command=> \&GetSequences3], ['command'=>"Open REBASE data", -command=> \&OpenREBASEdata], #['command' => "Get REBASE data via www", -command => [ \&GetRebaseData, 'w' ]], # doesn't work accurately: check '-', ['command' => "Open enzyme selection", -command => \&OpenEnzSel], ['command' => "Save enzyme selection", -command => \&SaveEnzSel], #'-', #['command' => "Save results", -command => sub {}], # not yet implemented '-', ['command' => "Help ...", -command => sub { $mw->messageBox(-title=>"Help", -message=> "SNP2CAPS: A SNP and INDEL analysis tool for CAPS marker development\n\nUsage:\n\n1. Load the file containing the multiple alignments (3 formata are supported)\n (a) modified FASTA formatted file that may contain data of more than\n\tone multiple alignment.\n\tStructure of the header line: '>id_seqname'\n\twith 'id' = name of the multiple alignement file and 'seqname' = sequence name\n (b) GCG's MSF format\n (c) Clustal's ALN format\n2. Load the REBASE database file in GCG format (http://rebase.neb.com)\n3. Select individual enzymes or select all\n4. Run the analysis\n5. Select the output format and press the 'Refresh' button\n6. Results may be exported by copy and paste\n\n\nSee http://pgrc.ipk-gatersleben.de/SNP2CAPS/ for updates\n\nFor comments and questions please contact Thomas Thiel (thiel\@ipk-gatersleben.de)\n\n"); }], '-', ['command' => "Exit", -command => sub {exit}] ])->pack(-side => 'left', -anchor => 'n'); $frameMenu->Menubutton(-text => 'Analyse', -tearoff => 0, -menuitems => [['command' => "Mine for CAPS markers", -command => \&Mine], # '-', # ['command'=>"Overview (text version)", -command=> \&DisplayResultsTxt], # ['command'=>"Take a closer look (graphical version)", -command=> \&DisplayResultsPic], ### graphical output not yet implemented (next release) ])->pack(-side => 'left', -anchor => 'n'); #~~ frameEnz ~~# $numselEnz = 0; # number of enzymes selected $lbAllEnz = $frameEnz->Scrolled('Listbox', -scrollbars=>'e', -label=>"\nAll enzymes", -selectmode=>'extended', -background=>'white') ->pack(-side=>'left', -anchor=>'n', padx=>10); $chooseEnz = $frameEnz->Frame()->pack(-side=>'left'); $chooseEnz->Button(-text=>'Select', -command=> \&SelectEnz) ->pack(-anchor=>'c', -fill=>'x', -pady=>5); $chooseEnz->Button(-text=>'Deselect', -command=> \&DeselectEnz) ->pack(-anchor=>'c', -fill=>'x'); $chooseEnz->Button(-text=>'Delete all', -command=> \&DeselectAllEnz) ->pack(-anchor=>'c', -fill=>'x', -pady=>5); $chooseEnz->Label(-text=>'Selected:')->pack(-anchor=>'c'); $chooseEnz->Label(-textvariable=>\$numselEnz)->pack(-anchor=>'c'); $lbSelEnz = $frameEnz->Scrolled('Listbox', -scrollbars=>'e', -label=>"Selected enzymes\n(Click for more details)", -selectmode=>'simple', -background=>'white') ->pack(-side=>'left', -anchor=>'n', -padx=>10); $lbSelEnz->bind('', sub {return if $lbSelEnz->size() == 0; my $sel = $lbSelEnz->get($lbSelEnz->curselection()); &EnzymeData($sel)}); $radbtnEnz = 'exclude'; foreach (0..3) {$chkbtnEnz[$_] = 1}; $chkbtnEnz[4] = 0; $addEnz = $frameEnz->Frame()->pack(-side=>'left'); $addEnz1 = $addEnz->Frame()->pack(-anchor=>'w'); $addEnz1->Radiobutton(-text=>'Without isoschizomers', -padx=>10, -value=>'exclude', -variable=>\$radbtnEnz) ->pack(-anchor => 'w'); $addEnz1->Radiobutton(-text=>'Include isoschizomers (not recommended)', -padx=>10, -value=>'include', -variable=>\$radbtnEnz) ->pack(-anchor => 'w'); $addEnz1->Checkbutton(-text=>'only commercially available enzymes', -padx=>10, -variable=> \$chkbtnEnz[0]) ->pack(-anchor => 'w'); $addEnz2 = $addEnz->Frame()->pack(-side=>'left', -anchor=>'w'); $addEnz2->Checkbutton(-text=>'4-base cutters', -padx=>10, -variable=> \$chkbtnEnz[1]) ->pack(-anchor => 'w'); $addEnz2->Checkbutton(-text=>'5-base cutters', -padx=>10, -variable=> \$chkbtnEnz[2]) ->pack(-anchor => 'w'); $addEnz2->Button(-text=>'Add', -command=> \&AddEnz) ->pack(-anchor => 'w', -padx=>10, -pady=>5, ipadx => 30); $addEnz3 = $addEnz->Frame()->pack(-side=>'left', -anchor=>'nw'); $addEnz3->Checkbutton(-text=>'6-base cutters', -padx=>10, -variable=> \$chkbtnEnz[3]) ->pack(-anchor => 'w'); $addEnz3->Checkbutton(-text=>'recognition sites >6', -padx=>10, -variable=> \$chkbtnEnz[4]) ->pack(-anchor => 'w'); $addEnz3->Button(-text=>'Run analysis', -command=> \&Mine) ->pack(-anchor => 'w', -padx=>10, -pady=>5, ipadx => 30); $labelEnz = $frameEnz->Label(-text=>'Enzyme properties:'); $infoEnz1 = $frameEnz->Frame(-relief=>'groove', -borderwidth=>1); $infoEnz1->Menubutton(-text=>'Enzyme', -anchor=>'w')->pack(-fill=>'x'); $infoEnz1->Menubutton(-text=>'Recogn. sequence', -anchor=>'w')->pack(-fill=>'x'); $infoEnz1->Menubutton(-text=>'Cutting site', -anchor=>'w')->pack(-fill=>'x'); $infoEnz1->Menubutton(-text=>'Overhang', -anchor=>'w')->pack(-fill=>'x'); $infoEnz1->Menubutton(-text=>'Isoschizomers', -anchor=>'w')->pack(-fill=>'x'); $infoEnz1->Menubutton(-text=>'Companies', -anchor=>'w')->pack(-fill=>'x'); $infoEnz2 = $frameEnz->Frame(-relief=>'groove', -borderwidth=>1); #~~ frameChk ~~# $chkbtnChk[0] = 0; $chkbtnChk[1] = 1; $chkbtnChk[2] = 1; $chkbtnChk[3] = 0; $showChk1 = $frameChk->Frame()->pack(-side=>'left'); $showChk1->Label(-text=>'Show results of:', -padx=>10)->pack(-anchor=>'w'); $showChk1->Checkbutton(-text=>'CAPS candidates', -padx=>10, -variable=>\$chkbtnChk[1]) ->pack(-anchor=>'w'); $showChk1->Checkbutton(-text=>'CAPS candidates containing N', -padx=>10, -variable=>\$chkbtnChk[2]) ->pack(-anchor=>'w'); $showChk2 = $frameChk->Frame()->pack(-side=>'left'); $showChk2->Label(-text=>'', -padx=>10)->pack(-anchor=>'w'); $showChk2->Checkbutton(-text=>'false positives', -padx=>10, -variable=>\$chkbtnChk[3]) ->pack(-anchor=>'w'); $showChk2->Checkbutton(-text=>'not converted SNP markers', -padx=>10, -variable=>\$chkbtnChk[0]) ->pack(-anchor=>'w'); $radbtnChk = 'group_native'; $showChk3 = $frameChk->Frame()->pack(-side=>'left', -anchor=>'n'); $showChk3->Label(-text=>'Display:', -padx=>10)->pack(-anchor=>'w'); $showChk3->Radiobutton(-text=>'predicted cutting sites & fragment lengths', -padx=>10, -value=>'group_native', -variable=>\$radbtnChk) ->pack(-anchor=>'w'); $showChk3->Radiobutton(-text=>'sequences of recognition sites', -padx=>10, -value=>'group_align', -variable=>\$radbtnChk) ->pack(-anchor=>'w'); $frameChk->Button(-text=>"Refresh", -command=>\&ShowResultsTxt) ->pack(-side=>'left', -padx=>10, -ipadx=>15, ipady=>15); #~~ frameTxt Field ~~# $header = $frameTxt->Text(-height=>1)->pack(-fill=>'x'); $text = $frameTxt->Scrolled('Text', -scrollbars=>'soe', -width=>100, -height=>10, -wrap=>'none') ->pack(-side=>'bottom', -fill=>'both', -expand=>1); #~~ frameSel ~~# $lbAllSel = $frameSel->Scrolled('Listbox', -scrollbars=>'e', -label=>'Choose one marker', -background=>'white') ->pack(-side=>'left', padx=>10); $lbAllSel->bind('', \&ShowEnzymes); $lbEnzSel = $frameSel->Scrolled('Listbox', -scrollbars=>'e', -label=>'Choose enzyme(s)', -background=>'white') ->pack(-side=>'left', padx=>10); #~~ framePic ~~# ### graphical output not yet implemented (next release) #~~ frameInfo Field ~~# $info = $frameInfo->Scrolled('Text', -height=>3, -scrollbars=>'e', -background=>'lightgrey', relief=>'ridge') ->pack(-fill => 'x', -expand => 1); $info->insert('end','Welcome...'); &DisplayResultsTxt; ### graphical output not yet implemented (next release) MainLoop; } else # command line version { GetSequences3(); GetRebaseData('f'); GetCandidates(split ',', $ARGV[2]); Results(); }; ########################## ##### TK SUBROUTINES ##### ########################## #>>> frameMen <<<# sub OpenREBASEdata { GetRebaseData('f'); my @enzall = sort keys %enzyme; $lbAllEnz->delete(0, 'end'); # delete existing entries $lbAllEnz->insert('end', @enzall) }; sub OpenEnzSel { my $file = $mw->getOpenFile() || return; open (IN, "<$file"); my @selection = ; chomp @selection; close (IN); $lbSelEnz->delete(0, 'end'); # delete existing entries $lbSelEnz->insert('end', @selection); $numselEnz = scalar @selection }; sub SaveEnzSel { my $file = $mw->getSaveFile() || return; open (OUT, ">$file") || die ("\nError: File doesn't exist !\n\n"); print OUT join "\n", $lbSelEnz->get(0, 'end'); close (OUT) }; sub Mine { my @enz; push @enz, $lbSelEnz->get(0, 'end'); GetCandidates(@enz) }; #>>> frameEnz <<<# sub SelectEnz { my %enzsel; foreach ($lbSelEnz->get(0, 'end')) {$enzsel{$_} = ''}; # get already selected enzymes foreach ($lbAllEnz->curselection()) {$enzsel{$lbAllEnz->get($_)} = ''}; # add enzymes $lbSelEnz->delete(0, 'end'); # delete existing entries $lbSelEnz->insert('end', sort keys %enzsel); $numselEnz = scalar keys %enzsel; }; sub DeselectEnz { my %enzdesel; foreach ($lbSelEnz->get(0, 'end')) {$enzdesel{$_} = ''}; foreach ($lbSelEnz->curselection()) {delete $enzdesel{$lbSelEnz->get($_)}}; $lbSelEnz->delete(0, 'end'); # delete existing entries $lbSelEnz->insert('end', sort keys %enzdesel); $numselEnz = scalar keys %enzdesel; }; sub DeselectAllEnz {$lbSelEnz->delete(0, 'end'); $numselEnz = 0}; sub AddEnz { my %addenz; @addenz{keys %enzyme} = (); foreach (keys %addenz) { if ($radbtnEnz eq 'exclude' && $enzyme{$_}[2] == 1) {delete $addenz{$_}; next}; if ($chkbtnEnz[0] == 1 && $enzyme{$_}[7] eq '') {delete $addenz{$_}; next}; if ($chkbtnEnz[1] == 0 && $enzyme{$_}[3] == 4) {delete $addenz{$_}; next}; if ($chkbtnEnz[2] == 0 && $enzyme{$_}[3] == 5) {delete $addenz{$_}; next}; if ($chkbtnEnz[3] == 0 && $enzyme{$_}[3] == 6) {delete $addenz{$_}; next}; if ($chkbtnEnz[4] == 0 && $enzyme{$_}[3] > 6) {delete $addenz{$_}; next}; }; foreach ($lbSelEnz->get(0, 'end')) {$addenz{$_} = ''}; # add current state $lbSelEnz->delete(0, 'end'); # delete existing entries $lbSelEnz->insert('end', sort keys %addenz); $numselEnz = scalar keys %addenz; }; sub EnzymeData {# called with enzyme name my @info; $info[0] = shift; foreach (1,4,5,6,7) {push @info, $enzyme{$info[0]}[$_]}; $labelEnz->pack(-side=>'top', -pady=>10, -anchor=>'w', -ipadx=>6); $infoEnz1->pack(-side=>'left', -anchor=>'n'); $infoEnz2->pack(-side=>'left', -anchor=>'n', -padx=>5); foreach ($infoEnz2->children()) {$_->destroy}; foreach (0..3) {$infoEnz2->Menubutton(-text=>"$info[$_]", -anchor=>'w', -width=>15)->pack(-fill=>'x')}; if ($info[4]) { my $menu_iso = $infoEnz2->Menubutton(-text=>'Click...', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x'); foreach (split ',',$info[4]) {$menu_iso->command(-label=>$_, -command=> [\&EnzymeData, $_] )} } else {$infoEnz2->Menubutton(-text=>'', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x')} if ($info[5]) { my $menu_com = $infoEnz2->Menubutton(-text=>'Click...', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x'); foreach (split '', $info[5]) {$menu_com->command(-label=>$rebase_customer{$_})} } else {$infoEnz2->Menubutton(-text=>'', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x')} }; #>>> frameChk/Txt <<<# sub DisplayResultsTxt { return if $frameTxt->ismapped; # frame is already existing $frameSel->packForget; $framePic->packForget; $frameChk->pack(-fill => 'both', -after=>$frameEnz); $frameTxt ->pack(-fill => 'both', -expand => 1, -after=>$frameChk) }; sub ShowResultsTxt { my %info = ( 1 => 'Predicted CAPS candidates', 2 => 'Predicted CAPS candidates (need further inspection: at least one N found within target)', 3 => 'Probably false positive candidates (need further inspection: at least one N found within target)', 0 => 'Following markers could not be converted into CAPS' ); $text->delete("1.0",'end'); # delete text window $header->delete("1.0",'end'); # delete header &ShowNativeTxt(\%info) if $radbtnChk eq 'group_native'; &ShowAlignTxt(\%info) if $radbtnChk eq 'group_align'; }; sub ShowNativeTxt { my %info = %{$_[0]}; $header->insert('end', sprintf ("%-10.10s %-9.9s %-5.5s %25.25s %25.25s %-25.25s\n", 'Marker', 'Enzyme', 'Total', 'Restriction Sites', 'Expected Fragments', 'Members')); for (my $i = 1; $i < 4; $i++) # iterate Test Groups { next if $chkbtnChk[$i] == 0 || scalar keys %{$candidate[$i]} == 0; $text->insert('end', "$info{$i}\n"); $text->insert('end', '-' x length $info{$i}); $text->insert('end', "\n\n"); foreach my $marker (sort keys %{$candidate[$i]}) { foreach my $enzyme (sort keys %{$candidate[$i]{$marker}}) { for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_native'}}; $j++) { my @cutpos; # store cut positions for (my $x = 1; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}}; $x += 4) { my $cutpos = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[$x]; push @cutpos, $cutpos if $cutpos > 0; }; @cutpos = sort {$a <=> $b} @cutpos; my @fragmentsizes; # stores fragment sizes my $length_align = length $marker{$marker}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[0]; my $lengthseq = $length_align - PrecedingGaps($marker, $candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0], $length_align); if (@cutpos) { push @fragmentsizes, $cutpos[0], $lengthseq - $cutpos[$#cutpos]; for (my $x = 1; $x < scalar @cutpos; $x++) {push @fragmentsizes, $cutpos[$x] - $cutpos[$x - 1]}; } else {$fragmentsizes[0] = $lengthseq}; @fragmentsizes = sort {$b<=>$a} @fragmentsizes; $text->insert('end', sprintf ("%-10.100s %-9.100s %5.100s ", $marker, $enzyme, $lengthseq)); $text->insert('end', sprintf ("%25.100s ", join ",",@cutpos)); $text->insert('end', sprintf ("%25.100s ", join ",",@fragmentsizes)); $text->insert('end', join ",",@{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j]}); $text->insert('end', "\n"); }; $text->insert('end', "\n"); } $text->insert('end', "\n"); } }; &ShowNotConverted($info{0}) if $chkbtnChk[0] == 1 && $candidate[0] && scalar @{$candidate[0]} > 0 # conversion failed }; sub ShowAlignTxt { my %info = %{$_[0]}; $header->insert('end', sprintf ("%-10.10s %-9.9s %-50.50s %-20.20s %-20.20s %-20.20s %-5.5s\n", 'Marker', 'Enzyme', 'Members', 'Cutting Site 1', 'Cutting Site 2', 'Cutting Site 3', '...')); for (my $i = 1; $i < 4; $i++) # iterate Test Groups { next if $chkbtnChk[$i] == 0 || scalar keys %{$candidate[$i]} == 0; $text->insert('end', "$info{$i}\n"); $text->insert('end', '-' x length $info{$i}); $text->insert('end', "\n\n"); foreach my $marker (sort keys %{$candidate[$i]}) { foreach my $enzyme (sort keys %{$candidate[$i]{$marker}}) { for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_align'}}; $j++) { $text->insert('end', sprintf ("%-10.100s %-9.100s ", $marker, $enzyme)); $text->insert('end', sprintf ("%-50.100s ", join (",",@{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j]}))); for (my $x = 0; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[0][0]}}; $x += 4) { my $recseq = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x + 3]; $recseq =~ s/-+/\~/g; $text->insert('end', sprintf ("%-20.20s ", "$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x + 2] $recseq ($candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x])")); }; $text->insert('end', "\n"); }; $text->insert('end', "\n"); } $text->insert('end', "\n"); } }; &ShowNotConverted($info{0}) if $chkbtnChk[0] == 1 && $candidate[0] && scalar @{$candidate[0]} > 0 # conversion failed }; sub ShowNotConverted { my $info = shift; $text->insert('end', "$info\n"); $text->insert('end', '-' x length $info); $text->insert('end', "\n\n"); $text->insert('end', join "\n", @{$candidate[0]}); }; #>>> frameSel/Pic <<<# sub DisplayResultsPic ### graphical output not yet implemented (next release) { return if $framePic->ismapped; $frameTxt->packForget; $frameChk->packForget; $frameSel->pack(-fill => 'both', -after=>$frameEnz, -ipady=>5); $framePic->pack(-fill => 'both', -expand => 1, -after=>$frameSel) }; sub ShowEnzymes # Show all relevant enzymes { return if $lbAllSel->size() == 0; my $marker = $lbAllSel->get($lbAllSel->curselection()); $lbEnzSel->delete(0,'end'); # delete all entries for (my $i = 1; $i < 4; $i++) { if (exists $candidate[$i]{$marker}) { $lbEnzSel->insert('end', sort (keys %{$candidate[$i]{$marker}})); } } }; ####################### ##### SUBROUTINES ##### ####################### #>>> READ DNA sequences from file <<<# sub GetSequences { my $file = $mw->getOpenFile() || return; my ($count_marker,$count_sequences); %marker = (); open (IN, "<$file"); $/ = ">>"; while () { my ($marker_name, $fasta); next unless (($marker_name, $fasta) = /(.*?)\n(.*)/s); $count_marker++; while ($fasta =~ />(.*?)\n(.*?)(?=\n)/gs) # $1->cultivar $2->sequence { (my $cultivar, my $seq) = ($1, $2); $marker{$marker_name}{$cultivar} = [$seq]; $count_sequences++; while ($seq =~ /(-+)/g) # find out positions of gaps { my $gap_pos = pos ($seq) + 1 - length $1; push @{$marker{$marker_name}{$cultivar}}, $gap_pos, length $1; } } }; $/ = "\n"; close (IN); $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s)."); $info -> yviewMoveto(1); }; sub GetSequences2 { if ($ARGV[0] eq "-gui") {$file = $mw->getOpenFile() || return} else {$file = $ARGV[0]} %marker = (); open (IN, "<$file") || die ("Could not open $file\n"); my $msa_type; # type of alignment file while () { if (/^>[^_]+_[^_]+/) {$msa_type = 'S2C';last} elsif (/^MSF/) {$msa_type = 'MSF';last} elsif (/^CLUSTAL W/) {$msa_type = 'ALN';last} }; my ($count_sequences, $count_marker); seek (IN, 0, 0); if ($msa_type eq 'S2C') # SNP2CAPS format { $/ = ">"; while () { my ($id, $seq); next unless (($id, $seq) = /(.*?)\n(.*)/s); my ($marker_name, $cultivar) = $id =~ /(.*?)_(.*)/; $seq =~ s/[\s>]//g; # remove whitespace $count_sequences++; $marker{$marker_name}{$cultivar} = [$seq]; push @{$marker{$marker_name}{$cultivar}}, FindGaps(\$seq); }; $count_marker = scalar keys %marker; } elsif ($msa_type eq 'MSF') { $/ = "\n"; my @parts = split '//', join '', ; # $parts[2] contains sequence alignment foreach (split "\n", $parts[2]) { my ($cultivar, $seq); next unless (($cultivar, $seq) = /^(\S+)\s+(\S+)/); $marker{$file}{$cultivar}[0] .= $seq; }; foreach my $cultivar (keys %{$marker{$file}}) { push @{$marker{$file}{$cultivar}}, FindGaps(\$marker{$file}{$cultivar}[0]); }; $count_marker = 1; $count_sequences = scalar keys %{$marker{$file}}; } elsif ($msa_type eq 'ALN') # CLUSTAL W format { $/ = "\n"; while () { chomp; next if /^CLUSTAL W/; my ($cultivar, $seq); next unless (($cultivar, $seq) = /^(\S+)\s+(\S+)/); $marker{$file}{$cultivar}[0] .= $seq; }; foreach my $cultivar (keys %{$marker{$file}}) { push @{$marker{$file}{$cultivar}}, FindGaps(\$marker{$file}{$cultivar}[0]); }; $count_marker = 1; $count_sequences = scalar keys %{$marker{$file}}; } else {return}; $/ = "\n"; close (IN); if ($ARGV[0] eq "-gui") { $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s)."); $info -> yviewMoveto(1); }; }; sub GetSequences3 { use Bio::AlignIO; my ($count_sequences, $count_marker); if ($ARGV[0] eq "-gui") {$file = $mw->getOpenFile() || return} else {$file = $ARGV[0]} %marker = (); my $msaformat = Bio::AlignIO->_guess_format($file); $msaformat || ($msaformat = 'S2C'); print $msaformat; if ($msaformat eq 'S2C') # SNP2CAPS format { open (IN, "<$file") || die ("Could not open $file\n"); #seek (IN, 0, 0); $/ = ">"; while () { my ($id, $seq, $marker_name, $cultivar); next unless (($id, $seq) = /(.*?)\n(.*)/s); next unless ($marker_name, $cultivar) = $id =~ /(.*?)_(.*)/; $seq =~ s/[\s>]//g; # remove whitespace $count_sequences++; $marker{$marker_name}{$cultivar} = [$seq]; push @{$marker{$marker_name}{$cultivar}}, FindGaps(\$seq); }; close (IN); $count_marker = scalar keys %marker; } else { my $msafile = Bio::AlignIO->new(-file=>$file, -format=>$msaformat); while (my $msa = $msafile->next_aln()) { $count_marker++; foreach my $seqobj ($msa->each_seq) { $count_sequences++; my $cultivar = $seqobj->display_id(); my $seq = $seqobj->seq(); my $name = "$file".'_'."$count_marker"; $marker{$name}{$cultivar}[0] = $seq; push @{$marker{$name}{$cultivar}}, FindGaps(\$marker{$name}{$cultivar}[0]); }; }; }; if ($ARGV[0] eq "-gui") { $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s)."); $info -> yviewMoveto(1); }; }; sub FindGaps { my ($seqref) = @_; my @gaps; # stores gap position and type while ($$seqref =~ /([-_\.]+)/g) # find out positions of gaps { my $gap_pos = pos ($$seqref) + 1 - length $1; push @gaps, $gap_pos, length $1; }; return @gaps }; #>>> READ REBASE enzyme data <<<# sub GetRebaseData { # Option 'w' -> obtain via WWW, 'f' -> Read from File my ($rebase_file, $rebase_version, $rebase_reldate, $count); if ($_[0] eq 'w') {while (!($rebase_file = get("http://rebase.neb.com/rebase/link_gcg"))) {}} else { if ($ARGV[0] eq '-gui') {$file = $mw->getOpenFile() || return} else {$file = $ARGV[1]}; open (IN, "<$file"); $rebase_file = join '',; close (IN) }; %enzyme = (); # EXTRACT items from the REBASE data file # (my $rebase_file1, my $rebase_file2) = split /\.\.\n/,$rebase_file; # ".." separates intro and enzyme list ($rebase_version) = $rebase_file1 =~ /\nREBASE version (\d+)/s; ($rebase_reldate) = $rebase_file1 =~ /\nRich Roberts\s+(.*?)\n/s; (%rebase_customer) = $rebase_file1 =~ /\n\s+([A-Z])\s+(.*?)(?=\n)/gs; while ($rebase_file2 =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(?: \? !| !)\s+(\S+|)\s+>(\S+|)/gs) # $1->name $2->cutting pos. $3->recognition seq. $4->overhang # $5->isoschizomers $6->commercial sources { my $isoschizomer = 0; # isoschizomer flag (0->false, 1-true) my $recseq = $3; my $enzname = $1; { $recseq =~ s/[_']//g; # remove cut position markers $isoschizomer = 1 if $enzname =~ s/;//; # preceding ; marks isoschizomers $recseq =~ s/N+$//i; }; # curly brackets keep matched variables (my $recseq_regex, my $recseq_fuzzy) = ($recseq, $recseq); foreach (keys %bas_amb) {$recseq_regex =~ s/$_/$bas_amb{$_}/g}; # translate to regex like style foreach (keys %bas_amb_fuzzy) {$recseq_fuzzy =~ s/$_/$bas_amb_fuzzy{$_}/g}; # for enzymes which are listed twice, e.g. AloI if (exists $enzyme{$enzname}) {$enzname .= ' !'} else {$count++}; $enzyme{$enzname} = [$recseq_regex, $recseq, $isoschizomer, length ($recseq), $2, $4, $5, $6, $recseq_fuzzy]; }; if ($ARGV[0] eq '-gui') { $info->insert("insert","\nLoaded REBASE data base (version $rebase_version, $rebase_reldate) comprising $count enzymes."); $info->yviewMoveto(1); }; }; #>>> SEARCH for CAPS candidates <<<# sub GetCandidates { # call by enzymes to be used for screening if (scalar keys %enzyme == 0) {$info->insert("insert","\nError: Load REBASE data base first"); $info->yviewMoveto(1); return}; if (scalar keys %marker == 0) {$info->insert("insert","\nError: Load Sequence Alignment first"); $info->yviewMoveto(1); return}; if ($ARGV[0] eq '-gui') {$info->insert("insert","\nAnalyse..."); $info->yviewMoveto(1); $info->update}; @candidate = (); foreach my $enzyme_name(@_) { my ($not_palindromic, $rc_recseq, $rc_recseq_regex, $rc_recseq_fuzzy); study $enzyme{$enzyme_name}[0]; # of any benefit ? if (! Palindromic($enzyme{$enzyme_name}[1]) && ! (exists $enzyme{$enzyme_name.' !'} || $enzyme_name =~ /!/)) # test if palindromic or of type as AloI { $not_palindromic = 1; $rc_recseq = reverse DNAComplement($enzyme{$enzyme_name}[1]); ($rc_recseq_regex, $rc_recseq_fuzzy) = ($rc_recseq, $rc_recseq); foreach (keys %bas_amb) {$rc_recseq_regex =~ s/$_/$bas_amb{$_}/g}; # translate to regex like style foreach (keys %bas_amb_fuzzy) {$rc_recseq_fuzzy =~ s/$_/$bas_amb_fuzzy{$_}/g} } foreach my $marker_name (keys %marker) { my %cut; # stores enzyme binding positions temporarily foreach my $cultivar (keys %{$marker{$marker_name}}) { while ($marker{$marker_name}{$cultivar}[0] =~ /($enzyme{$enzyme_name}[0])/ig) { pos ($marker{$marker_name}{$cultivar}[0]) += 1 - length $1; my $start = pos ($marker{$marker_name}{$cultivar}[0]); $cut{$start}{$cultivar} = ['f',$1]; }; if ($not_palindromic) # rec. seq. not palindromic => check the rc as well { while ($marker{$marker_name}{$cultivar}[0] =~ /($rc_recseq_regex)/ig) { pos ($marker{$marker_name}{$cultivar}[0]) += 1 - length $1; my $start = pos ($marker{$marker_name}{$cultivar}[0]); $cut{$start}{$cultivar} = ['r',$1]; }; }; }; my %test; # test for CAPS candidates # (1) true CAPS candidate (alternative restriction due to SNP) # (2) Not sure: probably true CAPS candidate (N within DNA sequence, but SNP at different position) # (3) Not sure: probably false positive CAPS candidate at at least one site (N within DNA sequence) # cases (2) and (3) need further inspection by the experimenter foreach my $site (keys %cut) { my $difference = scalar (keys %{$marker{$marker_name}}) - scalar (keys %{$cut{$site}}); if ($difference > 0) # test if all sequences were cut { my %intersection; # intersection between 2 hashes @intersection{ keys %{$marker{$marker_name}} } = (); delete @intersection{ grep (exists $intersection{$_}, keys %{$cut{$site}}) }; foreach my $cultivar (keys %intersection) # for all that were not cut { my $test_seq = substr($marker{$marker_name}{$cultivar}[0], $site - 1); $test_seq =~ s/((?:[^-]-*){$enzyme{$enzyme_name}[3]}).*/$1/; $cut{$site}{$cultivar} = ['n',$test_seq]; if ($test_seq !~ /N/ig) {$test{1} = ""} # see above elsif ($test_seq !~ /$enzyme{$enzyme_name}[8]/i) { if ($not_palindromic) {if ($test_seq !~ /$rc_recseq_fuzzy/i) {$test{2} = ""}} else {$test{2} = ""} } else {$test{3} = ""} } } }; if (%test) # true if alternative restriction was found { my $test_id = Test(\%test); foreach my $site (sort {$a <=> $b} keys %cut) { foreach my $cultivar (keys %{$cut{$site}}) { my $cutsite = 0; # calculate the cut site position if ($cut{$site}{$cultivar}[0] eq "f") {$cutsite = $site - PrecedingGaps($marker_name, $cultivar, $site) + $enzyme{$enzyme_name}[4] - 1} elsif ($cut{$site}{$cultivar}[0] eq "r") {$cutsite = $site - PrecedingGaps($marker_name, $cultivar, $site) + $enzyme{$enzyme_name}[3] - $enzyme{$enzyme_name}[4] - 1}; ## gaps within rec.seq???? TEST push @{$candidate[$test_id]{$marker_name}{$enzyme_name}{$cultivar}}, $cut{$site}{$cultivar}[0], $cutsite, $site, $cut{$site}{$cultivar}[1]; }; }; $candidate[$test_id]{$marker_name}{$enzyme_name} = group($marker_name,\%{$candidate[$test_id]{$marker_name}{$enzyme_name}}); } } }; my (%not_converted, @converted); @not_converted{keys %marker} = (); for (my $i = 1; $i < 4; $i++) { delete @not_converted{keys %{$candidate[$i]}}; push @converted, keys %{$candidate[$i]} }; @{$candidate[0]} = sort keys %not_converted; @converted = sort @converted; if ($ARGV[0] eq '-gui') { $lbAllSel->delete(0,'end'); $lbAllSel->insert('end', @converted); $info -> insert("insert","DONE"); }; #dumpValue(\@candidate); }; #>>> SORTING ROUTINE <<<# sub group { # two keys are appended during this routine containing sorting matrizes # according to the presence / absence of restriction sites (group_align) # and according to the resulting fragment sizes (group_native) my $marker = shift; my %group = %{$_[0]}; my (%sort_align, %sort_native); # contain the sorting keys my @cultivar = sort keys %group; for (my $i = 0; $i < scalar @cultivar; $i++) { for (my $j = 0; $j < scalar @{$group{$cultivar[$i]}}; $j += 4) { $sort_align{$cultivar[$i]} .= $group{$cultivar[$i]}[$j + 3]; # rec.seq. $sort_native{$cultivar[$i]} .= $group{$cultivar[$i]}[$j + 1] # cut.pos. }; $sort_native{$cultivar[$i]} .= length $marker{$marker}{$cultivar[$i]}[0]; #considers also total fragment length }; $group{'group_align'}[0][0] = $cultivar[0]; # initialize sorting matrix with first element $group{'group_native'}[0][0] = $cultivar[0]; for (my $i = 1; $i < scalar @cultivar; $i++) # check all cultivars { my $found_group_align = 0; my $found_group_native = 0; for (my $j = 0; $j < scalar @{$group{'group_align'}}; $j++) # check in all existing groups { if ($sort_align{$cultivar[$i]} eq $sort_align{$group{'group_align'}[$j][0]}) # compare with the first element of each group { push @{$group{'group_align'}[$j]}, $cultivar[$i]; $found_group_align = 1; last } }; for (my $j = 0; $j < scalar @{$group{'group_native'}}; $j++) { if ($sort_native{$cultivar[$i]} eq $sort_native{$group{'group_native'}[$j][0]}) { push @{$group{'group_native'}[$j]}, $cultivar[$i]; $found_group_native = 1; last } }; if (not $found_group_align) {$group{'group_align'}[scalar @{$group{'group_align'}}][0] = $cultivar[$i]}; # form a new group if (not $found_group_native) {$group{'group_native'}[scalar @{$group{'group_native'}}][0] = $cultivar[$i]} }; return \%group }; #>>> Get the number of preceding gaps <<<# sub PrecedingGaps { # invoke with (1) marker name, (2) cultivar name, (3) position to be tested my $gaps = 0; my @array = @{$marker{$_[0]}{$_[1]}}; my $position = $_[2]; shift @array; while (@array) { my $pos = shift @array; my $length = shift @array; if ($pos <= $position) {$gaps += $length} else {last}; }; return $gaps }; #>>> Generate the complement sequence <<<# sub DNAComplement { # invoke by DNA sequence my $seq = shift; $seq =~ tr/ACTGactgMRVHmrvhKYBDkybd/TGACtgacKYBDkybdMRVHmrvh/; return $seq }; #>>> Check if sequence is complementary <<<# sub Palindromic { # invoke by DNA sequence my $seq = shift; return 1 if ($seq eq reverse DNAComplement($seq)); return 0 }; #>>> Does the quality grouping <<<# sub Test { my @test = keys %{$_[0]}; return $test[0] if scalar @test == 1; return 2; }; sub Results { my %info = ( 1 => 'Predicted CAPS candidates', 2 => 'Predicted CAPS candidates (need further inspection: at least one N found within target)', 3 => 'Probably false positive candidates (need further inspection: at least one N found within target)', 0 => 'Following markers could not be converted into CAPS' ); print "#Format:\n" ,join "\t", '#Marker', 'Enzyme', 'Total size', 'Restriction Sites', 'Expected Fragments', 'Members', "\n\n"; for (my $i = 1; $i < 4; $i++) # iterate Test Groups { next if scalar keys %{$candidate[$i]} == 0; print "$info{$i}\n"; print '-' x length $info{$i}, "\n\n"; foreach my $marker (sort keys %{$candidate[$i]}) { foreach my $enzyme (sort keys %{$candidate[$i]{$marker}}) { for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_native'}}; $j++) { my @cutpos; # store cut positions for (my $x = 1; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}}; $x += 4) { my $cutpos = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[$x]; push @cutpos, $cutpos if $cutpos > 0; }; @cutpos = sort {$a <=> $b} @cutpos; my @fragmentsizes; # stores fragment sizes my $length_align = length $marker{$marker}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[0]; my $lengthseq = $length_align - PrecedingGaps($marker, $candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0], $length_align); if (@cutpos) { push @fragmentsizes, $cutpos[0], $lengthseq - $cutpos[$#cutpos]; for (my $x = 1; $x < scalar @cutpos; $x++) {push @fragmentsizes, $cutpos[$x] - $cutpos[$x - 1]}; } else {$fragmentsizes[0] = $lengthseq}; @fragmentsizes = sort {$b<=>$a} @fragmentsizes; print join "\t", $marker, $enzyme, $lengthseq; print "\t", join ",", @cutpos; print "\t", join ",",@fragmentsizes; print "\t", join ",",@{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j]}; print "\n"; }; print "\n"; } print "\n"; } }; print "$info{0}\n"; print '-' x length $info{0}; print "\n\n"; print join "\n", @{$candidate[0]}; };