!/usr/bin/perl -w
#Author: Thomas Thiel (thiel@ipk-gatersleben.de)
#v 0.6
#
#use LWP::Simple;
#require 'dumpvar.pl';

#######################
##### DECLARATION #####
#######################

# INTERNATIONAL UNION OF BIOCHEMISTRY AND MOLECULAR BIOLOGY (IUBMB)
# Nomenclature for Incompletely Specified Bases in Nucleic Acid Sequences
# http://www.chem.qmul.ac.uk/iubmb/misc/naseq.html

my %bas_amb = (
    'A' => 'A-*',
    'C' => 'C-*',
    'G' => 'G-*',
    'T' => 'T-*',
    'm' => '[ac]-*',    # A or C
    'r' => '[ag]-*',    # A or G
    'w' => '[at]-*',    # A or T
    's' => '[cg]-*',    # C or G
    'y' => '[ct]-*',    # C or T
    'k' => '[gt]-*',    # G or T
    'v' => '[acg]-*',   # A or C or G (not T)
    'h' => '[act]-*',   # A or C or T (not G)
    'd' => '[agt]-*',   # A or G or T (not C)
    'b' => '[cgt]-*',   # C or G or T (not A)
    'x' => '[acgtn]-*', # G or A or T or C - not defined
    'n' => '[acgtn]-*'  # G or A or T or C
     );

my %bas_amb_fuzzy = ('A'=>'[AN]-*','C'=>'[CN]-*','G'=>'[GN]-*','T'=>'[TN]-*',
    'm'=>'[acN]-*','r'=>'[agN]-*','w'=>'[atN]-*','s'=>'[cgN]-*','y'=>'[ctN]-*',  
    'k'=>'[gtN]-*','v'=>'[acgN]-*','h'=>'[actN]-*','d'=>'[agtN]-*','b'=>'[cgtN]-*',
    'n' => '[acgtn]-*');

my $rebase_file;
my %enzyme; # stores REBASE restriction enzyme data
my %rebase_customer; # stores REBASE customer info
my %marker; # stores sequence data to each marker
my @candidate; # stores CAPS marker candidates


#######################
##### HELP/SYNTAX #####
#######################

@ARGV == 3 || @ARGV > 0 && $ARGV[0] eq '-gui' || die (
"\n_______________________________________________________________________________

SNP2CAPS
========

A SNP and INDEL analysis tool for CAPS marker development

AUTHOR:  Thomas Thiel

SYNTAX:  (1) GUI version: SNP2CAPS.pl -gui

         (2) Command line version:

             SNP2CAPS.pl <msa_file> <rebase_db> <enzymes> > <output>
             
             <msa_file>  file containing the multiple alignments
             <rebase_db> file containing the REBASE database in GCG format
             <enzymes>   comma separated list of enzymes (e.g. EcoRV,BamHI)
             <output>    save the results in the output file (optional)

see http://pgrc.ipk-gatersleben.de/SNP2CAPS/ for updates

_______________________________________________________________________________\n\n");


###################
##### M A I N #####
###################

if ($ARGV[0] eq '-gui')
  {
  use Tk;
  $mw = MainWindow->new();
  $frameMenu = $mw->Frame()->pack(-anchor => 'w');
  $frameEnz = $mw->Frame(-relief=>'raised', -borderwidth=>1)->pack(-fill => 'both', -ipady=>5);
  $frameChk = $mw->Frame(-relief=>'raised', -borderwidth=>1);
  $frameTxt = $mw->Frame();
  $frameSel = $mw->Frame(-relief=>'raised', -borderwidth=>1);
  $framePic = $mw->Frame();
  $frameInfo = $mw->Frame()->pack(-fill => 'both');
  
  #~~ frameMenu ~~#
  
  $frameMenu->Menubutton(-text => 'File', -tearoff => 0, -underline => 0,
             -menuitems => [['command'=>"Open Sequence alignment", -command=> \&GetSequences3],
                            ['command'=>"Open REBASE data", -command=> \&OpenREBASEdata],
                            #['command' => "Get REBASE data via www", -command  => [ \&GetRebaseData, 'w' ]], # doesn't work accurately: check
                            '-',
                            ['command' => "Open enzyme selection", -command  => \&OpenEnzSel],
                            ['command' => "Save enzyme selection", -command  => \&SaveEnzSel],
                            #'-',
                            #['command' => "Save results", -command  => sub {}], # not yet implemented
                            '-',
                            ['command' => "Help ...", -command  => sub {  $mw->messageBox(-title=>"Help", -message=>
                            "SNP2CAPS:  A SNP and INDEL analysis tool for CAPS marker development\n\nUsage:\n\n1. Load the file containing the multiple alignments (3 formata are supported)\n   (a) modified FASTA formatted file that may contain data of more than\n\tone multiple alignment.\n\tStructure of the header line: '>id_seqname'\n\twith 'id' = name of the multiple alignement file and 'seqname' = sequence name\n   (b) GCG's MSF format\n   (c) Clustal's ALN format\n2. Load the REBASE database file in GCG format (http://rebase.neb.com)\n3. Select individual enzymes or select all\n4. Run the analysis\n5. Select the output format and press the 'Refresh' button\n6. Results may be exported by copy and paste\n\n\nSee http://pgrc.ipk-gatersleben.de/SNP2CAPS/ for updates\n\nFor comments and questions please contact Thomas Thiel (thiel\@ipk-gatersleben.de)\n\n");
                            }],
                            '-',
                            ['command' => "Exit", -command  => sub {exit}]
                           ])->pack(-side => 'left', -anchor => 'n');
                                                   
  $frameMenu->Menubutton(-text => 'Analyse', -tearoff => 0,
             -menuitems => [['command' => "Mine for CAPS markers", -command  => \&Mine],
                           # '-',
                           # ['command'=>"Overview (text version)", -command=> \&DisplayResultsTxt],
                           # ['command'=>"Take a closer look (graphical version)", -command=> \&DisplayResultsPic],   ### graphical output not yet implemented (next release)
                           ])->pack(-side => 'left', -anchor => 'n');
  
  #~~ frameEnz ~~#
  
  $numselEnz = 0; # number of enzymes selected
  $lbAllEnz = $frameEnz->Scrolled('Listbox', -scrollbars=>'e', -label=>"\nAll enzymes", -selectmode=>'extended', -background=>'white')
                       ->pack(-side=>'left', -anchor=>'n', padx=>10);
  
  $chooseEnz = $frameEnz->Frame()->pack(-side=>'left');
  $chooseEnz->Button(-text=>'Select', -command=> \&SelectEnz)
            ->pack(-anchor=>'c', -fill=>'x', -pady=>5);
  $chooseEnz->Button(-text=>'Deselect', -command=> \&DeselectEnz)
            ->pack(-anchor=>'c', -fill=>'x');
  $chooseEnz->Button(-text=>'Delete all', -command=> \&DeselectAllEnz)
            ->pack(-anchor=>'c', -fill=>'x', -pady=>5);
  $chooseEnz->Label(-text=>'Selected:')->pack(-anchor=>'c');
  $chooseEnz->Label(-textvariable=>\$numselEnz)->pack(-anchor=>'c');
  
  $lbSelEnz = $frameEnz->Scrolled('Listbox', -scrollbars=>'e', -label=>"Selected enzymes\n(Click for more details)", -selectmode=>'simple', -background=>'white')
                       ->pack(-side=>'left', -anchor=>'n', -padx=>10);
  $lbSelEnz->bind('<Button-1>', sub {return if $lbSelEnz->size() == 0;
                                     my $sel = $lbSelEnz->get($lbSelEnz->curselection());
                                     &EnzymeData($sel)});
  
  $radbtnEnz = 'exclude';
  foreach (0..3) {$chkbtnEnz[$_] = 1}; $chkbtnEnz[4] = 0;
  $addEnz = $frameEnz->Frame()->pack(-side=>'left');
  $addEnz1 = $addEnz->Frame()->pack(-anchor=>'w');
  $addEnz1->Radiobutton(-text=>'Without isoschizomers', -padx=>10, -value=>'exclude', -variable=>\$radbtnEnz)
          ->pack(-anchor => 'w');
  $addEnz1->Radiobutton(-text=>'Include isoschizomers (not recommended)', -padx=>10, -value=>'include', -variable=>\$radbtnEnz)
          ->pack(-anchor => 'w');
  $addEnz1->Checkbutton(-text=>'only commercially available enzymes', -padx=>10, -variable=> \$chkbtnEnz[0])
          ->pack(-anchor => 'w');
  $addEnz2 = $addEnz->Frame()->pack(-side=>'left', -anchor=>'w');
  $addEnz2->Checkbutton(-text=>'4-base cutters', -padx=>10, -variable=> \$chkbtnEnz[1])
          ->pack(-anchor => 'w');
  $addEnz2->Checkbutton(-text=>'5-base cutters', -padx=>10, -variable=> \$chkbtnEnz[2])
          ->pack(-anchor => 'w');
  $addEnz2->Button(-text=>'Add', -command=>  \&AddEnz)
          ->pack(-anchor => 'w', -padx=>10, -pady=>5, ipadx => 30);
  $addEnz3 = $addEnz->Frame()->pack(-side=>'left', -anchor=>'nw');
  $addEnz3->Checkbutton(-text=>'6-base cutters', -padx=>10, -variable=> \$chkbtnEnz[3])
          ->pack(-anchor => 'w');
  $addEnz3->Checkbutton(-text=>'recognition sites >6', -padx=>10, -variable=> \$chkbtnEnz[4])
          ->pack(-anchor => 'w');
  $addEnz3->Button(-text=>'Run analysis', -command=>  \&Mine)
          ->pack(-anchor => 'w', -padx=>10, -pady=>5, ipadx => 30);
          
  $labelEnz = $frameEnz->Label(-text=>'Enzyme properties:');
  $infoEnz1 = $frameEnz->Frame(-relief=>'groove', -borderwidth=>1);
  $infoEnz1->Menubutton(-text=>'Enzyme', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz1->Menubutton(-text=>'Recogn. sequence', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz1->Menubutton(-text=>'Cutting site', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz1->Menubutton(-text=>'Overhang', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz1->Menubutton(-text=>'Isoschizomers', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz1->Menubutton(-text=>'Companies', -anchor=>'w')->pack(-fill=>'x');
  $infoEnz2 = $frameEnz->Frame(-relief=>'groove', -borderwidth=>1);
  
  #~~ frameChk ~~#
  
  $chkbtnChk[0] = 0; $chkbtnChk[1] = 1; $chkbtnChk[2] = 1; $chkbtnChk[3] = 0; 
  $showChk1 = $frameChk->Frame()->pack(-side=>'left');
  $showChk1->Label(-text=>'Show results of:', -padx=>10)->pack(-anchor=>'w');
  $showChk1->Checkbutton(-text=>'CAPS candidates', -padx=>10, -variable=>\$chkbtnChk[1])
           ->pack(-anchor=>'w');
  $showChk1->Checkbutton(-text=>'CAPS candidates containing N', -padx=>10, -variable=>\$chkbtnChk[2])
           ->pack(-anchor=>'w');
  $showChk2 = $frameChk->Frame()->pack(-side=>'left');
  $showChk2->Label(-text=>'', -padx=>10)->pack(-anchor=>'w');
  $showChk2->Checkbutton(-text=>'false positives', -padx=>10, -variable=>\$chkbtnChk[3])
           ->pack(-anchor=>'w');
  $showChk2->Checkbutton(-text=>'not converted SNP markers', -padx=>10, -variable=>\$chkbtnChk[0])
           ->pack(-anchor=>'w');
  $radbtnChk = 'group_native';
  $showChk3 = $frameChk->Frame()->pack(-side=>'left', -anchor=>'n');
  $showChk3->Label(-text=>'Display:', -padx=>10)->pack(-anchor=>'w');
  $showChk3->Radiobutton(-text=>'predicted cutting sites & fragment lengths', -padx=>10, -value=>'group_native', -variable=>\$radbtnChk)
           ->pack(-anchor=>'w');
  $showChk3->Radiobutton(-text=>'sequences of recognition sites', -padx=>10, -value=>'group_align', -variable=>\$radbtnChk)
           ->pack(-anchor=>'w');
  
  $frameChk->Button(-text=>"Refresh", -command=>\&ShowResultsTxt)
                   ->pack(-side=>'left', -padx=>10, -ipadx=>15, ipady=>15);
  
  #~~ frameTxt Field ~~#
  
  $header = $frameTxt->Text(-height=>1)->pack(-fill=>'x');
  $text = $frameTxt->Scrolled('Text', -scrollbars=>'soe', -width=>100, -height=>10, -wrap=>'none')
                   ->pack(-side=>'bottom', -fill=>'both', -expand=>1);
  
  #~~ frameSel ~~#
  
  $lbAllSel = $frameSel->Scrolled('Listbox', -scrollbars=>'e', -label=>'Choose one marker', -background=>'white')
                       ->pack(-side=>'left', padx=>10);
  $lbAllSel->bind('<Button-1>', \&ShowEnzymes);
  
  $lbEnzSel = $frameSel->Scrolled('Listbox', -scrollbars=>'e', -label=>'Choose enzyme(s)', -background=>'white')
                       ->pack(-side=>'left', padx=>10);
  
  
  #~~ framePic ~~#   ### graphical output not yet implemented (next release)
  
  
  #~~ frameInfo Field ~~#
  
  $info = $frameInfo->Scrolled('Text', -height=>3, -scrollbars=>'e', -background=>'lightgrey', relief=>'ridge')
                    ->pack(-fill => 'x', -expand => 1);
  
  $info->insert('end','Welcome...');
  
  
  &DisplayResultsTxt;  ### graphical output not yet implemented (next release)
  MainLoop;
  }
else # command line version
  {
  GetSequences3();
  GetRebaseData('f');
  GetCandidates(split ',', $ARGV[2]);
  Results();
  };

##########################
##### TK SUBROUTINES #####
##########################

#>>> frameMen <<<#

sub OpenREBASEdata
  {
  GetRebaseData('f');
  my @enzall = sort keys %enzyme;
  $lbAllEnz->delete(0, 'end'); # delete existing entries
  $lbAllEnz->insert('end', @enzall)
  };

sub OpenEnzSel
  {
  my $file = $mw->getOpenFile() || return;
  open (IN, "<$file");
  my @selection = <IN>; chomp @selection;
  close (IN);
  $lbSelEnz->delete(0, 'end'); # delete existing entries
  $lbSelEnz->insert('end', @selection);
  $numselEnz = scalar @selection
  };

sub SaveEnzSel
  {
  my $file = $mw->getSaveFile() || return;
  open (OUT, ">$file") || die ("\nError: File doesn't exist !\n\n");
  print OUT join "\n", $lbSelEnz->get(0, 'end');
  close (OUT)
  };

sub Mine
  {
  my @enz;
  push @enz, $lbSelEnz->get(0, 'end');
  GetCandidates(@enz)
  };

#>>> frameEnz <<<#

sub SelectEnz
  {
  my %enzsel;
  foreach ($lbSelEnz->get(0, 'end')) {$enzsel{$_} = ''}; # get already selected enzymes
  foreach ($lbAllEnz->curselection()) {$enzsel{$lbAllEnz->get($_)} = ''}; # add enzymes
  $lbSelEnz->delete(0, 'end'); # delete existing entries
  $lbSelEnz->insert('end', sort keys %enzsel);
  $numselEnz = scalar keys %enzsel;
  };

sub DeselectEnz
  {
  my %enzdesel;
  foreach ($lbSelEnz->get(0, 'end')) {$enzdesel{$_} = ''};
  foreach ($lbSelEnz->curselection()) {delete $enzdesel{$lbSelEnz->get($_)}};
  $lbSelEnz->delete(0, 'end'); # delete existing entries
  $lbSelEnz->insert('end', sort keys %enzdesel);
  $numselEnz = scalar keys %enzdesel;
  };

sub DeselectAllEnz {$lbSelEnz->delete(0, 'end'); $numselEnz = 0};

sub AddEnz
  {
  my %addenz;
  @addenz{keys %enzyme} = ();
  foreach (keys %addenz)
    {
    if ($radbtnEnz eq 'exclude' && $enzyme{$_}[2] == 1) {delete $addenz{$_}; next};
    if ($chkbtnEnz[0] == 1 &&  $enzyme{$_}[7] eq '') {delete $addenz{$_}; next};
    if ($chkbtnEnz[1] == 0 && $enzyme{$_}[3] == 4) {delete $addenz{$_}; next};
    if ($chkbtnEnz[2] == 0 && $enzyme{$_}[3] == 5) {delete $addenz{$_}; next};
    if ($chkbtnEnz[3] == 0 && $enzyme{$_}[3] == 6) {delete $addenz{$_}; next};
    if ($chkbtnEnz[4] == 0 && $enzyme{$_}[3] > 6) {delete $addenz{$_}; next};
    };
  foreach ($lbSelEnz->get(0, 'end')) {$addenz{$_} = ''}; # add current state
  $lbSelEnz->delete(0, 'end'); # delete existing entries
  $lbSelEnz->insert('end', sort keys %addenz);
  $numselEnz = scalar keys %addenz;
  };

sub EnzymeData
  {# called with enzyme name
  my @info;
  $info[0] = shift;
  foreach (1,4,5,6,7) {push @info, $enzyme{$info[0]}[$_]};
  $labelEnz->pack(-side=>'top', -pady=>10, -anchor=>'w', -ipadx=>6);
  $infoEnz1->pack(-side=>'left', -anchor=>'n');
  $infoEnz2->pack(-side=>'left', -anchor=>'n', -padx=>5);
  foreach ($infoEnz2->children()) {$_->destroy};
  foreach (0..3) {$infoEnz2->Menubutton(-text=>"$info[$_]", -anchor=>'w', -width=>15)->pack(-fill=>'x')};
  if ($info[4])
    {
    my $menu_iso = $infoEnz2->Menubutton(-text=>'Click...', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x');
    foreach (split ',',$info[4]) {$menu_iso->command(-label=>$_, -command=> [\&EnzymeData, $_] )}
    }
  else {$infoEnz2->Menubutton(-text=>'', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x')}
  if ($info[5])
    {
    my $menu_com = $infoEnz2->Menubutton(-text=>'Click...', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x');
    foreach (split '', $info[5]) {$menu_com->command(-label=>$rebase_customer{$_})}
    }
  else {$infoEnz2->Menubutton(-text=>'', -tearoff=>0, -anchor=>'w')->pack(-fill=>'x')}
  };

#>>> frameChk/Txt <<<#

sub DisplayResultsTxt
  {
  return if $frameTxt->ismapped; # frame is already existing
  $frameSel->packForget;
  $framePic->packForget;
  $frameChk->pack(-fill => 'both', -after=>$frameEnz);
  $frameTxt ->pack(-fill => 'both', -expand => 1, -after=>$frameChk)
  };

sub ShowResultsTxt
  {
  my %info = (
    1 => 'Predicted CAPS candidates',
    2 => 'Predicted CAPS candidates (need further inspection: at least one N found within target)',
    3 => 'Probably false positive candidates (need further inspection: at least one N found within target)',
    0 => 'Following markers could not be converted into CAPS'
    );
  $text->delete("1.0",'end'); # delete text window
  $header->delete("1.0",'end'); # delete header
  &ShowNativeTxt(\%info) if $radbtnChk eq 'group_native';
  &ShowAlignTxt(\%info) if $radbtnChk eq 'group_align';
  };

sub ShowNativeTxt
  {
  my %info = %{$_[0]};
  $header->insert('end', sprintf ("%-10.10s %-9.9s %-5.5s %25.25s %25.25s   %-25.25s\n", 'Marker', 'Enzyme', 'Total', 'Restriction Sites', 'Expected Fragments', 'Members'));
  for (my $i = 1; $i < 4; $i++) # iterate Test Groups
    {
    next if $chkbtnChk[$i] == 0 || scalar keys %{$candidate[$i]} == 0;
    $text->insert('end', "$info{$i}\n");
    $text->insert('end', '-' x length $info{$i});
    $text->insert('end', "\n\n");
    foreach my $marker (sort keys %{$candidate[$i]})
      {
      foreach my $enzyme (sort keys %{$candidate[$i]{$marker}})
        {
        for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_native'}}; $j++)
          {
          my @cutpos; # store cut positions
          for (my $x = 1; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}}; $x += 4)
            {
            my $cutpos = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[$x];
            push @cutpos, $cutpos if $cutpos > 0;
            };
          @cutpos = sort {$a <=> $b} @cutpos;
          my @fragmentsizes; # stores fragment sizes
          my $length_align = length $marker{$marker}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[0];
          my $lengthseq = $length_align - PrecedingGaps($marker, $candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0], $length_align);
          if (@cutpos)
            {
            push @fragmentsizes, $cutpos[0], $lengthseq - $cutpos[$#cutpos];
            for (my $x = 1; $x < scalar @cutpos; $x++) {push @fragmentsizes, $cutpos[$x] - $cutpos[$x - 1]};
            }
          else {$fragmentsizes[0] = $lengthseq};
          @fragmentsizes = sort {$b<=>$a} @fragmentsizes;
          $text->insert('end', sprintf ("%-10.100s %-9.100s %5.100s ", $marker, $enzyme, $lengthseq));
          $text->insert('end', sprintf ("%25.100s ", join ",",@cutpos));
          $text->insert('end', sprintf ("%25.100s   ", join ",",@fragmentsizes));
          $text->insert('end', join ",",@{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j]});
          $text->insert('end', "\n");
          };
        $text->insert('end', "\n");
        }
      $text->insert('end', "\n");
      }
    };
  &ShowNotConverted($info{0}) if $chkbtnChk[0] == 1 && $candidate[0] && scalar @{$candidate[0]} > 0 # conversion failed
  };

sub ShowAlignTxt
  {
  my %info = %{$_[0]};
  $header->insert('end', sprintf ("%-10.10s %-9.9s %-50.50s %-20.20s %-20.20s %-20.20s %-5.5s\n", 'Marker', 'Enzyme', 'Members', 'Cutting Site 1', 'Cutting Site 2', 'Cutting Site 3', '...'));
  for (my $i = 1; $i < 4; $i++) # iterate Test Groups
    {
    next if $chkbtnChk[$i] == 0 || scalar keys %{$candidate[$i]} == 0;
    $text->insert('end', "$info{$i}\n");
    $text->insert('end', '-' x length $info{$i});
    $text->insert('end', "\n\n");
    foreach my $marker (sort keys %{$candidate[$i]})
      {
      foreach my $enzyme (sort keys %{$candidate[$i]{$marker}})
        {
        for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_align'}}; $j++)
          {
          $text->insert('end', sprintf ("%-10.100s %-9.100s ", $marker, $enzyme));
          $text->insert('end', sprintf ("%-50.100s ", join (",",@{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j]})));
          for (my $x = 0; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[0][0]}}; $x += 4)
	    {
	    my $recseq = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x + 3];
	    $recseq =~ s/-+/\~/g;
	    $text->insert('end', sprintf ("%-20.20s ", "$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x + 2] $recseq ($candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_align'}[$j][0]}[$x])"));
            };
          $text->insert('end', "\n");
          };
        $text->insert('end', "\n");
        }
      $text->insert('end', "\n");  
      }
    };
  &ShowNotConverted($info{0}) if $chkbtnChk[0] == 1 && $candidate[0] && scalar @{$candidate[0]} > 0  # conversion failed
  };

sub ShowNotConverted
  {
  my $info = shift;
  $text->insert('end', "$info\n");
  $text->insert('end', '-' x length $info);
  $text->insert('end', "\n\n");
  $text->insert('end', join "\n", @{$candidate[0]});
  };


#>>> frameSel/Pic <<<#

sub DisplayResultsPic  ### graphical output not yet implemented (next release)
  {
  return if $framePic->ismapped;
  $frameTxt->packForget;
  $frameChk->packForget;
  $frameSel->pack(-fill => 'both', -after=>$frameEnz, -ipady=>5);
  $framePic->pack(-fill => 'both', -expand => 1, -after=>$frameSel)
  };

sub ShowEnzymes # Show all relevant enzymes
  {
  return if $lbAllSel->size() == 0;
  my $marker = $lbAllSel->get($lbAllSel->curselection());
  $lbEnzSel->delete(0,'end'); # delete all entries
  for (my $i = 1; $i < 4; $i++)
    {
    if (exists $candidate[$i]{$marker})
      {
      $lbEnzSel->insert('end', sort (keys %{$candidate[$i]{$marker}}));
      }
    }
  };

#######################
##### SUBROUTINES #####
#######################

#>>> READ DNA sequences from file <<<#

sub GetSequences
  {
  my $file = $mw->getOpenFile() || return;
  my ($count_marker,$count_sequences);
  %marker = ();
  open (IN, "<$file");
  $/ = ">>";
  while (<IN>)
    {
    my ($marker_name, $fasta);
    next unless (($marker_name, $fasta) = /(.*?)\n(.*)/s);
    $count_marker++;
    while ($fasta =~ />(.*?)\n(.*?)(?=\n)/gs)  # $1->cultivar  $2->sequence
      {
      (my $cultivar, my $seq) = ($1, $2);
      $marker{$marker_name}{$cultivar} = [$seq];
      $count_sequences++;
      while ($seq =~ /(-+)/g) # find out positions of gaps
        {
        my $gap_pos = pos ($seq) + 1 - length $1;
        push @{$marker{$marker_name}{$cultivar}}, $gap_pos, length $1;
        }
      }
    };
  $/ = "\n";
  close (IN);
  $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s).");
  $info -> yviewMoveto(1);
  };

sub GetSequences2
  {
  if ($ARGV[0] eq "-gui")
    {$file = $mw->getOpenFile() || return}
  else {$file = $ARGV[0]}
  %marker = ();
  open (IN, "<$file") || die ("Could not open $file\n");
  my $msa_type; # type of alignment file
  while (<IN>)
    {
    if (/^>[^_]+_[^_]+/) {$msa_type = 'S2C';last}
    elsif (/^MSF/)  {$msa_type = 'MSF';last}
    elsif (/^CLUSTAL W/)  {$msa_type = 'ALN';last}
    };
  my ($count_sequences, $count_marker);
  seek (IN, 0, 0);
  if ($msa_type eq 'S2C') # SNP2CAPS format
    {
    $/ = ">";
    while (<IN>)
      {
      my ($id, $seq);
      next unless (($id, $seq) = /(.*?)\n(.*)/s);
      my ($marker_name, $cultivar) = $id =~ /(.*?)_(.*)/;
      $seq =~ s/[\s>]//g; # remove whitespace
      $count_sequences++;
      $marker{$marker_name}{$cultivar} = [$seq];
      push @{$marker{$marker_name}{$cultivar}}, FindGaps(\$seq);
      };
    $count_marker = scalar keys %marker;
    }
  elsif ($msa_type eq 'MSF')
    {
    $/ = "\n";
    my @parts = split '//', join '', <IN>; # $parts[2] contains sequence alignment
    foreach (split "\n", $parts[2])
      {
      my ($cultivar, $seq);
      next unless (($cultivar, $seq) = /^(\S+)\s+(\S+)/);
      $marker{$file}{$cultivar}[0] .= $seq;
      };
    foreach my $cultivar (keys %{$marker{$file}})
      {
      push @{$marker{$file}{$cultivar}}, FindGaps(\$marker{$file}{$cultivar}[0]);
      };
    $count_marker = 1;
    $count_sequences = scalar keys %{$marker{$file}};
    }
  elsif ($msa_type eq 'ALN') # CLUSTAL W format
    {
    $/ = "\n";
    while (<IN>)
      {
      chomp;
      next if /^CLUSTAL W/;
      my ($cultivar, $seq);
      next unless (($cultivar, $seq) = /^(\S+)\s+(\S+)/);
      $marker{$file}{$cultivar}[0] .= $seq;
      };
    foreach my $cultivar (keys %{$marker{$file}})
      {
      push @{$marker{$file}{$cultivar}}, FindGaps(\$marker{$file}{$cultivar}[0]);
      };
    $count_marker = 1;
    $count_sequences = scalar keys %{$marker{$file}};
    }
  else {return};
  $/ = "\n";
  close (IN);
  if ($ARGV[0] eq "-gui")
    {
    $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s).");
    $info -> yviewMoveto(1);
    };
  };
  
sub GetSequences3
  {
  use Bio::AlignIO;
  my ($count_sequences, $count_marker);
  if ($ARGV[0] eq "-gui") {$file = $mw->getOpenFile() || return}
  else {$file = $ARGV[0]}
  %marker = ();
  my $msaformat = Bio::AlignIO->_guess_format($file);
  $msaformat || ($msaformat = 'S2C');
  print $msaformat;
  if ($msaformat eq 'S2C') # SNP2CAPS format
    {
    open (IN, "<$file") || die ("Could not open $file\n");
    #seek (IN, 0, 0);
    $/ = ">";
    while (<IN>)
      {
      my ($id, $seq, $marker_name, $cultivar);
      next unless (($id, $seq) = /(.*?)\n(.*)/s);
      next unless ($marker_name, $cultivar) = $id =~ /(.*?)_(.*)/;
      $seq =~ s/[\s>]//g; # remove whitespace
      $count_sequences++;
      $marker{$marker_name}{$cultivar} = [$seq];
      push @{$marker{$marker_name}{$cultivar}}, FindGaps(\$seq);
      };
    close (IN);
    $count_marker = scalar keys %marker;
    }
  else
    {
    my $msafile = Bio::AlignIO->new(-file=>$file, -format=>$msaformat);
    while (my $msa = $msafile->next_aln())
      {
      $count_marker++;
      foreach my $seqobj ($msa->each_seq)
        {
        $count_sequences++;
        my $cultivar = $seqobj->display_id();
        my $seq = $seqobj->seq();
        my $name = "$file".'_'."$count_marker";
        $marker{$name}{$cultivar}[0] = $seq;
        push @{$marker{$name}{$cultivar}}, FindGaps(\$marker{$name}{$cultivar}[0]);
        };
      };
    };
  if ($ARGV[0] eq "-gui")
    {
    $info -> insert("insert","\nLoaded data of $count_marker marker(s) and $count_sequences sequence(s).");
    $info -> yviewMoveto(1);
    };
  };


sub FindGaps
  {
  my ($seqref) = @_;
  my @gaps; # stores gap position and type
  while ($$seqref =~ /([-_\.]+)/g) # find out positions of gaps
    {
    my $gap_pos = pos ($$seqref) + 1 - length $1;
    push @gaps, $gap_pos, length $1;
    };
  return @gaps
  };

#>>> READ REBASE enzyme data <<<#

sub GetRebaseData
  { # Option 'w' -> obtain via WWW, 'f' -> Read from File
  my ($rebase_file, $rebase_version, $rebase_reldate, $count);
  if ($_[0] eq 'w') {while (!($rebase_file = get("http://rebase.neb.com/rebase/link_gcg"))) {}}
  else
    {
    if ($ARGV[0] eq '-gui') {$file = $mw->getOpenFile() || return}
    else  {$file = $ARGV[1]};
    open (IN, "<$file");
    $rebase_file = join '',<IN>;
    close (IN)
    };
  %enzyme = ();
  # EXTRACT items from the REBASE data file #
  (my $rebase_file1, my $rebase_file2) = split /\.\.\n/,$rebase_file;  # ".." separates intro and enzyme list
  ($rebase_version) = $rebase_file1 =~ /\nREBASE version (\d+)/s;
  ($rebase_reldate) = $rebase_file1 =~ /\nRich Roberts\s+(.*?)\n/s;
  (%rebase_customer) = $rebase_file1 =~ /\n\s+([A-Z])\s+(.*?)(?=\n)/gs;

  while ($rebase_file2 =~ /(\S+)\s+(\S+)\s+(\S+)\s+(\S+)(?: \? !| !)\s+(\S+|)\s+>(\S+|)/gs)
    # $1->name  $2->cutting pos.  $3->recognition seq.  $4->overhang
    # $5->isoschizomers  $6->commercial sources
    {
    my $isoschizomer = 0; # isoschizomer flag (0->false, 1-true)
    my $recseq = $3;
    my $enzname = $1;
    {
      $recseq =~ s/[_']//g; # remove cut position markers
      $isoschizomer = 1 if $enzname =~ s/;//; # preceding ; marks isoschizomers
      $recseq =~ s/N+$//i;
    };   # curly brackets keep matched variables
    (my $recseq_regex, my $recseq_fuzzy) = ($recseq, $recseq);
    foreach (keys %bas_amb) {$recseq_regex =~ s/$_/$bas_amb{$_}/g}; # translate to regex like style
    foreach (keys %bas_amb_fuzzy) {$recseq_fuzzy =~ s/$_/$bas_amb_fuzzy{$_}/g}; # for enzymes which are listed twice, e.g. AloI
    if (exists $enzyme{$enzname}) {$enzname .= ' !'} else {$count++};
    $enzyme{$enzname} = [$recseq_regex, $recseq, $isoschizomer, length ($recseq), $2, $4, $5, $6, $recseq_fuzzy];
    };
  if ($ARGV[0] eq '-gui')
    {
    $info->insert("insert","\nLoaded REBASE data base (version $rebase_version, $rebase_reldate) comprising $count enzymes.");
    $info->yviewMoveto(1);
    };
  };


#>>> SEARCH for CAPS candidates <<<#

sub GetCandidates
  { # call by enzymes to be used for screening
  if (scalar keys %enzyme == 0) {$info->insert("insert","\nError: Load REBASE data base first"); $info->yviewMoveto(1); return};
  if (scalar keys %marker == 0) {$info->insert("insert","\nError: Load Sequence Alignment first"); $info->yviewMoveto(1); return};
  if ($ARGV[0] eq '-gui') {$info->insert("insert","\nAnalyse..."); $info->yviewMoveto(1); $info->update};
  @candidate = ();
  foreach my $enzyme_name(@_)
    {
    my ($not_palindromic, $rc_recseq, $rc_recseq_regex, $rc_recseq_fuzzy);
    study $enzyme{$enzyme_name}[0];   # of any benefit ?
    if (! Palindromic($enzyme{$enzyme_name}[1]) && ! (exists $enzyme{$enzyme_name.' !'} || $enzyme_name =~ /!/)) # test if palindromic or of type as AloI
      {
      $not_palindromic = 1;
      $rc_recseq = reverse DNAComplement($enzyme{$enzyme_name}[1]);
      ($rc_recseq_regex, $rc_recseq_fuzzy) = ($rc_recseq, $rc_recseq);
      foreach (keys %bas_amb) {$rc_recseq_regex =~ s/$_/$bas_amb{$_}/g}; # translate to regex like style
      foreach (keys %bas_amb_fuzzy) {$rc_recseq_fuzzy =~ s/$_/$bas_amb_fuzzy{$_}/g}
      }
    foreach my $marker_name (keys %marker)
      {
      my %cut; # stores enzyme binding positions temporarily
      foreach my $cultivar (keys %{$marker{$marker_name}})
        {
        while ($marker{$marker_name}{$cultivar}[0] =~ /($enzyme{$enzyme_name}[0])/ig)
          {
          pos ($marker{$marker_name}{$cultivar}[0]) += 1 - length $1;
          my $start = pos ($marker{$marker_name}{$cultivar}[0]);
          $cut{$start}{$cultivar} = ['f',$1];
          };
        if ($not_palindromic) # rec. seq. not palindromic => check the rc as well
          {
          while ($marker{$marker_name}{$cultivar}[0] =~ /($rc_recseq_regex)/ig)
            {
            pos ($marker{$marker_name}{$cultivar}[0]) += 1 - length $1;
            my $start = pos ($marker{$marker_name}{$cultivar}[0]);
            $cut{$start}{$cultivar} = ['r',$1];
            };
          };
        };
      my %test; # test for CAPS candidates
      # (1) true CAPS candidate (alternative restriction due to SNP)
      # (2) Not sure: probably true CAPS candidate (N within DNA sequence, but SNP at different position)
      # (3) Not sure: probably false positive CAPS candidate at at least one site (N within DNA sequence)
      # cases (2) and (3) need further inspection by the experimenter
      foreach my $site (keys %cut)
        {
        my $difference = scalar (keys %{$marker{$marker_name}}) - scalar (keys %{$cut{$site}});
        if ($difference > 0) # test if all sequences were cut
          {
          my %intersection; # intersection between 2 hashes
          @intersection{ keys %{$marker{$marker_name}} } = ();
          delete @intersection{ grep (exists $intersection{$_}, keys %{$cut{$site}}) };
          foreach my $cultivar (keys %intersection) # for all that were not cut
            {
            my $test_seq = substr($marker{$marker_name}{$cultivar}[0], $site - 1);
            $test_seq =~ s/((?:[^-]-*){$enzyme{$enzyme_name}[3]}).*/$1/;
            $cut{$site}{$cultivar} = ['n',$test_seq];
            if ($test_seq !~ /N/ig) {$test{1} = ""} # see above
            elsif ($test_seq !~ /$enzyme{$enzyme_name}[8]/i)
              {
              if ($not_palindromic) {if ($test_seq !~ /$rc_recseq_fuzzy/i) {$test{2} = ""}}
              else {$test{2} = ""}
              }
            else {$test{3} = ""}
            }
          }
        };
      if (%test) # true if alternative restriction was found
        {
        my $test_id = Test(\%test);
        foreach my $site (sort {$a <=> $b} keys %cut)
          {
          foreach my $cultivar (keys %{$cut{$site}})
            {
            my $cutsite = 0; # calculate the cut site position
            if ($cut{$site}{$cultivar}[0] eq "f") {$cutsite = $site - PrecedingGaps($marker_name, $cultivar, $site) + $enzyme{$enzyme_name}[4] - 1}
            elsif ($cut{$site}{$cultivar}[0] eq "r") {$cutsite = $site - PrecedingGaps($marker_name, $cultivar, $site) + $enzyme{$enzyme_name}[3] - $enzyme{$enzyme_name}[4] - 1}; ## gaps within rec.seq???? TEST
            push @{$candidate[$test_id]{$marker_name}{$enzyme_name}{$cultivar}}, $cut{$site}{$cultivar}[0], $cutsite, $site, $cut{$site}{$cultivar}[1];
            };
          };
        $candidate[$test_id]{$marker_name}{$enzyme_name} = group($marker_name,\%{$candidate[$test_id]{$marker_name}{$enzyme_name}});
        }
      }
    };
  my (%not_converted, @converted);
  @not_converted{keys %marker} = ();
  for (my $i = 1; $i < 4; $i++)
    {
    delete @not_converted{keys %{$candidate[$i]}};
    push @converted, keys %{$candidate[$i]}
    };
  @{$candidate[0]} = sort keys %not_converted;
  @converted = sort @converted;
  if ($ARGV[0] eq '-gui')
    {
    $lbAllSel->delete(0,'end');
    $lbAllSel->insert('end', @converted);
    $info -> insert("insert","DONE");
    };
  #dumpValue(\@candidate);
  };


#>>> SORTING ROUTINE <<<#

sub group
  { # two keys are appended during this routine containing sorting matrizes
    # according to the presence / absence of restriction sites (group_align)
    # and according to the resulting fragment sizes (group_native)
  my $marker = shift;
  my %group = %{$_[0]};
  my (%sort_align, %sort_native); # contain the sorting keys
  my @cultivar = sort keys %group;
  for (my $i = 0; $i < scalar @cultivar; $i++)
    {
    for (my $j = 0; $j < scalar @{$group{$cultivar[$i]}}; $j += 4)
      {
      $sort_align{$cultivar[$i]} .= $group{$cultivar[$i]}[$j + 3]; # rec.seq.
      $sort_native{$cultivar[$i]} .= $group{$cultivar[$i]}[$j + 1] # cut.pos.
      };
    $sort_native{$cultivar[$i]} .= length $marker{$marker}{$cultivar[$i]}[0]; #considers also total fragment length
    };
  $group{'group_align'}[0][0] = $cultivar[0]; # initialize sorting matrix with first element
  $group{'group_native'}[0][0] = $cultivar[0];
  for (my $i = 1; $i < scalar @cultivar; $i++) # check all cultivars
    {
    my $found_group_align = 0;
    my $found_group_native = 0;
    for (my $j = 0; $j < scalar @{$group{'group_align'}}; $j++) # check in all existing groups
      {
      if ($sort_align{$cultivar[$i]} eq $sort_align{$group{'group_align'}[$j][0]}) # compare with the first element of each group
        {
        push @{$group{'group_align'}[$j]}, $cultivar[$i];
        $found_group_align = 1;
        last
        }
      };
    for (my $j = 0; $j < scalar @{$group{'group_native'}}; $j++)
      {
      if ($sort_native{$cultivar[$i]} eq $sort_native{$group{'group_native'}[$j][0]})
        {
        push @{$group{'group_native'}[$j]}, $cultivar[$i];
        $found_group_native = 1;
        last
        }
      };
    if (not $found_group_align) {$group{'group_align'}[scalar @{$group{'group_align'}}][0] = $cultivar[$i]}; # form a new group
    if (not $found_group_native) {$group{'group_native'}[scalar @{$group{'group_native'}}][0] = $cultivar[$i]}
    };
  return \%group
  };

#>>> Get the number of preceding gaps <<<#

sub PrecedingGaps
  { # invoke with (1) marker name, (2) cultivar name, (3) position to be tested
  my $gaps = 0;
  my @array = @{$marker{$_[0]}{$_[1]}};
  my $position = $_[2];
  shift @array;
  while (@array)
    {
    my $pos = shift @array;
    my $length = shift @array;
    if ($pos <= $position) {$gaps += $length}
    else {last};
    };
  return $gaps
  };

#>>> Generate the complement sequence <<<#

sub DNAComplement
  { # invoke by DNA sequence
  my $seq = shift;
  $seq =~ tr/ACTGactgMRVHmrvhKYBDkybd/TGACtgacKYBDkybdMRVHmrvh/;
  return $seq
  };

#>>> Check if sequence is complementary <<<#

sub Palindromic
  { # invoke by DNA sequence
  my $seq = shift;
  return 1 if ($seq eq reverse DNAComplement($seq));
  return 0
  };

#>>> Does the quality grouping <<<#

sub Test
  {
  my @test = keys %{$_[0]};
  return $test[0] if scalar @test == 1;
  return 2;
  };

sub Results
  {
  my %info = (
    1 => 'Predicted CAPS candidates',
    2 => 'Predicted CAPS candidates (need further inspection: at least one N found within target)',
    3 => 'Probably false positive candidates (need further inspection: at least one N found within target)',
    0 => 'Following markers could not be converted into CAPS'
    );
  print "#Format:\n" ,join "\t", '#Marker', 'Enzyme', 'Total size', 'Restriction Sites', 'Expected Fragments', 'Members', "\n\n";

  for (my $i = 1; $i < 4; $i++) # iterate Test Groups
    {
    next if scalar keys %{$candidate[$i]} == 0;
    print  "$info{$i}\n";
    print '-' x length $info{$i}, "\n\n";
    foreach my $marker (sort keys %{$candidate[$i]})
      {
      foreach my $enzyme (sort keys %{$candidate[$i]{$marker}})
        {
        for (my $j = 0; $j < scalar @{$candidate[$i]{$marker}{$enzyme}{'group_native'}}; $j++)
          {
          my @cutpos; # store cut positions
          for (my $x = 1; $x < scalar @{$candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}}; $x += 4)
            {
            my $cutpos = $candidate[$i]{$marker}{$enzyme}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[$x];
            push @cutpos, $cutpos if $cutpos > 0;
            };
          @cutpos = sort {$a <=> $b} @cutpos;
          my @fragmentsizes; # stores fragment sizes
          my $length_align = length $marker{$marker}{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0]}[0];
          my $lengthseq = $length_align - PrecedingGaps($marker, $candidate[$i]{$marker}{$enzyme}{'group_native'}[$j][0], $length_align);
          if (@cutpos)
            {
            push @fragmentsizes, $cutpos[0], $lengthseq - $cutpos[$#cutpos];
            for (my $x = 1; $x < scalar @cutpos; $x++) {push @fragmentsizes, $cutpos[$x] - $cutpos[$x - 1]};
            }
          else {$fragmentsizes[0] = $lengthseq};
          @fragmentsizes = sort {$b<=>$a} @fragmentsizes;
          print join "\t", $marker, $enzyme, $lengthseq;
          print "\t", join ",", @cutpos;
          print "\t", join ",",@fragmentsizes;
          print "\t", join ",",@{$candidate[$i]{$marker}{$enzyme}{'group_native'}[$j]};
          print "\n";
          };
        print "\n";
        }
      print "\n";
      }
    };
  print "$info{0}\n";
  print '-' x length $info{0};
  print "\n\n";
  print join "\n", @{$candidate[0]};
  };
