#!/usr/bin/perl

#perl script to reformat the output of phase 2.1 into the imput format for
#treemap.
#Usage is PH2TL.pl <PHASE.out> <Recomb><Phenotypes.txt> <TreeLD.in> 
#where$ <PHASE.out>$ designates the name of the output file that is
#generated by PHASE, <Recomb> stands for the name of the _recom file 
#generated by phase, $<TreeLD.in>$ is the name of the input file for
#TreeLD and <Phenotypes.txt> is a text file that contains the phenotype
#information,comprised of one phenotype per line in the same order as
#the input file for PHASE.
# Version2 also reads in the output of the -recomb file and adjusts 
# the map accordingly

use strict; 
{ 

  if (@ARGV<3 || @ARGV >4) {
    usage();
  }
   
  my$pofn=0;
  my$rfn=1;
  my$phn=2;
  my$tln=3;

  if (@ARGV==3)			# no file to adjust recombination rates
    {
      $phn=1;
      $tln=2;
    }

print "\nphenotype file \t\t$ARGV[$phn]\n";

  open(DAT,$ARGV[$phn])or die "Can't open $ARGV[2]\n";
  my@ph=<DAT>;
  close(DAT);
  chomp @ph;
  my@pht;
  for (@ph) {

    if ($_=~/^(\d+\.\d)/) {
      push @pht,$1;
    } elsif ($_=~/^(\d+)/) {
      push @pht, "$1.0";
    }
  }

  my$i;
 my$nol;
 my@out;

  #map info
  if (@ARGV==4) {
print "genetic distances\t$ARGV[$rfn]\n";
    open(DAT,$ARGV[$rfn]) or die "can't open $ARGV[$rfn]\n";
    my@map=<DAT>;
    close (DAT);
    chomp @map;

    my@me=split /\s+/,@map[0];
    $nol=@me;
    my@dists;
    for ($i=1;$i<$nol;++$i) {
      my@ests;
      my$j;
      for ($j=1;$j<@map;++$j) {
	my@line=split/\s+/,@map[$j];
	push @ests,$line[$i];
      }
      $dists[$i]=int(median(@ests)*($me[$i]-$me[$i-1]));
    }
    for ($i=1;$i<$nol;++$i) {
      $me[$i]=$me[$i-1]+$dists[$i];
    }

    $out[0]="P";
    for (@me) {
      $out[0]=$out[0]." $_";
    }
  }

  print "PHASE output file\t$ARGV[$pofn]\n\n";

  open(DAT,$ARGV[$pofn]) or die "Can't open $ARGV[$pofn]\n";
  my@in=<DAT>;
  close(DAT);
  chomp @in;

  my$noi;
$i=0;
  while ($in[$i] ne "END INPUT_SUMMARY") {
    if ($in[$i]=~/^Number of Individuals:\s+(\d+)/) {
      $noi=$1;

      if ($noi!=@pht) {
	$i=@pht;
	print"Error: different number of individuals in $ARGV[$pofn] and $ARGV[$phn] ($i!=$noi.\n";
	print "Maybe you have extra linebreaks in $ARGV[$phn]\n";
	exit(8);
      }
    }

    if (@ARGV==4)
      {

	if ($in[$i]=~/^Number of Loci:\s+(\d+)/) {	
	  if ($nol!=$1) {
	    print"Error: different number of markers in $ARGV[$pofn] ($1) than in $ARGV[$rfn] ($nol)\n";
	    exit(8);
	  }
	}

      } else {
      if ($in[$i]=~/^Number of Loci:\s+(\d+)/) {
	$nol=$1;
      }
      if ($in[$i]=~/^Positions of loci: (.+)/) {
	push @out, makemap($1);
      }
    }
    ++$i;
    
    if($i>@in){print"File Format of $ARGV[$pofn] wrong\n";exit(8);}
  }

  while ($in[$i] ne "BEGIN BESTPAIRS1") {
    ++$i;
    if($i>@in){print"File Format of $ARGV[$pofn] wrong\n";exit(8);}
  }
 
  my$k;
  my$j;
  for ($k=0;$k<$noi;++$k) {
    ++$i;
    $in[$i]=~/^(\d)/;
    push @out,"$pht[$j] 2";
    ++$j;
    ++$i;
    push @out,getseq($in[$i]);

    ++$i;
    push @out,getseq($in[$i]);
  }





  open(DAT,">$ARGV[$tln]");
  print DAT"#Converted file, generated by PH2TM.pl\n";
  print DAT"#original filename $ARGV[$pofn], ";
    if (@ARGV==4) {
      print DAT"map information $ARGV[$rfn]\n";
    } else {
      print DAT"no finescale recombination map used\n";
    }

  print DAT"# phenotype information in $ARGV[$phn]\n";
  print DAT"#$noi individuals $nol markers\n";
  for (@out) {
    print DAT"$_\n";
  }
  close(DAT);

print "$ARGV[$tln] generated\n\n";
}
########################################################
sub makemap
  {

    my@map=split " ",$_[0];
    my$out="P";
    for (@map) {
      my$in=int($_);
      $out="$out$in ";
    }

    return($out);
  }
########################################################
sub getseq
  {
    my@seq=split " ",$_[0];
    my$out;
    for (@seq) {
      if ($_=~/(\d)/) {
	$out.=$1;
      }
    }
    return $out;
  }
############################
sub median
  {
    my$ret;
    my@ap=sort {$a<=>$b}@_;
    if (@ap/2==int(@ap/2)) {
      $ret=($ap[@ap/2-1]+$ap[@ap/2])/2;
    } else {
      $ret=$ap[int(@ap/2)];
    }

    return ($ret);
  }
###########################################
sub usage
  {
    print"\nUsage is PH2TL.pl <PHASE.out>
<Recomb><Phenotypes.txt> <TreeLD.in>, where <PHASE.out> designates the
name of the output file that is generated by PHASE, <Recomb> stands
for the name of the _recom file generated by phase, <TreeLD.in> is the
name of the input file for TreeLD and <Phenotypes.txt> is a text file
that contains the phenotype information,comprised of one phenotype per
line in the same order as the input file for PHASE. The <Recomb> file
is optional.\n
Please remember that it is suggested to increase the number of iterations 
of the final run of the PHASE algorithm (the -X option) to generate good 
estimates of fine-scale recombination.\n\n"; 
    exit(8); 
  }
