#! /usr/bin/perl
#
# mergeConds [OPTIONS] -out <outfile> -data <mergefile 1> [<mergefile 2> ...] 
#
# Trey Ideker 6/2000
#
################################################################################

# global variables and hashes
my(%gene, %info);
my($NUMCONDS, $NUMGENES);

# parse command line

$USAGE="  mergeConds [OPTIONS] -out <matrixFile> -conds <mergefile 1> [<mergefile 2> ...]\n".
       "        -lam <num>\n" .
       "        -rat <num>\n" .
       "        -std <num>\n" .
       "        -n <num>\n";
$NUMCONDS = -1; 
$LAMBDA = $RATIO = $SAMPLES = 0;
$STD = 1000;     # init STD to a very large number
for ($i=0; $i<=$#ARGV; $i++) {
    if ($ARGV[$i] =~ /-lam/)   { $LAMBDA     = $ARGV[++$i]; }
    elsif ($ARGV[$i] =~ /-rat/)   { $RATIO      = $ARGV[++$i]; }
    elsif ($ARGV[$i] =~ /-std/)   { $STD        = $ARGV[++$i]; }
    elsif ($ARGV[$i] =~ /-n/)     { $SAMPLES    = $ARGV[++$i]; }
    elsif ($ARGV[$i] =~ /-out/)   { $outFile    = $ARGV[++$i]; }
    elsif ($ARGV[$i] =~ /-conds/) { $NUMCONDS    = 0; }
    elsif ($NUMCONDS >= 0)         { $condFiles[$NUMCONDS++] = $ARGV[$i]; } 
}
if ( $NUMCONDS == -1 )    { die "\nCannot parse command line.\n$USAGE\n\n"; }
if ( !defined($outFile)) { die "\nCannot parse command line.\n$USAGE\n\n"; }

# define significant genes in hash structure
print("#DETERMINING SIGNIFICANT GENES OVER ALL $NUMCONDS CONDITIONS...\n");
$filenum = 0;
foreach $file (@condFiles) {
    $sigNum[$filenum] = getSigGenes($file, $filenum);
    $filenum++;
}

# read back through files, this time getting all of the ratios for sig genes
@keys = keys(%gene);     $NUMGENES = $#keys + 1;
print("\n#FOUND $NUMGENES SIGNIFICANT GENES, READING RATIOS...\n");
$filenum = 0;
foreach $file (@condFiles) {
    getRatios($file, $filenum++);
}

# print out matrix of ratios for each sig. gene over all conditions
# first print header information
die "Cannot access file $file\n" if (open(OUTFILE, ">$outFile") == 0 );
print OUTFILE ("\t\tRATIOS");
foreach $file (@condFiles) { print OUTFILE ("\t"); }
print OUTFILE ("LAMBDAS\n");
print OUTFILE ("GENE\tDESCRIPT\t");
# print headings for ratios
foreach $file (@condFiles) { 
    @parts = split (/\//, $file);          # split on /
    printf OUTFILE ("%s\t", pop(@parts) );
}
# print headings for lambdas
foreach $file (@condFiles) { 
    @parts = split (/\//, $file);          # split on /
    printf OUTFILE ("%s\t", pop(@parts) );
}
print OUTFILE ("NumSigConds\n");

# now print actual gene ratios (and lambdas)
foreach $ORF (keys(%gene)) {
    printRatios(OUTFILE, $ORF);
}

# print summary information
print OUTFILE ("NumSigGenes:\t");
foreach $num (@sigNum) {
    printf OUTFILE ("\t%6d", $num);
}

# finish up
close(OUTFILE);
print ("#     Done\n\n");

# print bit vector info for Phylip tree
# printBitVector();

################################################################################
# this routine just defines the significant genes for a file, it does not get 
# info for them.  Argument [0] is a filename.  Argument [1] is the number of this file.
# Returns the number of significant genes in the file.

sub getSigGenes {
    
    my (@entry, $i, $sigcount, $condNum, @sig, @ratios, @lambdas);
    $condNum = $_[1];
    die "Cannot access file $file\n" if (open(FILE, "$_[0]") == 0 );
    print("#     $file\n");

    # determine if header contains lambda info.
    $_ = <FILE>;
    chomp;
    @entry = split;
    $LAMCOL = -1;
    for ($i=0; $i<$#entry; $i++) {
	$LAMCOL = $i if ($entry[$i] =~ /lambda/i);
    }

    while (<FILE>) {
	chomp;
	next if /^\#/;		# ignore line if a comment
	next if !/\w/;		# ignore line if no words in it
	@entry = split;
	if ($entry[1] eq "") {die "Blank name for $ORF, file $_[0]\n" ; }
	$ORF = $entry[0]; $n = $entry[2]; $std = $entry[5]; 
	# if present in file, take lambda and ratio from end
	if ($LAMCOL >= 0)  { $lam = $entry[$LAMCOL];  $rat = $entry[$LAMCOL+1]; }
	else               { $lam = 0;           $rat = $entry[4];  }
	# based on input params, decide whether to reject gene or to record it
	next if ($lam < $LAMBDA);  # skip if lambda does not make cutoff value
	next if (abs($rat) < $RATIO);  # skip if ratio does not make cutoff value
	next if ($n < $SAMPLES);   # skip if too few samples
	next if ($std > $STD);     # skip if stdev in ratio is too large
	
	# made it to here- must be significant gene!!!
	if ( !defined($gene{$ORF}) ) {
	    $gene{$ORF}->{"name"} = $entry[1];
	    # initialize ratios array to all zeros;
	    for ($i=0; $i<$NUMCONDS; $i++) { 
		$ratios[$i] = 0.0; 
		$lambdas[$i] = 0.0; 
		$sig[$i] = 0.0;
	    }
	    $sig[$condNum] = 1;
	    $gene{$ORF}->{"ratios"}   = [ @ratios ]; # ptr to array
	    $gene{$ORF}->{"lambdas"}   = [ @lambdas ]; # ptr to array
	    $gene{$ORF}->{"sig"}   = [ @sig ]; # ptr to array
	    $gene{$ORF}->{"sigConds"} = 1;
	}
	else { 
	    $gene{$ORF}->{"sigConds"}++; 
	    $gene{$ORF}->{"sig"}[$condNum] = 1; 
	}
	$sigcount++;
    }
    close(FILE);
    return $sigcount;
}

################################################################################
# argument [0] is a filename, arg [1] is a filenumber

sub getRatios {

    my(@entry);
    die "Cannot access file $file\n" if (open(FILE, "$_[0]") == 0 );
    print("#     $file\n");

    # determine if header contains lambda info.
    $_ = <FILE>;
    chomp;
    @entry = split;
    $LAMCOL = -1;
    for ($i=0; $i<$#entry; $i++) {
	$LAMCOL = $i if ($entry[$i] =~ /lambda/i);
    }

    while (<FILE>) {
	chomp;
	next if /^\#/;		# ignore line if a comment
	next if !/\w/;		# ignore line if no words in it
	@entry = split;
	$ORF = $entry[0];
	if ($LAMCOL >= 0)  { $l = $entry[$LAMCOL];   $rat = $entry[$LAMCOL+1]; }
	else               { $l = 0;            $rat = $entry[4];  }
	# if in sig. list, record information for this ORF
	if (defined($gene{$ORF})) {
	    @{ $gene{$ORF}->{"ratios"}}[ $_[1] ] = $rat;
	    @{ $gene{$ORF}->{"lambdas"}}[ $_[1] ] = $l;
	}
    }
    close(FILE);
}

################################################################################
# first arg [0] is a file handle, second arg [1] is an ORF name

sub printRatios {

    my($FILE, $ORF);
    $FILE = $_[0]; $ORF = $_[1];

    printf $FILE ("%s\t%s", $ORF, $gene{$ORF}->{"name"});

    @ratios =  @{ $gene{$ORF}->{"ratios"}  };
    @lambdas = @{ $gene{$ORF}->{"lambdas"} };

    for ($i=0; $i<$NUMCONDS; $i++) {
	printf $FILE ("\t%6.3f", $ratios[$i]);
    }

    for ($i=0; $i<$NUMCONDS; $i++) {
	printf $FILE ("\t%6.3f", $lambdas[$i]);
    }


    printf $FILE ("\t%d\n", $gene{$ORF}->{"sigConds"});
}

################################################################################

sub printBitVector {

    my($cond, $gene, $i, @ratios);

    $thisFile = $outFile . ".bv";

    open(OUTFILE, ">$thisFile");
    
    printf OUTFILE ("     $NUMCONDS      $NUMGENES\n");

    for ($cond = 0; $cond < $NUMCONDS; $cond++) {
	@line = split(/\./, $condFiles[$cond]);
	$condName = $line[0] . (" " x 10);
	$condName = substr($condName, 0, 9);
	printf OUTFILE ("$condName");
	foreach $ORF (keys(%gene)) {
	    @sigs =  @{ $gene{$ORF}->{"sig"}  };
	    printf OUTFILE (" $sigs[$cond]");
	}
	printf OUTFILE ("\n");
    }

    close(OUTFILE);
}


######################################################################
# arg [0] is the key file name- constructs a global hash called info

sub readKeyFile {

    my ($name, $orf, @entry, $count);

    if ( open(KEYFILE, "$_[0]") == 0 ) { die "\nCannot open ann file\n\n" ; }
    while (<KEYFILE>) {
	if (/\d/) {
	    # if line not blank, read it in and parse it
	    @entry = split(/\t/);         # split on tabs
	    $name = $entry[0];
	    $orf = $entry[2];
	    # get only some columns of YPD file
	    $info{"$orf"}->{"majorLoc"} = ($entry[23] || "?"); 
	    $info{"$orf"}->{"minorLoc"} = ($entry[24] || "?");
	    $info{"$orf"}->{"environ"} = ($entry[25] || "?"); 
	    $info{"$orf"}->{"function"} = ($entry[26] || "?");
	    $info{"$orf"}->{"descript"} = ($entry[67] || "?");
	    $count++;
	}
    }
    close(KEYFILE);
    print("#     Processed $count gene entries\n");
}
