#!/usr/bin/perl 

# mergeReps

# A script to MERGE REPlicate microarray files. Multiple measurements of a gene
# ratio, present across multiple ratio files, are combined into one averaged
# measurement. Ratio files are specified in a fileTable. In the table, a
# reversed flag -r can be set to indicate a reverse labeling relative to the
# other files. The -err option sorts by std. error, not by ratio, the default
# The [-opt #] option only outputs unsaturated and above-background genes having
# number of samples >= #
# ######################################################################################

use POSIX;
$USAGE = "  mergeReps [OPTIONS] <fileTable> <mergedOutput>\n" .
         "        -opt <num>\n" . 
         "        -filter {on,off}\n" .
         "        -exclude <gene file>\n";
if ($#ARGV < 0) { die "\nCannot parse command line.\n$USAGE\n\n"; }

# define global variables
$filter = "on";
$opt = 0;
$MAXSAMPLES = 1;
$EXCLUDE_OUTLIER = 1;
$GENENUM = 0;
$THRESH = 800;
$GBGND = $RBGND = $GBGNDSTD = $RBGNDSTD = 0.0;

# parse command line
for ($i=0; $i<=$#ARGV; $i++) {
    if ($ARGV[$i] eq "-filter")    { $filter = $ARGV[++$i];  }
    elsif ($ARGV[$i] eq "-opt")    { $opt = $ARGV[++$i];     }   # get min number of genes
    elsif ($ARGV[$i] eq "-exclude"){ $exFile = $ARGV[++$i];  }
    elsif ($ARGV[$i] =~ /^-/)      { die ("Unrecognized parameter $ARGV[$i]\n$USAGE\n\n"); }
    elsif (!defined($filetable))   { $filetable = $ARGV[$i]; }   # read file table with ratio files
    elsif (!defined($outfile))     { $outfile   = $ARGV[$i]; }   # read output file name
}       
if (!defined($filetable) || !defined($outfile)) {
    die ("Must specify both <fileTable> and <mergedOutput> files\n$USAGE\n\n"); }
if ($filter ne "on" && $filter ne "off") {
    die ("Don't understand value for -filter option: $filter\n"); }

# first read filetable, which in turn reads the files listed in it
# NOTE: samples which have N flag are not read in!
readFileTable($filetable);
readBadGenes($exFile) if ($exFile);
print("#$GENENUM total genes read\n");
printf("#Average background:        green = %6.1f, red = %6.1f\n", $GBGND, $RBGND);
printf("#Average stddev background: green = %6.1f, red = %6.1f\n", $GBGNDSTD, $RBGNDSTD);
if ($filter eq "on" && $samples > 30) { 
    $EXCLUDE_OUTLIER = 0 ;
    print("#NOT EXCLUDING OUTLIERS: can only exclude if less than 30 samples\n"); 
}

# reject outliers, determine mean and standard error of ratios, and compare to reference
foreach $ORF (keys (%gene) ) {
    if ($filter eq "on") {
	if ($exFile && isBadGene( $gene{$ORF} )) {
	    delete ( $gene{$ORF} );
	    $excludeNum++;
	    next;
	}
	excludeOutliers( $gene{$ORF} ) if ($EXCLUDE_OUTLIER);
    }
    statisticsOnValues ( $gene{$ORF} );
    getNumSlides ( $gene{$ORF} ); 
    # calculate max samples
    $MAXSAMPLES = $gene{$ORF}->{"samples"} if ( $MAXSAMPLES < $gene{$ORF}->{"samples"} );
    # compute coefficient of variation
    if ($gene{$ORF}->{"green"} > $THRESH) {
	$gCV += $gene{$ORF}->{"stdGreen"} / $gene{$ORF}->{"green"}; 
	$gCVcount++;
    }
    if ($gene{$ORF}->{"red"} > $THRESH) {
	$rCV += $gene{$ORF}->{"stdRed"} / $gene{$ORF}->{"red"}; 
	$rCVcount++;
    }
}
$gCV /= $gCVcount; $rCV /= $rCVcount;          # take average coefficient of variation
printf ("#Max samples per gene: %d\n", $MAXSAMPLES);
printf ("#Estimate of Coeff. of Var: green = %6.2f, red = %6.2f\n", $gCV, $rCV);
printf ("#Excluded $excludeNum genes as bad\n") if ($exFile);

# step back through (sorted) genes and print out info
die "Cannot access file $outfile\n" if (open(OUT, ">$outfile") == 0 );
print("#Writing merged results to file $outfile\n");
@sorted = sort by_mean keys(%gene);
printOutput(OUT, @sorted);
close(OUT);

######################################################################
#
# SUBROUTINES
#
######################################################################

# first and only argument is the filename of the table of ratio files

sub readFileTable {
    my ($file, $dir, $slide, $count);
    die "Cannot access file $_[0]\n" if (open(FT, "$_[0]") == 0 );
    while(<FT>) {
	chop;
	next if /^\#/;		# ignore line if a comment
	next if !/\w/;		# ignore line if no words in it
	($file, $dir, $slide) = split;
	print("#Reading file $file\n");
	readRatioFile($file, $dir, $slide);
	$count++;
    }
    $GBGND /= $count; $RBGND /= $count;
    $GBGNDSTD /= $count; $RBGNDSTD /= $count;
    close(FT);
}

######################################################################
# first and only argument is the filename of the bad genes file

sub readBadGenes {
    my ($file);
    die "Cannot access file $_[0]\n" if (open(FT, "$_[0]") == 0 );
    while (<FT>) {
	chop;
	next if /^\#/;          # ignore line if a comment
	next if !/\w/;		# ignore line if no words in it
	if ( /^(\S+)/ ) { $badGenes{"$1"} = 1; }
    }
    close(FT);
}

######################################################################
# arg [0] is a filename to read, arg [1] the direction (f or r)
# and arg [2] is the slide identifier, usually 1, 2, 3, etc.

sub readRatioFile {
    my ($dir, $slide);
    die "Cannot access file $_[0]\n" if (open(RFILE, "$_[0]") == 0 );
    $dir = $_[1];   $slide = $_[2];

    while (<RFILE>) {
	chop;
	# look for background information in ratio file header
	if    ( /GREEN BACKGROUND.+ (\d+\.\d+).+ (\d+\.\d+)/ ) { 
	    $GBGND += $1; $GBGNDSTD += $2; 
	}
	elsif ( /RED\s+BACKGROUND.+ (\d+\.\d+).+ (\d+\.\d+)/ ) { 
	    $RBGND += $1; $RBGNDSTD += $2; 
	}
	@entry = split;
	next if /^\#/;		    # ignore line if a comment
	next if !/\d/;		    # ignore line if no numbers in it
	$ORF = $entry[0];
	$flag = $entry[6];
	$flag =~ s/-//;             # get rid of "null" flag

	# exclude SOME genes right away
	next if ( ($opt || $filter eq "on") && $flag =~ /N/); 
	next if ($opt && $flag =~ /S/);
	if ($entry[2] <= 0) {  
	    print("Trouble processing line:\n  $_\n");  
	    next;
	}
	

	# get green and red ints, reversing if reverse labeling flag indicated
	if ($dir eq "f") {
	    $ratio = $entry[2];
	    $green = $entry[4]; 
	    $red = $entry[5];
	}
	elsif ($dir eq "r") {   

	    $ratio = 1 / $entry[2]; 
	    $green = $entry[5];
	    $red = $entry[4];
	    $flag =~ tr/XYAB/YXBA/;          # reverse flags as well
	}
	else { die ("ERROR in file table : Labeling direction must be f or r\n");  }
	$ratio = .0001 if ($ratio == 0);     # if ratio zero at this level of precision, up it.

	# if no spot found at this location
	if ($flag =~ /N/) {
	    $ratio = 1; 
	    $green = 0; 
	    $red = 0; 
	}

	if ( !defined($gene{$ORF}) ) {   # if gene not been read in before
	    $samples = 0;
	    $gene{$ORF}->{"name"} = $entry[1];
	    $gene{$ORF}->{"values"} = [ () ]; # ptr to array
	    $GENENUM++;
	}
	else {                      # if gene been read in before add to prev. record
	    $samples = $gene{$ORF}->{"samples"};
	}
	@values = @ {$gene{$ORF}->{"values"} };
	$values[$samples]{"dir"} = $dir;
	$values[$samples]{"slide"} = $slide;
	$values[$samples]{"green"} = $green;  
	$values[$samples]{"red"} = $red; 
	$values[$samples]{"ratio"} = $ratio;
	$values[$samples]{"log"} = log($ratio) / log(10);
	$values[$samples]{"flag"} = $flag;
	$gene{$ORF}->{"values"} = [ @values ];
	$gene{$ORF}->{"samples"} = $samples + 1;
	$gene{$ORF}->{"gFlag"}++ if ($entry[7] =~ /X/);
	$gene{$ORF}->{"rFlag"}++ if ($entry[7] =~ /Y/);
	$gene{$ORF}->{"orf"} = $ORF;
    }   # end while loop stepping through this file
    close(RFILE);
}

#################################################################################
# first arg [0] is the output filehandle, second arg [1] is a list of keys to print

sub printOutput {
    $FILE = $_[0];
    if ($header == 0) {
	$row1 = "#GENE_NAME DESCRIPT.   N S  RATIO    STD     ";
	$row2 = "#--------- ---------   - - ------- -------   ";
	
	for ($i=0; $i<$MAXSAMPLES; $i++) {
	    $row1 .= "    X$i " . "    Y$i " . "  F$i  ";
	    $row2 .= "------ "  . "------ "  . "----  ";
	}
	print $FILE ("$row1\n$row2\n");
	$header = 1;
    }
    else { 
	print $FILE ("\n" . "#" x 110 . "\n\n");  # print a bunch of #'s as a separator
    }
    
    foreach $ORF (@_[1..$#_])  {
	$name = $gene{$ORF}->{"name"};
	$samples = $gene{$ORF}->{"samples"};
	$slides = $gene{$ORF}->{"slides"};
	$mean = $gene{$ORF}->{"mean"};
	$stdRatio = $gene{$ORF}->{"stdRatio"};
	$green = $gene{$ORF}->{"green"};
	$stdGreen = $gene{$ORF}->{"stdGreen"};
	$red = $gene{$ORF}->{"red"};
	$stdRed = $gene{$ORF}->{"stdRed"};
	@values = @ {$gene{$ORF}->{"values"} };
	@outliers = @ {$gene{$ORF}->{"outliers"} } if (defined ($gene{$ORF}->{"outliers"} )) ;

	next if ($opt && $samples < $opt); # skip if too few samples for optimization

	printf $FILE ("%-10s\ %10s   %1d %1d %7.4f %7.4f   ",
		      $ORF, $name, $samples, $slides, $mean, $stdRatio); 
	for ($i = 0; $i < $samples; $i++) { 
	    $values[$i]{"flag"} = "-" if ($values[$i]{"flag"} eq "");
	    printf $FILE ("%6d %6d %4s  ", $values[$i]{"green"}, 
			  $values[$i]{"red"}, $values[$i]{"flag"});
	}
	for ($i = 0; $i <= $#outliers; $i++) {
	    printf $FILE ("%6d %6d %4s  ", $outliers[$i]{"green"}, 
			  $outliers[$i]{"red"}, $outliers[$i]{"flag"} . "O");
	}
	for ($i = 0; $i < $MAXSAMPLES - $samples - $#outliers - 1; $i++) {
	    printf $FILE ("     -      -    -  ");
	}
	print $FILE ("\n");
    }  # end foreach loop
}

####################################################################################
# only arg [0] is one element of the gene array.  From this, extract array of values

sub statisticsOnValues {
    
    @v = @ { $_[0]->{"values"} };
    # first calculate mean of log ratio and of green and red ints.
    $logRatio = $green = $red = 0.0;
    $N = $#v + 1 ;
    for ($i=0; $i<$N; $i++) {
	$logRatio += $v[$i]{"log"};
	$green += $v[$i]{"green"};
	$red += $v[$i]{"red"};
    }
    $logRatio /= $N; $green /= $N; $red /= $N;

    # then calculate standard error and correlation coefficient
    $stdRatio = $stdGreen = $stdRed = $corr = 0.0;
    $errRatio = $errGreen = $errRed = 0.0;
    if ($N > 1) {
	for ($i=0; $i<$N; $i++) {
	    $stdRatio += ($v[$i]{"log"} - $logRatio)**2;
	    $stdGreen += ($v[$i]{"green"} - $green)**2;
	    $stdRed   += ($v[$i]{"red"} - $red)**2;
	    $corr     += ($v[$i]{"green"} - $green) * ($v[$i]{"red"} - $red);
	}
	$corr = $corr / sqrt($stdGreen * $stdRed) if ($stdGreen > 0 && $stdRed > 0);
	$stdRatio /= ($N-1);  $stdGreen /= ($N-1);  $stdRed /= ($N-1);
	$stdRatio = sqrt($stdRatio);   $errRatio = $stdRatio / sqrt($N);
	$stdGreen = sqrt($stdGreen);   $errGreen = $stdGreen / sqrt($N);
	$stdRed   = sqrt($stdRed);     $errRed =   $stdRed   / sqrt($N);
    }
    
    # load these values back into gene array
    $_[0]->{"mean"}     = $logRatio;
    $_[0]->{"stdRatio"} = $stdRatio;
    $_[0]->{"errRatio"} = $errRatio;
    $_[0]->{"green"}    = $green;
    $_[0]->{"stdGreen"} = $stdGreen;
    $_[0]->{"errGreen"} = $errGreen;
    $_[0]->{"red"}      = $red;
    $_[0]->{"stdRed"}   = $stdRed;
    $_[0]->{"errRed"}   = $errRed;
    $_[0]->{"corr"}     = $corr;
}

#########################################################################
# the argument is a gene from the gene array, returns the number of unique
# microarray slides which are represented by the values for that gene

sub getNumSlides {
    
    my (@values, $value, $N);
    my ($id, $count, %slidehash);

    @values = @ { $_[0]->{"values"} };
    $N = $#values+1;
    foreach $value (@values) {
	$id = $value->{"slide"};
	if ( !defined($slidehash{$id}) ) {
	    $count++;
	    $slidehash{$id} = 1; 
	}
    }
    $_[0]->{"slides"} = $count;
}

###############################################################################
# first arg [0] is one element of the gene array.  Possibly removes values from
# the values list, and pushes them on an outliers list.

sub excludeOutliers {
    
    # use Dixon's Test (ref. Dunn & Clark 1987)
    # samples must be from a normal distribution
    my @outliers = ();
    my @values = @ { $_[0]->{"values"} };
    my $samples = $_[0]->{"samples"} ;
    my $N;
    # distribution taken from Dixon (1951) and given in table A.6. of Dunn & Clark (1987)
    # Assuming alpha = .1 (which is standard for this test), .05, or .01,  the dist array below
    # lists the value of the distribution for sample size n, used as the index to the array
    # The values provided can accomodate from 3 to 30 samples
    if (!defined @rdist10) {
	@rdist10 = (0, 0, 0, .886, .679, .557, .482, .434, .399, .370, .349,
		    .332, .318, .305, .294, .285, .277, .269, .263, .258, .252,
		    .247, .242, .238, .234, .230, .227, .224, .220, .218, .215);
	@rdist05 = (0, 0, 0, .941, .765, .642, .560, .507, .468, .437, .412, 
		    .392, .376, .631, .349, .338, .329, .320, .313, .306, .300, 
		    .295, .290, .285, .281, .277, .273, .269, .266, .263, .260);
	@rdist01 = (0, 0, 0, .988, .889, .780, .698, .637, .590, .555, .527,
		    .502, .482, .465, .450, .438, .426, .416, .407, .398, .391,
		    .384, .378, .372, .367, .362, .357, .353, .349, .345, .341);	}

    foreach $color ("green", "red") {
	if ($samples >= 3) {  # can only do outlier rejection if 3 or more samples
	    if ($color eq "green") { @Y = sort by_green @values; }
	    else                   { @Y = sort by_red   @values; }
	    $N = $samples;
	    # compute statistics r10 and r01
	    next if (($Y[$N-1]{$color} - $Y[0]{$color}) == 0);  # check for divByZero error
	    $r10 = ($Y[1]{$color} - $Y[0]{$color}) / ($Y[$N-1]{$color} - $Y[0]{$color}) ;
	    $r01 = ($Y[$N-1]{$color} - $Y[$N-2]{$color}) / ($Y[$N-1]{$color} - $Y[0]{$color}) ;
	    if ( $r01 > $rdist05[$N] ) {
		$temp = pop(@Y);           # remove value from the regular values array
		push( @outliers, $temp );  # and push it on outliers array
		$samples -= 1;                   # update number of samples
	    }
	    if ( $r10 > $rdist05[$N] ) {
		$temp = shift(@Y);         # remove value from the regular values array
		unshift( @outliers, $temp );
		$samples -= 1;                   # update number of samples
	    } 
	    @values = @Y;
	}
    }
    $_[0]->{"values"} = [ @values ];
    $_[0]->{"outliers"} = [ @outliers ];
    $_[0]->{"samples"} = $samples;
}

######################################################################
# first arg [0] is gene array.  Returns True [1] if gene is BAD

sub isBadGene {
    foreach $badG (keys (%badGenes) ) {
	return 1 if ( $_[0]->{"name"} eq $badG) ;
	return 1 if ( $_[0]->{"orf"} eq $badG) ;
    }
    return 0;
}

######################################################################

sub by_mean {
    $aratio = $gene{$a}->{"mean"};
    $bratio = $gene{$b}->{"mean"};
    #print("a = $a, b = $b, ARATIO = $aratio, BRATIO = $bratio\n");
    return $aratio <=> $bratio;	
}

######################################################################

sub by_sig_then_mean {
    $asig = $gene{$a}->{"sigFlag"};   
    $bsig = $gene{$b}->{"sigFlag"};
    return $asig <=> $bsig if ($asig != $bsig) ;

    $aratio = $gene{$a}->{"mean"};
    $bratio = $gene{$b}->{"mean"};
    #print("a = $a, b = $b, ARATIO = $aratio, BRATIO = $bratio\n");
    return $aratio <=> $bratio;	
}

######################################################################

sub by_err {
    $aerr = $gene{$a}->{"errRatio"};
    $berr = $gene{$b}->{"errRatio"};
    return $aerr <=> $berr;	
}

######################################################################

sub by_ratio {
    return $$a{"ratio"} <=> $$b{"ratio"}; 
}

######################################################################

sub by_green {
    return $$a{"green"} <=> $$b{"green"}; 
}

######################################################################

sub by_red {
    return $$a{"red"} <=> $$b{"red"}; 
}
