#!/usr/bin/perl
#
# (c) 2001 by Trey Ideker and Ger van den Engh
#
# preprocess <dapple file> <genekey> <output file>
# 
# An array of hashes, @gene, is shared across subroutines.  
# Each gene is a hash with the following fields:
# $gene[]{orf}        $gene[]{gene}
# $gene[]{row}        $gene[]{col}
# $gene[]{green}      $gene[]{red}
# $gene[]{gBgnd}      $gene[]{rBgnd}
# $gene[]{gBgndLocal} $gene[]{rBgndLocal}
# $gene[]{gBgndErr}   $gene[]{rBgndErr}
# $gene[]{ratio}      $gene[]{log}
# $gene[]{normFactor} $gene[]{notNormalRatio}
# $gene[]{flag}
#
# file names and handles are also shared among subroutines:
# $infile  INFILE
# $keyfile KEYFILE
# $outfile OUTFILE
#
##########################################################################


# load libraries and modules
use FileHandle;
#use FindBin;
#use lib "$FindBin::Bin/../lib";
use lib "/net/arrays/Pipeline/microarray/tools/lib";
use Statistics::OLS;
require 'sortfunctions.pl';
require 'statistics.pl';
require 'psplot.pl';


# global variables which subroutines share
$MAXSLIDEROW = 96;       #variables to hold row and col info
$MAXCOLROW = 0;          #""
$MAXGRIDROW = 0;         #""
$MAXGRIDCOL = 0;         #""
$MAXRELROW = 0;          #""
$MAXRELCOL = 0;          #""
$TOTALGENES = 0;         #total genes
$DEBUG = 0;              #debug flag
$BASE = 10;              #var to hold logarithm base info (default is base 10)
$NORM = "median";        #default normalization is median normalization
$SAT = 99000;            #default saturation intensity

# parse command line
$USAGE = "  preprocess [OPTIONS] <dappleFile> <geneKey> <processedOutput>\n" .
         "        -base <num>\n" . 
         "        -norm {median, none}\n" . 
         "        -sat <num>\n" .
         "        -scale <num>\n" .
         "        -ps\n" ;
for ($i = 0; $i <= $#ARGV; $i++) {
    $_ = $ARGV[$i];
    if    (/-ps/)       { $DEBUG = 1;      }
    elsif (/-base/)        { $BASE = $ARGV[++$i]; }
    elsif (/-norm/)        { $NORM = $ARGV[++$i]; }
    elsif (/-sat/)         { $SAT =  $ARGV[++$i]; }
    elsif (/-scale/)       { $SCALE = $ARGV[++$i]; }
    elsif ( !defined($infile)   ) { $infile = $_ ; }
    elsif ( !defined($keyfile)  ) { $keyfile = $_ ; }
    elsif ( !defined($outfile)  ) { $outfile = $_ ; }
    else { die "\nCannot parse command line.\n$USAGE\n\n"; }
}
if ( !defined($infile) || !defined($keyfile) || !defined($outfile) ) {
    die "\nCannot parse command line.\n$USAGE\n\n";
}

# load info from gene key file
lookupGenes();

# read through dapple file once to get max col and row information
readMaxRowCol();

# read dapple file again, this time extracting all information
readDapple();

# open output files
open(OUTFILE, ">$outfile");
STDOUT->autoflush;

# go through genes yet again, this time computing and subtracting bkgnd.
estimateBgnd();

# normalize intensities
if ($NORM eq "median")    { normRatioMedian(); }
#elsif ($NORM eq "mean")   { normRatioMean();   }
#elsif ($NORM eq "window") { normRatioWindow(); }
elsif ($NORM eq "none")   {                    }
else                      { die "\nBad normalization option\n$USAGE\n\n"; }

# take ratios
computeRatio();

# sort by ratios and print out all information to output file
printOut();

# close output files
close(OUTFILE);

######################################################################
#
# Subroutines 
#
######################################################################

# row is passed in as first arg, col is second arg, max rows is third arg.
sub rowcol2index {
    return $_[0] + ($_[1] * $_[2]);
}

######################################################################

# index is first arg, max rows is second arg.
sub index2rowcol {
    return( ($_[0] % $_[1]) , (int ($_[0] / $_[1]) )  );
}

######################################################################

sub lookupGenes {
    print("\n#Extracting gene info from key\n");
    if ( open(KEYFILE, "$keyfile") == 0 ) { die "\nCannot open keyfile\n\n" ; }

    #### Skip possible header and parse the num_rows_per_slide directive
    my $line;
    while ($line = <KEYFILE>) {
      next if ($line =~ /^\s*\#/);
      if ($line =~ /num_rows_per_slide\s+(\d+)/) {
        $MAXSLIDEROW = $1;
        last;
      }
    }

    $index = -1;
    print("#Max slide rows in keyfile = $MAXSLIDEROW\n");
    while (<KEYFILE>) {
	if (/\d/) {
	    # if line not blank, read it in and parse it
	    @entry = split;
	    $row = $entry[0];
	    $col = $entry[1];
	    $orf = $entry[2];
	    $geneName = $entry[3];

	    # check to ensure reasonable values have been read
	    if ($row !~ /\d/ || $col !~ /\d/ || 
		$orf !~ /\w/ || $geneName !~ /\w/) {
		die("Error reading key file\n");
	    }
	    
	    # construct index to 1D array and store info
	    $index = rowcol2index($row, $col, $MAXSLIDEROW);
	    $gene[$index]{orf} = $orf;
	    $gene[$index]{gene} = $geneName;
	    $gene[$index]{row} = $row;
	    $gene[$index]{col} = $col;
	}
    }
    close(KEYFILE);
}

######################################################################

sub readMaxRowCol {
    print("\n#Getting grid configuration from dapple file:\n");
    if ( open(INFILE, "$infile") == 0 ) { die "\nCannot open dapple file\n\n" ; }
    <INFILE>; # get rid of first line of file
    $MAXRELROW = $MAXRELCOL = $MAXGRIDROW = $MAXGRIDCOL = 0;
    while(<INFILE>) {
	@entry = split;
	$MAXGRIDCOL = $entry[0] if ($entry[0] > $MAXGRIDCOL);
	$MAXGRIDROW = $entry[1] if ($entry[1] > $MAXGRIDROW);
	$MAXRELCOL = $entry[2] if ($entry[2] > $MAXRELCOL);
	$MAXRELROW = $entry[3] if ($entry[3] > $MAXRELROW);
    }
    $MAXGRIDCOL++;
    $MAXGRIDROW++;
    $MAXRELCOL++;
    $MAXRELROW++;
    $MAXSLIDEROW = $MAXGRIDROW * $MAXRELROW;
    $MAXSLIDECOL = $MAXGRIDCOL * $MAXRELCOL;
    print("#Found $MAXGRIDROW x $MAXGRIDCOL grids of $MAXRELROW x $MAXRELCOL spots each,\n");
    print("#accounting for $MAXSLIDEROW rows by $MAXSLIDECOL columns total.\n");
    close(INFILE);
}

######################################################################

sub readDapple {

    print("\n#Reading dapple file and computing ratios\n");
    open(INFILE, "$infile");
    <INFILE>;   # get rid of first line of file
    $TOTALGENES = $gCount = $rCount = 0;
    $maxRow = $maxCol = 0;
    $greenSum = $redSum = 0;
    
    while (<INFILE>) {
	# convert dapple coordinates to slide row and column
	@entry = split;
	$gridCol = $entry[0];
	$gridRow = $entry[1];
	$relCol = $entry[2];
	$relRow = $entry[3];
	$sliderow = $relRow + ( $MAXRELROW * $gridRow) ;
	$slidecol = $relCol + ( $MAXRELCOL * $gridCol) ;
	$maxRow = $row if ($maxRow < $row);
	$maxCol = $col if ($maxCol < $col);
	
	# use index to make 1D gene array
	$index = rowcol2index($sliderow, $slidecol, $MAXSLIDEROW);

	# get foreground, background, and dapple classification
	$gene[$index]{green} = $entry[5];
	$gene[$index]{gBgnd} = $entry[6];
	$gene[$index]{red}   = $entry[9];
	$gene[$index]{rBgnd} = $entry[10];
	# flag spot if not found by dapple (N) or saturated (S)
	$gene[$index]{flag} =  "N" if ($entry[4] =~ /[0R]/ && $entry[8] =~ /[0R]/) ;
	$gene[$index]{flag} .= "S" if ($entry[5] >= $SAT || $entry[9] >= $SAT);
	$TOTALGENES++;               # count total number of genes examined in file
    }
    close(INFILE);
    $maxRow++;
    $maxCol++;
}

######################################################################
# the many symbols used to refer to background descriptive statistics
# are listed here.  Definitions for the green channel are given, but the 
# red channel is analogous:
#
# gBgnd:         Dapple's estimate of background from a single vignette
# gbLocalAvg:    avg. of gBgnd over local neighborhood of 9 vignettes
# gbLocalMed:    median of gBgnd over local neighborhood
# gbLocalStd:    stddev of gBgnd over local neighborhood
# gbLocalStdAvg: avg. of gBgnd stddev over local neighborhood of 9 vignettes
# gbLocalStdMed: median of gBgnd stddev over local neighborhood
# gbGlobalAvg:   avg. of gbLocalAvg over entire array
# gbGlobalStd:   avg. of gbLocalStd over entire array

sub estimateBgnd {

    my ($i, $gWarn, $rWarn);
    my (@localGenes, @gSorted, @rSorted);
    $gWarn = 0; $rWarn = 0;

    # compute local median and MAD
    print("\n#Smoothing Local Background");
    for ($i=0; $i<$TOTALGENES; $i++) {
	# retrieve list of local genes then take median and MAD of them- 
	# multiply MAD by 1.3 to estimate std deviation
	@localGenes = getLocalGenes($i);       
	@gSorted = sort by_gBgnd @localGenes;
	@rSorted = sort by_rBgnd @localGenes;
	$gene[$i]{gbLocalMed} = $gSorted[ int( ($#gSorted+1) / 2) ]{gBgnd} ;
	$gene[$i]{rbLocalMed} = $rSorted[ int( ($#rSorted+1) / 2) ]{rBgnd} ;
	$gene[$i]{gbLocalStd} = 1.3 * MAD($gene[$i]{gbLocalMed}, "gBgnd", @localGenes );
	$gene[$i]{rbLocalStd} = 1.3 * MAD($gene[$i]{rbLocalMed}, "rBgnd", @localGenes );
	print (".") if ($i % 100 == 0) ;       
	if ($gWarn == 0 && $gene[$i]{gbLocalStd} <= 0) {
	    print ("\nWARNING: green backgnd MAD of zero\n") ;
	    $gWarn = 1;
	}
	if ($rWarn == 0 && $gene[$i]{rbLocalStd} <= 0) {
	    print ("\nWARNING: red backgnd MAD of zero\n") ;
	    $rWarn = 1;
	}
    }

    # compute global averages over entire array
    $gbGlobalAvg = mean( "gbLocalMed", @gene );   # the average local background
    $rbGlobalAvg = mean( "rbLocalMed", @gene );   
    $gbGlobalStd = mean( "gbLocalStd", @gene );   # the average stddev of backgnd
    $rbGlobalStd = mean( "rbLocalStd", @gene );       

    # set green and red low value thresholds at 2 * global bkgnd stdev
    $gThresh = $gbGlobalStd * 2;
    $rThresh = $rbGlobalStd * 2;
    
    # step back through genes, subtracting background and / or raising flags...
    print("\n#Subtracting Local Background\n");
    for ($i=0; $i<$TOTALGENES; $i++) {
	next if ($gene[$i]{flag} =~ /N/);        # don't subtract bgnd if no spot found
	# subtract background and compute unnormalized ratio
	$gene[$i]{green} = $gene[$i]{green} - $gene[$i]{gbLocalMed};
	$gene[$i]{red}   = $gene[$i]{red}   - $gene[$i]{rbLocalMed};
	# if subtracted fgnd intens. is less than 2 stddev of background set flag
	$gene[$i]{flag} .= "X" if ($gene[$i]{green} <= $gThresh);
	$gene[$i]{flag} .= "Y"	if ($gene[$i]{red}   <= $rThresh);
	# if bgnd intens. was more than median background + 2 global stddev, set flag
	if ($gene[$i]{gBgnd} >= $gene[$i]{gbLocalMed} + $gThresh) {
	    $gene[$i]{flag} .= "A" ; }
	if ($gene[$i]{rBgnd} >= $gene[$i]{rbLocalMed} + $rThresh) {
	    $gene[$i]{flag} .= "B" ; }
	# count as good gene unless S or X or Y were raised (N is already
	# excluded at this point. Only these good genes will be used for normalization.
	if ($gene[$i]{flag} !~ /S|X|Y/) {
	    $geneGood[$goodCount] = $gene[$i];
	    $gGood[$goodCount]   = $gene[$i]{green};
	    $rGood[$goodCount++] = $gene[$i]{red};
	    $greenSum += $gene[$i]{green}; 
	    $redSum   += $gene[$i]{red}; 
	}
    } # end for loop stepping through each gene
    print("\n");
    # calculate summary statistics
    $avgGreen = $greenSum / $goodCount;
    $avgRed = $redSum / $goodCount;
    @gSorted = sort by_value @gGood;           @rSorted = sort by_value @rGood;
    $medianGreen = $gSorted[$goodCount / 2];   $medianRed = $rSorted[$goodCount / 2];
    # find minimum value across background subtracted green and red data
    $minGreen = $gSorted[0];   $minRed = $rSorted[0];
}

######################################################################
# only arg [0] is the gene index.  Returns an array of local genes.
# for now, use 7 x 7 window

sub getLocalGenes {

    my ($row, $col, $centerRow, $centerCol, @localIndices);

    # define local vignettes to include in background estimate...
    # ...for now, look at current vignette and all 8 adjacent ones
    ($centerRow, $centerCol) = index2rowcol($_[0], $MAXSLIDEROW);

    # only include those vignette indices which are in range
    @localIndices = ();          # reset array
    for ($col = $centerCol - 3; $col <= $centerCol + 3; $col++) {
	for ($row = $centerRow - 3; $row <= $centerRow + 3; $row++) {
	    if ($row >= 0 && $row < $MAXSLIDEROW && $col >= 0 && $col < $MAXSLIDECOL) {
		push (@localIndices, rowcol2index($row, $col, $MAXSLIDEROW) ) ;
	    }
	}
    }
    return @gene[@localIndices];
}

######################################################################

sub normRatioMedian {

    print("\n#Normalizing");    

    # gGood and rGood have already been sorted in backgnd subroutine
    # only take upper 1/2 of these as good genes
    @geneGood = sort by_green_and_red @geneGood;
    @geneGood = @geneGood[int($goodCount/2)..($goodCount-1)];
    $goodCount = int($goodCount/2);
    print("\n#Using median\n");
    my $ls = Statistics::OLS->new();
    $ls->setData(\@gGood, \@rGood) or die( $ls->error() );
    $ls->regress() or die ($ls->error() );
    ($gVar, $rVar, $cov) = $ls->var();
    ($gMean, $rMean)       = $ls->av();
    $gStd = sqrt($gVar); $rStd = sqrt($rVar);
    
    # compute median
    @gSorted = sort by_green @geneGood;
    @rSorted = sort by_red @geneGood;
    $gMedian = $gSorted[ int( ($#gSorted+1) / 2) ]{"green"} ;   #use median to estimate mean
    $rMedian = $rSorted[ int( ($#rSorted+1) / 2) ]{"red"} ;
    $avgMedian = ($gMedian + $rMedian) / 2;
    $avgMedian = $SCALE if ($SCALE);         # if specified, use externally-provided scale
    # scale minimums to their future values, then find smallest
    $minGreen = $avgMedian * $minGreen / $gMedian;
    $minRed   = $avgMedian * $minRed   / $rMedian;
    if ($minGreen < $minRed) { $minValue = $minGreen; }
    else                     { $minValue = $minRed;   }
    $gThresh = $avgMedian * $gThresh / $gMedian;
    $rThresh = $avgMedian * $rThresh / $rMedian;
    # loop over genes, scaling green and red values and taking ratio, etc.
    # The genes should end up along the line y=x, with smallest value at (1,1)
    for ($i=0; $i<$TOTALGENES; $i++) {
	# only modify green and red if spot found.  
	if ($gene[$i]{flag} !~ /N/) {
	    $gene[$i]{green} = $avgMedian * $gene[$i]{green} / $gMedian;
	    $gene[$i]{red} = $avgMedian * $gene[$i]{red} / $rMedian;
	}
    }
}

######################################################################

sub computeRatio {

    my $count = 0;
    # can only compute ratios if normalization hasn't drastically affected the data offsets
    if ($gThresh <= 1) {
	print("WARNING: readjusting non-positive green threshold\n");
	$gThresh = 1;
    }
    if ($rThresh <= 1) {
	print("WARNING: readjusting non-positive red threshold\n");
	$rThresh = 1;
    }
    for ($i=0; $i<$TOTALGENES; $i++) {    
	# compute ratio
	if ( $gene[$i]{flag} =~ /XY|N/) {  # if both values low or no spot found, return 1
	    $gene[$i]{ratio} = 1;
	}
	elsif ( $gene[$i]{flag} =~ /X/ ) { # if just green low, use conservative limit
	    $gene[$i]{ratio} = $gThresh / $gene[$i]{red};
	}
	elsif ( $gene[$i]{flag} =~ /Y/ ) { # same for red
	    $gene[$i]{ratio} = $gene[$i]{green} / $rThresh ;
	}
	else {  # no low values to worry about
	    $gene[$i]{ratio} = $gene[$i]{green} / $gene[$i]{red} ;
	}
	
	# take log ratio
	if ($BASE =~ /e/) { $gene[$i]{log} = log ( $gene[$i]{ratio} ); }	 
	else  { 
	    $gene[$i]{log} = log ( $gene[$i]{ratio} ) / log ($BASE);  
	}	
	$avgRatio += $gene[$i]{ratio};
	$avgLog += $gene[$i]{log};
	$green[$count] = $gene[$i]{green};
	$red[$count++] = $gene[$i]{red};
	$avgGreenAfter += $gene[$i]{green};
	$avgRedAfter += $gene[$i]{red};
    }
    
    print("\n\n");
    
    $avgRatio /= $TOTALGENES;
    $avgLog /= $TOTALGENES;
    $avgGreenAfter /= $TOTALGENES; 
    $avgRedAfter /= $TOTALGENES; 
    @sorted = sort by_value @green;
    $medianGreenAfter = @sorted[$count / 2];
    @sorted = sort by_value @red;
    $medianRedAfter = @sorted[$count / 2];
}

######################################################################

sub printOut {
    @sorted = sort by_ratio @gene;
    printf OUTFILE ("#Using base $BASE for log ratios\n");
    printf OUTFILE ("#TOTALS:  genes = %d, slide rows = %d, slide cols = %d \n", 
		    $TOTALGENES, $maxRow, $maxCol);
    printf OUTFILE ("#GREEN FOREGROUND:      mean = %6.1f, median = %6.1f\n", 
		    $avgGreen, $medianGreen);
    printf OUTFILE ("#RED   FOREGROUND:      mean = %6.1f, median = %6.1f\n", 
		    $avgRed, $medianRed);
    printf OUTFILE ("#GREEN BACKGROUND:      mean = %6.1f, stddev = %6.2f\n",
		    $gbGlobalAvg, $gbGlobalStd);
    printf OUTFILE ("#RED   BACKGROUND:      mean = %6.1f, stddev = %6.2f\n",
		    $rbGlobalAvg, $rbGlobalStd);
    printf OUTFILE ("#BACKGROUND THRESHOLDS: green = %6.1f, red = %6.2f\n",
		    $gThresh, $rThresh);
    if ($NORM eq "median") {	
	my $ratio1 = $ratio2 = $ratio3 = $ratio4 = 0;
	$ratio1 = $avgGreen/$avgRed if ($avgRed > 0);
	$ratio2 = $medianGreen/$medianRed if ($medianRed > 0);
	$ratio3 = $avgGreenAfter/$avgRedAfter if ($avgRedAfter > 0);
	$ratio4 = $medianGreenAfter/$medianRedAfter if ($medianRedAfter > 0);
	printf OUTFILE ("\n#Using Median Normalization\n");
	if ($SCALE) { print OUTFILE ("#Scale specified at $SCALE\n"); }
	else        { print OUTFILE ("#Scale set to average median $avgMedian\n"); }
	printf OUTFILE ("#%d genes used for normalization \n", $goodCount);
	printf OUTFILE ("#BEFORE NORMALIZATION:\n");
	printf OUTFILE ("#   (green avg / red avg) = %5.2f\n", $ratio1);
	printf OUTFILE ("#   (green med / red med) = %5.2f\n", $ratio2);
	printf OUTFILE ("#AFTER NORMALIZATION:\n");
	printf OUTFILE ("#   (green avg / red avg) = %5.2f\n", $ratio3);
	printf OUTFILE ("#   (green med / red med) = %5.2f\n", $ratio4);
    }
    histogram(OUTFILE);     # print histogram of log ratios
    
    # finally ready to print ratios for each gene
    print OUTFILE ("#GENE        GENE       RATIO   LOG     ------  ------  -----  -SLIDE-\n");
    print OUTFILE ("#NAME      DESCRIPT      X/Y    RATIO   X_INT   Y_INT   FLAG   ROW COL\n");
    print OUTFILE ("#--------  ---------  --------  ------  ------  ------  -----  --- ---\n");
    for ($i=0; $i<$TOTALGENES; $i++) {
	$sorted[$i]{flag} = "-" if ($sorted[$i]{flag} eq "");
	printf OUTFILE ("%-9s  %9s  %8.4f  %6.3f  %6.0f  %6.0f  %5s  %3d %3d\n", 
			$sorted[$i]{orf}, $sorted[$i]{gene}, 
			$sorted[$i]{ratio}, $sorted[$i]{log},
			$sorted[$i]{green}, $sorted[$i]{red},
			$sorted[$i]{flag},
			$sorted[$i]{row}, $sorted[$i]{col} );

    }
    
    my @x, @y;
	for ($i=0; $i<$TOTALGENES; $i++) {
	    $x[$i] = $sorted[$i]{green};
	    $y[$i] = $sorted[$i]{red};
	}
#    for ($i=0; $i<$goodCount; $i++) {
#	$x[$i] = $geneGood[$i]{green};
#	$y[$i] = $geneGood[$i]{red};
#    }
    psplot(\@x, \@y, "$outfile.ps", -1000, 50000, -1000, 50000);
    
}

#############################################################################

#THE FOLLOWING CODE IS NOT CURRENTLY SUPPORTED BUT MAY BE USEFUL IN THE FUTURE

#############################################################################

sub normRatioWindow {
    $WINDOWSIZE = 100; 
    $MID = $WINDOWSIZE / 2;
    print("\n#Normalizing by window of $WINDOWSIZE");

    # compute preNormalized ratio
    for ($i=0; $i<$TOTALGENES; $i++) {
	if ($gene[$i]{flag} !~ /N|S|GR/) { 
	    $gene[$i]{notNormalRatio} = $gene[$i]{green} / $gene[$i]{red}; 
	} else { $gene[$i]{notNormalRatio} = 1; }
    }

    # compute window normalized ratio
    @gene = sort by_green_and_red @gene;
    for ($i=0; $i<$TOTALGENES; $i++) {	
	# set window boundaries
	$windowMin = $i - ($WINDOWSIZE/2);
	$windowMax = $i + ($WINDOWSIZE/2);
	if ($windowMin < 0) {
	    $windowMin = 0;
	    $windowMax = $WINDOWSIZE;
	}
	elsif ($windowMax >= $TOTALGENES) {
	    $windowMax = $TOTALGENES-1;
	    $windowMin = $TOTALGENES-1 - $WINDOWSIZE;
	}
	
	# get median ratio for this region: norm factor is median ratio
	@win = sort by_notNormalRatio @gene[$windowMin .. $windowMax];
	if ($win[$MID]{green} > 0 && $win[$MID]{red} > 0) {
	    $gene[$i]{normFactor} = $win[$MID]{green}/$win[$MID]{red};
	}
	else { $gene[$i]{normFactor} = 1; }

	# print out normFactor curve on log scale
	normCurve($i);
	print (".") if ($i % 100 == 0) ;
	
	# adjust red value to green one.
	$gene[$i]{red} = $gene[$i]{red} * $gene[$i]{normFactor};
    }
}

######################################################################

sub normCurve {
    
    $geneindex = $_[0];

    # first time print out header info and set defaults
    if (!defined ($init) ) {
	$init = 1;
	$XSTEP = $TOTALGENES / 20; # will print out a data pt every $XSTEP genes
	$YZERO = 20;   # this is the ASCII text column that will by Y = 0;
	$YSTEP = .2;  # each ASCII column corresponds to a normFactor delta of YSTEP 
	print OUTFILE ("\n#NORMALIZATION FACTOR CURVE (window length = $WINDOWSIZE)\n");
	printf OUTFILE ("#          Log10 (norm. factor)\n#     ");
	for ($col = $YZERO-15; $col < $YZERO+15; $col += 5) { 
	    printf OUTFILE ("%4.1f ", $YSTEP * ($col - $YZERO) );
	}
	print OUTFILE ("\n#    ------------------------------\n");
	print OUTFILE ("#gene|");
	for ($col = $YZERO-15; $col < $YZERO+15; $col += 5) { 
	    printf OUTFILE ("   | ");
	}
	print OUTFILE ("\n");
    }
    
    # first and every time, print out data point
    if ( $geneindex % $XSTEP == 0) {
	$logFactor = log ($gene[$geneindex]{normFactor}) / log (10);
	$yposition = $YZERO + ($logFactor / $YSTEP);
	printf OUTFILE ("#%4d|", $i);
	for ($col = 0; $col < $yposition; $col++) { printf OUTFILE (" "); }
	print OUTFILE ("x\n");
    }
}

######################################################################

sub normRatioRegress {

    $avgRatio = $avgLogRatio = 0;
    print("\n#Normalizing");    

    # regress gGood and rGood, which have already been sorted in backgnd subroutine
    print("\n#Performing linear regression");
    my $ls = Statistics::OLS->new();
    $ls->setData(\@gGood, \@rGood) or die( $ls->error() );
    $ls->regress() or die ($ls->error() );
    ($intercept, $slope) = $ls->coefficients();
    # scale minimums to their future values, then find smallest
    $minGreen = $minGreen * $slope;        
    $minRed = $minRed - $intercept;      
    if ($minGreen < $minRed) { $minValue = $minGreen; }
    else                     { $minValue = $minRed;   }
    # loop over genes, scaling green and red values and taking ratio, etc.
    # The genes should end up along the line y=x, with smallest value at (1,1)
    for ($i=0; $i<$TOTALGENES; $i++) {
	# only modify green and red if spot found.  
	if ($gene[$i]{flag} !~ /N/) {
	    $gene[$i]{green} = $gene[$i]{green} * $slope;
	    $gene[$i]{red}   = $gene[$i]{red}   - $intercept;
	}
    }
}

######################################################################

sub normRatioMean {

    my $M = 3000;            # global multiplicative factor
    my $B = 3800;
    print("\n#Normalizing");    

    # regress gGood and rGood, which have already been sorted in backgnd subroutine
    print("\n#Using mean and std. deviation\n");
    my $ls = Statistics::OLS->new();
    $ls->setData(\@gGood, \@rGood) or die( $ls->error() );
    $ls->regress() or die ($ls->error() );
    ($gVar, $rVar, $cov) = $ls->var();
    ($gMean, $rMean)       = $ls->av();
    $gStd = sqrt($gVar); $rStd = sqrt($rVar);
    # compute median and MAD
    @gSorted = sort by_green @geneGood;
    @rSorted = sort by_red @geneGood;
    $gMedian = $gSorted[ int( ($#gSorted+1) / 2) ]{"green"} ;   #use median to estimate mean
    $rMedian = $rSorted[ int( ($#rSorted+1) / 2) ]{"red"} ;
    $avgMedian = ($gMedian + $rMedian) / 2;
    $gMAD = 1.3 * MAD($gMean, "green", @geneGood);
    $rMAD = 1.3 * MAD($rMean, "red", @geneGood);
    # scale minimums and thresholds to their future values
    $minGreen = $M * ($minGreen - $gMean) / $gMAD + $B;
    $minRed   = $M * ($minRed   - $rMean) / $rMAD + $B;      
    if ($minGreen < $minRed) { $minValue = $minGreen; }
    else                     { $minValue = $minRed;   }
    $gThresh = $M * ($gThresh - $gMean) / $gMAD + $B;
    $rThresh = $M * ($rThresh - $rMean) / $rMAD + $B;
    # loop over genes, scaling green and red values and taking ratio, etc.
    # The genes should end up along the line y=x, with smallest value at (1,1)
    for ($i=0; $i<$TOTALGENES; $i++) {
	# only modify green and red if spot found.  
	if ($gene[$i]{flag} !~ /N/) {
	    $gene[$i]{green} = ($M * ($gene[$i]{green} - $gMean) / $gMAD) + $B;
	    $gene[$i]{red}   = ($M * ($gene[$i]{red}   - $rMean) / $rMAD) + $B;
	}
    }
}

