#!/usr/bin/perl # # (c) 2001 by Trey Ideker and Ger van den Engh # # preprocess # # An array of hashes, @gene, is shared across subroutines. # Each gene is a hash with the following fields: # $gene[]{orf} $gene[]{gene} # $gene[]{row} $gene[]{col} # $gene[]{green} $gene[]{red} # $gene[]{gBgnd} $gene[]{rBgnd} # $gene[]{gBgndLocal} $gene[]{rBgndLocal} # $gene[]{gBgndErr} $gene[]{rBgndErr} # $gene[]{ratio} $gene[]{log} # $gene[]{normFactor} $gene[]{notNormalRatio} # $gene[]{flag} # # file names and handles are also shared among subroutines: # $infile INFILE # $keyfile KEYFILE # $outfile OUTFILE # ########################################################################## # load libraries and modules use FileHandle; #use FindBin; #use lib "$FindBin::Bin/../lib"; use lib "/net/arrays/Pipeline/microarray/tools/lib"; use Statistics::OLS; require 'sortfunctions.pl'; require 'statistics.pl'; require 'psplot.pl'; # global variables which subroutines share $MAXSLIDEROW = 96; #variables to hold row and col info $MAXCOLROW = 0; #"" $MAXGRIDROW = 0; #"" $MAXGRIDCOL = 0; #"" $MAXRELROW = 0; #"" $MAXRELCOL = 0; #"" $TOTALGENES = 0; #total genes $DEBUG = 0; #debug flag $BASE = 10; #var to hold logarithm base info (default is base 10) $NORM = "median"; #default normalization is median normalization $SAT = 99000; #default saturation intensity # parse command line $USAGE = " preprocess [OPTIONS] \n" . " -base \n" . " -norm {median, none}\n" . " -sat \n" . " -scale \n" . " -ps\n" ; for ($i = 0; $i <= $#ARGV; $i++) { $_ = $ARGV[$i]; if (/-ps/) { $DEBUG = 1; } elsif (/-base/) { $BASE = $ARGV[++$i]; } elsif (/-norm/) { $NORM = $ARGV[++$i]; } elsif (/-sat/) { $SAT = $ARGV[++$i]; } elsif (/-scale/) { $SCALE = $ARGV[++$i]; } elsif ( !defined($infile) ) { $infile = $_ ; } elsif ( !defined($keyfile) ) { $keyfile = $_ ; } elsif ( !defined($outfile) ) { $outfile = $_ ; } else { die "\nCannot parse command line.\n$USAGE\n\n"; } } if ( !defined($infile) || !defined($keyfile) || !defined($outfile) ) { die "\nCannot parse command line.\n$USAGE\n\n"; } # load info from gene key file lookupGenes(); # read through dapple file once to get max col and row information readMaxRowCol(); # read dapple file again, this time extracting all information readDapple(); # open output files open(OUTFILE, ">$outfile"); STDOUT->autoflush; # go through genes yet again, this time computing and subtracting bkgnd. estimateBgnd(); # normalize intensities if ($NORM eq "median") { normRatioMedian(); } #elsif ($NORM eq "mean") { normRatioMean(); } #elsif ($NORM eq "window") { normRatioWindow(); } elsif ($NORM eq "none") { } else { die "\nBad normalization option\n$USAGE\n\n"; } # take ratios computeRatio(); # sort by ratios and print out all information to output file printOut(); # close output files close(OUTFILE); ###################################################################### # # Subroutines # ###################################################################### # row is passed in as first arg, col is second arg, max rows is third arg. sub rowcol2index { return $_[0] + ($_[1] * $_[2]); } ###################################################################### # index is first arg, max rows is second arg. sub index2rowcol { return( ($_[0] % $_[1]) , (int ($_[0] / $_[1]) ) ); } ###################################################################### sub lookupGenes { print("\n#Extracting gene info from key\n"); if ( open(KEYFILE, "$keyfile") == 0 ) { die "\nCannot open keyfile\n\n" ; } #### Skip possible header and parse the num_rows_per_slide directive my $line; while ($line = ) { next if ($line =~ /^\s*\#/); if ($line =~ /num_rows_per_slide\s+(\d+)/) { $MAXSLIDEROW = $1; last; } } $index = -1; print("#Max slide rows in keyfile = $MAXSLIDEROW\n"); while () { if (/\d/) { # if line not blank, read it in and parse it @entry = split; $row = $entry[0]; $col = $entry[1]; $orf = $entry[2]; $geneName = $entry[3]; # check to ensure reasonable values have been read if ($row !~ /\d/ || $col !~ /\d/ || $orf !~ /\w/ || $geneName !~ /\w/) { die("Error reading key file\n"); } # construct index to 1D array and store info $index = rowcol2index($row, $col, $MAXSLIDEROW); $gene[$index]{orf} = $orf; $gene[$index]{gene} = $geneName; $gene[$index]{row} = $row; $gene[$index]{col} = $col; } } close(KEYFILE); } ###################################################################### sub readMaxRowCol { print("\n#Getting grid configuration from dapple file:\n"); if ( open(INFILE, "$infile") == 0 ) { die "\nCannot open dapple file\n\n" ; } ; # get rid of first line of file $MAXRELROW = $MAXRELCOL = $MAXGRIDROW = $MAXGRIDCOL = 0; while() { @entry = split; $MAXGRIDCOL = $entry[0] if ($entry[0] > $MAXGRIDCOL); $MAXGRIDROW = $entry[1] if ($entry[1] > $MAXGRIDROW); $MAXRELCOL = $entry[2] if ($entry[2] > $MAXRELCOL); $MAXRELROW = $entry[3] if ($entry[3] > $MAXRELROW); } $MAXGRIDCOL++; $MAXGRIDROW++; $MAXRELCOL++; $MAXRELROW++; $MAXSLIDEROW = $MAXGRIDROW * $MAXRELROW; $MAXSLIDECOL = $MAXGRIDCOL * $MAXRELCOL; print("#Found $MAXGRIDROW x $MAXGRIDCOL grids of $MAXRELROW x $MAXRELCOL spots each,\n"); print("#accounting for $MAXSLIDEROW rows by $MAXSLIDECOL columns total.\n"); close(INFILE); } ###################################################################### sub readDapple { print("\n#Reading dapple file and computing ratios\n"); open(INFILE, "$infile"); ; # get rid of first line of file $TOTALGENES = $gCount = $rCount = 0; $maxRow = $maxCol = 0; $greenSum = $redSum = 0; while () { # convert dapple coordinates to slide row and column @entry = split; $gridCol = $entry[0]; $gridRow = $entry[1]; $relCol = $entry[2]; $relRow = $entry[3]; $sliderow = $relRow + ( $MAXRELROW * $gridRow) ; $slidecol = $relCol + ( $MAXRELCOL * $gridCol) ; $maxRow = $row if ($maxRow < $row); $maxCol = $col if ($maxCol < $col); # use index to make 1D gene array $index = rowcol2index($sliderow, $slidecol, $MAXSLIDEROW); # get foreground, background, and dapple classification $gene[$index]{green} = $entry[5]; $gene[$index]{gBgnd} = $entry[6]; $gene[$index]{red} = $entry[9]; $gene[$index]{rBgnd} = $entry[10]; # flag spot if not found by dapple (N) or saturated (S) $gene[$index]{flag} = "N" if ($entry[4] =~ /[0R]/ && $entry[8] =~ /[0R]/) ; $gene[$index]{flag} .= "S" if ($entry[5] >= $SAT || $entry[9] >= $SAT); $TOTALGENES++; # count total number of genes examined in file } close(INFILE); $maxRow++; $maxCol++; } ###################################################################### # the many symbols used to refer to background descriptive statistics # are listed here. Definitions for the green channel are given, but the # red channel is analogous: # # gBgnd: Dapple's estimate of background from a single vignette # gbLocalAvg: avg. of gBgnd over local neighborhood of 9 vignettes # gbLocalMed: median of gBgnd over local neighborhood # gbLocalStd: stddev of gBgnd over local neighborhood # gbLocalStdAvg: avg. of gBgnd stddev over local neighborhood of 9 vignettes # gbLocalStdMed: median of gBgnd stddev over local neighborhood # gbGlobalAvg: avg. of gbLocalAvg over entire array # gbGlobalStd: avg. of gbLocalStd over entire array sub estimateBgnd { my ($i, $gWarn, $rWarn); my (@localGenes, @gSorted, @rSorted); $gWarn = 0; $rWarn = 0; # compute local median and MAD print("\n#Smoothing Local Background"); for ($i=0; $i<$TOTALGENES; $i++) { # retrieve list of local genes then take median and MAD of them- # multiply MAD by 1.3 to estimate std deviation @localGenes = getLocalGenes($i); @gSorted = sort by_gBgnd @localGenes; @rSorted = sort by_rBgnd @localGenes; $gene[$i]{gbLocalMed} = $gSorted[ int( ($#gSorted+1) / 2) ]{gBgnd} ; $gene[$i]{rbLocalMed} = $rSorted[ int( ($#rSorted+1) / 2) ]{rBgnd} ; $gene[$i]{gbLocalStd} = 1.3 * MAD($gene[$i]{gbLocalMed}, "gBgnd", @localGenes ); $gene[$i]{rbLocalStd} = 1.3 * MAD($gene[$i]{rbLocalMed}, "rBgnd", @localGenes ); print (".") if ($i % 100 == 0) ; if ($gWarn == 0 && $gene[$i]{gbLocalStd} <= 0) { print ("\nWARNING: green backgnd MAD of zero\n") ; $gWarn = 1; } if ($rWarn == 0 && $gene[$i]{rbLocalStd} <= 0) { print ("\nWARNING: red backgnd MAD of zero\n") ; $rWarn = 1; } } # compute global averages over entire array $gbGlobalAvg = mean( "gbLocalMed", @gene ); # the average local background $rbGlobalAvg = mean( "rbLocalMed", @gene ); $gbGlobalStd = mean( "gbLocalStd", @gene ); # the average stddev of backgnd $rbGlobalStd = mean( "rbLocalStd", @gene ); # set green and red low value thresholds at 2 * global bkgnd stdev $gThresh = $gbGlobalStd * 2; $rThresh = $rbGlobalStd * 2; # step back through genes, subtracting background and / or raising flags... print("\n#Subtracting Local Background\n"); for ($i=0; $i<$TOTALGENES; $i++) { next if ($gene[$i]{flag} =~ /N/); # don't subtract bgnd if no spot found # subtract background and compute unnormalized ratio $gene[$i]{green} = $gene[$i]{green} - $gene[$i]{gbLocalMed}; $gene[$i]{red} = $gene[$i]{red} - $gene[$i]{rbLocalMed}; # if subtracted fgnd intens. is less than 2 stddev of background set flag $gene[$i]{flag} .= "X" if ($gene[$i]{green} <= $gThresh); $gene[$i]{flag} .= "Y" if ($gene[$i]{red} <= $rThresh); # if bgnd intens. was more than median background + 2 global stddev, set flag if ($gene[$i]{gBgnd} >= $gene[$i]{gbLocalMed} + $gThresh) { $gene[$i]{flag} .= "A" ; } if ($gene[$i]{rBgnd} >= $gene[$i]{rbLocalMed} + $rThresh) { $gene[$i]{flag} .= "B" ; } # count as good gene unless S or X or Y were raised (N is already # excluded at this point. Only these good genes will be used for normalization. if ($gene[$i]{flag} !~ /S|X|Y/) { $geneGood[$goodCount] = $gene[$i]; $gGood[$goodCount] = $gene[$i]{green}; $rGood[$goodCount++] = $gene[$i]{red}; $greenSum += $gene[$i]{green}; $redSum += $gene[$i]{red}; } } # end for loop stepping through each gene print("\n"); # calculate summary statistics $avgGreen = $greenSum / $goodCount; $avgRed = $redSum / $goodCount; @gSorted = sort by_value @gGood; @rSorted = sort by_value @rGood; $medianGreen = $gSorted[$goodCount / 2]; $medianRed = $rSorted[$goodCount / 2]; # find minimum value across background subtracted green and red data $minGreen = $gSorted[0]; $minRed = $rSorted[0]; } ###################################################################### # only arg [0] is the gene index. Returns an array of local genes. # for now, use 7 x 7 window sub getLocalGenes { my ($row, $col, $centerRow, $centerCol, @localIndices); # define local vignettes to include in background estimate... # ...for now, look at current vignette and all 8 adjacent ones ($centerRow, $centerCol) = index2rowcol($_[0], $MAXSLIDEROW); # only include those vignette indices which are in range @localIndices = (); # reset array for ($col = $centerCol - 3; $col <= $centerCol + 3; $col++) { for ($row = $centerRow - 3; $row <= $centerRow + 3; $row++) { if ($row >= 0 && $row < $MAXSLIDEROW && $col >= 0 && $col < $MAXSLIDECOL) { push (@localIndices, rowcol2index($row, $col, $MAXSLIDEROW) ) ; } } } return @gene[@localIndices]; } ###################################################################### sub normRatioMedian { print("\n#Normalizing"); # gGood and rGood have already been sorted in backgnd subroutine # only take upper 1/2 of these as good genes @geneGood = sort by_green_and_red @geneGood; @geneGood = @geneGood[int($goodCount/2)..($goodCount-1)]; $goodCount = int($goodCount/2); print("\n#Using median\n"); my $ls = Statistics::OLS->new(); $ls->setData(\@gGood, \@rGood) or die( $ls->error() ); $ls->regress() or die ($ls->error() ); ($gVar, $rVar, $cov) = $ls->var(); ($gMean, $rMean) = $ls->av(); $gStd = sqrt($gVar); $rStd = sqrt($rVar); # compute median @gSorted = sort by_green @geneGood; @rSorted = sort by_red @geneGood; $gMedian = $gSorted[ int( ($#gSorted+1) / 2) ]{"green"} ; #use median to estimate mean $rMedian = $rSorted[ int( ($#rSorted+1) / 2) ]{"red"} ; $avgMedian = ($gMedian + $rMedian) / 2; $avgMedian = $SCALE if ($SCALE); # if specified, use externally-provided scale # scale minimums to their future values, then find smallest $minGreen = $avgMedian * $minGreen / $gMedian; $minRed = $avgMedian * $minRed / $rMedian; if ($minGreen < $minRed) { $minValue = $minGreen; } else { $minValue = $minRed; } $gThresh = $avgMedian * $gThresh / $gMedian; $rThresh = $avgMedian * $rThresh / $rMedian; # loop over genes, scaling green and red values and taking ratio, etc. # The genes should end up along the line y=x, with smallest value at (1,1) for ($i=0; $i<$TOTALGENES; $i++) { # only modify green and red if spot found. if ($gene[$i]{flag} !~ /N/) { $gene[$i]{green} = $avgMedian * $gene[$i]{green} / $gMedian; $gene[$i]{red} = $avgMedian * $gene[$i]{red} / $rMedian; } } } ###################################################################### sub computeRatio { my $count = 0; # can only compute ratios if normalization hasn't drastically affected the data offsets if ($gThresh <= 1) { print("WARNING: readjusting non-positive green threshold\n"); $gThresh = 1; } if ($rThresh <= 1) { print("WARNING: readjusting non-positive red threshold\n"); $rThresh = 1; } for ($i=0; $i<$TOTALGENES; $i++) { # compute ratio if ( $gene[$i]{flag} =~ /XY|N/) { # if both values low or no spot found, return 1 $gene[$i]{ratio} = 1; } elsif ( $gene[$i]{flag} =~ /X/ ) { # if just green low, use conservative limit $gene[$i]{ratio} = $gThresh / $gene[$i]{red}; } elsif ( $gene[$i]{flag} =~ /Y/ ) { # same for red $gene[$i]{ratio} = $gene[$i]{green} / $rThresh ; } else { # no low values to worry about $gene[$i]{ratio} = $gene[$i]{green} / $gene[$i]{red} ; } # take log ratio if ($BASE =~ /e/) { $gene[$i]{log} = log ( $gene[$i]{ratio} ); } else { $gene[$i]{log} = log ( $gene[$i]{ratio} ) / log ($BASE); } $avgRatio += $gene[$i]{ratio}; $avgLog += $gene[$i]{log}; $green[$count] = $gene[$i]{green}; $red[$count++] = $gene[$i]{red}; $avgGreenAfter += $gene[$i]{green}; $avgRedAfter += $gene[$i]{red}; } print("\n\n"); $avgRatio /= $TOTALGENES; $avgLog /= $TOTALGENES; $avgGreenAfter /= $TOTALGENES; $avgRedAfter /= $TOTALGENES; @sorted = sort by_value @green; $medianGreenAfter = @sorted[$count / 2]; @sorted = sort by_value @red; $medianRedAfter = @sorted[$count / 2]; } ###################################################################### sub printOut { @sorted = sort by_ratio @gene; printf OUTFILE ("#Using base $BASE for log ratios\n"); printf OUTFILE ("#TOTALS: genes = %d, slide rows = %d, slide cols = %d \n", $TOTALGENES, $maxRow, $maxCol); printf OUTFILE ("#GREEN FOREGROUND: mean = %6.1f, median = %6.1f\n", $avgGreen, $medianGreen); printf OUTFILE ("#RED FOREGROUND: mean = %6.1f, median = %6.1f\n", $avgRed, $medianRed); printf OUTFILE ("#GREEN BACKGROUND: mean = %6.1f, stddev = %6.2f\n", $gbGlobalAvg, $gbGlobalStd); printf OUTFILE ("#RED BACKGROUND: mean = %6.1f, stddev = %6.2f\n", $rbGlobalAvg, $rbGlobalStd); printf OUTFILE ("#BACKGROUND THRESHOLDS: green = %6.1f, red = %6.2f\n", $gThresh, $rThresh); if ($NORM eq "median") { my $ratio1 = $ratio2 = $ratio3 = $ratio4 = 0; $ratio1 = $avgGreen/$avgRed if ($avgRed > 0); $ratio2 = $medianGreen/$medianRed if ($medianRed > 0); $ratio3 = $avgGreenAfter/$avgRedAfter if ($avgRedAfter > 0); $ratio4 = $medianGreenAfter/$medianRedAfter if ($medianRedAfter > 0); printf OUTFILE ("\n#Using Median Normalization\n"); if ($SCALE) { print OUTFILE ("#Scale specified at $SCALE\n"); } else { print OUTFILE ("#Scale set to average median $avgMedian\n"); } printf OUTFILE ("#%d genes used for normalization \n", $goodCount); printf OUTFILE ("#BEFORE NORMALIZATION:\n"); printf OUTFILE ("# (green avg / red avg) = %5.2f\n", $ratio1); printf OUTFILE ("# (green med / red med) = %5.2f\n", $ratio2); printf OUTFILE ("#AFTER NORMALIZATION:\n"); printf OUTFILE ("# (green avg / red avg) = %5.2f\n", $ratio3); printf OUTFILE ("# (green med / red med) = %5.2f\n", $ratio4); } histogram(OUTFILE); # print histogram of log ratios # finally ready to print ratios for each gene print OUTFILE ("#GENE GENE RATIO LOG ------ ------ ----- -SLIDE-\n"); print OUTFILE ("#NAME DESCRIPT X/Y RATIO X_INT Y_INT FLAG ROW COL\n"); print OUTFILE ("#-------- --------- -------- ------ ------ ------ ----- --- ---\n"); for ($i=0; $i<$TOTALGENES; $i++) { $sorted[$i]{flag} = "-" if ($sorted[$i]{flag} eq ""); printf OUTFILE ("%-9s %9s %8.4f %6.3f %6.0f %6.0f %5s %3d %3d\n", $sorted[$i]{orf}, $sorted[$i]{gene}, $sorted[$i]{ratio}, $sorted[$i]{log}, $sorted[$i]{green}, $sorted[$i]{red}, $sorted[$i]{flag}, $sorted[$i]{row}, $sorted[$i]{col} ); } my @x, @y; for ($i=0; $i<$TOTALGENES; $i++) { $x[$i] = $sorted[$i]{green}; $y[$i] = $sorted[$i]{red}; } # for ($i=0; $i<$goodCount; $i++) { # $x[$i] = $geneGood[$i]{green}; # $y[$i] = $geneGood[$i]{red}; # } psplot(\@x, \@y, "$outfile.ps", -1000, 50000, -1000, 50000); } ############################################################################# #THE FOLLOWING CODE IS NOT CURRENTLY SUPPORTED BUT MAY BE USEFUL IN THE FUTURE ############################################################################# sub normRatioWindow { $WINDOWSIZE = 100; $MID = $WINDOWSIZE / 2; print("\n#Normalizing by window of $WINDOWSIZE"); # compute preNormalized ratio for ($i=0; $i<$TOTALGENES; $i++) { if ($gene[$i]{flag} !~ /N|S|GR/) { $gene[$i]{notNormalRatio} = $gene[$i]{green} / $gene[$i]{red}; } else { $gene[$i]{notNormalRatio} = 1; } } # compute window normalized ratio @gene = sort by_green_and_red @gene; for ($i=0; $i<$TOTALGENES; $i++) { # set window boundaries $windowMin = $i - ($WINDOWSIZE/2); $windowMax = $i + ($WINDOWSIZE/2); if ($windowMin < 0) { $windowMin = 0; $windowMax = $WINDOWSIZE; } elsif ($windowMax >= $TOTALGENES) { $windowMax = $TOTALGENES-1; $windowMin = $TOTALGENES-1 - $WINDOWSIZE; } # get median ratio for this region: norm factor is median ratio @win = sort by_notNormalRatio @gene[$windowMin .. $windowMax]; if ($win[$MID]{green} > 0 && $win[$MID]{red} > 0) { $gene[$i]{normFactor} = $win[$MID]{green}/$win[$MID]{red}; } else { $gene[$i]{normFactor} = 1; } # print out normFactor curve on log scale normCurve($i); print (".") if ($i % 100 == 0) ; # adjust red value to green one. $gene[$i]{red} = $gene[$i]{red} * $gene[$i]{normFactor}; } } ###################################################################### sub normCurve { $geneindex = $_[0]; # first time print out header info and set defaults if (!defined ($init) ) { $init = 1; $XSTEP = $TOTALGENES / 20; # will print out a data pt every $XSTEP genes $YZERO = 20; # this is the ASCII text column that will by Y = 0; $YSTEP = .2; # each ASCII column corresponds to a normFactor delta of YSTEP print OUTFILE ("\n#NORMALIZATION FACTOR CURVE (window length = $WINDOWSIZE)\n"); printf OUTFILE ("# Log10 (norm. factor)\n# "); for ($col = $YZERO-15; $col < $YZERO+15; $col += 5) { printf OUTFILE ("%4.1f ", $YSTEP * ($col - $YZERO) ); } print OUTFILE ("\n# ------------------------------\n"); print OUTFILE ("#gene|"); for ($col = $YZERO-15; $col < $YZERO+15; $col += 5) { printf OUTFILE (" | "); } print OUTFILE ("\n"); } # first and every time, print out data point if ( $geneindex % $XSTEP == 0) { $logFactor = log ($gene[$geneindex]{normFactor}) / log (10); $yposition = $YZERO + ($logFactor / $YSTEP); printf OUTFILE ("#%4d|", $i); for ($col = 0; $col < $yposition; $col++) { printf OUTFILE (" "); } print OUTFILE ("x\n"); } } ###################################################################### sub normRatioRegress { $avgRatio = $avgLogRatio = 0; print("\n#Normalizing"); # regress gGood and rGood, which have already been sorted in backgnd subroutine print("\n#Performing linear regression"); my $ls = Statistics::OLS->new(); $ls->setData(\@gGood, \@rGood) or die( $ls->error() ); $ls->regress() or die ($ls->error() ); ($intercept, $slope) = $ls->coefficients(); # scale minimums to their future values, then find smallest $minGreen = $minGreen * $slope; $minRed = $minRed - $intercept; if ($minGreen < $minRed) { $minValue = $minGreen; } else { $minValue = $minRed; } # loop over genes, scaling green and red values and taking ratio, etc. # The genes should end up along the line y=x, with smallest value at (1,1) for ($i=0; $i<$TOTALGENES; $i++) { # only modify green and red if spot found. if ($gene[$i]{flag} !~ /N/) { $gene[$i]{green} = $gene[$i]{green} * $slope; $gene[$i]{red} = $gene[$i]{red} - $intercept; } } } ###################################################################### sub normRatioMean { my $M = 3000; # global multiplicative factor my $B = 3800; print("\n#Normalizing"); # regress gGood and rGood, which have already been sorted in backgnd subroutine print("\n#Using mean and std. deviation\n"); my $ls = Statistics::OLS->new(); $ls->setData(\@gGood, \@rGood) or die( $ls->error() ); $ls->regress() or die ($ls->error() ); ($gVar, $rVar, $cov) = $ls->var(); ($gMean, $rMean) = $ls->av(); $gStd = sqrt($gVar); $rStd = sqrt($rVar); # compute median and MAD @gSorted = sort by_green @geneGood; @rSorted = sort by_red @geneGood; $gMedian = $gSorted[ int( ($#gSorted+1) / 2) ]{"green"} ; #use median to estimate mean $rMedian = $rSorted[ int( ($#rSorted+1) / 2) ]{"red"} ; $avgMedian = ($gMedian + $rMedian) / 2; $gMAD = 1.3 * MAD($gMean, "green", @geneGood); $rMAD = 1.3 * MAD($rMean, "red", @geneGood); # scale minimums and thresholds to their future values $minGreen = $M * ($minGreen - $gMean) / $gMAD + $B; $minRed = $M * ($minRed - $rMean) / $rMAD + $B; if ($minGreen < $minRed) { $minValue = $minGreen; } else { $minValue = $minRed; } $gThresh = $M * ($gThresh - $gMean) / $gMAD + $B; $rThresh = $M * ($rThresh - $rMean) / $rMAD + $B; # loop over genes, scaling green and red values and taking ratio, etc. # The genes should end up along the line y=x, with smallest value at (1,1) for ($i=0; $i<$TOTALGENES; $i++) { # only modify green and red if spot found. if ($gene[$i]{flag} !~ /N/) { $gene[$i]{green} = ($M * ($gene[$i]{green} - $gMean) / $gMAD) + $B; $gene[$i]{red} = ($M * ($gene[$i]{red} - $rMean) / $rMAD) + $B; } } }