#!/usr/bin/perl # mergeReps # A script to MERGE REPlicate microarray files. Multiple measurements of a gene # ratio, present across multiple ratio files, are combined into one averaged # measurement. Ratio files are specified in a fileTable. In the table, a # reversed flag -r can be set to indicate a reverse labeling relative to the # other files. The -err option sorts by std. error, not by ratio, the default # The [-opt #] option only outputs unsaturated and above-background genes having # number of samples >= # # ###################################################################################### use POSIX; $USAGE = " mergeReps [OPTIONS] \n" . " -opt \n" . " -filter {on,off}\n" . " -exclude \n"; if ($#ARGV < 0) { die "\nCannot parse command line.\n$USAGE\n\n"; } # define global variables $filter = "on"; $opt = 0; $MAXSAMPLES = 1; $EXCLUDE_OUTLIER = 1; $GENENUM = 0; $THRESH = 800; $GBGND = $RBGND = $GBGNDSTD = $RBGNDSTD = 0.0; # parse command line for ($i=0; $i<=$#ARGV; $i++) { if ($ARGV[$i] eq "-filter") { $filter = $ARGV[++$i]; } elsif ($ARGV[$i] eq "-opt") { $opt = $ARGV[++$i]; } # get min number of genes elsif ($ARGV[$i] eq "-exclude"){ $exFile = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /^-/) { die ("Unrecognized parameter $ARGV[$i]\n$USAGE\n\n"); } elsif (!defined($filetable)) { $filetable = $ARGV[$i]; } # read file table with ratio files elsif (!defined($outfile)) { $outfile = $ARGV[$i]; } # read output file name } if (!defined($filetable) || !defined($outfile)) { die ("Must specify both and files\n$USAGE\n\n"); } if ($filter ne "on" && $filter ne "off") { die ("Don't understand value for -filter option: $filter\n"); } # first read filetable, which in turn reads the files listed in it # NOTE: samples which have N flag are not read in! readFileTable($filetable); readBadGenes($exFile) if ($exFile); print("#$GENENUM total genes read\n"); printf("#Average background: green = %6.1f, red = %6.1f\n", $GBGND, $RBGND); printf("#Average stddev background: green = %6.1f, red = %6.1f\n", $GBGNDSTD, $RBGNDSTD); if ($filter eq "on" && $samples > 30) { $EXCLUDE_OUTLIER = 0 ; print("#NOT EXCLUDING OUTLIERS: can only exclude if less than 30 samples\n"); } # reject outliers, determine mean and standard error of ratios, and compare to reference foreach $ORF (keys (%gene) ) { if ($filter eq "on") { if ($exFile && isBadGene( $gene{$ORF} )) { delete ( $gene{$ORF} ); $excludeNum++; next; } excludeOutliers( $gene{$ORF} ) if ($EXCLUDE_OUTLIER); } statisticsOnValues ( $gene{$ORF} ); getNumSlides ( $gene{$ORF} ); # calculate max samples $MAXSAMPLES = $gene{$ORF}->{"samples"} if ( $MAXSAMPLES < $gene{$ORF}->{"samples"} ); # compute coefficient of variation if ($gene{$ORF}->{"green"} > $THRESH) { $gCV += $gene{$ORF}->{"stdGreen"} / $gene{$ORF}->{"green"}; $gCVcount++; } if ($gene{$ORF}->{"red"} > $THRESH) { $rCV += $gene{$ORF}->{"stdRed"} / $gene{$ORF}->{"red"}; $rCVcount++; } } $gCV /= $gCVcount; $rCV /= $rCVcount; # take average coefficient of variation printf ("#Max samples per gene: %d\n", $MAXSAMPLES); printf ("#Estimate of Coeff. of Var: green = %6.2f, red = %6.2f\n", $gCV, $rCV); printf ("#Excluded $excludeNum genes as bad\n") if ($exFile); # step back through (sorted) genes and print out info die "Cannot access file $outfile\n" if (open(OUT, ">$outfile") == 0 ); print("#Writing merged results to file $outfile\n"); @sorted = sort by_mean keys(%gene); printOutput(OUT, @sorted); close(OUT); ###################################################################### # # SUBROUTINES # ###################################################################### # first and only argument is the filename of the table of ratio files sub readFileTable { my ($file, $dir, $slide, $count); die "Cannot access file $_[0]\n" if (open(FT, "$_[0]") == 0 ); while() { chop; next if /^\#/; # ignore line if a comment next if !/\w/; # ignore line if no words in it ($file, $dir, $slide) = split; print("#Reading file $file\n"); readRatioFile($file, $dir, $slide); $count++; } $GBGND /= $count; $RBGND /= $count; $GBGNDSTD /= $count; $RBGNDSTD /= $count; close(FT); } ###################################################################### # first and only argument is the filename of the bad genes file sub readBadGenes { my ($file); die "Cannot access file $_[0]\n" if (open(FT, "$_[0]") == 0 ); while () { chop; next if /^\#/; # ignore line if a comment next if !/\w/; # ignore line if no words in it if ( /^(\S+)/ ) { $badGenes{"$1"} = 1; } } close(FT); } ###################################################################### # arg [0] is a filename to read, arg [1] the direction (f or r) # and arg [2] is the slide identifier, usually 1, 2, 3, etc. sub readRatioFile { my ($dir, $slide); die "Cannot access file $_[0]\n" if (open(RFILE, "$_[0]") == 0 ); $dir = $_[1]; $slide = $_[2]; while () { chop; # look for background information in ratio file header if ( /GREEN BACKGROUND.+ (\d+\.\d+).+ (\d+\.\d+)/ ) { $GBGND += $1; $GBGNDSTD += $2; } elsif ( /RED\s+BACKGROUND.+ (\d+\.\d+).+ (\d+\.\d+)/ ) { $RBGND += $1; $RBGNDSTD += $2; } @entry = split; next if /^\#/; # ignore line if a comment next if !/\d/; # ignore line if no numbers in it $ORF = $entry[0]; $flag = $entry[6]; $flag =~ s/-//; # get rid of "null" flag # exclude SOME genes right away next if ( ($opt || $filter eq "on") && $flag =~ /N/); next if ($opt && $flag =~ /S/); if ($entry[2] <= 0) { print("Trouble processing line:\n $_\n"); next; } # get green and red ints, reversing if reverse labeling flag indicated if ($dir eq "f") { $ratio = $entry[2]; $green = $entry[4]; $red = $entry[5]; } elsif ($dir eq "r") { $ratio = 1 / $entry[2]; $green = $entry[5]; $red = $entry[4]; $flag =~ tr/XYAB/YXBA/; # reverse flags as well } else { die ("ERROR in file table : Labeling direction must be f or r\n"); } $ratio = .0001 if ($ratio == 0); # if ratio zero at this level of precision, up it. # if no spot found at this location if ($flag =~ /N/) { $ratio = 1; $green = 0; $red = 0; } if ( !defined($gene{$ORF}) ) { # if gene not been read in before $samples = 0; $gene{$ORF}->{"name"} = $entry[1]; $gene{$ORF}->{"values"} = [ () ]; # ptr to array $GENENUM++; } else { # if gene been read in before add to prev. record $samples = $gene{$ORF}->{"samples"}; } @values = @ {$gene{$ORF}->{"values"} }; $values[$samples]{"dir"} = $dir; $values[$samples]{"slide"} = $slide; $values[$samples]{"green"} = $green; $values[$samples]{"red"} = $red; $values[$samples]{"ratio"} = $ratio; $values[$samples]{"log"} = log($ratio) / log(10); $values[$samples]{"flag"} = $flag; $gene{$ORF}->{"values"} = [ @values ]; $gene{$ORF}->{"samples"} = $samples + 1; $gene{$ORF}->{"gFlag"}++ if ($entry[7] =~ /X/); $gene{$ORF}->{"rFlag"}++ if ($entry[7] =~ /Y/); $gene{$ORF}->{"orf"} = $ORF; } # end while loop stepping through this file close(RFILE); } ################################################################################# # first arg [0] is the output filehandle, second arg [1] is a list of keys to print sub printOutput { $FILE = $_[0]; if ($header == 0) { $row1 = "#GENE_NAME DESCRIPT. N S RATIO STD "; $row2 = "#--------- --------- - - ------- ------- "; for ($i=0; $i<$MAXSAMPLES; $i++) { $row1 .= " X$i " . " Y$i " . " F$i "; $row2 .= "------ " . "------ " . "---- "; } print $FILE ("$row1\n$row2\n"); $header = 1; } else { print $FILE ("\n" . "#" x 110 . "\n\n"); # print a bunch of #'s as a separator } foreach $ORF (@_[1..$#_]) { $name = $gene{$ORF}->{"name"}; $samples = $gene{$ORF}->{"samples"}; $slides = $gene{$ORF}->{"slides"}; $mean = $gene{$ORF}->{"mean"}; $stdRatio = $gene{$ORF}->{"stdRatio"}; $green = $gene{$ORF}->{"green"}; $stdGreen = $gene{$ORF}->{"stdGreen"}; $red = $gene{$ORF}->{"red"}; $stdRed = $gene{$ORF}->{"stdRed"}; @values = @ {$gene{$ORF}->{"values"} }; @outliers = @ {$gene{$ORF}->{"outliers"} } if (defined ($gene{$ORF}->{"outliers"} )) ; next if ($opt && $samples < $opt); # skip if too few samples for optimization printf $FILE ("%-10s\ %10s %1d %1d %7.4f %7.4f ", $ORF, $name, $samples, $slides, $mean, $stdRatio); for ($i = 0; $i < $samples; $i++) { $values[$i]{"flag"} = "-" if ($values[$i]{"flag"} eq ""); printf $FILE ("%6d %6d %4s ", $values[$i]{"green"}, $values[$i]{"red"}, $values[$i]{"flag"}); } for ($i = 0; $i <= $#outliers; $i++) { printf $FILE ("%6d %6d %4s ", $outliers[$i]{"green"}, $outliers[$i]{"red"}, $outliers[$i]{"flag"} . "O"); } for ($i = 0; $i < $MAXSAMPLES - $samples - $#outliers - 1; $i++) { printf $FILE (" - - - "); } print $FILE ("\n"); } # end foreach loop } #################################################################################### # only arg [0] is one element of the gene array. From this, extract array of values sub statisticsOnValues { @v = @ { $_[0]->{"values"} }; # first calculate mean of log ratio and of green and red ints. $logRatio = $green = $red = 0.0; $N = $#v + 1 ; for ($i=0; $i<$N; $i++) { $logRatio += $v[$i]{"log"}; $green += $v[$i]{"green"}; $red += $v[$i]{"red"}; } $logRatio /= $N; $green /= $N; $red /= $N; # then calculate standard error and correlation coefficient $stdRatio = $stdGreen = $stdRed = $corr = 0.0; $errRatio = $errGreen = $errRed = 0.0; if ($N > 1) { for ($i=0; $i<$N; $i++) { $stdRatio += ($v[$i]{"log"} - $logRatio)**2; $stdGreen += ($v[$i]{"green"} - $green)**2; $stdRed += ($v[$i]{"red"} - $red)**2; $corr += ($v[$i]{"green"} - $green) * ($v[$i]{"red"} - $red); } $corr = $corr / sqrt($stdGreen * $stdRed) if ($stdGreen > 0 && $stdRed > 0); $stdRatio /= ($N-1); $stdGreen /= ($N-1); $stdRed /= ($N-1); $stdRatio = sqrt($stdRatio); $errRatio = $stdRatio / sqrt($N); $stdGreen = sqrt($stdGreen); $errGreen = $stdGreen / sqrt($N); $stdRed = sqrt($stdRed); $errRed = $stdRed / sqrt($N); } # load these values back into gene array $_[0]->{"mean"} = $logRatio; $_[0]->{"stdRatio"} = $stdRatio; $_[0]->{"errRatio"} = $errRatio; $_[0]->{"green"} = $green; $_[0]->{"stdGreen"} = $stdGreen; $_[0]->{"errGreen"} = $errGreen; $_[0]->{"red"} = $red; $_[0]->{"stdRed"} = $stdRed; $_[0]->{"errRed"} = $errRed; $_[0]->{"corr"} = $corr; } ######################################################################### # the argument is a gene from the gene array, returns the number of unique # microarray slides which are represented by the values for that gene sub getNumSlides { my (@values, $value, $N); my ($id, $count, %slidehash); @values = @ { $_[0]->{"values"} }; $N = $#values+1; foreach $value (@values) { $id = $value->{"slide"}; if ( !defined($slidehash{$id}) ) { $count++; $slidehash{$id} = 1; } } $_[0]->{"slides"} = $count; } ############################################################################### # first arg [0] is one element of the gene array. Possibly removes values from # the values list, and pushes them on an outliers list. sub excludeOutliers { # use Dixon's Test (ref. Dunn & Clark 1987) # samples must be from a normal distribution my @outliers = (); my @values = @ { $_[0]->{"values"} }; my $samples = $_[0]->{"samples"} ; my $N; # distribution taken from Dixon (1951) and given in table A.6. of Dunn & Clark (1987) # Assuming alpha = .1 (which is standard for this test), .05, or .01, the dist array below # lists the value of the distribution for sample size n, used as the index to the array # The values provided can accomodate from 3 to 30 samples if (!defined @rdist10) { @rdist10 = (0, 0, 0, .886, .679, .557, .482, .434, .399, .370, .349, .332, .318, .305, .294, .285, .277, .269, .263, .258, .252, .247, .242, .238, .234, .230, .227, .224, .220, .218, .215); @rdist05 = (0, 0, 0, .941, .765, .642, .560, .507, .468, .437, .412, .392, .376, .631, .349, .338, .329, .320, .313, .306, .300, .295, .290, .285, .281, .277, .273, .269, .266, .263, .260); @rdist01 = (0, 0, 0, .988, .889, .780, .698, .637, .590, .555, .527, .502, .482, .465, .450, .438, .426, .416, .407, .398, .391, .384, .378, .372, .367, .362, .357, .353, .349, .345, .341); } foreach $color ("green", "red") { if ($samples >= 3) { # can only do outlier rejection if 3 or more samples if ($color eq "green") { @Y = sort by_green @values; } else { @Y = sort by_red @values; } $N = $samples; # compute statistics r10 and r01 next if (($Y[$N-1]{$color} - $Y[0]{$color}) == 0); # check for divByZero error $r10 = ($Y[1]{$color} - $Y[0]{$color}) / ($Y[$N-1]{$color} - $Y[0]{$color}) ; $r01 = ($Y[$N-1]{$color} - $Y[$N-2]{$color}) / ($Y[$N-1]{$color} - $Y[0]{$color}) ; if ( $r01 > $rdist05[$N] ) { $temp = pop(@Y); # remove value from the regular values array push( @outliers, $temp ); # and push it on outliers array $samples -= 1; # update number of samples } if ( $r10 > $rdist05[$N] ) { $temp = shift(@Y); # remove value from the regular values array unshift( @outliers, $temp ); $samples -= 1; # update number of samples } @values = @Y; } } $_[0]->{"values"} = [ @values ]; $_[0]->{"outliers"} = [ @outliers ]; $_[0]->{"samples"} = $samples; } ###################################################################### # first arg [0] is gene array. Returns True [1] if gene is BAD sub isBadGene { foreach $badG (keys (%badGenes) ) { return 1 if ( $_[0]->{"name"} eq $badG) ; return 1 if ( $_[0]->{"orf"} eq $badG) ; } return 0; } ###################################################################### sub by_mean { $aratio = $gene{$a}->{"mean"}; $bratio = $gene{$b}->{"mean"}; #print("a = $a, b = $b, ARATIO = $aratio, BRATIO = $bratio\n"); return $aratio <=> $bratio; } ###################################################################### sub by_sig_then_mean { $asig = $gene{$a}->{"sigFlag"}; $bsig = $gene{$b}->{"sigFlag"}; return $asig <=> $bsig if ($asig != $bsig) ; $aratio = $gene{$a}->{"mean"}; $bratio = $gene{$b}->{"mean"}; #print("a = $a, b = $b, ARATIO = $aratio, BRATIO = $bratio\n"); return $aratio <=> $bratio; } ###################################################################### sub by_err { $aerr = $gene{$a}->{"errRatio"}; $berr = $gene{$b}->{"errRatio"}; return $aerr <=> $berr; } ###################################################################### sub by_ratio { return $$a{"ratio"} <=> $$b{"ratio"}; } ###################################################################### sub by_green { return $$a{"green"} <=> $$b{"green"}; } ###################################################################### sub by_red { return $$a{"red"} <=> $$b{"red"}; }