#! /usr/bin/perl # # mergeConds [OPTIONS] -out -data [ ...] # # Trey Ideker 6/2000 # ################################################################################ # global variables and hashes my(%gene, %info); my($NUMCONDS, $NUMGENES); # parse command line $USAGE=" mergeConds [OPTIONS] -out -conds [ ...]\n". " -lam \n" . " -rat \n" . " -std \n" . " -n \n"; $NUMCONDS = -1; $LAMBDA = $RATIO = $SAMPLES = 0; $STD = 1000; # init STD to a very large number for ($i=0; $i<=$#ARGV; $i++) { if ($ARGV[$i] =~ /-lam/) { $LAMBDA = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-rat/) { $RATIO = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-std/) { $STD = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-n/) { $SAMPLES = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-out/) { $outFile = $ARGV[++$i]; } elsif ($ARGV[$i] =~ /-conds/) { $NUMCONDS = 0; } elsif ($NUMCONDS >= 0) { $condFiles[$NUMCONDS++] = $ARGV[$i]; } } if ( $NUMCONDS == -1 ) { die "\nCannot parse command line.\n$USAGE\n\n"; } if ( !defined($outFile)) { die "\nCannot parse command line.\n$USAGE\n\n"; } # define significant genes in hash structure print("#DETERMINING SIGNIFICANT GENES OVER ALL $NUMCONDS CONDITIONS...\n"); $filenum = 0; foreach $file (@condFiles) { $sigNum[$filenum] = getSigGenes($file, $filenum); $filenum++; } # read back through files, this time getting all of the ratios for sig genes @keys = keys(%gene); $NUMGENES = $#keys + 1; print("\n#FOUND $NUMGENES SIGNIFICANT GENES, READING RATIOS...\n"); $filenum = 0; foreach $file (@condFiles) { getRatios($file, $filenum++); } # print out matrix of ratios for each sig. gene over all conditions # first print header information die "Cannot access file $file\n" if (open(OUTFILE, ">$outFile") == 0 ); print OUTFILE ("\t\tRATIOS"); foreach $file (@condFiles) { print OUTFILE ("\t"); } print OUTFILE ("LAMBDAS\n"); print OUTFILE ("GENE\tDESCRIPT\t"); # print headings for ratios foreach $file (@condFiles) { @parts = split (/\//, $file); # split on / printf OUTFILE ("%s\t", pop(@parts) ); } # print headings for lambdas foreach $file (@condFiles) { @parts = split (/\//, $file); # split on / printf OUTFILE ("%s\t", pop(@parts) ); } print OUTFILE ("NumSigConds\n"); # now print actual gene ratios (and lambdas) foreach $ORF (keys(%gene)) { printRatios(OUTFILE, $ORF); } # print summary information print OUTFILE ("NumSigGenes:\t"); foreach $num (@sigNum) { printf OUTFILE ("\t%6d", $num); } # finish up close(OUTFILE); print ("# Done\n\n"); # print bit vector info for Phylip tree # printBitVector(); ################################################################################ # this routine just defines the significant genes for a file, it does not get # info for them. Argument [0] is a filename. Argument [1] is the number of this file. # Returns the number of significant genes in the file. sub getSigGenes { my (@entry, $i, $sigcount, $condNum, @sig, @ratios, @lambdas); $condNum = $_[1]; die "Cannot access file $file\n" if (open(FILE, "$_[0]") == 0 ); print("# $file\n"); # determine if header contains lambda info. $_ = ; chomp; @entry = split; $LAMCOL = -1; for ($i=0; $i<$#entry; $i++) { $LAMCOL = $i if ($entry[$i] =~ /lambda/i); } while () { chomp; next if /^\#/; # ignore line if a comment next if !/\w/; # ignore line if no words in it @entry = split; if ($entry[1] eq "") {die "Blank name for $ORF, file $_[0]\n" ; } $ORF = $entry[0]; $n = $entry[2]; $std = $entry[5]; # if present in file, take lambda and ratio from end if ($LAMCOL >= 0) { $lam = $entry[$LAMCOL]; $rat = $entry[$LAMCOL+1]; } else { $lam = 0; $rat = $entry[4]; } # based on input params, decide whether to reject gene or to record it next if ($lam < $LAMBDA); # skip if lambda does not make cutoff value next if (abs($rat) < $RATIO); # skip if ratio does not make cutoff value next if ($n < $SAMPLES); # skip if too few samples next if ($std > $STD); # skip if stdev in ratio is too large # made it to here- must be significant gene!!! if ( !defined($gene{$ORF}) ) { $gene{$ORF}->{"name"} = $entry[1]; # initialize ratios array to all zeros; for ($i=0; $i<$NUMCONDS; $i++) { $ratios[$i] = 0.0; $lambdas[$i] = 0.0; $sig[$i] = 0.0; } $sig[$condNum] = 1; $gene{$ORF}->{"ratios"} = [ @ratios ]; # ptr to array $gene{$ORF}->{"lambdas"} = [ @lambdas ]; # ptr to array $gene{$ORF}->{"sig"} = [ @sig ]; # ptr to array $gene{$ORF}->{"sigConds"} = 1; } else { $gene{$ORF}->{"sigConds"}++; $gene{$ORF}->{"sig"}[$condNum] = 1; } $sigcount++; } close(FILE); return $sigcount; } ################################################################################ # argument [0] is a filename, arg [1] is a filenumber sub getRatios { my(@entry); die "Cannot access file $file\n" if (open(FILE, "$_[0]") == 0 ); print("# $file\n"); # determine if header contains lambda info. $_ = ; chomp; @entry = split; $LAMCOL = -1; for ($i=0; $i<$#entry; $i++) { $LAMCOL = $i if ($entry[$i] =~ /lambda/i); } while () { chomp; next if /^\#/; # ignore line if a comment next if !/\w/; # ignore line if no words in it @entry = split; $ORF = $entry[0]; if ($LAMCOL >= 0) { $l = $entry[$LAMCOL]; $rat = $entry[$LAMCOL+1]; } else { $l = 0; $rat = $entry[4]; } # if in sig. list, record information for this ORF if (defined($gene{$ORF})) { @{ $gene{$ORF}->{"ratios"}}[ $_[1] ] = $rat; @{ $gene{$ORF}->{"lambdas"}}[ $_[1] ] = $l; } } close(FILE); } ################################################################################ # first arg [0] is a file handle, second arg [1] is an ORF name sub printRatios { my($FILE, $ORF); $FILE = $_[0]; $ORF = $_[1]; printf $FILE ("%s\t%s", $ORF, $gene{$ORF}->{"name"}); @ratios = @{ $gene{$ORF}->{"ratios"} }; @lambdas = @{ $gene{$ORF}->{"lambdas"} }; for ($i=0; $i<$NUMCONDS; $i++) { printf $FILE ("\t%6.3f", $ratios[$i]); } for ($i=0; $i<$NUMCONDS; $i++) { printf $FILE ("\t%6.3f", $lambdas[$i]); } printf $FILE ("\t%d\n", $gene{$ORF}->{"sigConds"}); } ################################################################################ sub printBitVector { my($cond, $gene, $i, @ratios); $thisFile = $outFile . ".bv"; open(OUTFILE, ">$thisFile"); printf OUTFILE (" $NUMCONDS $NUMGENES\n"); for ($cond = 0; $cond < $NUMCONDS; $cond++) { @line = split(/\./, $condFiles[$cond]); $condName = $line[0] . (" " x 10); $condName = substr($condName, 0, 9); printf OUTFILE ("$condName"); foreach $ORF (keys(%gene)) { @sigs = @{ $gene{$ORF}->{"sig"} }; printf OUTFILE (" $sigs[$cond]"); } printf OUTFILE ("\n"); } close(OUTFILE); } ###################################################################### # arg [0] is the key file name- constructs a global hash called info sub readKeyFile { my ($name, $orf, @entry, $count); if ( open(KEYFILE, "$_[0]") == 0 ) { die "\nCannot open ann file\n\n" ; } while () { if (/\d/) { # if line not blank, read it in and parse it @entry = split(/\t/); # split on tabs $name = $entry[0]; $orf = $entry[2]; # get only some columns of YPD file $info{"$orf"}->{"majorLoc"} = ($entry[23] || "?"); $info{"$orf"}->{"minorLoc"} = ($entry[24] || "?"); $info{"$orf"}->{"environ"} = ($entry[25] || "?"); $info{"$orf"}->{"function"} = ($entry[26] || "?"); $info{"$orf"}->{"descript"} = ($entry[67] || "?"); $count++; } } close(KEYFILE); print("# Processed $count gene entries\n"); }