# LIBSHUFF v1.2 # 12/15/2003 # First load the sample file and put it in the variable @matrix open (MATRIX, "sample") || open (MATRIX, "sample.txt") || die "Could not find sample file.\n"; while () { @tmp=split; push @matrix, [ @tmp ]; } open (RESULTS, ">results"); open (CVALUES, ">coverages"); print ("\nWelcome to LIBSHUFF v1.2\n"); print ("Last updated: December 15, 2003\n\n"); print ("EARLIER VERSIONS OF LIBSHUFF CONTAINED AN ERROR - DO NOT USE THEM\n\n"); print ("Please make sure that your square distance matrix is sorted\n"); print ("such that X and Y are grouped together, and all sequences\n"); print ("in X are first in the distance matrix.\n\n"); print ("How many sequences are in X? "); $numX = ; print ("How many sequences are in Y? "); $numY = ; $totalseqs = $numX + $numY; $rounds = 1000; # Determine lowest distances for X # Distances stored in @X_lds $i = 0; for ($i..$numX-1) { $ld = 100; $j = 0; for ($j..$numX-1) { if ($matrix[$j][$i] < $ld && ($i != $j)) { $ld = $matrix[$j][$i]; @X_lds[$i] = $ld; } $j++; } $i++; } # Determine lowest distances of Y # LDs stored in @Y_lds $i = $numX; $k = 0; for ($i..$totalseqs-1) { $ld = 100; $j = $numX; for ($j..$totalseqs-1) { if ($matrix[$j][$i] < $ld && ($i != $j)) { $ld = $matrix[$j][$i]; @Y_lds[$k] = $ld; } $j++; } $i++; $k++; } # Determine lowest distances of XY # LDs stored in @XY_lds $i = 0; for ($i..$numX-1) { $ld = 100; $j = $numX; for($j..$totalseqs-1) { if($matrix[$j][$i] < $ld) { $ld = $matrix[$j][$i]; @XY_lds[$i]= $ld; } $j++; } $i++; } # Determine lowest distances of YX # LDs stored in @YX_lds $i = $numX; $k = 0; for ($i..$totalseqs-1) { $ld = 100; $j = 0; for($j..$numX-1) { if($matrix[$j][$i] < $ld) { $ld = $matrix[$j][$i]; @YX_lds[$k]= $ld; } $j++; } $i++; $k++; } # Convert all arrays into number of uniques # For the X and XY comparisons $d = 0; $j = 0; while ($d < 0.51) { $i = 0; $sumX = 0; $sumXY = 0; for ($i..$numX-1) { if ($X_lds[$i] < $d || $X_lds[$i] == $d) { $sumX++; } if ($XY_lds[$i] < $d || $XY_lds[$i] == $d) { $sumXY++; } $i++; } @X_uniqs[$j] = ($numX - $sumX); @XY_uniqs[$j] = ($numX - $sumXY); $d = $d + 0.01; $j++; $sum = 0; } # For the Y and YX comparisons $d = 0; $j = 0; while ($d < 0.51) { $i = 0; $sumY = 0; $sumYX = 0; for ($i..$numY-1) { if ($Y_lds[$i] < $d || $Y_lds[$i] == $d) { $sumY++; } if ($YX_lds[$i] < $d || $YX_lds[$i] == $d) { $sumYX++; } $i++; } @Y_uniqs[$j] = ($numY - $sumY); @YX_uniqs[$j] = ($numY - $sumYX); $d = $d + 0.01; $j++; $sum = 0; } # Now convert all values into coverage values $i = 0; $d = 0; $XY_deltaC = 0; $YX_deltaC = 0; print CVALUES (" D X Y XY YX diff-XY/X diff-YX/Y\n"); for ($i..50) { printf CVALUES ("%.2f",$d); print CVALUES (" "); $X_coverages[$i] = (1-($X_uniqs[$i]/$numX)); printf CVALUES ("%.7f",$X_coverages[$i]); print CVALUES (" "); $Y_coverages[$i] = (1-($Y_uniqs[$i]/$numY)); printf CVALUES ("%.7f",$Y_coverages[$i]); print CVALUES (" "); $XY_coverages[$i] = (1-($XY_uniqs[$i]/$numX)); printf CVALUES ("%.7f",$XY_coverages[$i]); print CVALUES (" "); $YX_coverages[$i] = (1-($YX_uniqs[$i]/$numY)); printf CVALUES ("%.7f",$YX_coverages[$i]," "); print CVALUES (" "); $diffXY = (($X_coverages[$i] - $XY_coverages[$i])**2); $diffYX = (($Y_coverages[$i] - $YX_coverages[$i])**2); printf CVALUES ("%.7f",$diffXY); print CVALUES (" "); printf CVALUES ("%.7f",$diffYX); print CVALUES ("\n"); $XY_deltaC = $XY_deltaC + (($X_coverages[$i] - $XY_coverages[$i])**2); $YX_deltaC = $YX_deltaC + (($Y_coverages[$i] - $YX_coverages[$i])**2); $i++; $d = $d + 0.01; } print("\nCompleted homologous and heterolgous coverage calculations for X and Y.\n\n"); print("The next calculations may take some time.\nPlease be patient.\n"); print ("\nBeginning shuffling of matrix for dimensions of XY.\n\n"); # Begin the shuffling of the matrix # since this needs to be done twice, we need an additional variable to # change the size of the matrix from X x Y to Y x X $trip = 0; while ($trip != 2) { if($trip == 1) { $temporary = $numX; $numX = $numY; $numY = $temporary; print ("\nBeginning shuffling of matrix for dimensions of YX.\n\n"); } $shuffle_number = 0; while ($shuffle_number != $rounds) { # First, set up a null array filled with zeros. $i = 0; for ($i..$totalseqs) { @null[$i] = 0; $i++; } # Pick random numbers and change the zeros in the null array into # ones once one is chosen. $i = 0; while ($i != $numY) { RANDOM: $randomnumber = int(rand($totalseqs)); # ranges between zero and $totalseqs if ($null[$randomnumber] == 1) { goto RANDOM; } @Y_shuffled[$i] = $randomnumber; @null[$randomnumber] = 1; $i++ } # Ok, let's put every other number into the X_shuffled array $i = 0; $j = 0; while ($i != $totalseqs) { if ($null[$i] == 0) { @X_shuffled[$j] = $i; $j++; } $i++; } # Get the lowest distance values of Y for each value of X $i = 0; for ($i..$numX-1) { $ld = 100; $j = 0; for ($j..$numY-1) { if ($matrix[$X_shuffled[$i]][$Y_shuffled[$j]] < $ld) { $ld = $matrix[$X_shuffled[$i]][$Y_shuffled[$j]]; @hetero_lds[$i] = $ld; } $j++; } $i++; } # Determine the lowest distances for the homologous X shuffle $i = 0; for ($i..$numX-1) { $ld = 100; $j = 0; for ($j..$numX-1) { if ($matrix[$X_shuffled[$j]][$X_shuffled[$i]] < $ld && ($i != $j)) { $ld = $matrix[$X_shuffled[$j]][$X_shuffled[$i]]; @homol_lds[$i] = $ld; } $j++; } $i++; } # Convert lds to number of uniques $d = 0; $j = 0; while ($d < 0.51) { $i = 0; $sumX = 0; $sumXY = 0; for ($i..$numX-1) { if ($homol_lds[$i] < $d || $homol_lds[$i] == $d) { $sumX++; } if ($hetero_lds[$i] < $d || $hetero_lds[$i] == $d) { $sumXY++; } $i++; } @X_shuffled_uniqs[$j] = ($numX - $sumX); @XY_shuffled_uniqs[$j] = ($numX - $sumXY); $d = $d + 0.01; $j++; } # Convert number of uniques to coverage values $i = 0; $d = 0; for ($i..50) { $X_shuffled_coverages[$i] = (1-($X_shuffled_uniqs[$i]/$numX)); $XY_shuffled_coverages[$i] = (1-($XY_shuffled_uniqs[$i]/$numX)); $i++; $d = $d + 0.01; } # Calculate delta C values for this shuffle comparison $i = 0; $delta = 0; $deltad = 0; for ($i..50) { $deltad = (($X_shuffled_coverages[$i]-$XY_shuffled_coverages[$i])**2); $delta = $delta + $deltad; if($trip == 0) { $XYmatrix[$i][$shuffle_number] = ( $deltad ); } if($trip == 1) { $YXmatrix[$i][$shuffle_number] = ( $deltad ); } $deltad = 0; $i++; } # Clean out all the arrays for re-use @null=0; @X_shuffled=0; @Y_shuffled=0; @hetero_lds=0; @homol_lds=0; @X_shuffled_uniqs=0; @XY_shuffled_uniqs=0; @X_shuffled_coverages=0; @XY_shuffled_coverages=0; if ($trip == 0) { @XY_values[$shuffle_number] = $delta;} if ($trip == 1) { @YX_values[$shuffle_number] = $delta;} if ($shuffle_number == 250) { print("25% finished.\n"); } if ($shuffle_number == 500) { print("50% finished.\n"); } if ($shuffle_number == 750) { print("75% finished.\n"); } $shuffle_number++; } $trip++; } # Sort all of the delta C values for the shuffles @XY_results = sort { $a <=> $b } @XY_values; @YX_results = sort { $a <=> $b } @YX_values; # Determine p-values for the comparisons $XY_pvalue = 1; $YX_pvalue = 1; $i = 0; for ($i..998) { if($XY_deltaC > $XY_results[$i] || $XY_deltaC == $XY_results[$i]) { $XY_pvalue = $XY_pvalue - 0.001; } if($YX_deltaC > $YX_results[$i] || $YX_deltaC == $YX_results[$i]) { $YX_pvalue = $YX_pvalue - 0.001; } $i++; } # Set up the file for output of the coverage values for each # value of d at p = 0.950 print RESULTS (" D 95% XY 95% YX\n"); $i=0; $d=0; for($i..50) { $j=0; printf RESULTS ("%.2f",$d); print RESULTS (" "); $d = $d + 0.01; for($j..999) { @XY[$j] = $XYmatrix[$i][$j]; @YX[$j] = $YXmatrix[$i][$j]; $j++; } @XY_sorted = sort {$a <=> $b} @XY; @YX_sorted = sort {$a <=> $b} @YX; printf RESULTS ("%.10f",$XY_sorted[949]); print RESULTS (" "); printf RESULTS ("%.10f",$YX_sorted[949]); print RESULTS ("\n"); $i++; } # Output the results to the screen and end. print ("\nThe XY comparison had a delta-C of "); printf ("%.3f",$XY_deltaC); print (" for a p-value of "); printf ("%.3f",$XY_pvalue); print(".\n\n"); print ("The YX comparison had a delta-C of "); printf ("%.3f",$YX_deltaC); print (" for a p-value of "); printf ("%.3f",$YX_pvalue); print (".\n\n"); print RESULTS ("\nThe XY comparison had a delta-C of "); printf RESULTS ("%.3f",$XY_deltaC); print RESULTS (" for a p-value of "); printf RESULTS ("%.3f",$XY_pvalue); print RESULTS (".\n\n"); print RESULTS ("The YX comparison had a delta-C of "); printf RESULTS ("%.3f",$YX_deltaC); print RESULTS (" for a p-value of "); printf RESULTS ("%.3f",$YX_pvalue); close CVALUES; close RESULTS; print("Program finished.\n");