#!/usr/bin/perl # --- (c) Molinspiration Cheminformatics 2007 # --- simulated screening experiment # --- data set is divided randomly in training and test set # --- method is trained by using the training set # --- and validated by "predicting" activities for the test set # --- the procedure may be repeated several times # # --- this script is provided without any warranty # --- feel free to modify it according to your requirements ! # --- edit the following parameters $fraction = 50.; # --- fraction of the data used for training (usually 50 %) $nruns = 4; # --- number of runs (use 1 - 10) $actdata = "act.smi"; # --- file with active molecules $inadata = "ina.smi"; # --- file with inactive molecules $job = "tt"; # --- name of the validation job (used in output files) $htmlgraph = 1; # do simple html graph $detail10 = 1; # provides detailed output for first 10 % of the dataset # --- location of miscreen $miscreen = "java -jar miscreen.jar"; # --- start of the run open (ACT,"$actdata") || die "Cannot open file $actdata\n"; open (INA,"$inadata") || die "Cannot open file $inadata\n"; ($d,$totact) = split(/\s+/,`wc $actdata`); ($d,$totina) = split(/\s+/,`wc $inadata`); $tot = $totact + $totina; $ninstep = int($tot * $fraction / 100.); $log = $job.".log"; open (LOG,">$log"); $date = `date`; print LOG "$date\n"; print LOG "active $actdata\n"; print LOG "inactive $inadata\n"; $p = sprintf("%.2f",$totact * 100 / ($totact + $totina)); print LOG "nact $totact, nina $totina, $p % actives in data\n"; print LOG "nruns $nruns\n"; print LOG "fraction $fraction %\n"; print LOG "$ninstep molecules used for training\n"; print LOG "averaging $nruns\n"; close (LOG); # --- generation of pool $pool = $job.".pool"; open POOL,">$pool"; while () { ($smi) = split(/\s+/,$_); print POOL "$smi\t1\n"; } close (ACT); while () { ($smi) = split(/\s+/,$_); print POOL "$smi\t0\n"; } close (INA); close (POOL); # --- $nruns used for averaging for ($run=1;$run<=$nruns;$run++) { print "= = = = = = = = run # $run\n"; # --- filenames (updated after each step to enable debugging $act = $job.".act".$run; $ina = $job.".ina".$run; $actfragments = $job.".af".$run; $inafragments = $job.".if".$run; $score = $job.".score".$run; $tmppred = $job.".tmppred".$run; $pred = $job.".pred".$run; $test = $job.".test".$run; open (LOG,">>$log"); print LOG "run $run = = = = = = = =\n"; close (LOG); # --- generation of trainig set &randomSelection(); # --- generation of fragments `$miscreen -fragment $act > $actfragments`; `$miscreen -fragment $ina > $inafragments`; # --- calculation of scores; ($d,$nact) = split(/\s+/,`wc $act`); ($d,$nina) = split(/\s+/,`wc $ina`); `$miscreen -createmodel -af $actfragments -if $inafragments > $score`; `$miscreen -screen $test -model $score > $tmppred`; # --- sorting based on the calculated activity # --- this requires UNIX / cygwin sort ! `sort +2 -3 -rn $tmppred > $pred`; # --- evaluation of screening performance &eval($pred); # --- log open (LOG,">>$log"); $pact = sprintf("%.2f",$nact * 100 / $totact); $pprocessed = sprintf("%.2f",($nact+$nina) * 100 / $tot); print LOG "step $step - $nact $pact $pprocessed\n"; for ($i=0;$i<=100;$i++) { $value = sprintf("%.2f",$eval[$i]); print LOG "$i $value\n"; } print LOG $out[101]; @x = split(/\s+/,$out[$i]); $sumclas += $x[2]; $sumclasscore += $x[5]; close (LOG); $found[$step] += $nact; close (ACT); close (INA); close (POOL); close (FILE); } # --- end of run loop # --- average summary open (LOG,">>$log"); print LOG "Average from $nruns runs * * * *\n"; for ($i=0;$i<=100;$i++) { $seval[$i] /= $nruns; $value = sprintf("%.2f",$seval[$i]); print LOG "$i $value\n"; } close (LOG); # --- log for first 10 % if ($detail10) { $detaillog = $job."10.log"; open (DETAIL,">$detaillog"); for ($i=0;$i<=100;$i++) { $seval10[$i] /= $nruns; $p = sprintf("%.1f",$i/10.); $value = sprintf("%.2f",$seval10[$i]); print DETAIL "$p $value\n"; } close (DETAIL); } print "End of validation.\n"; print "Summary of results available at file $log\n"; if ($detail10) { print "Summary of results for first 10 % of data available at file $detaillog\n"; } # --- simple HTML graph if ($htmlgraph) { $f = $job.".html"; open (HTML,">$f"); print HTML "
"; print HTML "

Molinspiration fragment based selection

"; print HTML ""; print HTML ""; print HTML ""; print HTML "
%
 
a
c
t
 
f
o
u
n
d
"; print HTML ""; print HTML "
"; print HTML ""; for ($y=100;$y>=1;$y--) { print HTML ""; for ($x=1;$x<=100;$x++) { $color = 'white'; if ($x == $y) {$color='blue'}; if ($y <= int($seval[$x]+.5) && $y > $x) {$color='red';} print HTML ""; } print HTML "
"; } print HTML "
"; print HTML "
"; print HTML "
 % of database screened
"; print HTML "

"; print HTML "Random selection      "; print HTML "Fragment-based selection enrichment"; print HTML "

"; close (HTML); print "Graph showing enrichment at file $f\n"; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sub randomSelection { # select $ninstep molecules from the $pool # fills $act $ina, updates $pool # repeats random selection, untill at least one active molecule is found ($d,$poolsize) = split(/\s+/,`wc $pool`); $na = 0; open (POOL,"$pool"); open (TEST,">$test"); open (ACT,">$act"); open (INA,">$ina"); $n = 0; pool: while () { ($smiles,$expba) = split(/\s+/,$_); if (rand() < $fraction / 100.) { if ($expba == 1) {print ACT "$smiles\n"; $na++; next pool} elsif ($expba == 0) {print INA "$smiles\n"; next pool} } else { print TEST $_; } } close (ACT); close (INA); close (POOL); close (TEST); if ($na > 0) {return 1;} else {return 0;} } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sub eval { # --- evaluation of the screening performance # --- comparison of predicted and real activity class # --- molecules must be sorted according to calculated score $tota = 0; $toti = 0; open (FILE,"<$_[0]") || die "Cannot opne $_[0]\n"; while () { ($smi,$ba,$score) = split(/\s+/,$_); if ($ba==1) {$tota++;} else {$toti++;} } $tot = $tota + $toti; close(FILE); for ($i=0;$i<=100;$i++) {$eval[$i] = 0;} open (FILE,$_[0]); $n = 0; $na = 0; $ni = 0; $max_p = 0; $last_percent = 0; $last_percent10 = 0; while () { $n++; ($smi,$ba,$score) = split(/\s+/,$_); if ($ba==1) {$na++;} else {$ni++;} $a = $na / $tota; $a = sprintf("%.4f",$a); $i = ($toti - $ni) / $toti; $i = sprintf("%.4f",$i); $p = ($a + $i) *.5; $p = sprintf("%.4f",$p); $percent = 100 * $n / $tot; if ($p > $max_p) { $max_p = $p; $opt_percent = sprintf("%.2f",$percent); $opt_score = $score; $opt_n = $n; } $ip = int($percent); if ($ip > $last_percent) { #print "$pp $p $a $i"; print "$ip % processed"; $ap = $a * 100; print " $ap % actives found\n"; $eval[$ip] = $ap; # --- eval needed in averaging more runs # --- when number of mols smaller than 100 $last_percent = $ip; } $ip = int($percent*10); if ($detail10 && $ip <= 100) { # detailed output for first 10 % if ($ip > $last_percent10) { $ip = int($percent * 10); $ap = $a * 100; $eval10[$ip] = $ap; # --- eval10 needed in averaging more runs $last_percent10 = $ip; } } } # --- summing eval data for more runs for ($i=1;$i<=100;$i++) { if ($eval[$i] < $eval[$i-1]) {$eval[$i] = $eval[$i-1];} # less than 100 mols $seval[$i] += $eval[$i]; $seval10[$i] += $eval10[$i]; } } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -