#!/bin/csh -f # This is an example shell script that can be modified to easily run # a program numerous times with parameter variations. # # Place this file in the directory with your data and give it a # chmod 700 to set executable permission. # size of test set set testsetsize=30 # Replace "dataset1 dataset2 dataset3" with the base names of your # data sets, e.g. "vote", "monks", and your third set. The variable # U below will represent a file name. foreach U (dataset1 dataset2 dataset3) set D=$U-train set T=$U-test # backquotes execute a command in a subshell and returns the result; very handy set mainsetsize=`wc -l $U` # Split the data file into test and traning (D and T). Writes $testsetsize # randomly-selected examples into $T and the rest in $D. # # Assumes one example per line. # # You should only run this once; otherwise your results will be inconsistent # across multiple runs. # # Caveat: this doesn't actually check if you choose the same line twice. # If it does happen (unlikely), you could have a problem. cat $U | awk -v testsize=$testsetsize -v mainsetsize=$mainsize \ -v dfile=$D -v tfile=$T \ 'BEGIN {srand();for (i=0;i> tfile} (!(NR in a)) {print $0 >> dfile}' set trainsize=`wc -l $D` set testsize=`wc -l $T` # compute the 5 training set sizes # (may be a little off, producing an extra number if $trainsize-10 is not # evenly divisible) set sizes=`echo 10 $trainsize | awk '{interv=int(($2-$1)/4); for (i=$1;i<$2;i+=interv) print i; print $2 }'` # run the 5 training set sizes set i=1 while ($i <= 3) # run three iterations set j=1 while ($j <= 3) # Here we assume that your code will subsample the training set for # us. Alternatively one can adapt the random selection code from # above. ./runmyprogramrightnow -trainset $D -testset $T \ -sizeofsubsample $sizes[$i] \ -treeoutputfile treefile.$U.$i.$j -plotoutputfile plotfile.$U.$i.$j @ j++ end @ i++ end end