#!/usr/bin/env perl ############################################################################### # # deftest.pl Define SpeechDat FDB test set by a controlled # random selection, in which every accent region and # gender is treated separately. # # Input file: session.tbl # # Output files:
trn.ses,
tst.ses # # Version 1.1 Finn Tore Johansen, Telenor R&D, 09.12.97 # 1.2 Christoph Draxler, University of Munich, 12.04.2000 # * script modified to handle SpeechDat-Car session and # recording condition tables # * selection criteria now are SEX and ENVClass, i.e. speaker # gender and the seven environment classes that are computed # from the street conditions and the car equipment used # 1.3 Jan Vuylsteke, L&H Ieper, 04.08.2000 # * Sessions with the same speaker are now in one set # (not in both) # ############################################################################### # --- Configuration variables ------------------------------------------------- # desired testset size, this is a function of the database size $N = 200; # For 1000- databases # $N = 500; # For 2000+ databases # --- database and language code --------------------------------------------- # modify the language code for your language! $DD = "V1"; $LL = "NV"; # input file, you may need to edit the field description line below # if more than the mandatory fields are present, or if any of # the SEX or ACC fields are empty $rccfile = "REC_COND.TBL"; $spkfile = "SPEAKER.TBL"; $sesfile = "SESSION.TBL"; # --- output files ----------------------------------------------------------- $trnsetfile = "${DD}TRN$LL.SES"; $tstsetfile = "${DD}TST$LL.SES"; # --- load recording conditions file ----------------------------------------- open(FILE,$rccfile) || die "Could not open recording conditions file $rccfile"; while (){ next if /^SES/; # Ignore header s/[\r\n]*//g; ($SES,$REG,$NET,$PHM,$CAR,$CEQ,$SPP,$MIP,$MIT,$SCC,$WTC) = split(/\t/,$_); # --- from SCC and CEQ compute one of the seven environment classes ------- $_ = $CEQ; if($SCC eq "HIGH_SPEED_GOOD_ROAD") { if (/AUDIO=ON/) { $ENVClass=7; } else { $ENVClass=6; } } elsif ($SCC eq "LOW_SPEED_ROUGH_ROAD") { if ((/=ON/) || (/=OPEN/)) { $ENVClass=5; } else { $ENVClass=4; } } elsif ($SCC eq "TOWN_TRAFFIC") { if ((/=ON/) || (/=OPEN/)) { $ENVClass=3; } else { $ENVClass=2; } } else { $ENVClass=1; } $recconds{$SES} = $ENVClass; } close(FILE); # --- load speaker file, compute the total number of speakers in each class -- open(FILE,$spkfile) || die "Could not open speaker file $spkfile"; while (){ next if /^SCD/; # Ignore header s/[\r\n]*//g; ($SCD,$SEX,$AGE,$ACC) = split(/\t/,$_); $spkinfo{$SCD} = "$SEX"; } close(FILE); # --- load session file, compute the total number of speakers in each class -- open(FILE,$sesfile) || die "Could not open session file $sesfile"; while (){ next if /^SES/; # Ignore header s/[\r\n]*//g; ($SES,$SCD,$REP,$RED,$RET,$EXN,$TXF,$HLT,$STR,$TRD,$SHT) = split(/\t/,$_); $class{$SES} = "$spkinfo{$SCD}$recconds{$SES}"; $count{$class{$SES}}++; push(@{ $spksessions{$SCD} },$SES); $speaker{$SES} = $SCD; } close(FILE); # --- start generation of sets ----------------------------------------------- @sessions = sort numerically keys(%class); $F = $#sessions + 1; @classes = keys(%count); $K = $#classes + 1; printf("Desired testset size N=%d\n",$N); printf("Total database size F=%d\n",$F); printf("Number of classes in selection K=%d\n\n",$K); # --- generate optimum (non integer) speaker distribution -------------------- foreach $bin (@classes){ $opt{$bin} = $count{$bin}*$N/$F; $diff{$bin} = $count{$bin}-$opt{$bin}; } printf("Initial speaker distribution:\n"); &printtab; # --- find testset with integer number of speakers --------------------------- srand(0); # Use same seed to get same results every time printf("Generating testset distribution...\n"); while (&sumcount > $N){ # Select the class with largest difference between #speakers and optimum @sorted = sort bydiff (@classes); $top = $sorted[0]; # Remove a random speaker from the selected class @tmp = (); foreach $ses (@sessions){ push(@tmp,$ses) if ( !($trainset{$ses}) && ($class{$ses} eq $top)); } $ses = $tmp[rand(@tmp)]; $spk = $speaker{$ses}; #Select all sessions for the current speaker @sessions_same_speaker = @{ $spksessions{$spk} }; foreach $session (@sessions_same_speaker) { $trainset{$session} = 1; $count{$top}--; $diff{$top} = $count{$top} - $opt{$top}; } } &printtab; # --- save train and testset to list files ---------------------------------- open(TRNFILE,">$trnsetfile") || die "Can't open $trnsetfile"; open(TSTFILE,">$tstsetfile") || die "Can't open $tstsetfile"; foreach $ses (@sessions){ if ($trainset{$ses}) { printf(TRNFILE "$ses\r\n"); } else { printf(TSTFILE "$ses\r\n"); } } close(TRNFILE); close(TSTFILE); printf("Trainset saved to %s\n",$trnsetfile); printf("Testset saved to %s\n",$tstsetfile); # --- subroutines ----------------------------------------------------------- sub bydiff { $diff{$b} <=> $diff{$a} } sub numerically { $a <=> $b } sub sumcount { $acc = 0; foreach $bin (@classes){ $acc += $count{$bin}; } return $acc; } sub printtab { printf("\n%s\t%s\t%s\t\t%s\n","CLASS","#SPKS","OPT","DIFF"); foreach $bin (sort (@classes)){ printf("%s\t%d\t%f\t%f\n",$bin,$count{$bin},$opt{$bin},$diff{$bin}); } printf("%s\t%d\n\n","SUM",&sumcount); }