#!/usr/bin/env perl
###############################################################################
#
# deftest.pl Define SpeechDat FDB test set by a controlled
# random selection, in which every accent region and
# gender is treated separately.
#
# Input file: session.tbl
#
# Output files:
trn.ses, tst.ses
#
# Version 1.1 Finn Tore Johansen, Telenor R&D, 09.12.97
# 1.2 Christoph Draxler, University of Munich, 12.04.2000
# * script modified to handle SpeechDat-Car session and
# recording condition tables
# * selection criteria now are SEX and ENVClass, i.e. speaker
# gender and the seven environment classes that are computed
# from the street conditions and the car equipment used
# 1.3 Jan Vuylsteke, L&H Ieper, 04.08.2000
# * Sessions with the same speaker are now in one set
# (not in both)
#
###############################################################################
# --- Configuration variables -------------------------------------------------
# desired testset size, this is a function of the database size
$N = 200; # For 1000- databases
# $N = 500; # For 2000+ databases
# --- database and language code ---------------------------------------------
# modify the language code for your language!
$DD = "V1";
$LL = "NV";
# input file, you may need to edit the field description line below
# if more than the mandatory fields are present, or if any of
# the SEX or ACC fields are empty
$rccfile = "REC_COND.TBL";
$spkfile = "SPEAKER.TBL";
$sesfile = "SESSION.TBL";
# --- output files -----------------------------------------------------------
$trnsetfile = "${DD}TRN$LL.SES";
$tstsetfile = "${DD}TST$LL.SES";
# --- load recording conditions file -----------------------------------------
open(FILE,$rccfile) || die "Could not open recording conditions file $rccfile";
while (){
next if /^SES/; # Ignore header
s/[\r\n]*//g;
($SES,$REG,$NET,$PHM,$CAR,$CEQ,$SPP,$MIP,$MIT,$SCC,$WTC) = split(/\t/,$_);
# --- from SCC and CEQ compute one of the seven environment classes -------
$_ = $CEQ;
if($SCC eq "HIGH_SPEED_GOOD_ROAD") {
if (/AUDIO=ON/) {
$ENVClass=7;
} else {
$ENVClass=6;
}
} elsif ($SCC eq "LOW_SPEED_ROUGH_ROAD") {
if ((/=ON/) || (/=OPEN/)) {
$ENVClass=5;
} else {
$ENVClass=4;
}
} elsif ($SCC eq "TOWN_TRAFFIC") {
if ((/=ON/) || (/=OPEN/)) {
$ENVClass=3;
} else {
$ENVClass=2;
}
} else {
$ENVClass=1;
}
$recconds{$SES} = $ENVClass;
}
close(FILE);
# --- load speaker file, compute the total number of speakers in each class --
open(FILE,$spkfile) || die "Could not open speaker file $spkfile";
while (){
next if /^SCD/; # Ignore header
s/[\r\n]*//g;
($SCD,$SEX,$AGE,$ACC) = split(/\t/,$_);
$spkinfo{$SCD} = "$SEX";
}
close(FILE);
# --- load session file, compute the total number of speakers in each class --
open(FILE,$sesfile) || die "Could not open session file $sesfile";
while (){
next if /^SES/; # Ignore header
s/[\r\n]*//g;
($SES,$SCD,$REP,$RED,$RET,$EXN,$TXF,$HLT,$STR,$TRD,$SHT) = split(/\t/,$_);
$class{$SES} = "$spkinfo{$SCD}$recconds{$SES}";
$count{$class{$SES}}++;
push(@{ $spksessions{$SCD} },$SES);
$speaker{$SES} = $SCD;
}
close(FILE);
# --- start generation of sets -----------------------------------------------
@sessions = sort numerically keys(%class);
$F = $#sessions + 1;
@classes = keys(%count);
$K = $#classes + 1;
printf("Desired testset size N=%d\n",$N);
printf("Total database size F=%d\n",$F);
printf("Number of classes in selection K=%d\n\n",$K);
# --- generate optimum (non integer) speaker distribution --------------------
foreach $bin (@classes){
$opt{$bin} = $count{$bin}*$N/$F;
$diff{$bin} = $count{$bin}-$opt{$bin};
}
printf("Initial speaker distribution:\n");
&printtab;
# --- find testset with integer number of speakers ---------------------------
srand(0); # Use same seed to get same results every time
printf("Generating testset distribution...\n");
while (&sumcount > $N){
# Select the class with largest difference between #speakers and optimum
@sorted = sort bydiff (@classes);
$top = $sorted[0];
# Remove a random speaker from the selected class
@tmp = ();
foreach $ses (@sessions){
push(@tmp,$ses) if ( !($trainset{$ses}) && ($class{$ses} eq $top));
}
$ses = $tmp[rand(@tmp)];
$spk = $speaker{$ses};
#Select all sessions for the current speaker
@sessions_same_speaker = @{ $spksessions{$spk} };
foreach $session (@sessions_same_speaker) {
$trainset{$session} = 1;
$count{$top}--;
$diff{$top} = $count{$top} - $opt{$top};
}
}
&printtab;
# --- save train and testset to list files ----------------------------------
open(TRNFILE,">$trnsetfile") || die "Can't open $trnsetfile";
open(TSTFILE,">$tstsetfile") || die "Can't open $tstsetfile";
foreach $ses (@sessions){
if ($trainset{$ses}) {
printf(TRNFILE "$ses\r\n");
}
else {
printf(TSTFILE "$ses\r\n");
}
}
close(TRNFILE);
close(TSTFILE);
printf("Trainset saved to %s\n",$trnsetfile);
printf("Testset saved to %s\n",$tstsetfile);
# --- subroutines -----------------------------------------------------------
sub bydiff { $diff{$b} <=> $diff{$a} }
sub numerically { $a <=> $b }
sub sumcount {
$acc = 0;
foreach $bin (@classes){
$acc += $count{$bin};
}
return $acc;
}
sub printtab {
printf("\n%s\t%s\t%s\t\t%s\n","CLASS","#SPKS","OPT","DIFF");
foreach $bin (sort (@classes)){
printf("%s\t%d\t%f\t%f\n",$bin,$count{$bin},$opt{$bin},$diff{$bin});
}
printf("%s\t%d\n\n","SUM",&sumcount);
}