| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| |
| no warnings 'once'; |
| use utf8; |
|
|
| use Cwd ('abs_path'); |
| use File::Spec::Functions; |
| use File::Basename ('dirname'); |
| use IPC::Run3; |
| use Getopt::Long; |
| use Test::More; |
|
|
| GetOptions("detokenizer=s" => \(my $detokenizer), |
| "results-dir=s"=> \(my $results_dir) |
| ) or exit 1; |
|
|
| unless (defined $results_dir) { |
| print STDERR "Usage: run-test-detokenizer.t --results-dir <RESULTS-DIRECTORY> [--detokenizer <DETOKENIZER-SCRIPT>]\n"; |
| exit 1; |
| } |
|
|
| die "ERROR: Results directory ".$results_dir." doesn't exist or is not a writable directory. Dying" unless (-d $results_dir && -w $results_dir); |
|
|
| $detokenizer = catfile(dirname(dirname(abs_path($0))), "scripts", "tokenizer", "detokenizer.perl") unless $detokenizer; |
| die "ERROR: Detokenizer script ".$detokenizer." does not exist. Dying" unless -f $detokenizer; |
|
|
|
|
| my @testCases = (); |
|
|
| |
| |
| |
|
|
| |
| &addDetokenizerTest("TEST_ENGLISH_EASY", "en", |
| <<'TOK' |
| This sentence is really simple , so it should not be hard to detokenize . |
| This one is no more difficult , but , hey , it is on a new line . |
| TOK |
| , |
| <<'EXP' |
| This sentence is really simple, so it should not be hard to detokenize. |
| This one is no more difficult, but, hey, it is on a new line. |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_ENGLISH_DOUBLEQUOTES", "en", |
| <<'TOK' |
| This is a somewhat " less simple " test . |
| TOK |
| , |
| <<'EXP' |
| This is a somewhat "less simple" test. |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_FRENCH_EASY", "fr", |
| <<'TOK' |
| Voici une phrase simple . |
| TOK |
| , |
| <<'EXP' |
| Voici une phrase simple. |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_FRENCH_APOSTROPHE", "fr", |
| <<'TOK' |
| Moi , j' ai une apostrophe . |
| TOK |
| , |
| <<'EXP' |
| Moi, j'ai une apostrophe. |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_FRENCH_APOSTROPHE_PENULTIMATE", "fr", |
| <<'TOK' |
| de musique rap issus de l' immigration |
| TOK |
| , |
| <<'EXP' |
| de musique rap issus de l'immigration |
| EXP |
| ); |
|
|
| |
| |
| &addDetokenizerTest("TEST_GERMAN_NONASCII", undef, |
| <<'TOK' |
| Ich hoffe , daß Sie schöne Ferien hatten . |
| Frau Präsidentin ! Frau Díez González und ich hatten einige Anfragen |
| TOK |
| , |
| <<'EXP' |
| Ich hoffe, daß Sie schöne Ferien hatten. |
| Frau Präsidentin! Frau Díez González und ich hatten einige Anfragen |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_CHINESE_EASY", undef, |
| <<'TOK' |
| 这 是 一个 简单 的的 汉语 句子 。 |
| TOK |
| , |
| <<'EXP' |
| 这是一个简单的的汉语句子。 |
| EXP |
| ); |
|
|
| |
| &addDetokenizerTest("TEST_JAPANESE_EASY", undef, |
| <<'TOK' |
| どう しょ う か な 。 |
| どこ で 食べ たい 。 |
| TOK |
| , |
| <<'EXP' |
| どうしょうかな。 |
| どこで食べたい。 |
| EXP |
| ); |
|
|
|
|
| |
| |
| |
|
|
| plan tests => scalar(@testCases); |
|
|
| foreach my $testCase (@testCases) { |
| &runDetokenizerTest($testCase); |
| } |
|
|
| |
| |
| |
|
|
| |
| sub addDetokenizerTest { |
| my ($testName, $language, $tokenizedText, $rightAnswer) = @_; |
|
|
| my $testCase = new DetokenizerTestCase($testName, $language, $tokenizedText, $rightAnswer); |
| push(@testCases, $testCase); |
| return $testCase; |
| } |
|
|
| sub runDetokenizerTest { |
| my ($testCase) = @_; |
|
|
| my $testOutputDir = catfile($results_dir, $testCase->getName()); |
| my $tokenizedFile = catfile($testOutputDir, "input.txt"); |
| my $expectedFile = catfile($testOutputDir, "expected.txt"); |
|
|
| |
| unless (mkdir($testOutputDir)) { |
| return fail($testCase->getName().": Failed to create output directory ".$testOutputDir." [".$!."]"); |
| } |
|
|
| open TOK, ">".$tokenizedFile; |
| binmode TOK, ":utf8"; |
| print TOK $testCase->getTokenizedText(); |
| close TOK; |
|
|
| open TRUTH, ">".$expectedFile; |
| binmode TRUTH, ":utf8"; |
| print TRUTH $testCase->getRightAnswer(); |
| close TRUTH; |
|
|
| &runTest($testCase->getName(), $testOutputDir, $tokenizedFile, sub { |
| return defined($testCase->getLanguage()) ? [$detokenizer, "-l", $testCase->getLanguage()] : [$detokenizer]; |
| }, sub { |
| &verifyIdentical($testCase->getName(), $expectedFile, catfile($testOutputDir, "stdout.txt")) |
| }, 1, $testCase->getFailureExplanation()); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| sub runTest { |
| my ($testName, $outputDir, $stdinFile, $buildCommandRoutineReference, $validationRoutineReference, $separateStdoutFromStderr, $failureExplanation) = @_; |
|
|
| my ($stdoutFile, $stderrFile); |
| if ($separateStdoutFromStderr) { |
| $stdoutFile = catfile($outputDir, "stdout.txt"); |
| $stderrFile = catfile($outputDir, "stderr.txt"); |
| } else { |
| $stdoutFile = catfile($outputDir, "stdout-and-stderr.txt"); |
| $stderrFile = $stdoutFile; |
| } |
|
|
| my $commandRef = $buildCommandRoutineReference->(); |
| my $exitStatus = &runVerbosely($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
| return fail($testName.": command exited with status ".$exitStatus) unless $exitStatus == 0; |
|
|
| if (defined $failureExplanation) { |
| TODO: { |
| local $TODO = $failureExplanation; |
| $validationRoutineReference->(); |
| } |
| } else { |
| $validationRoutineReference->(); |
| } |
| } |
|
|
| |
| |
| |
| |
| |
| sub runVerbosely { |
| my ($commandRef, $stdinFile, $stdoutFile, $stderrFile) = @_; |
| my @command = @{$commandRef}; |
| note("Executing command:\n @command\n"); |
| note("standard input coming from: ".$stdinFile) if defined $stdinFile; |
| note("standard output going to: ".$stdoutFile) if defined $stdoutFile; |
| note("standard error going to: ".$stderrFile) if defined $stderrFile; |
| run3($commandRef, $stdinFile, $stdoutFile, $stderrFile); |
| return $?; |
| } |
|
|
| |
| sub verifyIdentical { |
| my ($testName, $referenceFile, $outputFile) = @_; |
|
|
| open(REF, $referenceFile) or return fail($testName.": Can't open reference file ".$referenceFile." [".$!."]."); |
| open(OUT, $outputFile) or return fail($testName.": Can't open output file ".$outputFile." [".$!."]."); |
| my @referenceFileAsArray = <REF>; |
| my @outputFileAsArray = <OUT>; |
| close(REF); |
| close(OUT); |
| is_deeply(\@outputFileAsArray, \@referenceFileAsArray, $testName.": Output file ".$outputFile." matches reference file ".$referenceFile."."); |
| } |
|
|
|
|
| |
| |
|
|
| package DetokenizerTestCase; |
|
|
| |
| sub new { |
| my $class = shift; |
| my $self = { |
| _name => shift, |
| _language => shift, |
| _tokenizedText => shift, |
| _rightAnswer => shift, |
|
|
| _failureExplanation => undef |
| }; |
| bless $self, $class; |
| } |
|
|
| sub getName { |
| my ($self) = @_; |
| return $self->{_name}; |
| } |
|
|
| sub getLanguage { |
| my ($self) = @_; |
| return $self->{_language}; |
| } |
|
|
| sub getTokenizedText { |
| my ($self) = @_; |
| return $self->{_tokenizedText}; |
| } |
|
|
| sub getRightAnswer { |
| my ($self) = @_; |
| return $self->{_rightAnswer}; |
| } |
|
|
| |
| |
| |
| sub setExpectedToFail { |
| my ($self, $failureExplanation) = @_; |
| $self->{_failureExplanation} = $failureExplanation || "This test is expected to fail."; |
| } |
|
|
| |
| |
| sub getFailureExplanation { |
| my ($self) = @_; |
| return $self->{_failureExplanation}; |
| } |
|
|