File size: 1,743 Bytes
fd49381 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
die("ERROR syntax: reference-from-sgm.perl ref src out")
unless scalar @ARGV == 3;
my ($ref,$src,$txt) = @ARGV;
# get order of the documents
my @ORDER;
open(ORDER,$src) || die("ERROR not found: $src");
while(<ORDER>) {
next unless /docid="([^\"]+)"/;
push @ORDER,$1;
}
close(ORDER);
# get from sgm file which lines belong to which system
my %DOC;
my $system_from_refset = 0;
my ($doc,$system);
open(REF,$ref) or die "Cannot open: $!";
while(my $line = <REF>) {
if ($line =~ /<refset/ && $line =~ /refid="([^\"]+)"/i) {
$system = $1;
$system_from_refset = 1;
}
if ($line =~ /<doc/i) {
die unless $line =~ /sysid="([^\"]+)"/i || $system_from_refset;
$system = $1 unless $system_from_refset;
die unless $line =~ /docid="([^\"]+)"/i;
$doc = $1;
}
while ($line =~ /<seg[^>]+>\s*(.*)\s*$/i &&
$line !~ /<seg[^>]+>\s*(.*)\s*<\/seg>/i) {
my $next_line = <REF>;
$line .= $next_line;
chop($line);
}
if ($line =~ /<seg[^>]+>\s*(.+)\s*<\/seg>/i) {
push @{$DOC{$system}{$doc}},$1;
}
}
close(REF);
my $i=0;
foreach my $system (keys %DOC) {
my $outfile = $txt;
if (scalar keys %DOC > 1) {
if ($outfile =~ /\.\d+$/) {
$outfile .= ".ref$i";
}
else {
$outfile .= $i;
}
}
open(TXT,">$outfile") || die($outfile);
foreach my $doc (@ORDER) {
die("can't find '$doc' for ref '$system'") unless defined $DOC{$system}{$doc};
foreach my $line (@{$DOC{$system}{$doc}}) {
print TXT $line."\n";
}
}
close(TXT);
$i++;
}
|