| |
| |
| |
| |
|
|
| use warnings; |
| use strict; |
| use Getopt::Long "GetOptions"; |
| use FindBin qw($RealBin); |
|
|
| my $___FACTOR_DELIMITER = "|"; |
|
|
| |
| my $ZCAT = "gzip -cd"; |
| my $BZCAT = "bzcat"; |
|
|
| my ($CORPUS,$REDUCED,$FACTOR,$_XML); |
| die("ERROR: wrong syntax when invoking reduce-factors") |
| unless &GetOptions('corpus=s' => \$CORPUS, |
| 'reduced-corpus=s' => \$REDUCED, |
| 'factor=s' => \$FACTOR, |
| 'xml' => \$_XML); |
|
|
| &reduce_factors($CORPUS,$REDUCED,$FACTOR); |
|
|
| |
| sub reduce_factors { |
| my ($full,$reduced,$factors) = @_; |
|
|
| my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); |
|
|
| print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`; |
| while(-e $reduced.".lock") { |
| sleep(10); |
| } |
| if (-e $reduced) { |
| print STDERR " $reduced in place, reusing\n"; |
| return; |
| } |
| if (-e $reduced.".gz") { |
| print STDERR " $reduced.gz in place, reusing\n"; |
| return; |
| } |
|
|
| unless ($_XML) { |
| |
| |
| my $inh = open_or_zcat($full); |
| my $firstline = <$inh>; |
| die "Corpus file $full is empty" unless $firstline; |
| close $inh; |
| |
| $firstline =~ s/^\s*//; |
| $firstline =~ s/\s.*//; |
| |
| my @WORD = split(/ /,$firstline); |
| my @FACTOR = split(/$___FACTOR_DELIMITER/,$WORD[0]); |
| my $maxfactorindex = scalar(@FACTOR)-1; |
| if (join(",", @INCLUDE) eq join(",", 0..$maxfactorindex)) { |
| |
| my $realfull = $full; |
| if (!-e $realfull && -e $realfull.".gz") { |
| $realfull .= ".gz"; |
| $reduced =~ s/(\.gz)?$/.gz/; |
| } |
| safesystem("ln -s '$realfull' '$reduced'") |
| or die "Failed to create symlink $realfull -> $reduced"; |
| return; |
| } |
| } |
|
|
| |
| `touch $reduced.lock`; |
| *IN = open_or_zcat($full); |
| open(OUT,">".$reduced) or die "ERROR: Can't write $reduced"; |
| my $nr = 0; |
| while(<IN>) { |
| $nr++; |
| print STDERR "." if $nr % 10000 == 0; |
| print STDERR "($nr)" if $nr % 100000 == 0; |
| s/<\S[^>]*>/ /g if $_XML; |
| chomp; s/ +/ /g; s/^ //; s/ $//; |
| my $first = 1; |
| foreach (split) { |
| my @FACTOR = split /\Q$___FACTOR_DELIMITER/; |
| |
| print OUT " " unless $first; |
| $first = 0; |
| my $first_factor = 1; |
| foreach my $outfactor (@INCLUDE) { |
| print OUT $___FACTOR_DELIMITER unless $first_factor; |
| $first_factor = 0; |
| my $out = $FACTOR[$outfactor]; |
| die "ERROR: Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; |
| print OUT $out; |
| } |
| |
| |
| |
| |
| |
| |
| } |
| print OUT "\n"; |
| } |
| print STDERR "\n"; |
| close(OUT); |
| close(IN); |
| `rm -f $reduced.lock`; |
| } |
|
|
| sub open_or_zcat { |
| my $fn = shift; |
| my $read = $fn; |
| $fn = $fn.".gz" if ! -e $fn && -e $fn.".gz"; |
| $fn = $fn.".bz2" if ! -e $fn && -e $fn.".bz2"; |
| if ($fn =~ /\.bz2$/) { |
| $read = "$BZCAT $fn|"; |
| } elsif ($fn =~ /\.gz$/) { |
| $read = "$ZCAT $fn|"; |
| } |
| my $hdl; |
| open($hdl,$read) or die "Can't read $fn ($read)"; |
| return $hdl; |
| } |
|
|
| sub safesystem { |
| print STDERR "Executing: @_\n"; |
| system(@_); |
| if ($? == -1) { |
| print STDERR "ERROR: Failed to execute: @_\n $!\n"; |
| exit(1); |
| } |
| elsif ($? & 127) { |
| printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", |
| ($? & 127), ($? & 128) ? 'with' : 'without'; |
| exit(1); |
| } |
| else { |
| my $exitcode = $? >> 8; |
| print STDERR "Exit code: $exitcode\n" if $exitcode; |
| return ! $exitcode; |
| } |
| } |
|
|
|
|
|
|