|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| $|=1;
|
|
|
| use FindBin;
|
| use Cwd "abs_path";
|
| use File::Basename qw(dirname);
|
| use File::Spec;
|
|
|
| my $bin_dir = abs_path(dirname($0));
|
| my $root_dir = File::Spec->catfile($bin_dir, File::Spec->updir());
|
| my $data_dir = File::Spec->catfile($root_dir, "data");
|
| my $lib_dir = File::Spec->catfile($root_dir, "lib");
|
|
|
| use lib "$FindBin::Bin/../lib";
|
| use List::Util qw(min max);
|
| use NLP::utilities;
|
| use NLP::stringDistance;
|
| $util = NLP::utilities;
|
| $sd = NLP::stringDistance;
|
| $verbose = 0;
|
| $separator = "\t";
|
|
|
| $cost_rule_filename = File::Spec->catfile($data_dir, "string-distance-cost-rules.txt");
|
|
|
| $lang_code1 = "eng";
|
| $lang_code2 = "eng";
|
| %ht = ();
|
|
|
| while (@ARGV) {
|
| $arg = shift @ARGV;
|
| if ($arg =~ /^-+lc1$/) {
|
| $lang_code_candidate = shift @ARGV;
|
| $lang_code1 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
| } elsif ($arg =~ /^-+lc2$/) {
|
| $lang_code_candidate = shift @ARGV;
|
| $lang_code2 = $lang_code_candidate if $lang_code_candidate =~ /^[a-z]{3,3}$/;
|
| } elsif ($arg =~ /^-+(v|verbose)$/) {
|
| $verbose = shift @ARGV;
|
| } else {
|
| print STDERR "Ignoring unrecognized arg $arg\n";
|
| }
|
| }
|
|
|
| $sd->load_string_distance_data($cost_rule_filename, *ht, $verbose);
|
| print STDERR "Loaded resources.\n" if $verbose;
|
|
|
| my $chart_id = 0;
|
| my $line_number = 0;
|
| print "# Lang-code-1: $lang_code1 Lang-code-2: $lang_code2\n";
|
| while (<>) {
|
| $line_number++;
|
| if ($verbose) {
|
| if ($line_number =~ /000$/) {
|
| if ($line_number =~ /0000$/) {
|
| print STDERR $line_number;
|
| } else {
|
| print STDERR ".";
|
| }
|
| }
|
| }
|
| my $line = $_;
|
| $line =~ s/^\xEF\xBB\xBF//;
|
| next if $line =~ /^\s*(\#.*)?$/;
|
| my $s1;
|
| my $s2;
|
| if (($s1, $s2) = ($line =~ /^("(?:\\"|[^"])*"|\S+)$separator("(?:\\"|[^"])*"|\S+)\s*$/)) {
|
| $s1 = $util->dequote_string($s1);
|
| $s2 = $util->dequote_string($s2);
|
| } elsif ($line =~ /^\s*(#.*)$/) {
|
| } else {
|
| print STDERR "Could not process line $line_number: $line" if $verbose;
|
| print "\n";
|
| next;
|
| }
|
|
|
| $cost = $sd->quick_romanized_string_distance_by_chart($s1, $s2, *ht, "", $lang_code1, $lang_code2);
|
| print "$s1\t$s2\t$cost\n";
|
| }
|
| print STDERR "\n" if $verbose;
|
|
|
| exit 0;
|
|
|
|
|