|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
($cnt,$dir,$inv)=(); |
|
|
|
|
|
while ($w=shift @ARGV){ |
|
|
$dir=shift(@ARGV),next if $w eq "-d"; |
|
|
$inv=shift(@ARGV),next if $w eq "-i"; |
|
|
$cnt=shift(@ARGV),next if $w eq "-c"; |
|
|
} |
|
|
|
|
|
my $lc = 0; |
|
|
|
|
|
if (!$dir || !$inv){ |
|
|
print "usage: giza2bal.pl [-c <count-file>] -d <dir-align-file> -i <inv-align-file>\n"; |
|
|
print "input files can be also commands, e.g. -d \"gunzip -c file.gz\"\n"; |
|
|
exit(0); |
|
|
} |
|
|
|
|
|
$|=1; |
|
|
|
|
|
open(DIR,"<$dir") || open(DIR,"$dir|") || die "cannot open $dir\n"; |
|
|
open(INV,"<$inv") || open(INV,"$inv|") || die "cannot open $dir\n"; |
|
|
|
|
|
if ($cnt){ |
|
|
open(CNT,"<$cnt") || open(CNT,"$cnt|") || die "cannot open $dir\n"; |
|
|
} |
|
|
|
|
|
|
|
|
sub ReadBiAlign{ |
|
|
local($fd0,$fd1,$fd2,*s1,*s2,*a,*b,*c)=@_; |
|
|
local($dummy,$n); |
|
|
|
|
|
chop($c=<$fd0>); |
|
|
$dummy=<$fd0>; |
|
|
$dummy=<$fd0>; |
|
|
$c=1 if !$c; |
|
|
|
|
|
$dummy=<$fd1>; |
|
|
chop($s1=<$fd1>); |
|
|
chop($t1=<$fd1>); |
|
|
|
|
|
$dummy=<$fd2>; |
|
|
chop($s2=<$fd2>); |
|
|
chop($t2=<$fd2>); |
|
|
|
|
|
@a=@b=(); |
|
|
$lc++; |
|
|
|
|
|
|
|
|
$n=1; |
|
|
$t1=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; |
|
|
while ($t1=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ |
|
|
grep($a[$_]=$n,split(/\s+/,$2)); |
|
|
$n++; |
|
|
} |
|
|
|
|
|
$m=1; |
|
|
$t2=~s/NULL \(\{((\s+\d+)*)\s+\}\)//; |
|
|
while ($t2=~s/(\S+)\s+\(\{((\s+\d+)*)\s+\}\)//){ |
|
|
grep($b[$_]=$m,split(/\s+/,$2)); |
|
|
$m++; |
|
|
} |
|
|
|
|
|
$M=split(/\s+/,$s1); |
|
|
$N=split(/\s+/,$s2); |
|
|
|
|
|
if ($m != ($M+1) || $n != ($N+1)) { |
|
|
print STDERR "Sentence mismatch error! Line #$lc\n"; |
|
|
$s1 = "ALIGN_ERR"; |
|
|
$s2 = "ALIGN_ERR"; |
|
|
@a=(); @b=(); |
|
|
for ($j=1;$j<2;$j++){ $a[$j]=1; } |
|
|
for ($i=1;$i<2;$i++){ $b[$i]=1; } |
|
|
return 1; |
|
|
} |
|
|
|
|
|
for ($j=1;$j<$m;$j++){ |
|
|
$a[$j]=0 if !$a[$j]; |
|
|
} |
|
|
|
|
|
for ($i=1;$i<$n;$i++){ |
|
|
$b[$i]=0 if !$b[$i]; |
|
|
} |
|
|
|
|
|
|
|
|
return 1; |
|
|
} |
|
|
|
|
|
$skip=0; |
|
|
$ccc=0; |
|
|
while(!eof(DIR)){ |
|
|
|
|
|
if (ReadBiAlign(CNT,DIR,INV,*src,*tgt,*a,*b,*c)) |
|
|
{ |
|
|
$ccc++; |
|
|
print "$c\n"; |
|
|
print $#a," $src \# @a[1..$#a]\n"; |
|
|
print $#b," $tgt \# @b[1..$#b]\n"; |
|
|
} |
|
|
else{ |
|
|
print "\n"; |
|
|
print STDERR "." if !(++$skip % 1000); |
|
|
} |
|
|
}; |
|
|
print STDERR "skip=<$skip> counts=<$ccc>\n"; |
|
|
|