|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
package NLP::Chinese; |
|
|
|
|
|
$utf8 = NLP::UTF8; |
|
|
%empty_ht = (); |
|
|
|
|
|
sub read_chinese_tonal_pinyin_files { |
|
|
local($caller, *ht, @filenames) = @_; |
|
|
|
|
|
$n_kHanyuPinlu = 0; |
|
|
$n_kXHC1983 = 0; |
|
|
$n_kHanyuPinyin = 0; |
|
|
$n_kMandarin = 0; |
|
|
$n_cedict = 0; |
|
|
$n_simple_pinyin = 0; |
|
|
|
|
|
foreach $filename (@filenames) { |
|
|
if ($filename =~ /unihan/i) { |
|
|
my $line_number = 0; |
|
|
if (open(IN, $filename)) { |
|
|
while (<IN>) { |
|
|
$line_number++; |
|
|
next if /^ |
|
|
s/\s*$//; |
|
|
if (($u, $type, $value) = split(/\t/, $_)) { |
|
|
if ($type =~ /^(kHanyuPinlu|kXHC1983|kHanyuPinyin|kMandarin)$/) { |
|
|
$u = $util->trim($u); |
|
|
$type = $util->trim($type); |
|
|
$value = $util->trim($value); |
|
|
$f = $utf8->unicode_string2string($u); |
|
|
|
|
|
if ($type eq "kHanyuPinlu") { |
|
|
$value =~ s/\(.*?\)//g; |
|
|
$value = $util->trim($value); |
|
|
$translit = $caller->number_to_accent_tone($value); |
|
|
$ht{"kHanyuPinlu"}->{$f} = $translit; |
|
|
$n_kHanyuPinlu++; |
|
|
} elsif ($type eq "kXHC1983") { |
|
|
@translits = ($value =~ /:(\S+)/g); |
|
|
$translit = join(" ", @translits); |
|
|
$ht{"kXHC1983"}->{$f} = $translit; |
|
|
$n_kXHC1983++; |
|
|
} elsif ($type eq "kHanyuPinyin") { |
|
|
$value =~ s/^.*://; |
|
|
$value =~ s/,/ /g; |
|
|
$ht{"kHanyuPinyin"}->{$f} = $value; |
|
|
$n_kHanyuPinyin++; |
|
|
} elsif ($type eq "kMandarin") { |
|
|
$ht{"kMandarin"}->{$f} = $value; |
|
|
$n_kMandarin++; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
close(IN); |
|
|
print "Read in $n_kHanyuPinlu kHanyuPinlu, $n_kXHC1983 n_kXHC1983, $n_kHanyuPinyin n_kHanyuPinyin $n_kMandarin n_kMandarin\n"; |
|
|
} else { |
|
|
print STDERR "Can't open $filename\n"; |
|
|
} |
|
|
} elsif ($filename =~ /cedict/i) { |
|
|
if (open(IN, $filename)) { |
|
|
my $line_number = 0; |
|
|
while (<IN>) { |
|
|
$line_number++; |
|
|
next if /^ |
|
|
s/\s*$//; |
|
|
if (($f, $translit) = ($_ =~ /^\S+\s+(\S+)\s+\[([^\[\]]+)\]/)) { |
|
|
$translit = $utf8->extended_lower_case($translit); |
|
|
$translit = $caller->number_to_accent_tone($translit); |
|
|
$translit =~ s/\s//g; |
|
|
if ($old_translit = $ht{"cedict"}->{$f}) { |
|
|
|
|
|
$ht{"cedicts"}->{$f} = join(" ", $ht{"cedicts"}->{$f}, $translit) unless $old_translit eq $translit; |
|
|
} else { |
|
|
$ht{"cedict"}->{$f} = $translit; |
|
|
$ht{"cedicts"}->{$f} = $translit; |
|
|
} |
|
|
$n_cedict++; |
|
|
} |
|
|
} |
|
|
close(IN); |
|
|
|
|
|
} else { |
|
|
print STDERR "Can't open $filename"; |
|
|
} |
|
|
} elsif ($filename =~ /chinese_to_pinyin/i) { |
|
|
if (open(IN, $filename)) { |
|
|
my $line_number = 0; |
|
|
while (<IN>) { |
|
|
$line_number++; |
|
|
next if /^ |
|
|
if (($f, $translit) = ($_ =~ /^(\S+)\t(\S+)\s*$/)) { |
|
|
$ht{"simple_pinyin"}->{$f} = $translit; |
|
|
$n_simple_pinyin++; |
|
|
} |
|
|
} |
|
|
close(IN); |
|
|
|
|
|
} else { |
|
|
print STDERR "Can't open $filename"; |
|
|
} |
|
|
} else { |
|
|
print STDERR "Don't know what to do with file $filename (in read_chinese_tonal_pinyin_files)\n"; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
sub tonal_pinyin { |
|
|
local($caller, $s, *ht, $gloss) = @_; |
|
|
|
|
|
return $result if defined($result = $ht{COMBINED}->{$s}); |
|
|
|
|
|
$cedict_pinyin = $ht{"cedict"}->{$s} || ""; |
|
|
$cedicts_pinyin = $ht{"cedicts"}->{$s} || ""; |
|
|
$unihan_pinyin = ""; |
|
|
@characters = $utf8->split_into_utf8_characters($s, "return only chars", *empty_ht); |
|
|
foreach $c (@characters) { |
|
|
if ($pinyin = $ht{"simple_pinyin"}->{$c}) { |
|
|
$unihan_pinyin .= $pinyin; |
|
|
} elsif ($pinyin = $ht{"kHanyuPinlu"}->{$c}) { |
|
|
$pinyin =~ s/^(\S+)\s.*$/$1/; |
|
|
$unihan_pinyin .= $pinyin; |
|
|
} elsif ($pinyin = $ht{"kXHC1983"}->{$c}) { |
|
|
$pinyin =~ s/^(\S+)\s.*$/$1/; |
|
|
$unihan_pinyin .= $pinyin; |
|
|
} elsif ($pinyin = $ht{"kHanyuPinyin"}->{$c}) { |
|
|
$pinyin =~ s/^(\S+)\s.*$/$1/; |
|
|
$unihan_pinyin .= $pinyin; |
|
|
} elsif ($pinyin = $ht{"cedicts"}->{$c}) { |
|
|
$pinyin =~ s/^(\S+)\s.*$/$1/; |
|
|
$unihan_pinyin .= $pinyin; |
|
|
|
|
|
} elsif ($c =~ /^(\xC2\xB7|\xE3\x83\xBB|\xC3\x97)$/) { |
|
|
$unihan_pinyin .= $c; |
|
|
|
|
|
} elsif ($c =~ /^([\x21-\x7E])$/) { |
|
|
$unihan_pinyin .= $c; |
|
|
} else { |
|
|
$unihan_pinyin .= "?"; |
|
|
$hex = $utf8->utf8_to_hex($c); |
|
|
$unicode = uc $utf8->utf8_to_4hex_unicode($c); |
|
|
|
|
|
} |
|
|
} |
|
|
$pinyin_title = ""; |
|
|
if (($#characters >= 1) && $cedicts_pinyin) { |
|
|
foreach $pinyin (split(/\s+/, $cedicts_pinyin)) { |
|
|
$pinyin_title .= "$s $pinyin (CEDICT)\n"; |
|
|
} |
|
|
$pinyin_title .= "\n"; |
|
|
} |
|
|
foreach $c (@characters) { |
|
|
my %local_ht = (); |
|
|
@pinyins = (); |
|
|
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin", "cedicts")) { |
|
|
if ($pinyin_s = $ht{$type}->{$c}) { |
|
|
foreach $pinyin (split(/\s+/, $pinyin_s)) { |
|
|
push(@pinyins, $pinyin) unless $util->member($pinyin, @pinyins); |
|
|
$type2 = ($type eq "cedicts") ? "CEDICT" : $type; |
|
|
$local_ht{$pinyin} = ($local_ht{$pinyin}) ? join(", ", $local_ht{$pinyin}, $type2) : $type2; |
|
|
} |
|
|
} |
|
|
} |
|
|
foreach $pinyin (@pinyins) { |
|
|
$type_s = $local_ht{$pinyin}; |
|
|
$pinyin_title .= "$c $pinyin ($type_s)\n"; |
|
|
} |
|
|
} |
|
|
$pinyin_title =~ s/\n$//; |
|
|
$pinyin_title =~ s/\n/
/g; |
|
|
$unihan_pinyin = "" if $unihan_pinyin =~ /^\?+$/; |
|
|
if (($#characters >= 1) && $cedict_pinyin && $unihan_pinyin && ($unihan_pinyin ne $cedict_pinyin)) { |
|
|
$log = "Gloss($s): $gloss\nCEdict($s): $cedicts_pinyin\nUnihan($s): $unihan_pinyin\n"; |
|
|
foreach $type (("kHanyuPinlu", "kXHC1983", "kHanyuPinyin")) { |
|
|
$log_line = "$type($s): "; |
|
|
foreach $c (@characters) { |
|
|
$pinyin = $ht{$type}->{$c} || ""; |
|
|
if ($pinyin =~ / /) { |
|
|
$log_line .= "($pinyin)"; |
|
|
} elsif ($pinyin) { |
|
|
$log_line .= $pinyin; |
|
|
} else { |
|
|
$log_line .= "?"; |
|
|
} |
|
|
} |
|
|
$log .= "$log_line\n"; |
|
|
} |
|
|
$ht{CONFLICT}->{$s} = $log; |
|
|
} |
|
|
$result = $unihan_pinyin || $cedict_pinyin; |
|
|
$result = $cedict_pinyin if ($#characters > 0) && $cedict_pinyin; |
|
|
$ht{COMBINED}->{$s} = $result; |
|
|
$ht{PINYIN_TITLE}->{$s} = $pinyin_title; |
|
|
return $result; |
|
|
} |
|
|
|
|
|
%number_to_accent_tone_ht = ( |
|
|
"a1", "\xC4\x81", "a2", "\xC3\xA1", "a3", "\xC7\x8E", "a4", "\xC3\xA0", |
|
|
"e1", "\xC4\x93", "e2", "\xC3\xA9", "e3", "\xC4\x9B", "e4", "\xC3\xA8", |
|
|
"i1", "\xC4\xAB", "i2", "\xC3\xAD", "i3", "\xC7\x90", "i4", "\xC3\xAC", |
|
|
"o1", "\xC5\x8D", "o2", "\xC3\xB3", "o3", "\xC7\x92", "o4", "\xC3\xB2", |
|
|
"u1", "\xC5\xAB", "u2", "\xC3\xBA", "u3", "\xC7\x94", "u4", "\xC3\xB9", |
|
|
"u:1","\xC7\x96", "u:2","\xC7\x98", "u:3","\xC7\x9A", "u:4","\xC7\x9C", |
|
|
"\xC3\xBC1","\xC7\x96","\xC3\xBC2","\xC7\x98","\xC3\xBC3","\xC7\x9A","\xC3\xBC4","\xC7\x9C" |
|
|
); |
|
|
|
|
|
sub number_to_accent_tone { |
|
|
local($caller, $s) = @_; |
|
|
|
|
|
my $result = ""; |
|
|
while (($pre,$alpha,$tone_number,$rest) = ($s =~ /^(.*?)((?:[a-z]|u:|\xC3\xBC)+)([1-5])(.*)$/i)) { |
|
|
if ($tone_number eq "5") { |
|
|
$result .= "$pre$alpha"; |
|
|
} elsif ((($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)([ae])(.*)$/)) |
|
|
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(o)(u.*)$/)) |
|
|
|| (($pre_acc,$acc_letter,$post_acc) = ($alpha =~ /^(.*)(u:|[iou]|\xC3\xBC)([^aeiou]*)$/))) { |
|
|
$result .= "$pre$pre_acc" . ($number_to_accent_tone_ht{($acc_letter . $tone_number)} || ($acc_letter . $tone_number)) . $post_acc; |
|
|
} else { |
|
|
$result .= "$pre$alpha$tone_number"; |
|
|
} |
|
|
$s = $rest; |
|
|
} |
|
|
$result .= $s; |
|
|
$result =~ s/u:/\xC3\xBC/g; |
|
|
return $result; |
|
|
} |
|
|
|
|
|
sub string_contains_utf8_cjk_unified_ideograph_p { |
|
|
local($caller, $s) = @_; |
|
|
|
|
|
return ($s =~ /([\xE4-\xE9]|\xE3[\x90-\xBF]|\xF0[\xA0-\xAC])/); |
|
|
} |
|
|
|
|
|
1; |
|
|
|