Spaces:
Runtime error
Runtime error
| import re | |
| import six | |
| from . import messages | |
| from .unicode_block import ( | |
| unicode_block, | |
| UNICODE_BASIC_LATIN, | |
| UNICODE_LATIN_1_SUPPLEMENT, | |
| UNICODE_LATIN_EXTENDED_B, | |
| UNICODE_GENERAL_PUNCTUATION, | |
| UNICODE_ARABIC, | |
| UNICODE_LATIN_EXTENDED_ADDITIONAL, | |
| UNICODE_HIRAGANA, | |
| UNICODE_KATAKANA, | |
| UNICODE_BOPOMOFO, | |
| UNICODE_BOPOMOFO_EXTENDED, | |
| UNICODE_CJK_UNIFIED_IDEOGRAPHS, | |
| UNICODE_HANGUL_SYLLABLES, | |
| ) | |
| class NGram(object): | |
| LATIN1_EXCLUDED = messages.get_string('NGram.LATIN1_EXCLUDE') | |
| N_GRAM = 3 | |
| def __init__(self): | |
| self.grams = ' ' | |
| self.capitalword = False | |
| def add_char(self, ch): | |
| '''Append a character into ngram buffer.''' | |
| ch = self.normalize(ch) | |
| last_char = self.grams[-1] | |
| if last_char == ' ': | |
| self.grams = ' ' | |
| self.capitalword = False | |
| if ch == ' ': | |
| return | |
| elif len(self.grams) >= self.N_GRAM: | |
| self.grams = self.grams[1:] | |
| self.grams += ch | |
| if ch.isupper(): | |
| if last_char.isupper(): | |
| self.capitalword = True | |
| else: | |
| self.capitalword = False | |
| def get(self, n): | |
| '''Get n-gram.''' | |
| if self.capitalword: | |
| return | |
| if n < 1 or n > self.N_GRAM or len(self.grams) < n: | |
| return | |
| if n == 1: | |
| ch = self.grams[-1] | |
| if ch == ' ': | |
| return | |
| return ch | |
| else: | |
| return self.grams[-n:] | |
| def normalize(cls, ch): | |
| block = unicode_block(ch) | |
| if block == UNICODE_BASIC_LATIN: | |
| if ch < 'A' or ('Z' < ch < 'a') or 'z' < ch: | |
| ch = ' ' | |
| elif block == UNICODE_LATIN_1_SUPPLEMENT: | |
| if cls.LATIN1_EXCLUDED.find(ch) >= 0: | |
| ch = ' ' | |
| elif block == UNICODE_LATIN_EXTENDED_B: | |
| # normalization for Romanian | |
| if ch == six.u('\u0219'): # Small S with comma below => with cedilla | |
| ch = six.u('\u015f') | |
| if ch == six.u('\u021b'): # Small T with comma below => with cedilla | |
| ch = six.u('\u0163') | |
| elif block == UNICODE_GENERAL_PUNCTUATION: | |
| ch = ' ' | |
| elif block == UNICODE_ARABIC: | |
| if ch == six.u('\u06cc'): | |
| ch = six.u('\u064a') # Farsi yeh => Arabic yeh | |
| elif block == UNICODE_LATIN_EXTENDED_ADDITIONAL: | |
| if ch >= six.u('\u1ea0'): | |
| ch = six.u('\u1ec3') | |
| elif block == UNICODE_HIRAGANA: | |
| ch = six.u('\u3042') | |
| elif block == UNICODE_KATAKANA: | |
| ch = six.u('\u30a2') | |
| elif block in (UNICODE_BOPOMOFO, UNICODE_BOPOMOFO_EXTENDED): | |
| ch = six.u('\u3105') | |
| elif block == UNICODE_CJK_UNIFIED_IDEOGRAPHS: | |
| ch = cls.CJK_MAP.get(ch, ch) | |
| elif block == UNICODE_HANGUL_SYLLABLES: | |
| ch = six.u('\uac00') | |
| return ch | |
| def normalize_vi(cls, text): | |
| '''Normalizer for Vietnamese. | |
| Normalize Alphabet + Diacritical Mark(U+03xx) into U+1Exx. | |
| ''' | |
| def repl(m): | |
| alphabet = cls.TO_NORMALIZE_VI_CHARS.find(m.group(1)) | |
| dmark = cls.DMARK_CLASS.find(m.group(2)) # Diacritical Mark | |
| return cls.NORMALIZED_VI_CHARS[dmark][alphabet] | |
| return cls.ALPHABET_WITH_DMARK.sub(repl, text) | |
| NORMALIZED_VI_CHARS = [ | |
| messages.get_string('NORMALIZED_VI_CHARS_0300'), | |
| messages.get_string('NORMALIZED_VI_CHARS_0301'), | |
| messages.get_string('NORMALIZED_VI_CHARS_0303'), | |
| messages.get_string('NORMALIZED_VI_CHARS_0309'), | |
| messages.get_string('NORMALIZED_VI_CHARS_0323')] | |
| TO_NORMALIZE_VI_CHARS = messages.get_string('TO_NORMALIZE_VI_CHARS') | |
| DMARK_CLASS = messages.get_string('DMARK_CLASS') | |
| ALPHABET_WITH_DMARK = re.compile( | |
| '([' + TO_NORMALIZE_VI_CHARS + '])([' + DMARK_CLASS + '])', | |
| re.UNICODE) | |
| # CJK Kanji Normalization Mapping | |
| CJK_CLASS = [ | |
| messages.get_string('NGram.KANJI_1_0'), | |
| messages.get_string('NGram.KANJI_1_2'), | |
| messages.get_string('NGram.KANJI_1_4'), | |
| messages.get_string('NGram.KANJI_1_8'), | |
| messages.get_string('NGram.KANJI_1_11'), | |
| messages.get_string('NGram.KANJI_1_12'), | |
| messages.get_string('NGram.KANJI_1_13'), | |
| messages.get_string('NGram.KANJI_1_14'), | |
| messages.get_string('NGram.KANJI_1_16'), | |
| messages.get_string('NGram.KANJI_1_18'), | |
| messages.get_string('NGram.KANJI_1_22'), | |
| messages.get_string('NGram.KANJI_1_27'), | |
| messages.get_string('NGram.KANJI_1_29'), | |
| messages.get_string('NGram.KANJI_1_31'), | |
| messages.get_string('NGram.KANJI_1_35'), | |
| messages.get_string('NGram.KANJI_2_0'), | |
| messages.get_string('NGram.KANJI_2_1'), | |
| messages.get_string('NGram.KANJI_2_4'), | |
| messages.get_string('NGram.KANJI_2_9'), | |
| messages.get_string('NGram.KANJI_2_10'), | |
| messages.get_string('NGram.KANJI_2_11'), | |
| messages.get_string('NGram.KANJI_2_12'), | |
| messages.get_string('NGram.KANJI_2_13'), | |
| messages.get_string('NGram.KANJI_2_15'), | |
| messages.get_string('NGram.KANJI_2_16'), | |
| messages.get_string('NGram.KANJI_2_18'), | |
| messages.get_string('NGram.KANJI_2_21'), | |
| messages.get_string('NGram.KANJI_2_22'), | |
| messages.get_string('NGram.KANJI_2_23'), | |
| messages.get_string('NGram.KANJI_2_28'), | |
| messages.get_string('NGram.KANJI_2_29'), | |
| messages.get_string('NGram.KANJI_2_30'), | |
| messages.get_string('NGram.KANJI_2_31'), | |
| messages.get_string('NGram.KANJI_2_32'), | |
| messages.get_string('NGram.KANJI_2_35'), | |
| messages.get_string('NGram.KANJI_2_36'), | |
| messages.get_string('NGram.KANJI_2_37'), | |
| messages.get_string('NGram.KANJI_2_38'), | |
| messages.get_string('NGram.KANJI_3_1'), | |
| messages.get_string('NGram.KANJI_3_2'), | |
| messages.get_string('NGram.KANJI_3_3'), | |
| messages.get_string('NGram.KANJI_3_4'), | |
| messages.get_string('NGram.KANJI_3_5'), | |
| messages.get_string('NGram.KANJI_3_8'), | |
| messages.get_string('NGram.KANJI_3_9'), | |
| messages.get_string('NGram.KANJI_3_11'), | |
| messages.get_string('NGram.KANJI_3_12'), | |
| messages.get_string('NGram.KANJI_3_13'), | |
| messages.get_string('NGram.KANJI_3_15'), | |
| messages.get_string('NGram.KANJI_3_16'), | |
| messages.get_string('NGram.KANJI_3_18'), | |
| messages.get_string('NGram.KANJI_3_19'), | |
| messages.get_string('NGram.KANJI_3_22'), | |
| messages.get_string('NGram.KANJI_3_23'), | |
| messages.get_string('NGram.KANJI_3_27'), | |
| messages.get_string('NGram.KANJI_3_29'), | |
| messages.get_string('NGram.KANJI_3_30'), | |
| messages.get_string('NGram.KANJI_3_31'), | |
| messages.get_string('NGram.KANJI_3_32'), | |
| messages.get_string('NGram.KANJI_3_35'), | |
| messages.get_string('NGram.KANJI_3_36'), | |
| messages.get_string('NGram.KANJI_3_37'), | |
| messages.get_string('NGram.KANJI_3_38'), | |
| messages.get_string('NGram.KANJI_4_0'), | |
| messages.get_string('NGram.KANJI_4_9'), | |
| messages.get_string('NGram.KANJI_4_10'), | |
| messages.get_string('NGram.KANJI_4_16'), | |
| messages.get_string('NGram.KANJI_4_17'), | |
| messages.get_string('NGram.KANJI_4_18'), | |
| messages.get_string('NGram.KANJI_4_22'), | |
| messages.get_string('NGram.KANJI_4_24'), | |
| messages.get_string('NGram.KANJI_4_28'), | |
| messages.get_string('NGram.KANJI_4_34'), | |
| messages.get_string('NGram.KANJI_4_39'), | |
| messages.get_string('NGram.KANJI_5_10'), | |
| messages.get_string('NGram.KANJI_5_11'), | |
| messages.get_string('NGram.KANJI_5_12'), | |
| messages.get_string('NGram.KANJI_5_13'), | |
| messages.get_string('NGram.KANJI_5_14'), | |
| messages.get_string('NGram.KANJI_5_18'), | |
| messages.get_string('NGram.KANJI_5_26'), | |
| messages.get_string('NGram.KANJI_5_29'), | |
| messages.get_string('NGram.KANJI_5_34'), | |
| messages.get_string('NGram.KANJI_5_39'), | |
| messages.get_string('NGram.KANJI_6_0'), | |
| messages.get_string('NGram.KANJI_6_3'), | |
| messages.get_string('NGram.KANJI_6_9'), | |
| messages.get_string('NGram.KANJI_6_10'), | |
| messages.get_string('NGram.KANJI_6_11'), | |
| messages.get_string('NGram.KANJI_6_12'), | |
| messages.get_string('NGram.KANJI_6_16'), | |
| messages.get_string('NGram.KANJI_6_18'), | |
| messages.get_string('NGram.KANJI_6_20'), | |
| messages.get_string('NGram.KANJI_6_21'), | |
| messages.get_string('NGram.KANJI_6_22'), | |
| messages.get_string('NGram.KANJI_6_23'), | |
| messages.get_string('NGram.KANJI_6_25'), | |
| messages.get_string('NGram.KANJI_6_28'), | |
| messages.get_string('NGram.KANJI_6_29'), | |
| messages.get_string('NGram.KANJI_6_30'), | |
| messages.get_string('NGram.KANJI_6_32'), | |
| messages.get_string('NGram.KANJI_6_34'), | |
| messages.get_string('NGram.KANJI_6_35'), | |
| messages.get_string('NGram.KANJI_6_37'), | |
| messages.get_string('NGram.KANJI_6_39'), | |
| messages.get_string('NGram.KANJI_7_0'), | |
| messages.get_string('NGram.KANJI_7_3'), | |
| messages.get_string('NGram.KANJI_7_6'), | |
| messages.get_string('NGram.KANJI_7_7'), | |
| messages.get_string('NGram.KANJI_7_9'), | |
| messages.get_string('NGram.KANJI_7_11'), | |
| messages.get_string('NGram.KANJI_7_12'), | |
| messages.get_string('NGram.KANJI_7_13'), | |
| messages.get_string('NGram.KANJI_7_16'), | |
| messages.get_string('NGram.KANJI_7_18'), | |
| messages.get_string('NGram.KANJI_7_19'), | |
| messages.get_string('NGram.KANJI_7_20'), | |
| messages.get_string('NGram.KANJI_7_21'), | |
| messages.get_string('NGram.KANJI_7_23'), | |
| messages.get_string('NGram.KANJI_7_25'), | |
| messages.get_string('NGram.KANJI_7_28'), | |
| messages.get_string('NGram.KANJI_7_29'), | |
| messages.get_string('NGram.KANJI_7_32'), | |
| messages.get_string('NGram.KANJI_7_33'), | |
| messages.get_string('NGram.KANJI_7_35'), | |
| messages.get_string('NGram.KANJI_7_37')] | |
| CJK_MAP = {} | |
| def _init_cjk_map(cls): | |
| for cjk_list in cls.CJK_CLASS: | |
| representative = cjk_list[0] | |
| for ch in cjk_list: | |
| cls.CJK_MAP[ch] = representative | |
| NGram._init_cjk_map() | |