| from parsivar import Normalizer |
| from parsivar import SpellCheck |
|
|
| import num2fawords |
| import re |
| import string |
|
|
| from dictionary import dictionary_mapping, fixator_dictionary |
|
|
| _normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True) |
| _spell = SpellCheck() |
| chars_to_ignore = [ |
| ",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�", |
| "#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?", |
| ".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„', |
| 'ā', 'š', 'ّ', 'ْ', |
| ] |
| chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits) |
| chars_to_ignore = f"""[{"".join(chars_to_ignore)}]""" |
| zwnj = "\u200c" |
| silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "] |
|
|
|
|
| def multiple_replace(text, chars_to_mapping): |
| pattern = "|".join(map(re.escape, chars_to_mapping.keys())) |
| return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) |
|
|
|
|
| def remove_special_characters(text, chars_to_ignore_regex): |
| text = re.sub(chars_to_ignore_regex, '', text).lower() + " " |
| return text |
|
|
|
|
| def convert_word_nums_to_text(word): |
| try: |
| word = int(word) |
| word = num2fawords.words(word) |
| except: |
| word = word |
|
|
| return word |
|
|
|
|
| def normalizer_at_word_level(text): |
| words = text.split() |
| _text = [] |
|
|
| for word in words: |
| word = convert_word_nums_to_text(word) |
| word = fixator_dictionary.get(word, word) |
|
|
| _text.append(word) |
|
|
| return " ".join(_text) + " " |
|
|
|
|
| def finder(ss, s, starter=False): |
| found = [] |
| for m in re.finditer(ss, s): |
| if starter: |
| found.append(m.start()) |
| else: |
| found.append((m.start(), m.end())) |
|
|
| return found |
|
|
|
|
| def substring_replace(ss, s, start, end, stripped=True): |
| s_start = s[:start] |
| s_end = s[end:] |
|
|
| counter = 0 |
| if stripped: |
| counter = 1 if s_start.endswith(" ") else counter |
| s_start = s_start.rstrip() |
|
|
| return s_start + ss + s_end, counter |
|
|
|
|
| def normalizer( |
| batch, |
| is_normalize=True, |
| is_spell_check=False, |
| return_dict=True, |
| filter_trivials=False, |
| remove_extra_space=False |
| ): |
| text = batch["sentence"].lower().strip() |
|
|
| |
| if is_normalize: |
| text = _normalizer.normalize(text) |
|
|
| |
| text = multiple_replace(text, dictionary_mapping) |
| text = re.sub(" +", " ", text) |
|
|
| |
| text = remove_special_characters(text, chars_to_ignore) |
| text = re.sub(" +", " ", text) |
|
|
| |
| special, pointer = "آ", int("0") |
| for f in sorted(finder(special, text, True)): |
| index = f + pointer - 1 |
| if len(text) >= index: |
| if text[index] not in silent_chars: |
| new_text, extra_pointer = substring_replace( |
| f"{text[index]}{zwnj}", text, index, index + 1, stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
|
|
| |
| pointer = int("0") |
| special_list = [ |
| |
| "هایمان", "هایم", "هایت", "هایش", |
| "هایتان", "هایشان", "هام", "هات", |
| "هاتان", "هامون", "هامان", "هاش", |
| "هاتون", "هاشان", "هاشون", |
| "هایی", "های", "هاس", "ها" |
| ] |
| for special in special_list: |
| pointer = 0 |
| text = text |
| for f in sorted(finder(special, text, False)): |
| start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
| if len(text) >= (end + 1): |
| if len(text) == (end + 1): |
| new_text, extra_pointer = substring_replace( |
| f"{zwnj}{special}", |
| text, |
| start + 1, |
| end + 1, |
| stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
| else: |
| if text[end + 1] == " ": |
| new_text, extra_pointer = substring_replace( |
| f"{zwnj}{special}", |
| text, |
| start + 1, |
| end + 1, |
| stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
|
|
| special, pointer = "افزار", int("0") |
| for f in sorted(finder(special, text, False)): |
| start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
|
|
| if len(text) >= (end + 1): |
| new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
|
|
| |
| pointer = int("0") |
| special_list = [ |
| "ترین", "تر" |
| ] |
| for special in special_list: |
| pointer = 0 |
| text = text |
| for f in sorted(finder(special, text, False)): |
| start, end = f[0] + pointer - 1, f[1] + pointer - 1 |
| if len(text) >= (end + 1): |
| if len(text) == (end + 1): |
| new_text, extra_pointer = substring_replace( |
| f"{zwnj}{special}", |
| text, |
| start + 1, |
| end + 1, |
| stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
| else: |
| if text[end + 1] == " ": |
| new_text, extra_pointer = substring_replace( |
| f"{zwnj}{special}", |
| text, |
| start + 1, |
| end + 1, |
| stripped=True) |
| text = new_text |
| pointer += 1 + 1 - 1 - extra_pointer |
|
|
| |
| if is_spell_check: |
| text = _normalizer.normalize(_spell.spell_corrector(text)) |
|
|
| |
| text = normalizer_at_word_level(text) |
| text = re.sub(" +", " ", text) |
|
|
| if remove_extra_space: |
| text = text.strip() |
| else: |
| text = text.strip() + " " |
|
|
| if filter_trivials: |
| if not len(text) > 2: |
| text = None |
|
|
| if not return_dict: |
| return text |
|
|
| batch["sentence"] = text |
| return batch |
|
|
|
|
| if __name__ == '__main__': |
| input_text = "سلام بر شما که میآیید و میآموزید که بیآرآیم" |
| print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
| input_text = "کتابهایمان میدانی کجاها ماههاس که کیهامون و کیهان دنبالههاشون برای بهای هستند." |
| print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
| input_text = " میانافزارهای امروزی نرمافزار سخت افزار امروز نوشتافزار ها" |
| print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
| input_text = "این کتاب بهترین در نوع شتر آسانتر هست" |
| print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|
| input_text = "سه چیز هست که از پژوهش در این زمینه آموختهام" |
| print(normalizer({"sentence": input_text}, return_dict=False)) |
|
|