| | import hazm |
| | import re |
| | import string |
| |
|
| | from regexes.currency import CURRENCY_REGEX |
| | from regexes.email import EMAIL_REGEX |
| | from regexes.latin import LATIN_REGEX |
| | from regexes.latin import LATIN_REGEX, LATIN_WITH_SPECIAL_REGEX |
| | from regexes.number import NUMBERS_REGEX |
| | from regexes.phone import PHONE_REGEX |
| | from regexes.quote import DOUBLE_QUOTE_REGEX, SINGLE_QUOTE_REGEX |
| | from regexes.url import URL_REGEX |
| | from regexes.persian import PERSIAN_REGEX |
| | from regexes.punk import PUNK_REGEX |
| | import dictionary |
| |
|
| | allowed_char = string.ascii_letters + string.digits + ':/@_-. ' |
| |
|
| |
|
| | def make_trans(list_a, list_b): |
| | return dict((ord(a), b) for a, b in zip(list_a, list_b)) |
| |
|
| |
|
| | def multiple_replace(text, chars_to_mapping): |
| | pattern = "|".join(map(re.escape, chars_to_mapping.keys())) |
| | return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text)) |
| |
|
| |
|
| | def remove_adv_by_tag_name(text, tag_name): |
| | found = text.find(tag_name) |
| |
|
| | if found > 0: |
| | text = text[:found] |
| |
|
| | return text |
| |
|
| |
|
| | def clean_url(text): |
| | |
| | text = re.sub('<.*?>', '', text) |
| |
|
| | |
| | text = re.sub(r'(?:(?:http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?', "", |
| | text) |
| |
|
| | |
| | result = '' |
| | for char in text: |
| | if char in allowed_char: |
| | result += char |
| | result = result.replace(' ', '') |
| | result = result.split(':') |
| | for phrase in result: |
| | p = phrase |
| | if '//' in p: |
| | if ('https :' + p) in text: |
| | text = text.replace('https :' + p, '') |
| | elif ('http :' + p) in text: |
| | text = text.replace('http :' + p, '') |
| | elif '@' in p: |
| | if p in text: |
| | text = text.replace(p, '') |
| |
|
| | return text |
| |
|
| |
|
| | ar2fa_digits = make_trans("٠١٢٣٤٥٦٧٨٩٪", "۰۱۲۳۴۵۶۷۸۹٪") |
| | fa2en_digits = make_trans("۰۱۲۳۴۵۶۷۸۹٪", "0123456789%") |
| | normalizer = hazm.Normalizer(persian_numbers=True, punctuation_spacing=False) |
| |
|
| |
|
| | def normalize(text, zwnj="\u200c", tokenized=False): |
| | text = text.replace("\n", " ").replace("\t", " ") |
| | text = re.sub(r"\u200c+", "\u200c", text) |
| | text = text.replace('ـ', '') |
| | text = normalizer.normalize(text) |
| |
|
| | if len(dictionary.characters) > 0: |
| | text = multiple_replace(text, dictionary.characters) |
| |
|
| | if len(dictionary.words_map) > 0: |
| | text = multiple_replace(text, dictionary.words_map) |
| |
|
| | text = text.translate(ar2fa_digits) |
| | text = text.translate(fa2en_digits) |
| |
|
| | text = SINGLE_QUOTE_REGEX.sub("'", text) |
| | text = DOUBLE_QUOTE_REGEX.sub('"', text) |
| | text = CURRENCY_REGEX.sub(r" \1 ", text) |
| | text = clean_url(text) |
| | text = remove_adv_by_tag_name(text, tag_name="برچسب ها :") |
| | text = URL_REGEX.sub(" ", text) |
| | text = EMAIL_REGEX.sub(" ", text) |
| | text = PHONE_REGEX.sub(r" \1 ", text) |
| | text = NUMBERS_REGEX.sub(r" \1 ", text) |
| | text = LATIN_REGEX.sub(r" \1 ", text) |
| | |
| |
|
| | |
| | text = re.sub(PERSIAN_REGEX, " ", text) |
| |
|
| | text = text.replace(f" {zwnj} ", f"{zwnj}") |
| | text = text.replace(f"{zwnj} ", f"{zwnj}") |
| | text = text.replace(f" {zwnj}", f"{zwnj}") |
| |
|
| | if len(dictionary.special_tokens) > 0: |
| | text = multiple_replace(text, dictionary.special_tokens) |
| |
|
| | tokens = [] |
| | for token in text.split(): |
| | token = token.strip() |
| | if token: |
| | if token.startswith(zwnj) and token.endswith(zwnj): |
| | token = token[1:-1] |
| | if token.startswith(zwnj): |
| | token = token[1:] |
| | elif token.endswith(zwnj): |
| | token = token[:-1] |
| | else: |
| | token = token |
| |
|
| | tokens.append(token) |
| |
|
| | if tokenized: |
| | return tokens |
| |
|
| | return " ".join(tokens) |
| |
|
| |
|
| |
|
| | if __name__ == '__main__': |
| | import textwrap |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|