File size: 1,992 Bytes
37c48d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import re
import emoji
_CHAR_MAP = str.maketrans({
"أ": "ا", # noqa: RUF001
"إ": "ا", # noqa: RUF001
"آ": "ا", # noqa: RUF001
"ى": "ي",
"ة": "ه", # noqa: RUF001
"ؤ": "و",
"ئ": "ي",
"ـ": "",
})
# Patterns
_DIACRITICS_PATTERN = re.compile(r"[\u064B-\u0652]")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1+")
_URL_PATTERN = re.compile(
r"""(?xi)
\b
(?:https?:\/\/)? # optional http or https
(?:www\.)? # optional www.
[a-z0-9\-._~%]+ # domain or subdomain
\.
[a-z]{2,} # TLD
(?:[\/?#][^\s]*)? # optional path/query
\b
""",
)
_PHONE_PATTERN = re.compile(r"\b\d{8,15}\b")
_PUNCTUATION_PATTERN = re.compile(r"[^\w\s<>]")
_WHITESPACE_PATTERN = re.compile(r"\s+")
def normalize_arabic_letters(text: str) -> str:
return text.translate(_CHAR_MAP)
def remove_diacritics(text: str) -> str:
return _DIACRITICS_PATTERN.sub("", text)
def reduce_repeated_characters(text: str) -> str:
return _REPEATED_CHAR_PATTERN.sub(r"\1", text)
def replace_urls(text: str) -> str:
return _URL_PATTERN.sub(" <url> ", text)
def replace_phone_numbers(text: str) -> str:
text = re.sub(r"(?<=\d)\s+(?=\d)", "", text)
return _PHONE_PATTERN.sub(" <phone> ", text)
def replace_emojis(text: str) -> str:
return emoji.replace_emoji(text, replace=" <emoji> ")
def remove_punctuation(text: str) -> str:
return _PUNCTUATION_PATTERN.sub("", text)
def normalize_whitespace(text: str) -> str:
return _WHITESPACE_PATTERN.sub(" ", text).strip()
def clean_text(text: str) -> str:
text = text.strip().lower()
text = normalize_arabic_letters(text)
text = remove_diacritics(text)
text = replace_urls(text)
text = replace_phone_numbers(text)
text = replace_emojis(text)
text = reduce_repeated_characters(text)
text = remove_punctuation(text)
return normalize_whitespace(text)
|