File size: 1,992 Bytes
37c48d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re

import emoji

_CHAR_MAP = str.maketrans({
    "أ": "ا",  # noqa: RUF001
    "إ": "ا",  # noqa: RUF001
    "آ": "ا",  # noqa: RUF001
    "ى": "ي",
    "ة": "ه",  # noqa: RUF001
    "ؤ": "و",
    "ئ": "ي",
    "ـ": "",
})


# Patterns
_DIACRITICS_PATTERN = re.compile(r"[\u064B-\u0652]")
_REPEATED_CHAR_PATTERN = re.compile(r"(.)\1+")
_URL_PATTERN = re.compile(
    r"""(?xi)
    \b
    (?:https?:\/\/)?               # optional http or https
    (?:www\.)?                     # optional www.
    [a-z0-9\-._~%]+                # domain or subdomain
    \.
    [a-z]{2,}                      # TLD
    (?:[\/?#][^\s]*)?              # optional path/query
    \b
    """,
)

_PHONE_PATTERN = re.compile(r"\b\d{8,15}\b")
_PUNCTUATION_PATTERN = re.compile(r"[^\w\s<>]")
_WHITESPACE_PATTERN = re.compile(r"\s+")


def normalize_arabic_letters(text: str) -> str:
    return text.translate(_CHAR_MAP)


def remove_diacritics(text: str) -> str:
    return _DIACRITICS_PATTERN.sub("", text)


def reduce_repeated_characters(text: str) -> str:
    return _REPEATED_CHAR_PATTERN.sub(r"\1", text)


def replace_urls(text: str) -> str:
    return _URL_PATTERN.sub(" <url> ", text)


def replace_phone_numbers(text: str) -> str:
    text = re.sub(r"(?<=\d)\s+(?=\d)", "", text)
    return _PHONE_PATTERN.sub(" <phone> ", text)


def replace_emojis(text: str) -> str:
    return emoji.replace_emoji(text, replace=" <emoji> ")


def remove_punctuation(text: str) -> str:
    return _PUNCTUATION_PATTERN.sub("", text)


def normalize_whitespace(text: str) -> str:
    return _WHITESPACE_PATTERN.sub(" ", text).strip()


def clean_text(text: str) -> str:
    text = text.strip().lower()
    text = normalize_arabic_letters(text)
    text = remove_diacritics(text)
    text = replace_urls(text)
    text = replace_phone_numbers(text)
    text = replace_emojis(text)
    text = reduce_repeated_characters(text)
    text = remove_punctuation(text)

    return normalize_whitespace(text)