Spaces:
Sleeping
Sleeping
| import re | |
| import emoji | |
| def clean_arabic_text(text): | |
| if not text: | |
| return "" | |
| # 1. Convert to String | |
| text = str(text) | |
| # 2. Remove URLs and Mentions | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) | |
| text = re.sub(r"@\w+", "", text) | |
| # 3. Demojize (Convert 😂 to :face_with_tears_of_joy:) | |
| text = emoji.demojize(text) | |
| # 4. Orthographic Normalization | |
| # Normalize Alif (أ, إ, آ -> ا) | |
| text = re.sub(r"[أإآ]", "ا", text) | |
| # Normalize Yaa (ى -> ي) | |
| text = re.sub(r"ى", "ي", text) | |
| # Normalize Ta-Marbuta (ة -> ه) | |
| text = re.sub(r"ة", "ه", text) | |
| # Remove Tatweel (ـ) | |
| text = re.sub(r"ـ", "", text) | |
| # 5. Remove Extra Whitespace | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text |