import re import emoji def clean_arabic_text(text): if not text: return "" # 1. Convert to String text = str(text) # 2. Remove URLs and Mentions text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE) text = re.sub(r"@\w+", "", text) # 3. Demojize (Convert 😂 to :face_with_tears_of_joy:) text = emoji.demojize(text) # 4. Orthographic Normalization # Normalize Alif (أ, إ, آ -> ا) text = re.sub(r"[أإآ]", "ا", text) # Normalize Yaa (ى -> ي) text = re.sub(r"ى", "ي", text) # Normalize Ta-Marbuta (ة -> ه) text = re.sub(r"ة", "ه", text) # Remove Tatweel (ـ) text = re.sub(r"ـ", "", text) # 5. Remove Extra Whitespace text = re.sub(r"\s+", " ", text).strip() return text