Spaces:

rana811
/

NLP_project2.0

Sleeping

rana811 commited on Jan 4

Commit

e6fc24e

verified ·

1 Parent(s): 0037c19

Create preprocess.py

Files changed (1) hide show

preprocess.py ADDED Viewed

+import re
+import emoji
+def clean_arabic_text(text):
+    if not text:
+        return ""
+    # 1. Convert to String
+    text = str(text)
+    # 2. Remove URLs and Mentions
+    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
+    text = re.sub(r"@\w+", "", text)
+    # 3. Demojize (Convert 😂 to :face_with_tears_of_joy:)
+    text = emoji.demojize(text)
+    # 4. Orthographic Normalization
+    # Normalize Alif (أ, إ, آ -> ا)
+    text = re.sub(r"[أإآ]", "ا", text)
+    # Normalize Yaa (ى -> ي)
+    text = re.sub(r"ى", "ي", text)
+    # Normalize Ta-Marbuta (ة -> ه)
+    text = re.sub(r"ة", "ه", text)
+    # Remove Tatweel (ـ)
+    text = re.sub(r"ـ", "", text)
+    # 5. Remove Extra Whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    return text