rana811 commited on
Commit
e6fc24e
·
verified ·
1 Parent(s): 0037c19

Create preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +31 -0
preprocess.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import emoji
3
+
4
+ def clean_arabic_text(text):
5
+ if not text:
6
+ return ""
7
+
8
+ # 1. Convert to String
9
+ text = str(text)
10
+
11
+ # 2. Remove URLs and Mentions
12
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
13
+ text = re.sub(r"@\w+", "", text)
14
+
15
+ # 3. Demojize (Convert 😂 to :face_with_tears_of_joy:)
16
+ text = emoji.demojize(text)
17
+
18
+ # 4. Orthographic Normalization
19
+ # Normalize Alif (أ, إ, آ -> ا)
20
+ text = re.sub(r"[أإآ]", "ا", text)
21
+ # Normalize Yaa (ى -> ي)
22
+ text = re.sub(r"ى", "ي", text)
23
+ # Normalize Ta-Marbuta (ة -> ه)
24
+ text = re.sub(r"ة", "ه", text)
25
+ # Remove Tatweel (ـ)
26
+ text = re.sub(r"ـ", "", text)
27
+
28
+ # 5. Remove Extra Whitespace
29
+ text = re.sub(r"\s+", " ", text).strip()
30
+
31
+ return text