NLP_project2.0 / preprocess.py
rana811's picture
Create preprocess.py
e6fc24e verified
raw
history blame contribute delete
822 Bytes
import re
import emoji
def clean_arabic_text(text):
if not text:
return ""
# 1. Convert to String
text = str(text)
# 2. Remove URLs and Mentions
text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
text = re.sub(r"@\w+", "", text)
# 3. Demojize (Convert 😂 to :face_with_tears_of_joy:)
text = emoji.demojize(text)
# 4. Orthographic Normalization
# Normalize Alif (أ, إ, آ -> ا)
text = re.sub(r"[أإآ]", "ا", text)
# Normalize Yaa (ى -> ي)
text = re.sub(r"ى", "ي", text)
# Normalize Ta-Marbuta (ة -> ه)
text = re.sub(r"ة", "ه", text)
# Remove Tatweel (ـ)
text = re.sub(r"ـ", "", text)
# 5. Remove Extra Whitespace
text = re.sub(r"\s+", " ", text).strip()
return text