arabic_text_cleaner_app / text_preprocessor.py
fatttty's picture
Update text_preprocessor.py
8aa048a verified
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import qalsadi.analex as qa
import qalsadi.lemmatizer
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
class TextPreprocessor:
def __init__(self):
self.stop_words = set(stopwords.words('arabic'))
self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
self.english_punctuations = string.punctuation
self.punctuations_list = self.arabic_punctuations + self.english_punctuations
def remove_punctuations(self, text):
translator = str.maketrans('', '', self.punctuations_list)
return text.translate(translator)
def remove_english(self, text):
english_pattern = re.compile(r'\b[a-zA-Z]+\b')
cleaned_text = re.sub(english_pattern, '', text)
return cleaned_text
def remove_digits(self, text):
text = re.sub(r'[0-9]+', '', text) # Remove English digits
text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text) # Remove Arabic digits
return text
def remove_diacritics(self, text):
pattern = re.compile(r"""
ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def remove_extra_whitespaces(self, text):
trimmed_text = text.strip()
return re.sub(r"\s+", ' ', trimmed_text)
def text_normalize(self, text):
text = re.sub("[إأآااً]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("۽", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("[ڱګگݣڪ]", "ك", text)
text = re.sub("ڤ", "ف", text)
text = re.sub("چ", "ج", text)
text = re.sub("ژ", "ز", text)
text = re.sub("ڒ", "ز", text)
text = re.sub("ٺ", "ت", text)
text = re.sub("پ", "ب", text)
# text = re.sub("ه", "ة", text)
text = re.sub("پ", "ب", text)
return text
def remove_stop_words(self, text):
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_arabic_prefixes(self,text):
text = re.sub(r"\bال", '', text)
text = re.sub(r"\bوال", '', text)
text = re.sub(r"\bلل", '', text)
text = re.sub(r"\bبال", '', text)
text = re.sub("الا", "ا", text)
return text
def tokenize(self, text):
tokens = word_tokenize(str(text)) # Convert text to string if not NaN
return tokens
def preprocess(self, text):
# Chain all preprocessing steps together
text = self.remove_punctuations(text)
text = self.remove_english(text)
text = self.remove_digits(text)
text = self.remove_diacritics(text)
text = self.remove_extra_whitespaces(text)
text = self.text_normalize(text)
text = self.remove_arabic_prefixes(text)
# text = self.remove_stop_words(text)
return text