import re import nltk import string from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import qalsadi.analex as qa import qalsadi.lemmatizer # Download required NLTK resources nltk.download('punkt') nltk.download('stopwords') class TextPreprocessor: def __init__(self): self.stop_words = set(stopwords.words('arabic')) self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' self.english_punctuations = string.punctuation self.punctuations_list = self.arabic_punctuations + self.english_punctuations def remove_punctuations(self, text): translator = str.maketrans('', '', self.punctuations_list) return text.translate(translator) def remove_english(self, text): english_pattern = re.compile(r'\b[a-zA-Z]+\b') cleaned_text = re.sub(english_pattern, '', text) return cleaned_text def remove_digits(self, text): text = re.sub(r'[0-9]+', '', text) # Remove English digits text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text) # Remove Arabic digits return text def remove_diacritics(self, text): pattern = re.compile(r""" ّ | # Tashdid َ | # Fatha ً | # Tanwin Fath ُ | # Damma ٌ | # Tanwin Damm ِ | # Kasra ٍ | # Tanwin Kasr ْ | # Sukun ـ # Tatwil/Kashida """, re.VERBOSE) cleaned_text = re.sub(pattern, '', text) return cleaned_text def remove_extra_whitespaces(self, text): trimmed_text = text.strip() return re.sub(r"\s+", ' ', trimmed_text) def text_normalize(self, text): text = re.sub("[إأآااً]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ؤ", "ء", text) text = re.sub("ئ", "ء", text) text = re.sub("۽", "ء", text) text = re.sub("ة", "ه", text) text = re.sub("[ڱګگݣڪ]", "ك", text) text = re.sub("ڤ", "ف", text) text = re.sub("چ", "ج", text) text = re.sub("ژ", "ز", text) text = re.sub("ڒ", "ز", text) text = re.sub("ٺ", "ت", text) text = re.sub("پ", "ب", text) # text = re.sub("ه", "ة", text) text = re.sub("پ", "ب", text) return text def remove_stop_words(self, text): tokens = word_tokenize(text) filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words] filtered_text = ' '.join(filtered_tokens) return filtered_text def remove_arabic_prefixes(self,text): text = re.sub(r"\bال", '', text) text = re.sub(r"\bوال", '', text) text = re.sub(r"\bلل", '', text) text = re.sub(r"\bبال", '', text) text = re.sub("الا", "ا", text) return text def tokenize(self, text): tokens = word_tokenize(str(text)) # Convert text to string if not NaN return tokens def preprocess(self, text): # Chain all preprocessing steps together text = self.remove_punctuations(text) text = self.remove_english(text) text = self.remove_digits(text) text = self.remove_diacritics(text) text = self.remove_extra_whitespaces(text) text = self.text_normalize(text) text = self.remove_arabic_prefixes(text) # text = self.remove_stop_words(text) return text