Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| import string | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| import qalsadi.analex as qa | |
| import qalsadi.lemmatizer | |
| # Download required NLTK resources | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| class TextPreprocessor: | |
| def __init__(self): | |
| self.stop_words = set(stopwords.words('arabic')) | |
| self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' | |
| self.english_punctuations = string.punctuation | |
| self.punctuations_list = self.arabic_punctuations + self.english_punctuations | |
| def remove_punctuations(self, text): | |
| translator = str.maketrans('', '', self.punctuations_list) | |
| return text.translate(translator) | |
| def remove_english(self, text): | |
| english_pattern = re.compile(r'\b[a-zA-Z]+\b') | |
| cleaned_text = re.sub(english_pattern, '', text) | |
| return cleaned_text | |
| def remove_digits(self, text): | |
| text = re.sub(r'[0-9]+', '', text) # Remove English digits | |
| text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text) # Remove Arabic digits | |
| return text | |
| def remove_diacritics(self, text): | |
| pattern = re.compile(r""" | |
| ّ | # Tashdid | |
| َ | # Fatha | |
| ً | # Tanwin Fath | |
| ُ | # Damma | |
| ٌ | # Tanwin Damm | |
| ِ | # Kasra | |
| ٍ | # Tanwin Kasr | |
| ْ | # Sukun | |
| ـ # Tatwil/Kashida | |
| """, re.VERBOSE) | |
| cleaned_text = re.sub(pattern, '', text) | |
| return cleaned_text | |
| def remove_extra_whitespaces(self, text): | |
| trimmed_text = text.strip() | |
| return re.sub(r"\s+", ' ', trimmed_text) | |
| def text_normalize(self, text): | |
| text = re.sub("[إأآااً]", "ا", text) | |
| text = re.sub("ى", "ي", text) | |
| text = re.sub("ؤ", "ء", text) | |
| text = re.sub("ئ", "ء", text) | |
| text = re.sub("۽", "ء", text) | |
| text = re.sub("ة", "ه", text) | |
| text = re.sub("[ڱګگݣڪ]", "ك", text) | |
| text = re.sub("ڤ", "ف", text) | |
| text = re.sub("چ", "ج", text) | |
| text = re.sub("ژ", "ز", text) | |
| text = re.sub("ڒ", "ز", text) | |
| text = re.sub("ٺ", "ت", text) | |
| text = re.sub("پ", "ب", text) | |
| # text = re.sub("ه", "ة", text) | |
| text = re.sub("پ", "ب", text) | |
| return text | |
| def remove_stop_words(self, text): | |
| tokens = word_tokenize(text) | |
| filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words] | |
| filtered_text = ' '.join(filtered_tokens) | |
| return filtered_text | |
| def remove_arabic_prefixes(self,text): | |
| text = re.sub(r"\bال", '', text) | |
| text = re.sub(r"\bوال", '', text) | |
| text = re.sub(r"\bلل", '', text) | |
| text = re.sub(r"\bبال", '', text) | |
| text = re.sub("الا", "ا", text) | |
| return text | |
| def tokenize(self, text): | |
| tokens = word_tokenize(str(text)) # Convert text to string if not NaN | |
| return tokens | |
| def preprocess(self, text): | |
| # Chain all preprocessing steps together | |
| text = self.remove_punctuations(text) | |
| text = self.remove_english(text) | |
| text = self.remove_digits(text) | |
| text = self.remove_diacritics(text) | |
| text = self.remove_extra_whitespaces(text) | |
| text = self.text_normalize(text) | |
| text = self.remove_arabic_prefixes(text) | |
| # text = self.remove_stop_words(text) | |
| return text | |