Spaces:
Sleeping
Sleeping
File size: 3,690 Bytes
ffc124d 8aa048a ffc124d 8aa048a ffc124d 8aa048a ffc124d 8aa048a ffc124d 8aa048a 406f3af ffc124d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import qalsadi.analex as qa
import qalsadi.lemmatizer
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
class TextPreprocessor:
def __init__(self):
self.stop_words = set(stopwords.words('arabic'))
self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
self.english_punctuations = string.punctuation
self.punctuations_list = self.arabic_punctuations + self.english_punctuations
def remove_punctuations(self, text):
translator = str.maketrans('', '', self.punctuations_list)
return text.translate(translator)
def remove_english(self, text):
english_pattern = re.compile(r'\b[a-zA-Z]+\b')
cleaned_text = re.sub(english_pattern, '', text)
return cleaned_text
def remove_digits(self, text):
text = re.sub(r'[0-9]+', '', text) # Remove English digits
text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text) # Remove Arabic digits
return text
def remove_diacritics(self, text):
pattern = re.compile(r"""
ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def remove_extra_whitespaces(self, text):
trimmed_text = text.strip()
return re.sub(r"\s+", ' ', trimmed_text)
def text_normalize(self, text):
text = re.sub("[إأآااً]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("۽", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("[ڱګگݣڪ]", "ك", text)
text = re.sub("ڤ", "ف", text)
text = re.sub("چ", "ج", text)
text = re.sub("ژ", "ز", text)
text = re.sub("ڒ", "ز", text)
text = re.sub("ٺ", "ت", text)
text = re.sub("پ", "ب", text)
# text = re.sub("ه", "ة", text)
text = re.sub("پ", "ب", text)
return text
def remove_stop_words(self, text):
tokens = word_tokenize(text)
filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def remove_arabic_prefixes(self,text):
text = re.sub(r"\bال", '', text)
text = re.sub(r"\bوال", '', text)
text = re.sub(r"\bلل", '', text)
text = re.sub(r"\bبال", '', text)
text = re.sub("الا", "ا", text)
return text
def tokenize(self, text):
tokens = word_tokenize(str(text)) # Convert text to string if not NaN
return tokens
def preprocess(self, text):
# Chain all preprocessing steps together
text = self.remove_punctuations(text)
text = self.remove_english(text)
text = self.remove_digits(text)
text = self.remove_diacritics(text)
text = self.remove_extra_whitespaces(text)
text = self.text_normalize(text)
text = self.remove_arabic_prefixes(text)
# text = self.remove_stop_words(text)
return text
|