File size: 3,690 Bytes
ffc124d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aa048a
ffc124d
8aa048a
ffc124d
 
8aa048a
 
 
 
 
 
ffc124d
 
 
 
 
 
 
 
8aa048a
 
 
 
 
 
 
 
ffc124d
 
 
 
 
 
 
 
 
 
 
 
8aa048a
406f3af
ffc124d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import qalsadi.analex as qa
import qalsadi.lemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

class TextPreprocessor:
    def __init__(self):
        self.stop_words = set(stopwords.words('arabic'))
        self.arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
        self.english_punctuations = string.punctuation
        self.punctuations_list = self.arabic_punctuations + self.english_punctuations

    def remove_punctuations(self, text):
        translator = str.maketrans('', '', self.punctuations_list)
        return text.translate(translator)

    def remove_english(self, text):
        english_pattern = re.compile(r'\b[a-zA-Z]+\b')
        cleaned_text = re.sub(english_pattern, '', text)
        return cleaned_text

    def remove_digits(self, text):
        text = re.sub(r'[0-9]+', '', text)  # Remove English digits
        text = re.sub(r'[٠١۲٣٤٥٦٧٨٩]+', '', text)  # Remove Arabic digits
        return text

    def remove_diacritics(self, text):
        pattern = re.compile(r"""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
        cleaned_text = re.sub(pattern, '', text)
        return cleaned_text

    def remove_extra_whitespaces(self, text):
        trimmed_text = text.strip()
        return re.sub(r"\s+", ' ', trimmed_text)

    def text_normalize(self, text):
        text = re.sub("[إأآااً]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ؤ", "ء", text)
        text = re.sub("ئ", "ء", text)
        text = re.sub("۽", "ء", text)
        text = re.sub("ة", "ه", text)
        text = re.sub("[ڱګگݣڪ]", "ك", text)
        text = re.sub("ڤ", "ف", text)
        text = re.sub("چ", "ج", text)
        text = re.sub("ژ", "ز", text)
        text = re.sub("ڒ", "ز", text)
        text = re.sub("ٺ", "ت", text)
        text = re.sub("پ", "ب", text)
    #    text = re.sub("ه", "ة", text)
        text = re.sub("پ", "ب", text)
        return text

    def remove_stop_words(self, text):
        tokens = word_tokenize(text)
        filtered_tokens = [word for word in tokens if word.lower() not in self.stop_words]
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text

    def remove_arabic_prefixes(self,text):
        text = re.sub(r"\bال", '', text)
        text = re.sub(r"\bوال", '', text)
        text = re.sub(r"\bلل", '', text)
        text = re.sub(r"\bبال", '', text)
        text = re.sub("الا", "ا",  text)
        return text

    def tokenize(self, text):
        tokens = word_tokenize(str(text))  # Convert text to string if not NaN
        return tokens

    def preprocess(self, text):
        # Chain all preprocessing steps together
        text = self.remove_punctuations(text)
        text = self.remove_english(text)
        text = self.remove_digits(text)
        text = self.remove_diacritics(text)
        text = self.remove_extra_whitespaces(text)
        text = self.text_normalize(text)
        text = self.remove_arabic_prefixes(text)
        # text = self.remove_stop_words(text)
        return text