Spaces:

fatttty
/

arabic_text_cleaner_app

Sleeping

fatttty commited on Sep 21, 2024

Commit

8aa048a

verified ·

1 Parent(s): 68c82ec

Update text_preprocessor.py

Files changed (1) hide show

text_preprocessor.py CHANGED Viewed

@@ -56,9 +56,17 @@ class TextPreprocessor:
         text = re.sub("ى", "ي", text)
         text = re.sub("ؤ", "ء", text)
         text = re.sub("ئ", "ء", text)
         text = re.sub("ة", "ه", text)
         text = re.sub("ڤ", "ف", text)
         text = re.sub("چ", "ج", text)
         return text
     def remove_stop_words(self, text):
@@ -67,6 +75,14 @@ class TextPreprocessor:
         filtered_text = ' '.join(filtered_tokens)
         return filtered_text
     def tokenize(self, text):
         tokens = word_tokenize(str(text))  # Convert text to string if not NaN
         return tokens
@@ -79,5 +95,6 @@ class TextPreprocessor:
         text = self.remove_diacritics(text)
         text = self.remove_extra_whitespaces(text)
         text = self.text_normalize(text)
         # text = self.remove_stop_words(text)
         return text

         text = re.sub("ى", "ي", text)
         text = re.sub("ؤ", "ء", text)
         text = re.sub("ئ", "ء", text)
+        text = re.sub("۽", "ء", text)
         text = re.sub("ة", "ه", text)
+        text = re.sub("[ڱګگݣڪ]", "ك", text)
         text = re.sub("ڤ", "ف", text)
         text = re.sub("چ", "ج", text)
+        text = re.sub("ژ", "ز", text)
+        text = re.sub("ڒ", "ز", text)
+        text = re.sub("ٺ", "ت", text)
+        text = re.sub("پ", "ب", text)
+    #    text = re.sub("ه", "ة", text)
+        text = re.sub("پ", "ب", text)
         return text
     def remove_stop_words(self, text):
         filtered_text = ' '.join(filtered_tokens)
         return filtered_text
+    def remove_arabic_prefixes(self,text):
+        text = re.sub(r"\bال", '', text)
+        text = re.sub(r"\bوال", '', text)
+        text = re.sub(r"\bلل", '', text)
+        text = re.sub(r"\bبال", '', text)
+        text = re.sub("الا", "ا",  text)
+        return text
     def tokenize(self, text):
         tokens = word_tokenize(str(text))  # Convert text to string if not NaN
         return tokens
         text = self.remove_diacritics(text)
         text = self.remove_extra_whitespaces(text)
         text = self.text_normalize(text)
+        text = self.remove_arabic_prefixes(text)
         # text = self.remove_stop_words(text)
         return text