Spaces:
Sleeping
Sleeping
Update text_preprocessor.py
Browse files- text_preprocessor.py +17 -0
text_preprocessor.py
CHANGED
|
@@ -56,9 +56,17 @@ class TextPreprocessor:
|
|
| 56 |
text = re.sub("ى", "ي", text)
|
| 57 |
text = re.sub("ؤ", "ء", text)
|
| 58 |
text = re.sub("ئ", "ء", text)
|
|
|
|
| 59 |
text = re.sub("ة", "ه", text)
|
|
|
|
| 60 |
text = re.sub("ڤ", "ف", text)
|
| 61 |
text = re.sub("چ", "ج", text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
return text
|
| 63 |
|
| 64 |
def remove_stop_words(self, text):
|
|
@@ -67,6 +75,14 @@ class TextPreprocessor:
|
|
| 67 |
filtered_text = ' '.join(filtered_tokens)
|
| 68 |
return filtered_text
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def tokenize(self, text):
|
| 71 |
tokens = word_tokenize(str(text)) # Convert text to string if not NaN
|
| 72 |
return tokens
|
|
@@ -79,5 +95,6 @@ class TextPreprocessor:
|
|
| 79 |
text = self.remove_diacritics(text)
|
| 80 |
text = self.remove_extra_whitespaces(text)
|
| 81 |
text = self.text_normalize(text)
|
|
|
|
| 82 |
# text = self.remove_stop_words(text)
|
| 83 |
return text
|
|
|
|
| 56 |
text = re.sub("ى", "ي", text)
|
| 57 |
text = re.sub("ؤ", "ء", text)
|
| 58 |
text = re.sub("ئ", "ء", text)
|
| 59 |
+
text = re.sub("۽", "ء", text)
|
| 60 |
text = re.sub("ة", "ه", text)
|
| 61 |
+
text = re.sub("[ڱګگݣڪ]", "ك", text)
|
| 62 |
text = re.sub("ڤ", "ف", text)
|
| 63 |
text = re.sub("چ", "ج", text)
|
| 64 |
+
text = re.sub("ژ", "ز", text)
|
| 65 |
+
text = re.sub("ڒ", "ز", text)
|
| 66 |
+
text = re.sub("ٺ", "ت", text)
|
| 67 |
+
text = re.sub("پ", "ب", text)
|
| 68 |
+
# text = re.sub("ه", "ة", text)
|
| 69 |
+
text = re.sub("پ", "ب", text)
|
| 70 |
return text
|
| 71 |
|
| 72 |
def remove_stop_words(self, text):
|
|
|
|
| 75 |
filtered_text = ' '.join(filtered_tokens)
|
| 76 |
return filtered_text
|
| 77 |
|
| 78 |
+
def remove_arabic_prefixes(self,text):
|
| 79 |
+
text = re.sub(r"\bال", '', text)
|
| 80 |
+
text = re.sub(r"\bوال", '', text)
|
| 81 |
+
text = re.sub(r"\bلل", '', text)
|
| 82 |
+
text = re.sub(r"\bبال", '', text)
|
| 83 |
+
text = re.sub("الا", "ا", text)
|
| 84 |
+
return text
|
| 85 |
+
|
| 86 |
def tokenize(self, text):
|
| 87 |
tokens = word_tokenize(str(text)) # Convert text to string if not NaN
|
| 88 |
return tokens
|
|
|
|
| 95 |
text = self.remove_diacritics(text)
|
| 96 |
text = self.remove_extra_whitespaces(text)
|
| 97 |
text = self.text_normalize(text)
|
| 98 |
+
text = self.remove_arabic_prefixes(text)
|
| 99 |
# text = self.remove_stop_words(text)
|
| 100 |
return text
|