fatttty commited on
Commit
8aa048a
·
verified ·
1 Parent(s): 68c82ec

Update text_preprocessor.py

Browse files
Files changed (1) hide show
  1. text_preprocessor.py +17 -0
text_preprocessor.py CHANGED
@@ -56,9 +56,17 @@ class TextPreprocessor:
56
  text = re.sub("ى", "ي", text)
57
  text = re.sub("ؤ", "ء", text)
58
  text = re.sub("ئ", "ء", text)
 
59
  text = re.sub("ة", "ه", text)
 
60
  text = re.sub("ڤ", "ف", text)
61
  text = re.sub("چ", "ج", text)
 
 
 
 
 
 
62
  return text
63
 
64
  def remove_stop_words(self, text):
@@ -67,6 +75,14 @@ class TextPreprocessor:
67
  filtered_text = ' '.join(filtered_tokens)
68
  return filtered_text
69
 
 
 
 
 
 
 
 
 
70
  def tokenize(self, text):
71
  tokens = word_tokenize(str(text)) # Convert text to string if not NaN
72
  return tokens
@@ -79,5 +95,6 @@ class TextPreprocessor:
79
  text = self.remove_diacritics(text)
80
  text = self.remove_extra_whitespaces(text)
81
  text = self.text_normalize(text)
 
82
  # text = self.remove_stop_words(text)
83
  return text
 
56
  text = re.sub("ى", "ي", text)
57
  text = re.sub("ؤ", "ء", text)
58
  text = re.sub("ئ", "ء", text)
59
+ text = re.sub("۽", "ء", text)
60
  text = re.sub("ة", "ه", text)
61
+ text = re.sub("[ڱګگݣڪ]", "ك", text)
62
  text = re.sub("ڤ", "ف", text)
63
  text = re.sub("چ", "ج", text)
64
+ text = re.sub("ژ", "ز", text)
65
+ text = re.sub("ڒ", "ز", text)
66
+ text = re.sub("ٺ", "ت", text)
67
+ text = re.sub("پ", "ب", text)
68
+ # text = re.sub("ه", "ة", text)
69
+ text = re.sub("پ", "ب", text)
70
  return text
71
 
72
  def remove_stop_words(self, text):
 
75
  filtered_text = ' '.join(filtered_tokens)
76
  return filtered_text
77
 
78
+ def remove_arabic_prefixes(self,text):
79
+ text = re.sub(r"\bال", '', text)
80
+ text = re.sub(r"\bوال", '', text)
81
+ text = re.sub(r"\bلل", '', text)
82
+ text = re.sub(r"\bبال", '', text)
83
+ text = re.sub("الا", "ا", text)
84
+ return text
85
+
86
  def tokenize(self, text):
87
  tokens = word_tokenize(str(text)) # Convert text to string if not NaN
88
  return tokens
 
95
  text = self.remove_diacritics(text)
96
  text = self.remove_extra_whitespaces(text)
97
  text = self.text_normalize(text)
98
+ text = self.remove_arabic_prefixes(text)
99
  # text = self.remove_stop_words(text)
100
  return text