kdallash commited on
Commit
216e8a6
·
verified ·
1 Parent(s): 6969608

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +19 -3
utils.py CHANGED
@@ -1,10 +1,26 @@
1
  import re
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def preprocess_arabic(text):
4
- text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
5
- text = re.sub(r"[^\w\s]", " ", text)
6
- text = re.sub(r"\s+", " ", text).strip()
7
  return text
8
 
9
  def bm25_tokenize(text):
10
  return preprocess_arabic(text).split()
 
 
 
 
1
  import re
2
 
3
+ def remove_tashkeel(text):
4
+ tashkeel_pattern = re.compile(
5
+ r'[\u0617-\u061A\u064B-\u0652]'
6
+ )
7
+ return re.sub(tashkeel_pattern, '', text)
8
+
9
+ def normalize_arabic(text):
10
+ text = re.sub("[إأآا]", "ا", text)
11
+ text = re.sub("ى", "ي", text)
12
+ text = re.sub("ؤ", "و", text)
13
+ text = re.sub("ئ", "ي", text)
14
+ text = re.sub("ة", "ه", text)
15
+ return text
16
+
17
  def preprocess_arabic(text):
18
+ text = remove_tashkeel(text)
19
+ text = normalize_arabic(text)
 
20
  return text
21
 
22
  def bm25_tokenize(text):
23
  return preprocess_arabic(text).split()
24
+ def preprocess_query(query):
25
+ query = preprocess_arabic(query)
26
+ return bm25_tokenize(query)