Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,10 +1,26 @@
|
|
| 1 |
import re
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
def preprocess_arabic(text):
|
| 4 |
-
text =
|
| 5 |
-
text =
|
| 6 |
-
text = re.sub(r"\s+", " ", text).strip()
|
| 7 |
return text
|
| 8 |
|
| 9 |
def bm25_tokenize(text):
|
| 10 |
return preprocess_arabic(text).split()
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
+
def remove_tashkeel(text):
|
| 4 |
+
tashkeel_pattern = re.compile(
|
| 5 |
+
r'[\u0617-\u061A\u064B-\u0652]'
|
| 6 |
+
)
|
| 7 |
+
return re.sub(tashkeel_pattern, '', text)
|
| 8 |
+
|
| 9 |
+
def normalize_arabic(text):
|
| 10 |
+
text = re.sub("[إأآا]", "ا", text)
|
| 11 |
+
text = re.sub("ى", "ي", text)
|
| 12 |
+
text = re.sub("ؤ", "و", text)
|
| 13 |
+
text = re.sub("ئ", "ي", text)
|
| 14 |
+
text = re.sub("ة", "ه", text)
|
| 15 |
+
return text
|
| 16 |
+
|
| 17 |
def preprocess_arabic(text):
|
| 18 |
+
text = remove_tashkeel(text)
|
| 19 |
+
text = normalize_arabic(text)
|
|
|
|
| 20 |
return text
|
| 21 |
|
| 22 |
def bm25_tokenize(text):
|
| 23 |
return preprocess_arabic(text).split()
|
| 24 |
+
def preprocess_query(query):
|
| 25 |
+
query = preprocess_arabic(query)
|
| 26 |
+
return bm25_tokenize(query)
|