Spaces:
Sleeping
Sleeping
File size: 678 Bytes
fde590b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import re
def remove_tashkeel(text):
tashkeel_pattern = re.compile(
r'[\u0617-\u061A\u064B-\u0652]'
)
return re.sub(tashkeel_pattern, '', text)
def normalize_arabic(text):
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "و", text)
text = re.sub("ئ", "ي", text)
text = re.sub("ة", "ه", text)
return text
def preprocess_arabic(text):
text = remove_tashkeel(text)
text = normalize_arabic(text)
return text
def bm25_tokenize(text):
return preprocess_arabic(text).split()
def preprocess_query(query):
query = preprocess_arabic(query)
return bm25_tokenize(query)
|