Spaces:
Sleeping
Sleeping
| import re | |
| def remove_tashkeel(text): | |
| tashkeel_pattern = re.compile( | |
| r'[\u0617-\u061A\u064B-\u0652]' | |
| ) | |
| return re.sub(tashkeel_pattern, '', text) | |
| def normalize_arabic(text): | |
| text = re.sub("[إأآا]", "ا", text) | |
| text = re.sub("ى", "ي", text) | |
| text = re.sub("ؤ", "و", text) | |
| text = re.sub("ئ", "ي", text) | |
| text = re.sub("ة", "ه", text) | |
| return text | |
| def preprocess_arabic(text): | |
| text = remove_tashkeel(text) | |
| text = normalize_arabic(text) | |
| return text | |
| def bm25_tokenize(text): | |
| return preprocess_arabic(text).split() | |
| def preprocess_query(query): | |
| query = preprocess_arabic(query) | |
| return bm25_tokenize(query) | |