Hadith_Search / utils.py
NightPrince's picture
Deploy Vanilla HTML Hadith Engine via FastAPI (with LFS)
fde590b
raw
history blame contribute delete
678 Bytes
import re
def remove_tashkeel(text):
tashkeel_pattern = re.compile(
r'[\u0617-\u061A\u064B-\u0652]'
)
return re.sub(tashkeel_pattern, '', text)
def normalize_arabic(text):
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "و", text)
text = re.sub("ئ", "ي", text)
text = re.sub("ة", "ه", text)
return text
def preprocess_arabic(text):
text = remove_tashkeel(text)
text = normalize_arabic(text)
return text
def bm25_tokenize(text):
return preprocess_arabic(text).split()
def preprocess_query(query):
query = preprocess_arabic(query)
return bm25_tokenize(query)