File size: 678 Bytes
fde590b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re

def remove_tashkeel(text):
    tashkeel_pattern = re.compile(
        r'[\u0617-\u061A\u064B-\u0652]'
    )
    return re.sub(tashkeel_pattern, '', text)

def normalize_arabic(text):
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ئ", "ي", text)
    text = re.sub("ة", "ه", text)
    return text

def preprocess_arabic(text):
    text = remove_tashkeel(text)
    text = normalize_arabic(text)
    return text

def bm25_tokenize(text):
    return preprocess_arabic(text).split()
def preprocess_query(query):
    query = preprocess_arabic(query)
    return bm25_tokenize(query)