File size: 1,387 Bytes
a52abbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# ============================================
# file: app/utils.py
# ============================================
from __future__ import annotations

import re
from typing import List, Set


_ARABIC_STOPWORDS: Set[str] = {
    "ููŠ", "ู…ู†", "ุนู„ู‰", "ุฅู„ู‰", "ุนู†", "ุฃู†", "ู‡ุฐุง", "ู‡ุฐู‡", "ุงู„ุชูŠ", "ุงู„ุฐูŠ",
    "ู…ุง", "ู„ุง", "ุฃูˆ", "ูˆ", "ูƒู„", "ุฐู„ูƒ", "ุจูŠู†", "ูƒุงู†", "ู‚ุฏ", "ู‡ูˆ", "ู‡ูŠ",
    "ู„ู…", "ุจู„", "ุซู…", "ุฅุฐุง", "ุญุชู‰", "ู„ูƒู†", "ู…ู†ู‡", "ููŠู‡", "ุนู†ุฏ", "ู„ู‡",
    "ุจู‡ุง", "ู„ู‡ุง", "ู…ู†ู‡ุง", "ููŠู‡ุง", "ุงู„ุชู‰", "ุงู„ุฐู‰", "ูˆู„ุง", "ูˆูู‰", "ูƒู…ุง",
    "ุชู„ูƒ", "ู‡ู†ุง", "ุฃูŠ", "ุฏูˆู†", "ู„ูŠุณ", "ุฅู„ุง", "ุฃู…ุง", "ู…ุน", "ุนู„ูŠู‡",
}


def arabic_tokenize(text: str) -> List[str]:
    """
    Arabic tokenization used by BM25 + metadata index.
    Removes diacritics, keeps Arabic letters/spaces, removes stopwords.
    """
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)      # strip tashkeel
    text = re.sub(r"[^\u0600-\u06FF\s]", " ", text)        # keep Arabic only
    tokens = text.split()
    return [t for t in tokens if t not in _ARABIC_STOPWORDS and len(t) > 1]


def convert_to_eastern_arabic(text: str) -> str:
    """Converts 0123456789 to ู ูกูขูฃูคูฅูฆูงูจูฉ"""
    western = "0123456789"
    eastern = "ู ูกูขูฃูคูฅูฆูงูจูฉ"
    return text.translate(str.maketrans(western, eastern))