Spaces:
Sleeping
Sleeping
File size: 1,387 Bytes
a52abbb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | # ============================================
# file: app/utils.py
# ============================================
from __future__ import annotations
import re
from typing import List, Set
_ARABIC_STOPWORDS: Set[str] = {
"ูู", "ู
ู", "ุนูู", "ุฅูู", "ุนู", "ุฃู", "ูุฐุง", "ูุฐู", "ุงูุชู", "ุงูุฐู",
"ู
ุง", "ูุง", "ุฃู", "ู", "ูู", "ุฐูู", "ุจูู", "ูุงู", "ูุฏ", "ูู", "ูู",
"ูู
", "ุจู", "ุซู
", "ุฅุฐุง", "ุญุชู", "ููู", "ู
ูู", "ููู", "ุนูุฏ", "ูู",
"ุจูุง", "ููุง", "ู
ููุง", "ูููุง", "ุงูุชู", "ุงูุฐู", "ููุง", "ููู", "ูู
ุง",
"ุชูู", "ููุง", "ุฃู", "ุฏูู", "ููุณ", "ุฅูุง", "ุฃู
ุง", "ู
ุน", "ุนููู",
}
def arabic_tokenize(text: str) -> List[str]:
"""
Arabic tokenization used by BM25 + metadata index.
Removes diacritics, keeps Arabic letters/spaces, removes stopwords.
"""
text = re.sub(r"[\u064B-\u065F\u0670]", "", text) # strip tashkeel
text = re.sub(r"[^\u0600-\u06FF\s]", " ", text) # keep Arabic only
tokens = text.split()
return [t for t in tokens if t not in _ARABIC_STOPWORDS and len(t) > 1]
def convert_to_eastern_arabic(text: str) -> str:
"""Converts 0123456789 to ู ูกูขูฃูคูฅูฆูงูจูฉ"""
western = "0123456789"
eastern = "ู ูกูขูฃูคูฅูฆูงูจูฉ"
return text.translate(str.maketrans(western, eastern))
|