LegalChatbot / app /utils.py
mayar-waleed
fastapi
a52abbb
# ============================================
# file: app/utils.py
# ============================================
from __future__ import annotations
import re
from typing import List, Set
_ARABIC_STOPWORDS: Set[str] = {
"ููŠ", "ู…ู†", "ุนู„ู‰", "ุฅู„ู‰", "ุนู†", "ุฃู†", "ู‡ุฐุง", "ู‡ุฐู‡", "ุงู„ุชูŠ", "ุงู„ุฐูŠ",
"ู…ุง", "ู„ุง", "ุฃูˆ", "ูˆ", "ูƒู„", "ุฐู„ูƒ", "ุจูŠู†", "ูƒุงู†", "ู‚ุฏ", "ู‡ูˆ", "ู‡ูŠ",
"ู„ู…", "ุจู„", "ุซู…", "ุฅุฐุง", "ุญุชู‰", "ู„ูƒู†", "ู…ู†ู‡", "ููŠู‡", "ุนู†ุฏ", "ู„ู‡",
"ุจู‡ุง", "ู„ู‡ุง", "ู…ู†ู‡ุง", "ููŠู‡ุง", "ุงู„ุชู‰", "ุงู„ุฐู‰", "ูˆู„ุง", "ูˆูู‰", "ูƒู…ุง",
"ุชู„ูƒ", "ู‡ู†ุง", "ุฃูŠ", "ุฏูˆู†", "ู„ูŠุณ", "ุฅู„ุง", "ุฃู…ุง", "ู…ุน", "ุนู„ูŠู‡",
}
def arabic_tokenize(text: str) -> List[str]:
"""
Arabic tokenization used by BM25 + metadata index.
Removes diacritics, keeps Arabic letters/spaces, removes stopwords.
"""
text = re.sub(r"[\u064B-\u065F\u0670]", "", text) # strip tashkeel
text = re.sub(r"[^\u0600-\u06FF\s]", " ", text) # keep Arabic only
tokens = text.split()
return [t for t in tokens if t not in _ARABIC_STOPWORDS and len(t) > 1]
def convert_to_eastern_arabic(text: str) -> str:
"""Converts 0123456789 to ู ูกูขูฃูคูฅูฆูงูจูฉ"""
western = "0123456789"
eastern = "ู ูกูขูฃูคูฅูฆูงูจูฉ"
return text.translate(str.maketrans(western, eastern))