Spaces:
Sleeping
Sleeping
| # ============================================ | |
| # file: app/utils.py | |
| # ============================================ | |
| from __future__ import annotations | |
| import re | |
| from typing import List, Set | |
| _ARABIC_STOPWORDS: Set[str] = { | |
| "ูู", "ู ู", "ุนูู", "ุฅูู", "ุนู", "ุฃู", "ูุฐุง", "ูุฐู", "ุงูุชู", "ุงูุฐู", | |
| "ู ุง", "ูุง", "ุฃู", "ู", "ูู", "ุฐูู", "ุจูู", "ูุงู", "ูุฏ", "ูู", "ูู", | |
| "ูู ", "ุจู", "ุซู ", "ุฅุฐุง", "ุญุชู", "ููู", "ู ูู", "ููู", "ุนูุฏ", "ูู", | |
| "ุจูุง", "ููุง", "ู ููุง", "ูููุง", "ุงูุชู", "ุงูุฐู", "ููุง", "ููู", "ูู ุง", | |
| "ุชูู", "ููุง", "ุฃู", "ุฏูู", "ููุณ", "ุฅูุง", "ุฃู ุง", "ู ุน", "ุนููู", | |
| } | |
| def arabic_tokenize(text: str) -> List[str]: | |
| """ | |
| Arabic tokenization used by BM25 + metadata index. | |
| Removes diacritics, keeps Arabic letters/spaces, removes stopwords. | |
| """ | |
| text = re.sub(r"[\u064B-\u065F\u0670]", "", text) # strip tashkeel | |
| text = re.sub(r"[^\u0600-\u06FF\s]", " ", text) # keep Arabic only | |
| tokens = text.split() | |
| return [t for t in tokens if t not in _ARABIC_STOPWORDS and len(t) > 1] | |
| def convert_to_eastern_arabic(text: str) -> str: | |
| """Converts 0123456789 to ู ูกูขูฃูคูฅูฆูงูจูฉ""" | |
| western = "0123456789" | |
| eastern = "ู ูกูขูฃูคูฅูฆูงูจูฉ" | |
| return text.translate(str.maketrans(western, eastern)) | |