Spaces:

MayarWaleed
/

Legal_Chatbot

Sleeping

File size: 1,387 Bytes

a52abbb

# ============================================
# file: app/utils.py
# ============================================
from __future__ import annotations

import re
from typing import List, Set


_ARABIC_STOPWORDS: Set[str] = {
    "في", "من", "على", "إلى", "عن", "أن", "هذا", "هذه", "التي", "الذي",
    "ما", "لا", "أو", "و", "كل", "ذلك", "بين", "كان", "قد", "هو", "هي",
    "لم", "بل", "ثم", "إذا", "حتى", "لكن", "منه", "فيه", "عند", "له",
    "بها", "لها", "منها", "فيها", "التى", "الذى", "ولا", "وفى", "كما",
    "تلك", "هنا", "أي", "دون", "ليس", "إلا", "أما", "مع", "عليه",
}


def arabic_tokenize(text: str) -> List[str]:
    """
    Arabic tokenization used by BM25 + metadata index.
    Removes diacritics, keeps Arabic letters/spaces, removes stopwords.
    """
    text = re.sub(r"[\u064B-\u065F\u0670]", "", text)      # strip tashkeel
    text = re.sub(r"[^\u0600-\u06FF\s]", " ", text)        # keep Arabic only
    tokens = text.split()
    return [t for t in tokens if t not in _ARABIC_STOPWORDS and len(t) > 1]


def convert_to_eastern_arabic(text: str) -> str:
    """Converts 0123456789 to ٠١٢٣٤٥٦٧٨٩"""
    western = "0123456789"
    eastern = "٠١٢٣٤٥٦٧٨٩"
    return text.translate(str.maketrans(western, eastern))