Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

File size: 12,350 Bytes

469ef7f

"""Reusable multilingual preprocessor for the chatbot.

Provides a single class, MultilingualPreprocessor, with these methods:

  detect_language(text)   -> "AR" | "EN" | "FR" | "CS"
  detect_arabizi(text)    -> bool   (Arabic written in Latin script)
  normalize_arabic(text)  -> str    (strip tashkeel, tatweel; normalize hamza)
  clean_text(text)        -> str    (Unicode-NFC, drop URLs/control, collapse ws)
  tokenize_for_xlmr(text) -> dict   ({input_ids, attention_mask}; xlm-roberta-base)

Language detection algorithm (in order):
  1. Arabic script + Latin script in same string  -> CS
  2. Only Arabic script                            -> AR
  3. Latin only with Arabizi indicators            -> CS
  4. Latin only with both FR and EN indicators     -> CS
  5. Latin only, fall back to lingua-language-detector
                  (decides FR vs EN, with word-list tie-break on low confidence)

The lingua detector is built only over {AR, EN, FR} so it cannot mistakenly
return some unrelated language. The xlm-roberta tokenizer is loaded lazily
on first call (so importing this module is cheap).
"""

from __future__ import annotations

import re
import unicodedata
from functools import cached_property
from typing import Any

# pyarabic — pure-python, lightweight, always available in this venv
import pyarabic.araby as araby

# lingua — fast/accurate language detector (loaded eagerly; small memory)
from lingua import Language, LanguageDetectorBuilder


# ============================================================================
#                              Static resources
# ============================================================================

# Arabic Unicode range (Arabic + Arabic Supplement + Arabic Extended-A/B)
ARABIC_SCRIPT_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿ]")

# Stripping URLs from text (covers http, https, and bare www)
URL_RE = re.compile(r"https?://\S+|www\.\S+")

# "Letter-digits" used in the Arabic chat alphabet (Arabizi):
#   2 = ء/همزة, 3 = ع, 5 = خ, 7 = ح, 9 = ق
ARABIZI_LETTER_DIGITS = set("23579")

# Common Arabizi tokens (Levantine + MSA flavour). Lowercase form.
ARABIZI_WORDS: set[str] = {
    "ana", "enta", "enti", "howa", "heya", "ehna", "ento",
    "bde", "bdi", "bidi", "biddi",
    "kifak", "kifik", "kifkun", "kifak?",
    "shou", "shu", "eh", "shou?",
    "yalla", "khalas",
    "wallahi", "wallah", "wala",
    "ma3leesh", "ma3lich", "mafi", "ma3i", "ma3a", "ma3", "m3a",
    "habibi", "habibti", "habayebi",
    "fi", "mafi", "fih",
    "mochkil", "moshkil", "moshkila", "mushkila",
    "btehki", "lazem", "lezem", "kefi",
    "shi", "hayda", "haydi", "haydak",
    "3andi", "3and", "3andak", "3andik", "3andna",
    "7ub", "7ubbi", "7abibi",
    "9awi", "9ad", "9addesh",
    "5alas", "5all", "5ali",
    "akhouy", "okhti", "yaba", "yumma",
    "tab", "tabe", "ta3", "ta",
}

# Strong French indicators (lowercased, used with word-boundary regex).
FR_WORDS: list[str] = [
    "je", "le", "la", "les", "un", "une", "des", "du",
    "et", "est", "qui", "que", "quoi", "où", "quand",
    "avec", "pour", "ce", "ces", "cette",
    "dans", "sur", "sous", "vers", "chez",
    "très", "comment", "pourquoi", "mon", "ma", "mes",
    "votre", "vos", "notre", "nos",
    "merci", "bonjour", "salut", "oui", "non",
    "vous", "nous", "tu", "moi", "toi", "lui", "elle",
    "alors", "donc", "mais", "ou", "ni",
    "déjà", "encore", "aussi", "même",
]
# French elision/contraction prefixes — extremely diagnostic.
FR_ELISIONS_RE = re.compile(r"\b(?:j'|qu'|n'|l'|d'|m'|s'|t'|c'|jusqu')", re.IGNORECASE)

# Strong English indicators.
EN_WORDS: list[str] = [
    "the", "is", "are", "was", "were",
    "have", "has", "had", "having",
    "i", "you", "your", "yours",
    "this", "that", "these", "those",
    "what", "how", "why", "where", "when",
    "with", "for", "to", "and", "but", "or",
    "of", "in", "on", "at", "from", "by",
    "please", "thanks", "thank", "hello", "hi",
    "want", "need", "would", "could", "should", "will",
    "my", "me", "do", "does", "did", "doing",
    "can", "must", "may", "might",
]


def _word_boundary_re(words: list[str]) -> re.Pattern[str]:
    """Build a single regex that matches any of the given words with custom
    boundaries that work for words preceded/followed by letters or apostrophes
    (so `j'ai` matches `j` and so does `j'`)."""
    escaped = [re.escape(w) for w in words]
    pat = r"(?<![a-zA-Zàâäéèêëïîôöùûüç])(?:" + "|".join(escaped) + r")(?![a-zA-Zàâäéèêëïîôöùûüç])"
    return re.compile(pat, re.IGNORECASE)


_FR_RE = _word_boundary_re(FR_WORDS)
_EN_RE = _word_boundary_re(EN_WORDS)


# ============================================================================
#                              Preprocessor
# ============================================================================

class MultilingualPreprocessor:
    """Single-pass preprocessor. Stateless apart from the lazily-built
    tokenizer + lingua detector. Safe to instantiate once and reuse.
    """

    def __init__(self, xlmr_model_name: str = "xlm-roberta-base") -> None:
        """Create the preprocessor.

        Args:
            xlmr_model_name: HuggingFace model id whose tokenizer to load
                lazily for tokenize_for_xlmr(). Default xlm-roberta-base.
        """
        self._xlmr_name = xlmr_model_name
        self._tokenizer: Any = None  # loaded lazily
        # Build lingua detector over only {AR, EN, FR} so it cannot return
        # any other language by accident.
        self._detector = (
            LanguageDetectorBuilder
            .from_languages(Language.ARABIC, Language.ENGLISH, Language.FRENCH)
            .build()
        )

    # ------------------------------------------------------------------ tokenizer

    @cached_property
    def tokenizer(self) -> Any:
        """Return the xlm-roberta-base tokenizer (downloaded on first access)."""
        from transformers import AutoTokenizer
        return AutoTokenizer.from_pretrained(self._xlmr_name)

    def tokenize_for_xlmr(
        self,
        text: str,
        max_length: int = 128,
        return_tensors: str | None = None,
    ) -> dict[str, Any]:
        """Tokenize a single string with the xlm-roberta-base tokenizer.

        Args:
            text: input string.
            max_length: truncation length (defaults to 128).
            return_tensors: 'pt' / 'np' / None. None returns plain Python lists.

        Returns:
            dict with at least {input_ids, attention_mask}, optionally tensors.
        """
        return self.tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding=False,
            return_tensors=return_tensors,
        )

    # ------------------------------------------------------------------ cleaning

    def clean_text(self, text: str) -> str:
        """Normalise unicode (NFC), strip URLs and control chars, collapse ws."""
        if not isinstance(text, str):
            return ""
        # NFC normalisation
        text = unicodedata.normalize("NFC", text)
        # Strip URLs
        text = URL_RE.sub(" ", text)
        # Drop control characters (category C*) except common whitespace
        text = "".join(
            c for c in text
            if not unicodedata.category(c).startswith("C") or c in (" ", "\n", "\t")
        )
        # Collapse whitespace
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # ------------------------------------------------------------------ Arabic norm

    def normalize_arabic(self, text: str) -> str:
        """Strip tashkeel + tatweel; normalize hamza forms.

        Safe to call on non-Arabic text — pyarabic functions only touch Arabic
        characters, so Latin characters pass through unchanged. Also folds
        alef-maksura ى -> ي as a mild extra normalisation (very common in
        Arabic preprocessing pipelines).
        """
        if not text:
            return text
        text = araby.strip_tashkeel(text)
        text = araby.strip_tatweel(text)
        text = araby.normalize_hamza(text)  # أ إ آ -> ا
        # Mild extra: alef-maksura -> ya
        text = text.replace("ى", "ي")
        return text

    # ------------------------------------------------------------------ Arabizi

    def detect_arabizi(self, text: str) -> bool:
        """Heuristic: Arabic written in Latin script.

        True if either:
          (a) any token is in our hardcoded Arabizi word list, or
          (b) any token contains a digit from {2,3,5,7,9} acting as a letter
              (i.e., the token also has letters and is alnum).
        Returns False for non-Latin-only text.
        """
        if not text:
            return False
        # Pull out tokens (alnum + apostrophes); lowercase for comparison
        tokens = [t.lower() for t in re.findall(r"[A-Za-zÀ-ÿ0-9']+", text)]
        if not tokens:
            return False
        for t in tokens:
            if t in ARABIZI_WORDS:
                return True
            # Word with an Arabizi letter-digit (must also have real letters)
            if (
                len(t) >= 2
                and any(c in ARABIZI_LETTER_DIGITS for c in t)
                and any(c.isalpha() for c in t)
                and all(c.isalnum() or c == "'" for c in t)
            ):
                return True
        return False

    # ------------------------------------------------------------------ language

    def _has_french(self, text: str) -> bool:
        """True if text contains a strong French indicator word or elision."""
        return bool(FR_ELISIONS_RE.search(text)) or bool(_FR_RE.search(text))

    def _has_english(self, text: str) -> bool:
        """True if text contains a strong English indicator word."""
        return bool(_EN_RE.search(text))

    def detect_language(self, text: str) -> str:
        """Classify into AR / EN / FR / CS.

        See module docstring for the full algorithm.
        """
        if not text or not text.strip():
            return "EN"
        text = text.strip()

        has_arabic = bool(ARABIC_SCRIPT_RE.search(text))
        latin_part = ARABIC_SCRIPT_RE.sub(" ", text).strip()
        has_latin = bool(re.search(r"[A-Za-zÀ-ÿ]", latin_part))

        # 1. Both scripts present -> code-switched
        if has_arabic and has_latin:
            return "CS"

        # 2. Arabic script only
        if has_arabic:
            return "AR"

        # 3. Latin only — Arabizi indicates CS
        if self.detect_arabizi(text):
            return "CS"

        # 4. Both FR and EN words present -> CS
        has_fr = self._has_french(text)
        has_en = self._has_english(text)
        if has_fr and has_en:
            return "CS"

        # 5. Defer to lingua for the FR vs EN decision
        try:
            lang = self._detector.detect_language_of(text)
            if lang == Language.FRENCH:
                return "FR"
            if lang == Language.ENGLISH:
                return "EN"
            if lang == Language.ARABIC:
                # Pure-Arabic only happens if our regex missed; treat as AR.
                return "AR"
        except Exception:
            pass

        # 6. Final tiebreak via word lists
        if has_fr:
            return "FR"
        return "EN"


# ============================================================================
#                       Stand-alone smoke test
# ============================================================================

if __name__ == "__main__":
    pre = MultilingualPreprocessor()
    samples = [
        "ana bde booking بكرا please",
        "j'ai un problème avec mon compte",
        "I want to cancel my order الرجاء",
        "مرحبا hello bonjour كيف حالك",
        "3andi mochkil m3a l'application",
        # extras
        "Hello world",
        "Bonjour tout le monde",
        "كيف حالك يا صديقي العزيز",
        "أهلا بك في موقعنا",
    ]
    for s in samples:
        print(f"{s!r}")
        print(f"  language : {pre.detect_language(s)}")
        print(f"  arabizi  : {pre.detect_arabizi(s)}")
        print(f"  cleaned  : {pre.clean_text(s)!r}")
        print(f"  norm-AR  : {pre.normalize_arabic(s)!r}")
        print()