"""Reusable multilingual preprocessor for the chatbot. Provides a single class, MultilingualPreprocessor, with these methods: detect_language(text) -> "AR" | "EN" | "FR" | "CS" detect_arabizi(text) -> bool (Arabic written in Latin script) normalize_arabic(text) -> str (strip tashkeel, tatweel; normalize hamza) clean_text(text) -> str (Unicode-NFC, drop URLs/control, collapse ws) tokenize_for_xlmr(text) -> dict ({input_ids, attention_mask}; xlm-roberta-base) Language detection algorithm (in order): 1. Arabic script + Latin script in same string -> CS 2. Only Arabic script -> AR 3. Latin only with Arabizi indicators -> CS 4. Latin only with both FR and EN indicators -> CS 5. Latin only, fall back to lingua-language-detector (decides FR vs EN, with word-list tie-break on low confidence) The lingua detector is built only over {AR, EN, FR} so it cannot mistakenly return some unrelated language. The xlm-roberta tokenizer is loaded lazily on first call (so importing this module is cheap). """ from __future__ import annotations import re import unicodedata from functools import cached_property from typing import Any # pyarabic — pure-python, lightweight, always available in this venv import pyarabic.araby as araby # lingua — fast/accurate language detector (loaded eagerly; small memory) from lingua import Language, LanguageDetectorBuilder # ============================================================================ # Static resources # ============================================================================ # Arabic Unicode range (Arabic + Arabic Supplement + Arabic Extended-A/B) ARABIC_SCRIPT_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿ]") # Stripping URLs from text (covers http, https, and bare www) URL_RE = re.compile(r"https?://\S+|www\.\S+") # "Letter-digits" used in the Arabic chat alphabet (Arabizi): # 2 = ء/همزة, 3 = ع, 5 = خ, 7 = ح, 9 = ق ARABIZI_LETTER_DIGITS = set("23579") # Common Arabizi tokens (Levantine + MSA flavour). Lowercase form. ARABIZI_WORDS: set[str] = { "ana", "enta", "enti", "howa", "heya", "ehna", "ento", "bde", "bdi", "bidi", "biddi", "kifak", "kifik", "kifkun", "kifak?", "shou", "shu", "eh", "shou?", "yalla", "khalas", "wallahi", "wallah", "wala", "ma3leesh", "ma3lich", "mafi", "ma3i", "ma3a", "ma3", "m3a", "habibi", "habibti", "habayebi", "fi", "mafi", "fih", "mochkil", "moshkil", "moshkila", "mushkila", "btehki", "lazem", "lezem", "kefi", "shi", "hayda", "haydi", "haydak", "3andi", "3and", "3andak", "3andik", "3andna", "7ub", "7ubbi", "7abibi", "9awi", "9ad", "9addesh", "5alas", "5all", "5ali", "akhouy", "okhti", "yaba", "yumma", "tab", "tabe", "ta3", "ta", } # Strong French indicators (lowercased, used with word-boundary regex). FR_WORDS: list[str] = [ "je", "le", "la", "les", "un", "une", "des", "du", "et", "est", "qui", "que", "quoi", "où", "quand", "avec", "pour", "ce", "ces", "cette", "dans", "sur", "sous", "vers", "chez", "très", "comment", "pourquoi", "mon", "ma", "mes", "votre", "vos", "notre", "nos", "merci", "bonjour", "salut", "oui", "non", "vous", "nous", "tu", "moi", "toi", "lui", "elle", "alors", "donc", "mais", "ou", "ni", "déjà", "encore", "aussi", "même", ] # French elision/contraction prefixes — extremely diagnostic. FR_ELISIONS_RE = re.compile(r"\b(?:j'|qu'|n'|l'|d'|m'|s'|t'|c'|jusqu')", re.IGNORECASE) # Strong English indicators. EN_WORDS: list[str] = [ "the", "is", "are", "was", "were", "have", "has", "had", "having", "i", "you", "your", "yours", "this", "that", "these", "those", "what", "how", "why", "where", "when", "with", "for", "to", "and", "but", "or", "of", "in", "on", "at", "from", "by", "please", "thanks", "thank", "hello", "hi", "want", "need", "would", "could", "should", "will", "my", "me", "do", "does", "did", "doing", "can", "must", "may", "might", ] def _word_boundary_re(words: list[str]) -> re.Pattern[str]: """Build a single regex that matches any of the given words with custom boundaries that work for words preceded/followed by letters or apostrophes (so `j'ai` matches `j` and so does `j'`).""" escaped = [re.escape(w) for w in words] pat = r"(? None: """Create the preprocessor. Args: xlmr_model_name: HuggingFace model id whose tokenizer to load lazily for tokenize_for_xlmr(). Default xlm-roberta-base. """ self._xlmr_name = xlmr_model_name self._tokenizer: Any = None # loaded lazily # Build lingua detector over only {AR, EN, FR} so it cannot return # any other language by accident. self._detector = ( LanguageDetectorBuilder .from_languages(Language.ARABIC, Language.ENGLISH, Language.FRENCH) .build() ) # ------------------------------------------------------------------ tokenizer @cached_property def tokenizer(self) -> Any: """Return the xlm-roberta-base tokenizer (downloaded on first access).""" from transformers import AutoTokenizer return AutoTokenizer.from_pretrained(self._xlmr_name) def tokenize_for_xlmr( self, text: str, max_length: int = 128, return_tensors: str | None = None, ) -> dict[str, Any]: """Tokenize a single string with the xlm-roberta-base tokenizer. Args: text: input string. max_length: truncation length (defaults to 128). return_tensors: 'pt' / 'np' / None. None returns plain Python lists. Returns: dict with at least {input_ids, attention_mask}, optionally tensors. """ return self.tokenizer( text, truncation=True, max_length=max_length, padding=False, return_tensors=return_tensors, ) # ------------------------------------------------------------------ cleaning def clean_text(self, text: str) -> str: """Normalise unicode (NFC), strip URLs and control chars, collapse ws.""" if not isinstance(text, str): return "" # NFC normalisation text = unicodedata.normalize("NFC", text) # Strip URLs text = URL_RE.sub(" ", text) # Drop control characters (category C*) except common whitespace text = "".join( c for c in text if not unicodedata.category(c).startswith("C") or c in (" ", "\n", "\t") ) # Collapse whitespace text = re.sub(r"\s+", " ", text).strip() return text # ------------------------------------------------------------------ Arabic norm def normalize_arabic(self, text: str) -> str: """Strip tashkeel + tatweel; normalize hamza forms. Safe to call on non-Arabic text — pyarabic functions only touch Arabic characters, so Latin characters pass through unchanged. Also folds alef-maksura ى -> ي as a mild extra normalisation (very common in Arabic preprocessing pipelines). """ if not text: return text text = araby.strip_tashkeel(text) text = araby.strip_tatweel(text) text = araby.normalize_hamza(text) # أ إ آ -> ا # Mild extra: alef-maksura -> ya text = text.replace("ى", "ي") return text # ------------------------------------------------------------------ Arabizi def detect_arabizi(self, text: str) -> bool: """Heuristic: Arabic written in Latin script. True if either: (a) any token is in our hardcoded Arabizi word list, or (b) any token contains a digit from {2,3,5,7,9} acting as a letter (i.e., the token also has letters and is alnum). Returns False for non-Latin-only text. """ if not text: return False # Pull out tokens (alnum + apostrophes); lowercase for comparison tokens = [t.lower() for t in re.findall(r"[A-Za-zÀ-ÿ0-9']+", text)] if not tokens: return False for t in tokens: if t in ARABIZI_WORDS: return True # Word with an Arabizi letter-digit (must also have real letters) if ( len(t) >= 2 and any(c in ARABIZI_LETTER_DIGITS for c in t) and any(c.isalpha() for c in t) and all(c.isalnum() or c == "'" for c in t) ): return True return False # ------------------------------------------------------------------ language def _has_french(self, text: str) -> bool: """True if text contains a strong French indicator word or elision.""" return bool(FR_ELISIONS_RE.search(text)) or bool(_FR_RE.search(text)) def _has_english(self, text: str) -> bool: """True if text contains a strong English indicator word.""" return bool(_EN_RE.search(text)) def detect_language(self, text: str) -> str: """Classify into AR / EN / FR / CS. See module docstring for the full algorithm. """ if not text or not text.strip(): return "EN" text = text.strip() has_arabic = bool(ARABIC_SCRIPT_RE.search(text)) latin_part = ARABIC_SCRIPT_RE.sub(" ", text).strip() has_latin = bool(re.search(r"[A-Za-zÀ-ÿ]", latin_part)) # 1. Both scripts present -> code-switched if has_arabic and has_latin: return "CS" # 2. Arabic script only if has_arabic: return "AR" # 3. Latin only — Arabizi indicates CS if self.detect_arabizi(text): return "CS" # 4. Both FR and EN words present -> CS has_fr = self._has_french(text) has_en = self._has_english(text) if has_fr and has_en: return "CS" # 5. Defer to lingua for the FR vs EN decision try: lang = self._detector.detect_language_of(text) if lang == Language.FRENCH: return "FR" if lang == Language.ENGLISH: return "EN" if lang == Language.ARABIC: # Pure-Arabic only happens if our regex missed; treat as AR. return "AR" except Exception: pass # 6. Final tiebreak via word lists if has_fr: return "FR" return "EN" # ============================================================================ # Stand-alone smoke test # ============================================================================ if __name__ == "__main__": pre = MultilingualPreprocessor() samples = [ "ana bde booking بكرا please", "j'ai un problème avec mon compte", "I want to cancel my order الرجاء", "مرحبا hello bonjour كيف حالك", "3andi mochkil m3a l'application", # extras "Hello world", "Bonjour tout le monde", "كيف حالك يا صديقي العزيز", "أهلا بك في موقعنا", ] for s in samples: print(f"{s!r}") print(f" language : {pre.detect_language(s)}") print(f" arabizi : {pre.detect_arabizi(s)}") print(f" cleaned : {pre.clean_text(s)!r}") print(f" norm-AR : {pre.normalize_arabic(s)!r}") print()