Spaces:
Sleeping
Sleeping
| """Reusable multilingual preprocessor for the chatbot. | |
| Provides a single class, MultilingualPreprocessor, with these methods: | |
| detect_language(text) -> "AR" | "EN" | "FR" | "CS" | |
| detect_arabizi(text) -> bool (Arabic written in Latin script) | |
| normalize_arabic(text) -> str (strip tashkeel, tatweel; normalize hamza) | |
| clean_text(text) -> str (Unicode-NFC, drop URLs/control, collapse ws) | |
| tokenize_for_xlmr(text) -> dict ({input_ids, attention_mask}; xlm-roberta-base) | |
| Language detection algorithm (in order): | |
| 1. Arabic script + Latin script in same string -> CS | |
| 2. Only Arabic script -> AR | |
| 3. Latin only with Arabizi indicators -> CS | |
| 4. Latin only with both FR and EN indicators -> CS | |
| 5. Latin only, fall back to lingua-language-detector | |
| (decides FR vs EN, with word-list tie-break on low confidence) | |
| The lingua detector is built only over {AR, EN, FR} so it cannot mistakenly | |
| return some unrelated language. The xlm-roberta tokenizer is loaded lazily | |
| on first call (so importing this module is cheap). | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from functools import cached_property | |
| from typing import Any | |
| # pyarabic — pure-python, lightweight, always available in this venv | |
| import pyarabic.araby as araby | |
| # lingua — fast/accurate language detector (loaded eagerly; small memory) | |
| from lingua import Language, LanguageDetectorBuilder | |
| # ============================================================================ | |
| # Static resources | |
| # ============================================================================ | |
| # Arabic Unicode range (Arabic + Arabic Supplement + Arabic Extended-A/B) | |
| ARABIC_SCRIPT_RE = re.compile(r"[-ۿݐ-ݿࢠ-ࣿ]") | |
| # Stripping URLs from text (covers http, https, and bare www) | |
| URL_RE = re.compile(r"https?://\S+|www\.\S+") | |
| # "Letter-digits" used in the Arabic chat alphabet (Arabizi): | |
| # 2 = ء/همزة, 3 = ع, 5 = خ, 7 = ح, 9 = ق | |
| ARABIZI_LETTER_DIGITS = set("23579") | |
| # Common Arabizi tokens (Levantine + MSA flavour). Lowercase form. | |
| ARABIZI_WORDS: set[str] = { | |
| "ana", "enta", "enti", "howa", "heya", "ehna", "ento", | |
| "bde", "bdi", "bidi", "biddi", | |
| "kifak", "kifik", "kifkun", "kifak?", | |
| "shou", "shu", "eh", "shou?", | |
| "yalla", "khalas", | |
| "wallahi", "wallah", "wala", | |
| "ma3leesh", "ma3lich", "mafi", "ma3i", "ma3a", "ma3", "m3a", | |
| "habibi", "habibti", "habayebi", | |
| "fi", "mafi", "fih", | |
| "mochkil", "moshkil", "moshkila", "mushkila", | |
| "btehki", "lazem", "lezem", "kefi", | |
| "shi", "hayda", "haydi", "haydak", | |
| "3andi", "3and", "3andak", "3andik", "3andna", | |
| "7ub", "7ubbi", "7abibi", | |
| "9awi", "9ad", "9addesh", | |
| "5alas", "5all", "5ali", | |
| "akhouy", "okhti", "yaba", "yumma", | |
| "tab", "tabe", "ta3", "ta", | |
| } | |
| # Strong French indicators (lowercased, used with word-boundary regex). | |
| FR_WORDS: list[str] = [ | |
| "je", "le", "la", "les", "un", "une", "des", "du", | |
| "et", "est", "qui", "que", "quoi", "où", "quand", | |
| "avec", "pour", "ce", "ces", "cette", | |
| "dans", "sur", "sous", "vers", "chez", | |
| "très", "comment", "pourquoi", "mon", "ma", "mes", | |
| "votre", "vos", "notre", "nos", | |
| "merci", "bonjour", "salut", "oui", "non", | |
| "vous", "nous", "tu", "moi", "toi", "lui", "elle", | |
| "alors", "donc", "mais", "ou", "ni", | |
| "déjà", "encore", "aussi", "même", | |
| ] | |
| # French elision/contraction prefixes — extremely diagnostic. | |
| FR_ELISIONS_RE = re.compile(r"\b(?:j'|qu'|n'|l'|d'|m'|s'|t'|c'|jusqu')", re.IGNORECASE) | |
| # Strong English indicators. | |
| EN_WORDS: list[str] = [ | |
| "the", "is", "are", "was", "were", | |
| "have", "has", "had", "having", | |
| "i", "you", "your", "yours", | |
| "this", "that", "these", "those", | |
| "what", "how", "why", "where", "when", | |
| "with", "for", "to", "and", "but", "or", | |
| "of", "in", "on", "at", "from", "by", | |
| "please", "thanks", "thank", "hello", "hi", | |
| "want", "need", "would", "could", "should", "will", | |
| "my", "me", "do", "does", "did", "doing", | |
| "can", "must", "may", "might", | |
| ] | |
| def _word_boundary_re(words: list[str]) -> re.Pattern[str]: | |
| """Build a single regex that matches any of the given words with custom | |
| boundaries that work for words preceded/followed by letters or apostrophes | |
| (so `j'ai` matches `j` and so does `j'`).""" | |
| escaped = [re.escape(w) for w in words] | |
| pat = r"(?<![a-zA-Zàâäéèêëïîôöùûüç])(?:" + "|".join(escaped) + r")(?![a-zA-Zàâäéèêëïîôöùûüç])" | |
| return re.compile(pat, re.IGNORECASE) | |
| _FR_RE = _word_boundary_re(FR_WORDS) | |
| _EN_RE = _word_boundary_re(EN_WORDS) | |
| # ============================================================================ | |
| # Preprocessor | |
| # ============================================================================ | |
| class MultilingualPreprocessor: | |
| """Single-pass preprocessor. Stateless apart from the lazily-built | |
| tokenizer + lingua detector. Safe to instantiate once and reuse. | |
| """ | |
| def __init__(self, xlmr_model_name: str = "xlm-roberta-base") -> None: | |
| """Create the preprocessor. | |
| Args: | |
| xlmr_model_name: HuggingFace model id whose tokenizer to load | |
| lazily for tokenize_for_xlmr(). Default xlm-roberta-base. | |
| """ | |
| self._xlmr_name = xlmr_model_name | |
| self._tokenizer: Any = None # loaded lazily | |
| # Build lingua detector over only {AR, EN, FR} so it cannot return | |
| # any other language by accident. | |
| self._detector = ( | |
| LanguageDetectorBuilder | |
| .from_languages(Language.ARABIC, Language.ENGLISH, Language.FRENCH) | |
| .build() | |
| ) | |
| # ------------------------------------------------------------------ tokenizer | |
| def tokenizer(self) -> Any: | |
| """Return the xlm-roberta-base tokenizer (downloaded on first access).""" | |
| from transformers import AutoTokenizer | |
| return AutoTokenizer.from_pretrained(self._xlmr_name) | |
| def tokenize_for_xlmr( | |
| self, | |
| text: str, | |
| max_length: int = 128, | |
| return_tensors: str | None = None, | |
| ) -> dict[str, Any]: | |
| """Tokenize a single string with the xlm-roberta-base tokenizer. | |
| Args: | |
| text: input string. | |
| max_length: truncation length (defaults to 128). | |
| return_tensors: 'pt' / 'np' / None. None returns plain Python lists. | |
| Returns: | |
| dict with at least {input_ids, attention_mask}, optionally tensors. | |
| """ | |
| return self.tokenizer( | |
| text, | |
| truncation=True, | |
| max_length=max_length, | |
| padding=False, | |
| return_tensors=return_tensors, | |
| ) | |
| # ------------------------------------------------------------------ cleaning | |
| def clean_text(self, text: str) -> str: | |
| """Normalise unicode (NFC), strip URLs and control chars, collapse ws.""" | |
| if not isinstance(text, str): | |
| return "" | |
| # NFC normalisation | |
| text = unicodedata.normalize("NFC", text) | |
| # Strip URLs | |
| text = URL_RE.sub(" ", text) | |
| # Drop control characters (category C*) except common whitespace | |
| text = "".join( | |
| c for c in text | |
| if not unicodedata.category(c).startswith("C") or c in (" ", "\n", "\t") | |
| ) | |
| # Collapse whitespace | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| # ------------------------------------------------------------------ Arabic norm | |
| def normalize_arabic(self, text: str) -> str: | |
| """Strip tashkeel + tatweel; normalize hamza forms. | |
| Safe to call on non-Arabic text — pyarabic functions only touch Arabic | |
| characters, so Latin characters pass through unchanged. Also folds | |
| alef-maksura ى -> ي as a mild extra normalisation (very common in | |
| Arabic preprocessing pipelines). | |
| """ | |
| if not text: | |
| return text | |
| text = araby.strip_tashkeel(text) | |
| text = araby.strip_tatweel(text) | |
| text = araby.normalize_hamza(text) # أ إ آ -> ا | |
| # Mild extra: alef-maksura -> ya | |
| text = text.replace("ى", "ي") | |
| return text | |
| # ------------------------------------------------------------------ Arabizi | |
| def detect_arabizi(self, text: str) -> bool: | |
| """Heuristic: Arabic written in Latin script. | |
| True if either: | |
| (a) any token is in our hardcoded Arabizi word list, or | |
| (b) any token contains a digit from {2,3,5,7,9} acting as a letter | |
| (i.e., the token also has letters and is alnum). | |
| Returns False for non-Latin-only text. | |
| """ | |
| if not text: | |
| return False | |
| # Pull out tokens (alnum + apostrophes); lowercase for comparison | |
| tokens = [t.lower() for t in re.findall(r"[A-Za-zÀ-ÿ0-9']+", text)] | |
| if not tokens: | |
| return False | |
| for t in tokens: | |
| if t in ARABIZI_WORDS: | |
| return True | |
| # Word with an Arabizi letter-digit (must also have real letters) | |
| if ( | |
| len(t) >= 2 | |
| and any(c in ARABIZI_LETTER_DIGITS for c in t) | |
| and any(c.isalpha() for c in t) | |
| and all(c.isalnum() or c == "'" for c in t) | |
| ): | |
| return True | |
| return False | |
| # ------------------------------------------------------------------ language | |
| def _has_french(self, text: str) -> bool: | |
| """True if text contains a strong French indicator word or elision.""" | |
| return bool(FR_ELISIONS_RE.search(text)) or bool(_FR_RE.search(text)) | |
| def _has_english(self, text: str) -> bool: | |
| """True if text contains a strong English indicator word.""" | |
| return bool(_EN_RE.search(text)) | |
| def detect_language(self, text: str) -> str: | |
| """Classify into AR / EN / FR / CS. | |
| See module docstring for the full algorithm. | |
| """ | |
| if not text or not text.strip(): | |
| return "EN" | |
| text = text.strip() | |
| has_arabic = bool(ARABIC_SCRIPT_RE.search(text)) | |
| latin_part = ARABIC_SCRIPT_RE.sub(" ", text).strip() | |
| has_latin = bool(re.search(r"[A-Za-zÀ-ÿ]", latin_part)) | |
| # 1. Both scripts present -> code-switched | |
| if has_arabic and has_latin: | |
| return "CS" | |
| # 2. Arabic script only | |
| if has_arabic: | |
| return "AR" | |
| # 3. Latin only — Arabizi indicates CS | |
| if self.detect_arabizi(text): | |
| return "CS" | |
| # 4. Both FR and EN words present -> CS | |
| has_fr = self._has_french(text) | |
| has_en = self._has_english(text) | |
| if has_fr and has_en: | |
| return "CS" | |
| # 5. Defer to lingua for the FR vs EN decision | |
| try: | |
| lang = self._detector.detect_language_of(text) | |
| if lang == Language.FRENCH: | |
| return "FR" | |
| if lang == Language.ENGLISH: | |
| return "EN" | |
| if lang == Language.ARABIC: | |
| # Pure-Arabic only happens if our regex missed; treat as AR. | |
| return "AR" | |
| except Exception: | |
| pass | |
| # 6. Final tiebreak via word lists | |
| if has_fr: | |
| return "FR" | |
| return "EN" | |
| # ============================================================================ | |
| # Stand-alone smoke test | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| pre = MultilingualPreprocessor() | |
| samples = [ | |
| "ana bde booking بكرا please", | |
| "j'ai un problème avec mon compte", | |
| "I want to cancel my order الرجاء", | |
| "مرحبا hello bonjour كيف حالك", | |
| "3andi mochkil m3a l'application", | |
| # extras | |
| "Hello world", | |
| "Bonjour tout le monde", | |
| "كيف حالك يا صديقي العزيز", | |
| "أهلا بك في موقعنا", | |
| ] | |
| for s in samples: | |
| print(f"{s!r}") | |
| print(f" language : {pre.detect_language(s)}") | |
| print(f" arabizi : {pre.detect_arabizi(s)}") | |
| print(f" cleaned : {pre.clean_text(s)!r}") | |
| print(f" norm-AR : {pre.normalize_arabic(s)!r}") | |
| print() | |