multilingual-chatbot / src /preprocessor.py
momenalhamza's picture
Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)
469ef7f verified
"""Reusable multilingual preprocessor for the chatbot.
Provides a single class, MultilingualPreprocessor, with these methods:
detect_language(text) -> "AR" | "EN" | "FR" | "CS"
detect_arabizi(text) -> bool (Arabic written in Latin script)
normalize_arabic(text) -> str (strip tashkeel, tatweel; normalize hamza)
clean_text(text) -> str (Unicode-NFC, drop URLs/control, collapse ws)
tokenize_for_xlmr(text) -> dict ({input_ids, attention_mask}; xlm-roberta-base)
Language detection algorithm (in order):
1. Arabic script + Latin script in same string -> CS
2. Only Arabic script -> AR
3. Latin only with Arabizi indicators -> CS
4. Latin only with both FR and EN indicators -> CS
5. Latin only, fall back to lingua-language-detector
(decides FR vs EN, with word-list tie-break on low confidence)
The lingua detector is built only over {AR, EN, FR} so it cannot mistakenly
return some unrelated language. The xlm-roberta tokenizer is loaded lazily
on first call (so importing this module is cheap).
"""
from __future__ import annotations
import re
import unicodedata
from functools import cached_property
from typing import Any
# pyarabic — pure-python, lightweight, always available in this venv
import pyarabic.araby as araby
# lingua — fast/accurate language detector (loaded eagerly; small memory)
from lingua import Language, LanguageDetectorBuilder
# ============================================================================
# Static resources
# ============================================================================
# Arabic Unicode range (Arabic + Arabic Supplement + Arabic Extended-A/B)
ARABIC_SCRIPT_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿ]")
# Stripping URLs from text (covers http, https, and bare www)
URL_RE = re.compile(r"https?://\S+|www\.\S+")
# "Letter-digits" used in the Arabic chat alphabet (Arabizi):
# 2 = ء/همزة, 3 = ع, 5 = خ, 7 = ح, 9 = ق
ARABIZI_LETTER_DIGITS = set("23579")
# Common Arabizi tokens (Levantine + MSA flavour). Lowercase form.
ARABIZI_WORDS: set[str] = {
"ana", "enta", "enti", "howa", "heya", "ehna", "ento",
"bde", "bdi", "bidi", "biddi",
"kifak", "kifik", "kifkun", "kifak?",
"shou", "shu", "eh", "shou?",
"yalla", "khalas",
"wallahi", "wallah", "wala",
"ma3leesh", "ma3lich", "mafi", "ma3i", "ma3a", "ma3", "m3a",
"habibi", "habibti", "habayebi",
"fi", "mafi", "fih",
"mochkil", "moshkil", "moshkila", "mushkila",
"btehki", "lazem", "lezem", "kefi",
"shi", "hayda", "haydi", "haydak",
"3andi", "3and", "3andak", "3andik", "3andna",
"7ub", "7ubbi", "7abibi",
"9awi", "9ad", "9addesh",
"5alas", "5all", "5ali",
"akhouy", "okhti", "yaba", "yumma",
"tab", "tabe", "ta3", "ta",
}
# Strong French indicators (lowercased, used with word-boundary regex).
FR_WORDS: list[str] = [
"je", "le", "la", "les", "un", "une", "des", "du",
"et", "est", "qui", "que", "quoi", "où", "quand",
"avec", "pour", "ce", "ces", "cette",
"dans", "sur", "sous", "vers", "chez",
"très", "comment", "pourquoi", "mon", "ma", "mes",
"votre", "vos", "notre", "nos",
"merci", "bonjour", "salut", "oui", "non",
"vous", "nous", "tu", "moi", "toi", "lui", "elle",
"alors", "donc", "mais", "ou", "ni",
"déjà", "encore", "aussi", "même",
]
# French elision/contraction prefixes — extremely diagnostic.
FR_ELISIONS_RE = re.compile(r"\b(?:j'|qu'|n'|l'|d'|m'|s'|t'|c'|jusqu')", re.IGNORECASE)
# Strong English indicators.
EN_WORDS: list[str] = [
"the", "is", "are", "was", "were",
"have", "has", "had", "having",
"i", "you", "your", "yours",
"this", "that", "these", "those",
"what", "how", "why", "where", "when",
"with", "for", "to", "and", "but", "or",
"of", "in", "on", "at", "from", "by",
"please", "thanks", "thank", "hello", "hi",
"want", "need", "would", "could", "should", "will",
"my", "me", "do", "does", "did", "doing",
"can", "must", "may", "might",
]
def _word_boundary_re(words: list[str]) -> re.Pattern[str]:
"""Build a single regex that matches any of the given words with custom
boundaries that work for words preceded/followed by letters or apostrophes
(so `j'ai` matches `j` and so does `j'`)."""
escaped = [re.escape(w) for w in words]
pat = r"(?<![a-zA-Zàâäéèêëïîôöùûüç])(?:" + "|".join(escaped) + r")(?![a-zA-Zàâäéèêëïîôöùûüç])"
return re.compile(pat, re.IGNORECASE)
_FR_RE = _word_boundary_re(FR_WORDS)
_EN_RE = _word_boundary_re(EN_WORDS)
# ============================================================================
# Preprocessor
# ============================================================================
class MultilingualPreprocessor:
"""Single-pass preprocessor. Stateless apart from the lazily-built
tokenizer + lingua detector. Safe to instantiate once and reuse.
"""
def __init__(self, xlmr_model_name: str = "xlm-roberta-base") -> None:
"""Create the preprocessor.
Args:
xlmr_model_name: HuggingFace model id whose tokenizer to load
lazily for tokenize_for_xlmr(). Default xlm-roberta-base.
"""
self._xlmr_name = xlmr_model_name
self._tokenizer: Any = None # loaded lazily
# Build lingua detector over only {AR, EN, FR} so it cannot return
# any other language by accident.
self._detector = (
LanguageDetectorBuilder
.from_languages(Language.ARABIC, Language.ENGLISH, Language.FRENCH)
.build()
)
# ------------------------------------------------------------------ tokenizer
@cached_property
def tokenizer(self) -> Any:
"""Return the xlm-roberta-base tokenizer (downloaded on first access)."""
from transformers import AutoTokenizer
return AutoTokenizer.from_pretrained(self._xlmr_name)
def tokenize_for_xlmr(
self,
text: str,
max_length: int = 128,
return_tensors: str | None = None,
) -> dict[str, Any]:
"""Tokenize a single string with the xlm-roberta-base tokenizer.
Args:
text: input string.
max_length: truncation length (defaults to 128).
return_tensors: 'pt' / 'np' / None. None returns plain Python lists.
Returns:
dict with at least {input_ids, attention_mask}, optionally tensors.
"""
return self.tokenizer(
text,
truncation=True,
max_length=max_length,
padding=False,
return_tensors=return_tensors,
)
# ------------------------------------------------------------------ cleaning
def clean_text(self, text: str) -> str:
"""Normalise unicode (NFC), strip URLs and control chars, collapse ws."""
if not isinstance(text, str):
return ""
# NFC normalisation
text = unicodedata.normalize("NFC", text)
# Strip URLs
text = URL_RE.sub(" ", text)
# Drop control characters (category C*) except common whitespace
text = "".join(
c for c in text
if not unicodedata.category(c).startswith("C") or c in (" ", "\n", "\t")
)
# Collapse whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
# ------------------------------------------------------------------ Arabic norm
def normalize_arabic(self, text: str) -> str:
"""Strip tashkeel + tatweel; normalize hamza forms.
Safe to call on non-Arabic text — pyarabic functions only touch Arabic
characters, so Latin characters pass through unchanged. Also folds
alef-maksura ى -> ي as a mild extra normalisation (very common in
Arabic preprocessing pipelines).
"""
if not text:
return text
text = araby.strip_tashkeel(text)
text = araby.strip_tatweel(text)
text = araby.normalize_hamza(text) # أ إ آ -> ا
# Mild extra: alef-maksura -> ya
text = text.replace("ى", "ي")
return text
# ------------------------------------------------------------------ Arabizi
def detect_arabizi(self, text: str) -> bool:
"""Heuristic: Arabic written in Latin script.
True if either:
(a) any token is in our hardcoded Arabizi word list, or
(b) any token contains a digit from {2,3,5,7,9} acting as a letter
(i.e., the token also has letters and is alnum).
Returns False for non-Latin-only text.
"""
if not text:
return False
# Pull out tokens (alnum + apostrophes); lowercase for comparison
tokens = [t.lower() for t in re.findall(r"[A-Za-zÀ-ÿ0-9']+", text)]
if not tokens:
return False
for t in tokens:
if t in ARABIZI_WORDS:
return True
# Word with an Arabizi letter-digit (must also have real letters)
if (
len(t) >= 2
and any(c in ARABIZI_LETTER_DIGITS for c in t)
and any(c.isalpha() for c in t)
and all(c.isalnum() or c == "'" for c in t)
):
return True
return False
# ------------------------------------------------------------------ language
def _has_french(self, text: str) -> bool:
"""True if text contains a strong French indicator word or elision."""
return bool(FR_ELISIONS_RE.search(text)) or bool(_FR_RE.search(text))
def _has_english(self, text: str) -> bool:
"""True if text contains a strong English indicator word."""
return bool(_EN_RE.search(text))
def detect_language(self, text: str) -> str:
"""Classify into AR / EN / FR / CS.
See module docstring for the full algorithm.
"""
if not text or not text.strip():
return "EN"
text = text.strip()
has_arabic = bool(ARABIC_SCRIPT_RE.search(text))
latin_part = ARABIC_SCRIPT_RE.sub(" ", text).strip()
has_latin = bool(re.search(r"[A-Za-zÀ-ÿ]", latin_part))
# 1. Both scripts present -> code-switched
if has_arabic and has_latin:
return "CS"
# 2. Arabic script only
if has_arabic:
return "AR"
# 3. Latin only — Arabizi indicates CS
if self.detect_arabizi(text):
return "CS"
# 4. Both FR and EN words present -> CS
has_fr = self._has_french(text)
has_en = self._has_english(text)
if has_fr and has_en:
return "CS"
# 5. Defer to lingua for the FR vs EN decision
try:
lang = self._detector.detect_language_of(text)
if lang == Language.FRENCH:
return "FR"
if lang == Language.ENGLISH:
return "EN"
if lang == Language.ARABIC:
# Pure-Arabic only happens if our regex missed; treat as AR.
return "AR"
except Exception:
pass
# 6. Final tiebreak via word lists
if has_fr:
return "FR"
return "EN"
# ============================================================================
# Stand-alone smoke test
# ============================================================================
if __name__ == "__main__":
pre = MultilingualPreprocessor()
samples = [
"ana bde booking بكرا please",
"j'ai un problème avec mon compte",
"I want to cancel my order الرجاء",
"مرحبا hello bonjour كيف حالك",
"3andi mochkil m3a l'application",
# extras
"Hello world",
"Bonjour tout le monde",
"كيف حالك يا صديقي العزيز",
"أهلا بك في موقعنا",
]
for s in samples:
print(f"{s!r}")
print(f" language : {pre.detect_language(s)}")
print(f" arabizi : {pre.detect_arabizi(s)}")
print(f" cleaned : {pre.clean_text(s)!r}")
print(f" norm-AR : {pre.normalize_arabic(s)!r}")
print()