Spaces:

momenalhamza
/

multilingual-chatbot

Sleeping

App Files Files Community

multilingual-chatbot / src /preprocessor.py

momenalhamza

Deploy chatbot: code + RAG + Qwen (3 BERT classifiers loaded from HF Hub)

469ef7f verified 10 days ago

raw

history blame contribute delete

12.4 kB

	"""Reusable multilingual preprocessor for the chatbot.

	Provides a single class, MultilingualPreprocessor, with these methods:

	detect_language(text) -> "AR" \| "EN" \| "FR" \| "CS"
	detect_arabizi(text) -> bool (Arabic written in Latin script)
	normalize_arabic(text) -> str (strip tashkeel, tatweel; normalize hamza)
	clean_text(text) -> str (Unicode-NFC, drop URLs/control, collapse ws)
	tokenize_for_xlmr(text) -> dict ({input_ids, attention_mask}; xlm-roberta-base)

	Language detection algorithm (in order):
	1. Arabic script + Latin script in same string -> CS
	2. Only Arabic script -> AR
	3. Latin only with Arabizi indicators -> CS
	4. Latin only with both FR and EN indicators -> CS
	5. Latin only, fall back to lingua-language-detector
	(decides FR vs EN, with word-list tie-break on low confidence)

	The lingua detector is built only over {AR, EN, FR} so it cannot mistakenly
	return some unrelated language. The xlm-roberta tokenizer is loaded lazily
	on first call (so importing this module is cheap).
	"""

	from __future__ import annotations

	import re
	import unicodedata
	from functools import cached_property
	from typing import Any

	# pyarabic — pure-python, lightweight, always available in this venv
	import pyarabic.araby as araby

	# lingua — fast/accurate language detector (loaded eagerly; small memory)
	from lingua import Language, LanguageDetectorBuilder


	# ============================================================================
	# Static resources
	# ============================================================================

	# Arabic Unicode range (Arabic + Arabic Supplement + Arabic Extended-A/B)
	ARABIC_SCRIPT_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿ]")

	# Stripping URLs from text (covers http, https, and bare www)
	URL_RE = re.compile(r"https?://\S+\|www\.\S+")

	# "Letter-digits" used in the Arabic chat alphabet (Arabizi):
	# 2 = ء/همزة, 3 = ع, 5 = خ, 7 = ح, 9 = ق
	ARABIZI_LETTER_DIGITS = set("23579")

	# Common Arabizi tokens (Levantine + MSA flavour). Lowercase form.
	ARABIZI_WORDS: set[str] = {
	"ana", "enta", "enti", "howa", "heya", "ehna", "ento",
	"bde", "bdi", "bidi", "biddi",
	"kifak", "kifik", "kifkun", "kifak?",
	"shou", "shu", "eh", "shou?",
	"yalla", "khalas",
	"wallahi", "wallah", "wala",
	"ma3leesh", "ma3lich", "mafi", "ma3i", "ma3a", "ma3", "m3a",
	"habibi", "habibti", "habayebi",
	"fi", "mafi", "fih",
	"mochkil", "moshkil", "moshkila", "mushkila",
	"btehki", "lazem", "lezem", "kefi",
	"shi", "hayda", "haydi", "haydak",
	"3andi", "3and", "3andak", "3andik", "3andna",
	"7ub", "7ubbi", "7abibi",
	"9awi", "9ad", "9addesh",
	"5alas", "5all", "5ali",
	"akhouy", "okhti", "yaba", "yumma",
	"tab", "tabe", "ta3", "ta",
	}

	# Strong French indicators (lowercased, used with word-boundary regex).
	FR_WORDS: list[str] = [
	"je", "le", "la", "les", "un", "une", "des", "du",
	"et", "est", "qui", "que", "quoi", "où", "quand",
	"avec", "pour", "ce", "ces", "cette",
	"dans", "sur", "sous", "vers", "chez",
	"très", "comment", "pourquoi", "mon", "ma", "mes",
	"votre", "vos", "notre", "nos",
	"merci", "bonjour", "salut", "oui", "non",
	"vous", "nous", "tu", "moi", "toi", "lui", "elle",
	"alors", "donc", "mais", "ou", "ni",
	"déjà", "encore", "aussi", "même",
	]
	# French elision/contraction prefixes — extremely diagnostic.
	FR_ELISIONS_RE = re.compile(r"\b(?:j'\|qu'\|n'\|l'\|d'\|m'\|s'\|t'\|c'\|jusqu')", re.IGNORECASE)

	# Strong English indicators.
	EN_WORDS: list[str] = [
	"the", "is", "are", "was", "were",
	"have", "has", "had", "having",
	"i", "you", "your", "yours",
	"this", "that", "these", "those",
	"what", "how", "why", "where", "when",
	"with", "for", "to", "and", "but", "or",
	"of", "in", "on", "at", "from", "by",
	"please", "thanks", "thank", "hello", "hi",
	"want", "need", "would", "could", "should", "will",
	"my", "me", "do", "does", "did", "doing",
	"can", "must", "may", "might",
	]


	def _word_boundary_re(words: list[str]) -> re.Pattern[str]:
	"""Build a single regex that matches any of the given words with custom
	boundaries that work for words preceded/followed by letters or apostrophes
	(so `j'ai` matches `j` and so does `j'`)."""
	escaped = [re.escape(w) for w in words]
	pat = r"(?<![a-zA-Zàâäéèêëïîôöùûüç])(?:" + "\|".join(escaped) + r")(?![a-zA-Zàâäéèêëïîôöùûüç])"
	return re.compile(pat, re.IGNORECASE)


	_FR_RE = _word_boundary_re(FR_WORDS)
	_EN_RE = _word_boundary_re(EN_WORDS)


	# ============================================================================
	# Preprocessor
	# ============================================================================

	class MultilingualPreprocessor:
	"""Single-pass preprocessor. Stateless apart from the lazily-built
	tokenizer + lingua detector. Safe to instantiate once and reuse.
	"""

	def __init__(self, xlmr_model_name: str = "xlm-roberta-base") -> None:
	"""Create the preprocessor.

	Args:
	xlmr_model_name: HuggingFace model id whose tokenizer to load
	lazily for tokenize_for_xlmr(). Default xlm-roberta-base.
	"""
	self._xlmr_name = xlmr_model_name
	self._tokenizer: Any = None # loaded lazily
	# Build lingua detector over only {AR, EN, FR} so it cannot return
	# any other language by accident.
	self._detector = (
	LanguageDetectorBuilder
	.from_languages(Language.ARABIC, Language.ENGLISH, Language.FRENCH)
	.build()
	)

	# ------------------------------------------------------------------ tokenizer

	@cached_property
	def tokenizer(self) -> Any:
	"""Return the xlm-roberta-base tokenizer (downloaded on first access)."""
	from transformers import AutoTokenizer
	return AutoTokenizer.from_pretrained(self._xlmr_name)

	def tokenize_for_xlmr(
	self,
	text: str,
	max_length: int = 128,
	return_tensors: str \| None = None,
	) -> dict[str, Any]:
	"""Tokenize a single string with the xlm-roberta-base tokenizer.

	Args:
	text: input string.
	max_length: truncation length (defaults to 128).
	return_tensors: 'pt' / 'np' / None. None returns plain Python lists.

	Returns:
	dict with at least {input_ids, attention_mask}, optionally tensors.
	"""
	return self.tokenizer(
	text,
	truncation=True,
	max_length=max_length,
	padding=False,
	return_tensors=return_tensors,
	)

	# ------------------------------------------------------------------ cleaning

	def clean_text(self, text: str) -> str:
	"""Normalise unicode (NFC), strip URLs and control chars, collapse ws."""
	if not isinstance(text, str):
	return ""
	# NFC normalisation
	text = unicodedata.normalize("NFC", text)
	# Strip URLs
	text = URL_RE.sub(" ", text)
	# Drop control characters (category C*) except common whitespace
	text = "".join(
	c for c in text
	if not unicodedata.category(c).startswith("C") or c in (" ", "\n", "\t")
	)
	# Collapse whitespace
	text = re.sub(r"\s+", " ", text).strip()
	return text

	# ------------------------------------------------------------------ Arabic norm

	def normalize_arabic(self, text: str) -> str:
	"""Strip tashkeel + tatweel; normalize hamza forms.

	Safe to call on non-Arabic text — pyarabic functions only touch Arabic
	characters, so Latin characters pass through unchanged. Also folds
	alef-maksura ى -> ي as a mild extra normalisation (very common in
	Arabic preprocessing pipelines).
	"""
	if not text:
	return text
	text = araby.strip_tashkeel(text)
	text = araby.strip_tatweel(text)
	text = araby.normalize_hamza(text) # أ إ آ -> ا
	# Mild extra: alef-maksura -> ya
	text = text.replace("ى", "ي")
	return text

	# ------------------------------------------------------------------ Arabizi

	def detect_arabizi(self, text: str) -> bool:
	"""Heuristic: Arabic written in Latin script.

	True if either:
	(a) any token is in our hardcoded Arabizi word list, or
	(b) any token contains a digit from {2,3,5,7,9} acting as a letter
	(i.e., the token also has letters and is alnum).
	Returns False for non-Latin-only text.
	"""
	if not text:
	return False
	# Pull out tokens (alnum + apostrophes); lowercase for comparison
	tokens = [t.lower() for t in re.findall(r"[A-Za-zÀ-ÿ0-9']+", text)]
	if not tokens:
	return False
	for t in tokens:
	if t in ARABIZI_WORDS:
	return True
	# Word with an Arabizi letter-digit (must also have real letters)
	if (
	len(t) >= 2
	and any(c in ARABIZI_LETTER_DIGITS for c in t)
	and any(c.isalpha() for c in t)
	and all(c.isalnum() or c == "'" for c in t)
	):
	return True
	return False

	# ------------------------------------------------------------------ language

	def _has_french(self, text: str) -> bool:
	"""True if text contains a strong French indicator word or elision."""
	return bool(FR_ELISIONS_RE.search(text)) or bool(_FR_RE.search(text))

	def _has_english(self, text: str) -> bool:
	"""True if text contains a strong English indicator word."""
	return bool(_EN_RE.search(text))

	def detect_language(self, text: str) -> str:
	"""Classify into AR / EN / FR / CS.

	See module docstring for the full algorithm.
	"""
	if not text or not text.strip():
	return "EN"
	text = text.strip()

	has_arabic = bool(ARABIC_SCRIPT_RE.search(text))
	latin_part = ARABIC_SCRIPT_RE.sub(" ", text).strip()
	has_latin = bool(re.search(r"[A-Za-zÀ-ÿ]", latin_part))

	# 1. Both scripts present -> code-switched
	if has_arabic and has_latin:
	return "CS"

	# 2. Arabic script only
	if has_arabic:
	return "AR"

	# 3. Latin only — Arabizi indicates CS
	if self.detect_arabizi(text):
	return "CS"

	# 4. Both FR and EN words present -> CS
	has_fr = self._has_french(text)
	has_en = self._has_english(text)
	if has_fr and has_en:
	return "CS"

	# 5. Defer to lingua for the FR vs EN decision
	try:
	lang = self._detector.detect_language_of(text)
	if lang == Language.FRENCH:
	return "FR"
	if lang == Language.ENGLISH:
	return "EN"
	if lang == Language.ARABIC:
	# Pure-Arabic only happens if our regex missed; treat as AR.
	return "AR"
	except Exception:
	pass

	# 6. Final tiebreak via word lists
	if has_fr:
	return "FR"
	return "EN"


	# ============================================================================
	# Stand-alone smoke test
	# ============================================================================

	if __name__ == "__main__":
	pre = MultilingualPreprocessor()
	samples = [
	"ana bde booking بكرا please",
	"j'ai un problème avec mon compte",
	"I want to cancel my order الرجاء",
	"مرحبا hello bonjour كيف حالك",
	"3andi mochkil m3a l'application",
	# extras
	"Hello world",
	"Bonjour tout le monde",
	"كيف حالك يا صديقي العزيز",
	"أهلا بك في موقعنا",
	]
	for s in samples:
	print(f"{s!r}")
	print(f" language : {pre.detect_language(s)}")
	print(f" arabizi : {pre.detect_arabizi(s)}")
	print(f" cleaned : {pre.clean_text(s)!r}")
	print(f" norm-AR : {pre.normalize_arabic(s)!r}")
	print()