""" engine/nlp.py — Multilingual query handling and spell correction. Extracted from finalized_search_engine_full_script.py (lines 80-364). Contains: - MultilingualHandler: language detection + dictionary-based translation - SpellCorrector: Norvig-style spell correction built from the product catalog """ import re import logging from typing import List, Tuple, Set from collections import Counter __all__ = ["MultilingualHandler", "SpellCorrector"] logger = logging.getLogger("asos_search") # ═══════════════════════════════════════════════════════════════════════════════ # MULTILINGUAL SUPPORT — lightweight language detection + translation # ═══════════════════════════════════════════════════════════════════════════════ class MultilingualHandler: """ Detects non-English queries and translates them to English using a dictionary-based approach for common fashion terms in major languages. For production, swap this with a proper translation API (Google Translate, DeepL, or a local model like Helsinki-NLP/opus-mt-*). """ # Common fashion terms in multiple languages → English FASHION_DICT = { # French 'robe': 'dress', 'jupe': 'skirt', 'chemise': 'shirt', 'pantalon': 'trousers', 'veste': 'jacket', 'manteau': 'coat', 'chaussures': 'shoes', 'bottes': 'boots', 'sac': 'bag', 'ceinture': 'belt', 'rouge': 'red', 'bleu': 'blue', 'noir': 'black', 'blanc': 'white', 'vert': 'green', 'jaune': 'yellow', 'rose': 'pink', 'gris': 'grey', 'violet': 'purple', 'marron': 'brown', 'orange': 'orange', 'élégant': 'elegant', 'décontracté': 'casual', 'chic': 'chic', 'femme': 'women', 'homme': 'men', 'fille': 'girl', 'soie': 'silk', 'coton': 'cotton', 'cuir': 'leather', 'lin': 'linen', 'floral': 'floral', 'rayé': 'striped', 'imprimé': 'printed', 'été': 'summer', 'hiver': 'winter', 'printemps': 'spring', 'automne': 'autumn', 'mini': 'mini', 'maxi': 'maxi', 'midi': 'midi', 'pas cher': 'budget', 'luxe': 'luxury', 'bon marché': 'cheap', # Spanish 'vestido': 'dress', 'falda': 'skirt', 'camisa': 'shirt', 'pantalón': 'trousers', 'pantalones': 'trousers', 'chaqueta': 'jacket', 'abrigo': 'coat', 'zapatos': 'shoes', 'botas': 'boots', 'bolso': 'bag', 'cinturón': 'belt', 'sombrero': 'hat', 'rojo': 'red', 'azul': 'blue', 'negro': 'black', 'blanco': 'white', 'verde': 'green', 'amarillo': 'yellow', 'rosado': 'pink', 'morado': 'purple', 'marrón': 'brown', 'gris': 'grey', 'naranja': 'orange', 'elegante': 'elegant', 'informal': 'casual', 'moderno': 'modern', 'mujer': 'women', 'hombre': 'men', 'barato': 'cheap', 'algodón': 'cotton', 'seda': 'silk', 'cuero': 'leather', 'verano': 'summer', 'invierno': 'winter', # German 'kleid': 'dress', 'rock': 'skirt', 'hemd': 'shirt', 'bluse': 'blouse', 'hose': 'trousers', 'jacke': 'jacket', 'mantel': 'coat', 'schuhe': 'shoes', 'stiefel': 'boots', 'tasche': 'bag', 'gürtel': 'belt', 'hut': 'hat', 'pullover': 'sweater', 'rot': 'red', 'blau': 'blue', 'schwarz': 'black', 'weiß': 'white', 'weiss': 'white', 'grün': 'green', 'gelb': 'yellow', 'rosa': 'pink', 'lila': 'purple', 'braun': 'brown', 'grau': 'grey', 'frau': 'women', 'herren': 'men', 'damen': 'women', 'seide': 'silk', 'baumwolle': 'cotton', 'leder': 'leather', 'sommer': 'summer', 'winter': 'winter', # Italian 'abito': 'dress', 'gonna': 'skirt', 'camicia': 'shirt', 'giacca': 'jacket', 'cappotto': 'coat', 'scarpe': 'shoes', 'stivali': 'boots', 'borsa': 'bag', 'cintura': 'belt', 'rosso': 'red', 'blu': 'blue', 'nero': 'black', 'bianco': 'white', 'grigio': 'grey', 'giallo': 'yellow', 'donna': 'women', 'uomo': 'men', 'seta': 'silk', 'cotone': 'cotton', 'pelle': 'leather', 'estate': 'summer', 'inverno': 'winter', # Portuguese 'vestido': 'dress', 'saia': 'skirt', 'calça': 'trousers', 'jaqueta': 'jacket', 'casaco': 'coat', 'sapatos': 'shoes', 'bolsa': 'bag', 'vermelho': 'red', 'preto': 'black', 'branco': 'white', 'mulher': 'women', 'homem': 'men', # Japanese (romaji) 'doresu': 'dress', 'sukato': 'skirt', 'shatsu': 'shirt', 'zubon': 'trousers', 'jaketto': 'jacket', 'kutsu': 'shoes', 'baggu': 'bag', 'aka': 'red', 'ao': 'blue', 'kuro': 'black', 'shiro': 'white', # Common multilingual fashion terms 'kimono': 'kimono', 'sari': 'sari', 'hijab': 'hijab', 'kaftan': 'kaftan', 'poncho': 'poncho', } # Character-range heuristics for script detection _LATIN_EXTENDED = re.compile(r'[àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]', re.I) _CJK = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]') _CYRILLIC = re.compile(r'[\u0400-\u04ff]') _ARABIC = re.compile(r'[\u0600-\u06ff]') _DEVANAGARI = re.compile(r'[\u0900-\u097f]') @classmethod def detect_language(cls, text: str) -> str: """Return a rough language tag: 'en', 'fr', 'es', 'de', 'it', 'pt', 'ja', 'zh', 'ar', 'hi', 'ru', or 'other'.""" if cls._CJK.search(text): return 'ja' if re.search(r'[\u3040-\u30ff]', text) else 'zh' if cls._CYRILLIC.search(text): return 'ru' if cls._ARABIC.search(text): return 'ar' if cls._DEVANAGARI.search(text): return 'hi' words = set(re.findall(r'\b[a-zàáâãäåæçèéêëìíîïñòóôõöùúûüýÿ]+\b', text.lower())) # French markers fr_markers = {'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'en', 'pour', 'avec', 'je', 'ce', 'cette'} es_markers = {'el', 'la', 'los', 'las', 'un', 'una', 'de', 'en', 'y', 'para', 'con', 'por', 'que', 'muy'} de_markers = {'der', 'die', 'das', 'ein', 'eine', 'und', 'für', 'mit', 'ich', 'ist', 'nicht', 'auch'} it_markers = {'il', 'lo', 'la', 'gli', 'le', 'un', 'una', 'di', 'e', 'per', 'con', 'che', 'sono'} pt_markers = {'o', 'a', 'os', 'as', 'um', 'uma', 'de', 'em', 'para', 'com', 'que', 'não'} scores = { 'fr': len(words & fr_markers), 'es': len(words & es_markers), 'de': len(words & de_markers), 'it': len(words & it_markers), 'pt': len(words & pt_markers), } best = max(scores, key=scores.get) if scores[best] >= 2: return best # Check if any words are in our fashion dictionary dict_words = words & set(cls.FASHION_DICT.keys()) en_words = {'the', 'a', 'an', 'in', 'on', 'for', 'with', 'and', 'or', 'is', 'are'} if dict_words and not (words & en_words): return 'other' return 'en' @classmethod def translate_query(cls, query: str) -> Tuple[str, str, bool]: """ Translate a query to English using the fashion dictionary. Returns: (translated_query, detected_language, was_translated) """ lang = cls.detect_language(query) if lang == 'en': return query, 'en', False # For non-Latin scripts, we can't do dictionary translation if lang in ('ja', 'zh', 'ar', 'hi', 'ru'): logger.info(f"Non-Latin script detected ({lang}). Passing through to CLIP.") return query, lang, False # Dictionary-based word-by-word translation for Latin-script languages words = query.lower().split() translated = [] was_translated = False i = 0 while i < len(words): # Try 2-word phrases first if i + 1 < len(words): bigram = f"{words[i]} {words[i+1]}" if bigram in cls.FASHION_DICT: translated.append(cls.FASHION_DICT[bigram]) was_translated = True i += 2 continue word = words[i] if word in cls.FASHION_DICT: translated.append(cls.FASHION_DICT[word]) was_translated = True else: translated.append(word) i += 1 result = ' '.join(translated) if was_translated: logger.info(f"Translated [{lang}]: \"{query}\" → \"{result}\"") return result, lang, was_translated # ═══════════════════════════════════════════════════════════════════════════════ # QUERY SPELL-CORRECTION # ═══════════════════════════════════════════════════════════════════════════════ class SpellCorrector: """ Lightweight spell correction for fashion search queries. Uses a vocabulary built from the product catalog + common fashion terms. Based on Peter Norvig's spell corrector algorithm. """ def __init__(self): self.word_freq: Counter = Counter() self._ready = False def fit(self, texts: List[str]): """Build vocabulary from product catalog texts.""" for text in texts: words = re.findall(r'\b[a-z]+\b', str(text).lower()) self.word_freq.update(words) # Boost common fashion terms fashion_boost = [ 'dress', 'dresses', 'skirt', 'shirt', 'blouse', 'jacket', 'coat', 'jeans', 'trousers', 'shorts', 'hoodie', 'sweater', 'cardigan', 'boots', 'sneakers', 'trainers', 'sandals', 'heels', 'shoes', 'bag', 'handbag', 'tote', 'backpack', 'clutch', 'black', 'white', 'blue', 'red', 'green', 'pink', 'yellow', 'purple', 'brown', 'grey', 'gray', 'navy', 'beige', 'cream', 'casual', 'formal', 'elegant', 'vintage', 'boho', 'minimalist', 'streetwear', 'oversized', 'cropped', 'fitted', 'floral', 'leather', 'denim', 'satin', 'silk', 'cotton', 'linen', 'summer', 'winter', 'spring', 'autumn', 'party', 'office', 'midi', 'mini', 'maxi', 'sequin', 'lace', 'velvet', ] for w in fashion_boost: self.word_freq[w] += 1000 self._ready = True logger.info(f"SpellCorrector fitted with {len(self.word_freq):,} words") def _edits1(self, word: str) -> Set[str]: """All edits that are one edit distance away from `word`.""" letters = 'abcdefghijklmnopqrstuvwxyz' splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def _edits2(self, word: str) -> Set[str]: """All edits that are two edits away from `word`.""" return set(e2 for e1 in self._edits1(word) for e2 in self._edits1(e1)) def _known(self, words: Set[str]) -> Set[str]: """Subset of words that are in the vocabulary.""" return words & set(self.word_freq.keys()) def correct_word(self, word: str) -> str: """Return the most likely spelling correction for a single word.""" if not self._ready or len(word) <= 2: return word word_lower = word.lower() # Already known if word_lower in self.word_freq: return word # Edit distance 1 candidates = self._known(self._edits1(word_lower)) if candidates: best = max(candidates, key=self.word_freq.get) if self.word_freq[best] > 10: # Only correct if the candidate is common enough return best # Edit distance 2 (only for longer words) if len(word_lower) >= 5: candidates = self._known(self._edits2(word_lower)) if candidates: best = max(candidates, key=self.word_freq.get) if self.word_freq[best] > 50: return best return word def correct_query(self, query: str) -> Tuple[str, bool]: """ Correct a full query string. Returns: (corrected_query, was_corrected) """ if not self._ready: return query, False words = query.split() corrected = [] was_corrected = False for word in words: # Don't correct price tokens, numbers, or currency symbols if re.match(r'^[£$€]?\d', word) or len(word) <= 2: corrected.append(word) continue fixed = self.correct_word(word) if fixed != word: was_corrected = True corrected.append(fixed) else: corrected.append(word) result = ' '.join(corrected) if was_corrected: logger.info(f"Spell-corrected: \"{query}\" → \"{result}\"") return result, was_corrected