Spaces:

ParshvPatel
/

Multimodal_Search_Engine

Running

App Files Files Community

Multimodal_Search_Engine / backend /app /engine /nlp.py

ParshvPatel

feat: HuggingFace Spaces deployment

d992912 29 days ago

raw

history blame contribute delete

13.7 kB

	"""
	engine/nlp.py — Multilingual query handling and spell correction.

	Extracted from finalized_search_engine_full_script.py (lines 80-364).
	Contains:
	- MultilingualHandler: language detection + dictionary-based translation
	- SpellCorrector: Norvig-style spell correction built from the product catalog
	"""

	import re
	import logging
	from typing import List, Tuple, Set
	from collections import Counter

	__all__ = ["MultilingualHandler", "SpellCorrector"]

	logger = logging.getLogger("asos_search")


	# ═══════════════════════════════════════════════════════════════════════════════
	# MULTILINGUAL SUPPORT — lightweight language detection + translation
	# ═══════════════════════════════════════════════════════════════════════════════
	class MultilingualHandler:
	"""
	Detects non-English queries and translates them to English using a
	dictionary-based approach for common fashion terms in major languages.
	For production, swap this with a proper translation API (Google Translate,
	DeepL, or a local model like Helsinki-NLP/opus-mt-*).
	"""

	# Common fashion terms in multiple languages → English
	FASHION_DICT = {
	# French
	'robe': 'dress', 'jupe': 'skirt', 'chemise': 'shirt', 'pantalon': 'trousers',
	'veste': 'jacket', 'manteau': 'coat', 'chaussures': 'shoes',
	'bottes': 'boots', 'sac': 'bag', 'ceinture': 'belt',
	'rouge': 'red', 'bleu': 'blue', 'noir': 'black', 'blanc': 'white',
	'vert': 'green', 'jaune': 'yellow', 'rose': 'pink', 'gris': 'grey',
	'violet': 'purple', 'marron': 'brown', 'orange': 'orange',
	'élégant': 'elegant', 'décontracté': 'casual', 'chic': 'chic',
	'femme': 'women', 'homme': 'men', 'fille': 'girl',
	'soie': 'silk', 'coton': 'cotton', 'cuir': 'leather', 'lin': 'linen',
	'floral': 'floral', 'rayé': 'striped', 'imprimé': 'printed',
	'été': 'summer', 'hiver': 'winter', 'printemps': 'spring', 'automne': 'autumn',
	'mini': 'mini', 'maxi': 'maxi', 'midi': 'midi',
	'pas cher': 'budget', 'luxe': 'luxury', 'bon marché': 'cheap',

	# Spanish
	'vestido': 'dress', 'falda': 'skirt', 'camisa': 'shirt',
	'pantalón': 'trousers', 'pantalones': 'trousers', 'chaqueta': 'jacket',
	'abrigo': 'coat', 'zapatos': 'shoes', 'botas': 'boots',
	'bolso': 'bag', 'cinturón': 'belt', 'sombrero': 'hat',
	'rojo': 'red', 'azul': 'blue', 'negro': 'black', 'blanco': 'white',
	'verde': 'green', 'amarillo': 'yellow', 'rosado': 'pink', 'morado': 'purple',
	'marrón': 'brown', 'gris': 'grey', 'naranja': 'orange',
	'elegante': 'elegant', 'informal': 'casual', 'moderno': 'modern',
	'mujer': 'women', 'hombre': 'men', 'barato': 'cheap',
	'algodón': 'cotton', 'seda': 'silk', 'cuero': 'leather',
	'verano': 'summer', 'invierno': 'winter',

	# German
	'kleid': 'dress', 'rock': 'skirt', 'hemd': 'shirt', 'bluse': 'blouse',
	'hose': 'trousers', 'jacke': 'jacket', 'mantel': 'coat',
	'schuhe': 'shoes', 'stiefel': 'boots', 'tasche': 'bag',
	'gürtel': 'belt', 'hut': 'hat', 'pullover': 'sweater',
	'rot': 'red', 'blau': 'blue', 'schwarz': 'black', 'weiß': 'white',
	'weiss': 'white', 'grün': 'green', 'gelb': 'yellow', 'rosa': 'pink',
	'lila': 'purple', 'braun': 'brown', 'grau': 'grey',
	'frau': 'women', 'herren': 'men', 'damen': 'women',
	'seide': 'silk', 'baumwolle': 'cotton', 'leder': 'leather',
	'sommer': 'summer', 'winter': 'winter',

	# Italian
	'abito': 'dress', 'gonna': 'skirt', 'camicia': 'shirt',
	'giacca': 'jacket', 'cappotto': 'coat', 'scarpe': 'shoes',
	'stivali': 'boots', 'borsa': 'bag', 'cintura': 'belt',
	'rosso': 'red', 'blu': 'blue', 'nero': 'black', 'bianco': 'white',
	'grigio': 'grey', 'giallo': 'yellow', 'donna': 'women', 'uomo': 'men',
	'seta': 'silk', 'cotone': 'cotton', 'pelle': 'leather',
	'estate': 'summer', 'inverno': 'winter',

	# Portuguese
	'vestido': 'dress', 'saia': 'skirt', 'calça': 'trousers',
	'jaqueta': 'jacket', 'casaco': 'coat', 'sapatos': 'shoes',
	'bolsa': 'bag', 'vermelho': 'red', 'preto': 'black', 'branco': 'white',
	'mulher': 'women', 'homem': 'men',

	# Japanese (romaji)
	'doresu': 'dress', 'sukato': 'skirt', 'shatsu': 'shirt',
	'zubon': 'trousers', 'jaketto': 'jacket', 'kutsu': 'shoes',
	'baggu': 'bag', 'aka': 'red', 'ao': 'blue', 'kuro': 'black',
	'shiro': 'white',

	# Common multilingual fashion terms
	'kimono': 'kimono', 'sari': 'sari', 'hijab': 'hijab',
	'kaftan': 'kaftan', 'poncho': 'poncho',
	}

	# Character-range heuristics for script detection
	_LATIN_EXTENDED = re.compile(r'[àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]', re.I)
	_CJK = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]')
	_CYRILLIC = re.compile(r'[\u0400-\u04ff]')
	_ARABIC = re.compile(r'[\u0600-\u06ff]')
	_DEVANAGARI = re.compile(r'[\u0900-\u097f]')

	@classmethod
	def detect_language(cls, text: str) -> str:
	"""Return a rough language tag: 'en', 'fr', 'es', 'de', 'it', 'pt', 'ja', 'zh', 'ar', 'hi', 'ru', or 'other'."""
	if cls._CJK.search(text):
	return 'ja' if re.search(r'[\u3040-\u30ff]', text) else 'zh'
	if cls._CYRILLIC.search(text):
	return 'ru'
	if cls._ARABIC.search(text):
	return 'ar'
	if cls._DEVANAGARI.search(text):
	return 'hi'

	words = set(re.findall(r'\b[a-zàáâãäåæçèéêëìíîïñòóôõöùúûüýÿ]+\b', text.lower()))
	# French markers
	fr_markers = {'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'en', 'pour', 'avec', 'je', 'ce', 'cette'}
	es_markers = {'el', 'la', 'los', 'las', 'un', 'una', 'de', 'en', 'y', 'para', 'con', 'por', 'que', 'muy'}
	de_markers = {'der', 'die', 'das', 'ein', 'eine', 'und', 'für', 'mit', 'ich', 'ist', 'nicht', 'auch'}
	it_markers = {'il', 'lo', 'la', 'gli', 'le', 'un', 'una', 'di', 'e', 'per', 'con', 'che', 'sono'}
	pt_markers = {'o', 'a', 'os', 'as', 'um', 'uma', 'de', 'em', 'para', 'com', 'que', 'não'}

	scores = {
	'fr': len(words & fr_markers),
	'es': len(words & es_markers),
	'de': len(words & de_markers),
	'it': len(words & it_markers),
	'pt': len(words & pt_markers),
	}
	best = max(scores, key=scores.get)
	if scores[best] >= 2:
	return best

	# Check if any words are in our fashion dictionary
	dict_words = words & set(cls.FASHION_DICT.keys())
	en_words = {'the', 'a', 'an', 'in', 'on', 'for', 'with', 'and', 'or', 'is', 'are'}
	if dict_words and not (words & en_words):
	return 'other'

	return 'en'

	@classmethod
	def translate_query(cls, query: str) -> Tuple[str, str, bool]:
	"""
	Translate a query to English using the fashion dictionary.

	Returns: (translated_query, detected_language, was_translated)
	"""
	lang = cls.detect_language(query)

	if lang == 'en':
	return query, 'en', False

	# For non-Latin scripts, we can't do dictionary translation
	if lang in ('ja', 'zh', 'ar', 'hi', 'ru'):
	logger.info(f"Non-Latin script detected ({lang}). Passing through to CLIP.")
	return query, lang, False

	# Dictionary-based word-by-word translation for Latin-script languages
	words = query.lower().split()
	translated = []
	was_translated = False

	i = 0
	while i < len(words):
	# Try 2-word phrases first
	if i + 1 < len(words):
	bigram = f"{words[i]} {words[i+1]}"
	if bigram in cls.FASHION_DICT:
	translated.append(cls.FASHION_DICT[bigram])
	was_translated = True
	i += 2
	continue

	word = words[i]
	if word in cls.FASHION_DICT:
	translated.append(cls.FASHION_DICT[word])
	was_translated = True
	else:
	translated.append(word)
	i += 1

	result = ' '.join(translated)
	if was_translated:
	logger.info(f"Translated [{lang}]: \"{query}\" → \"{result}\"")

	return result, lang, was_translated


	# ═══════════════════════════════════════════════════════════════════════════════
	# QUERY SPELL-CORRECTION
	# ═══════════════════════════════════════════════════════════════════════════════
	class SpellCorrector:
	"""
	Lightweight spell correction for fashion search queries.
	Uses a vocabulary built from the product catalog + common fashion terms.
	Based on Peter Norvig's spell corrector algorithm.
	"""

	def __init__(self):
	self.word_freq: Counter = Counter()
	self._ready = False

	def fit(self, texts: List[str]):
	"""Build vocabulary from product catalog texts."""
	for text in texts:
	words = re.findall(r'\b[a-z]+\b', str(text).lower())
	self.word_freq.update(words)

	# Boost common fashion terms
	fashion_boost = [
	'dress', 'dresses', 'skirt', 'shirt', 'blouse', 'jacket', 'coat',
	'jeans', 'trousers', 'shorts', 'hoodie', 'sweater', 'cardigan',
	'boots', 'sneakers', 'trainers', 'sandals', 'heels', 'shoes',
	'bag', 'handbag', 'tote', 'backpack', 'clutch',
	'black', 'white', 'blue', 'red', 'green', 'pink', 'yellow',
	'purple', 'brown', 'grey', 'gray', 'navy', 'beige', 'cream',
	'casual', 'formal', 'elegant', 'vintage', 'boho', 'minimalist',
	'streetwear', 'oversized', 'cropped', 'fitted', 'floral',
	'leather', 'denim', 'satin', 'silk', 'cotton', 'linen',
	'summer', 'winter', 'spring', 'autumn', 'party', 'office',
	'midi', 'mini', 'maxi', 'sequin', 'lace', 'velvet',
	]
	for w in fashion_boost:
	self.word_freq[w] += 1000

	self._ready = True
	logger.info(f"SpellCorrector fitted with {len(self.word_freq):,} words")

	def _edits1(self, word: str) -> Set[str]:
	"""All edits that are one edit distance away from `word`."""
	letters = 'abcdefghijklmnopqrstuvwxyz'
	splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
	deletes = [L + R[1:] for L, R in splits if R]
	transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
	replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
	inserts = [L + c + R for L, R in splits for c in letters]
	return set(deletes + transposes + replaces + inserts)

	def _edits2(self, word: str) -> Set[str]:
	"""All edits that are two edits away from `word`."""
	return set(e2 for e1 in self._edits1(word) for e2 in self._edits1(e1))

	def _known(self, words: Set[str]) -> Set[str]:
	"""Subset of words that are in the vocabulary."""
	return words & set(self.word_freq.keys())

	def correct_word(self, word: str) -> str:
	"""Return the most likely spelling correction for a single word."""
	if not self._ready or len(word) <= 2:
	return word

	word_lower = word.lower()

	# Already known
	if word_lower in self.word_freq:
	return word

	# Edit distance 1
	candidates = self._known(self._edits1(word_lower))
	if candidates:
	best = max(candidates, key=self.word_freq.get)
	if self.word_freq[best] > 10: # Only correct if the candidate is common enough
	return best

	# Edit distance 2 (only for longer words)
	if len(word_lower) >= 5:
	candidates = self._known(self._edits2(word_lower))
	if candidates:
	best = max(candidates, key=self.word_freq.get)
	if self.word_freq[best] > 50:
	return best

	return word

	def correct_query(self, query: str) -> Tuple[str, bool]:
	"""
	Correct a full query string.
	Returns: (corrected_query, was_corrected)
	"""
	if not self._ready:
	return query, False

	words = query.split()
	corrected = []
	was_corrected = False

	for word in words:
	# Don't correct price tokens, numbers, or currency symbols
	if re.match(r'^[£$€]?\d', word) or len(word) <= 2:
	corrected.append(word)
	continue

	fixed = self.correct_word(word)
	if fixed != word:
	was_corrected = True
	corrected.append(fixed)
	else:
	corrected.append(word)

	result = ' '.join(corrected)
	if was_corrected:
	logger.info(f"Spell-corrected: \"{query}\" → \"{result}\"")
	return result, was_corrected