Spaces:

ParshvPatel
/

Multimodal_Search_Engine

Running

File size: 13,720 Bytes

d992912

"""
engine/nlp.py — Multilingual query handling and spell correction.

Extracted from finalized_search_engine_full_script.py (lines 80-364).
Contains:
  - MultilingualHandler: language detection + dictionary-based translation
  - SpellCorrector: Norvig-style spell correction built from the product catalog
"""

import re
import logging
from typing import List, Tuple, Set
from collections import Counter

__all__ = ["MultilingualHandler", "SpellCorrector"]

logger = logging.getLogger("asos_search")


# ═══════════════════════════════════════════════════════════════════════════════
# MULTILINGUAL SUPPORT — lightweight language detection + translation
# ═══════════════════════════════════════════════════════════════════════════════
class MultilingualHandler:
    """
    Detects non-English queries and translates them to English using a
    dictionary-based approach for common fashion terms in major languages.
    For production, swap this with a proper translation API (Google Translate,
    DeepL, or a local model like Helsinki-NLP/opus-mt-*).
    """

    # Common fashion terms in multiple languages → English
    FASHION_DICT = {
        # French
        'robe': 'dress', 'jupe': 'skirt', 'chemise': 'shirt', 'pantalon': 'trousers',
        'veste': 'jacket', 'manteau': 'coat', 'chaussures': 'shoes',
        'bottes': 'boots', 'sac': 'bag', 'ceinture': 'belt',
        'rouge': 'red', 'bleu': 'blue', 'noir': 'black', 'blanc': 'white',
        'vert': 'green', 'jaune': 'yellow', 'rose': 'pink', 'gris': 'grey',
        'violet': 'purple', 'marron': 'brown', 'orange': 'orange',
        'élégant': 'elegant', 'décontracté': 'casual', 'chic': 'chic',
        'femme': 'women', 'homme': 'men', 'fille': 'girl',
        'soie': 'silk', 'coton': 'cotton', 'cuir': 'leather', 'lin': 'linen',
        'floral': 'floral', 'rayé': 'striped', 'imprimé': 'printed',
        'été': 'summer', 'hiver': 'winter', 'printemps': 'spring', 'automne': 'autumn',
        'mini': 'mini', 'maxi': 'maxi', 'midi': 'midi',
        'pas cher': 'budget', 'luxe': 'luxury', 'bon marché': 'cheap',

        # Spanish
        'vestido': 'dress', 'falda': 'skirt', 'camisa': 'shirt',
        'pantalón': 'trousers', 'pantalones': 'trousers', 'chaqueta': 'jacket',
        'abrigo': 'coat', 'zapatos': 'shoes', 'botas': 'boots',
        'bolso': 'bag', 'cinturón': 'belt', 'sombrero': 'hat',
        'rojo': 'red', 'azul': 'blue', 'negro': 'black', 'blanco': 'white',
        'verde': 'green', 'amarillo': 'yellow', 'rosado': 'pink', 'morado': 'purple',
        'marrón': 'brown', 'gris': 'grey', 'naranja': 'orange',
        'elegante': 'elegant', 'informal': 'casual', 'moderno': 'modern',
        'mujer': 'women', 'hombre': 'men', 'barato': 'cheap',
        'algodón': 'cotton', 'seda': 'silk', 'cuero': 'leather',
        'verano': 'summer', 'invierno': 'winter',

        # German
        'kleid': 'dress', 'rock': 'skirt', 'hemd': 'shirt', 'bluse': 'blouse',
        'hose': 'trousers', 'jacke': 'jacket', 'mantel': 'coat',
        'schuhe': 'shoes', 'stiefel': 'boots', 'tasche': 'bag',
        'gürtel': 'belt', 'hut': 'hat', 'pullover': 'sweater',
        'rot': 'red', 'blau': 'blue', 'schwarz': 'black', 'weiß': 'white',
        'weiss': 'white', 'grün': 'green', 'gelb': 'yellow', 'rosa': 'pink',
        'lila': 'purple', 'braun': 'brown', 'grau': 'grey',
        'frau': 'women', 'herren': 'men', 'damen': 'women',
        'seide': 'silk', 'baumwolle': 'cotton', 'leder': 'leather',
        'sommer': 'summer', 'winter': 'winter',

        # Italian
        'abito': 'dress', 'gonna': 'skirt', 'camicia': 'shirt',
        'giacca': 'jacket', 'cappotto': 'coat', 'scarpe': 'shoes',
        'stivali': 'boots', 'borsa': 'bag', 'cintura': 'belt',
        'rosso': 'red', 'blu': 'blue', 'nero': 'black', 'bianco': 'white',
        'grigio': 'grey', 'giallo': 'yellow', 'donna': 'women', 'uomo': 'men',
        'seta': 'silk', 'cotone': 'cotton', 'pelle': 'leather',
        'estate': 'summer', 'inverno': 'winter',

        # Portuguese
        'vestido': 'dress', 'saia': 'skirt', 'calça': 'trousers',
        'jaqueta': 'jacket', 'casaco': 'coat', 'sapatos': 'shoes',
        'bolsa': 'bag', 'vermelho': 'red', 'preto': 'black', 'branco': 'white',
        'mulher': 'women', 'homem': 'men',

        # Japanese (romaji)
        'doresu': 'dress', 'sukato': 'skirt', 'shatsu': 'shirt',
        'zubon': 'trousers', 'jaketto': 'jacket', 'kutsu': 'shoes',
        'baggu': 'bag', 'aka': 'red', 'ao': 'blue', 'kuro': 'black',
        'shiro': 'white',

        # Common multilingual fashion terms
        'kimono': 'kimono', 'sari': 'sari', 'hijab': 'hijab',
        'kaftan': 'kaftan', 'poncho': 'poncho',
    }

    # Character-range heuristics for script detection
    _LATIN_EXTENDED = re.compile(r'[àáâãäåæçèéêëìíîïðñòóôõöùúûüýþÿ]', re.I)
    _CJK = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]')
    _CYRILLIC = re.compile(r'[\u0400-\u04ff]')
    _ARABIC = re.compile(r'[\u0600-\u06ff]')
    _DEVANAGARI = re.compile(r'[\u0900-\u097f]')

    @classmethod
    def detect_language(cls, text: str) -> str:
        """Return a rough language tag: 'en', 'fr', 'es', 'de', 'it', 'pt', 'ja', 'zh', 'ar', 'hi', 'ru', or 'other'."""
        if cls._CJK.search(text):
            return 'ja' if re.search(r'[\u3040-\u30ff]', text) else 'zh'
        if cls._CYRILLIC.search(text):
            return 'ru'
        if cls._ARABIC.search(text):
            return 'ar'
        if cls._DEVANAGARI.search(text):
            return 'hi'

        words = set(re.findall(r'\b[a-zàáâãäåæçèéêëìíîïñòóôõöùúûüýÿ]+\b', text.lower()))
        # French markers
        fr_markers = {'le', 'la', 'les', 'un', 'une', 'des', 'du', 'de', 'et', 'en', 'pour', 'avec', 'je', 'ce', 'cette'}
        es_markers = {'el', 'la', 'los', 'las', 'un', 'una', 'de', 'en', 'y', 'para', 'con', 'por', 'que', 'muy'}
        de_markers = {'der', 'die', 'das', 'ein', 'eine', 'und', 'für', 'mit', 'ich', 'ist', 'nicht', 'auch'}
        it_markers = {'il', 'lo', 'la', 'gli', 'le', 'un', 'una', 'di', 'e', 'per', 'con', 'che', 'sono'}
        pt_markers = {'o', 'a', 'os', 'as', 'um', 'uma', 'de', 'em', 'para', 'com', 'que', 'não'}

        scores = {
            'fr': len(words & fr_markers),
            'es': len(words & es_markers),
            'de': len(words & de_markers),
            'it': len(words & it_markers),
            'pt': len(words & pt_markers),
        }
        best = max(scores, key=scores.get)
        if scores[best] >= 2:
            return best

        # Check if any words are in our fashion dictionary
        dict_words = words & set(cls.FASHION_DICT.keys())
        en_words = {'the', 'a', 'an', 'in', 'on', 'for', 'with', 'and', 'or', 'is', 'are'}
        if dict_words and not (words & en_words):
            return 'other'

        return 'en'

    @classmethod
    def translate_query(cls, query: str) -> Tuple[str, str, bool]:
        """
        Translate a query to English using the fashion dictionary.

        Returns: (translated_query, detected_language, was_translated)
        """
        lang = cls.detect_language(query)

        if lang == 'en':
            return query, 'en', False

        # For non-Latin scripts, we can't do dictionary translation
        if lang in ('ja', 'zh', 'ar', 'hi', 'ru'):
            logger.info(f"Non-Latin script detected ({lang}). Passing through to CLIP.")
            return query, lang, False

        # Dictionary-based word-by-word translation for Latin-script languages
        words = query.lower().split()
        translated = []
        was_translated = False

        i = 0
        while i < len(words):
            # Try 2-word phrases first
            if i + 1 < len(words):
                bigram = f"{words[i]} {words[i+1]}"
                if bigram in cls.FASHION_DICT:
                    translated.append(cls.FASHION_DICT[bigram])
                    was_translated = True
                    i += 2
                    continue

            word = words[i]
            if word in cls.FASHION_DICT:
                translated.append(cls.FASHION_DICT[word])
                was_translated = True
            else:
                translated.append(word)
            i += 1

        result = ' '.join(translated)
        if was_translated:
            logger.info(f"Translated [{lang}]: \"{query}\" → \"{result}\"")

        return result, lang, was_translated


# ═══════════════════════════════════════════════════════════════════════════════
# QUERY SPELL-CORRECTION
# ═══════════════════════════════════════════════════════════════════════════════
class SpellCorrector:
    """
    Lightweight spell correction for fashion search queries.
    Uses a vocabulary built from the product catalog + common fashion terms.
    Based on Peter Norvig's spell corrector algorithm.
    """

    def __init__(self):
        self.word_freq: Counter = Counter()
        self._ready = False

    def fit(self, texts: List[str]):
        """Build vocabulary from product catalog texts."""
        for text in texts:
            words = re.findall(r'\b[a-z]+\b', str(text).lower())
            self.word_freq.update(words)

        # Boost common fashion terms
        fashion_boost = [
            'dress', 'dresses', 'skirt', 'shirt', 'blouse', 'jacket', 'coat',
            'jeans', 'trousers', 'shorts', 'hoodie', 'sweater', 'cardigan',
            'boots', 'sneakers', 'trainers', 'sandals', 'heels', 'shoes',
            'bag', 'handbag', 'tote', 'backpack', 'clutch',
            'black', 'white', 'blue', 'red', 'green', 'pink', 'yellow',
            'purple', 'brown', 'grey', 'gray', 'navy', 'beige', 'cream',
            'casual', 'formal', 'elegant', 'vintage', 'boho', 'minimalist',
            'streetwear', 'oversized', 'cropped', 'fitted', 'floral',
            'leather', 'denim', 'satin', 'silk', 'cotton', 'linen',
            'summer', 'winter', 'spring', 'autumn', 'party', 'office',
            'midi', 'mini', 'maxi', 'sequin', 'lace', 'velvet',
        ]
        for w in fashion_boost:
            self.word_freq[w] += 1000

        self._ready = True
        logger.info(f"SpellCorrector fitted with {len(self.word_freq):,} words")

    def _edits1(self, word: str) -> Set[str]:
        """All edits that are one edit distance away from `word`."""
        letters = 'abcdefghijklmnopqrstuvwxyz'
        splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes = [L + R[1:] for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
        replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
        inserts = [L + c + R for L, R in splits for c in letters]
        return set(deletes + transposes + replaces + inserts)

    def _edits2(self, word: str) -> Set[str]:
        """All edits that are two edits away from `word`."""
        return set(e2 for e1 in self._edits1(word) for e2 in self._edits1(e1))

    def _known(self, words: Set[str]) -> Set[str]:
        """Subset of words that are in the vocabulary."""
        return words & set(self.word_freq.keys())

    def correct_word(self, word: str) -> str:
        """Return the most likely spelling correction for a single word."""
        if not self._ready or len(word) <= 2:
            return word

        word_lower = word.lower()

        # Already known
        if word_lower in self.word_freq:
            return word

        # Edit distance 1
        candidates = self._known(self._edits1(word_lower))
        if candidates:
            best = max(candidates, key=self.word_freq.get)
            if self.word_freq[best] > 10:  # Only correct if the candidate is common enough
                return best

        # Edit distance 2 (only for longer words)
        if len(word_lower) >= 5:
            candidates = self._known(self._edits2(word_lower))
            if candidates:
                best = max(candidates, key=self.word_freq.get)
                if self.word_freq[best] > 50:
                    return best

        return word

    def correct_query(self, query: str) -> Tuple[str, bool]:
        """
        Correct a full query string.
        Returns: (corrected_query, was_corrected)
        """
        if not self._ready:
            return query, False

        words = query.split()
        corrected = []
        was_corrected = False

        for word in words:
            # Don't correct price tokens, numbers, or currency symbols
            if re.match(r'^[£$€]?\d', word) or len(word) <= 2:
                corrected.append(word)
                continue

            fixed = self.correct_word(word)
            if fixed != word:
                was_corrected = True
                corrected.append(fixed)
            else:
                corrected.append(word)

        result = ' '.join(corrected)
        if was_corrected:
            logger.info(f"Spell-corrected: \"{query}\" → \"{result}\"")
        return result, was_corrected