Spaces:

Ali00922
/

scoutsearch

Sleeping

File size: 4,983 Bytes

da6a0a4

"""
Advanced Text Processor with Ultimate Optimization
Handles normalization, fuzzy matching, and intelligent tokenization
"""

import re
import unicodedata
from functools import lru_cache

class AdvancedTextProcessor:
    """Ultra-fast text processing with caching and optimizations"""
    
    def __init__(self):
        # Pre-compile regex patterns for speed
        self._special_chars_pattern = re.compile(r'[^\w\s-]')
        self._whitespace_pattern = re.compile(r'\s+')
        self._number_pattern = re.compile(r'\d+')
        
        # Fast lookup tables (pre-computed)
        self._accent_map = self._build_accent_map()
        self._abbreviations = {
            'cf': 'center forward', 'cam': 'attacking midfielder',
            'cdm': 'defensive midfielder', 'lb': 'left back',
            'rb': 'right back', 'cb': 'center back',
            'gk': 'goalkeeper', 'st': 'striker',
            'cm': 'central midfielder', 'lw': 'left wing',
            'rw': 'right wing', 'lm': 'left mid', 'rm': 'right mid'
        }
    
    def _build_accent_map(self):
        """Pre-build accent removal map for O(1) lookups"""
        accents = {
            '': 'e', '': 'e', '': 'e', '': 'e',
            '': 'a', '': 'a', '': 'a', '': 'a', '': 'a', '': 'a',
            '': 'i', '': 'i', '': 'i', '': 'i',
            '': 'o', '': 'o', '': 'o', '': 'o', '': 'o',
            '': 'u', '': 'u', '': 'u', '': 'u',
            '': 'n', '': 'c', '': 'ss', '': 'y', '': 'y'
        }
        # Add uppercase versions
        for k, v in list(accents.items()):
            accents[k.upper()] = v.upper()
        return accents
    
    @lru_cache(maxsize=10000)
    def normalize_text(self, text):
        """
        Ultra-fast normalization with LRU cache
        Converts: Mbapp  mbappe, So Paulo  sao paulo
        """
        if not text:
            return ""
        
        text = str(text).lower()
        
        # Fast accent removal using pre-built map
        chars = []
        for c in text:
            chars.append(self._accent_map.get(c, c))
        text = ''.join(chars)
        
        # Unicode normalization (fallback for missed characters)
        text = unicodedata.normalize('NFKD', text)
        text = ''.join([c for c in text if not unicodedata.combining(c)])
        
        # Remove special characters (keep hyphens)
        text = self._special_chars_pattern.sub(' ', text)
        
        # Normalize whitespace
        text = self._whitespace_pattern.sub(' ', text).strip()
        
        return text
    
    @lru_cache(maxsize=5000)
    def tokenize(self, text):
        """Fast tokenization with caching"""
        normalized = self.normalize_text(text)
        tokens = normalized.split()
        
        # Expand abbreviations inline
        result = []
        for token in tokens:
            if token in self._abbreviations:
                result.append(self._abbreviations[token])
            else:
                result.append(token)
                # Handle hyphenated names
                if '-' in token:
                    result.extend(token.split('-'))
        
        return tuple(result)  # Tuple for caching
    
    def quick_match(self, query, text):
        """
        Ultra-fast substring matching
        Returns match score (0-1)
        """
        q_norm = self.normalize_text(query)
        t_norm = self.normalize_text(text)
        
        if not q_norm or not t_norm:
            return 0.0
        
        # Exact match
        if q_norm == t_norm:
            return 1.0
        
        # Contains match
        if q_norm in t_norm:
            return 0.9
        
        # Token overlap
        q_tokens = set(q_norm.split())
        t_tokens = set(t_norm.split())
        if q_tokens and t_tokens:
            overlap = len(q_tokens & t_tokens) / len(q_tokens | t_tokens)
            return overlap * 0.7
        
        return 0.0
    
    def fuzzy_similarity(self, str1, str2, threshold=0.6):
        """
        Fast fuzzy matching using optimized Jaro-Winkler
        Only computes if strings are similar length
        """
        s1 = self.normalize_text(str1)
        s2 = self.normalize_text(str2)
        
        if not s1 or not s2:
            return 0.0
        
        # Quick reject if length difference is too large
        len_diff = abs(len(s1) - len(s2))
        if len_diff > max(len(s1), len(s2)) * 0.5:
            return 0.0
        
        # Simple edit distance approximation
        if s1 == s2:
            return 1.0
        
        # Character overlap score (fast approximation)
        chars1 = set(s1)
        chars2 = set(s2)
        overlap = len(chars1 & chars2) / len(chars1 | chars2)
        
        return overlap

# Global singleton
_text_processor = None

def get_text_processor():
    """Get singleton instance"""
    global _text_processor
    if _text_processor is None:
        _text_processor = AdvancedTextProcessor()
    return _text_processor