Spaces:
Sleeping
Sleeping
| """ | |
| Advanced Text Processor with Ultimate Optimization | |
| Handles normalization, fuzzy matching, and intelligent tokenization | |
| """ | |
| import re | |
| import unicodedata | |
| from functools import lru_cache | |
| class AdvancedTextProcessor: | |
| """Ultra-fast text processing with caching and optimizations""" | |
| def __init__(self): | |
| # Pre-compile regex patterns for speed | |
| self._special_chars_pattern = re.compile(r'[^\w\s-]') | |
| self._whitespace_pattern = re.compile(r'\s+') | |
| self._number_pattern = re.compile(r'\d+') | |
| # Fast lookup tables (pre-computed) | |
| self._accent_map = self._build_accent_map() | |
| self._abbreviations = { | |
| 'cf': 'center forward', 'cam': 'attacking midfielder', | |
| 'cdm': 'defensive midfielder', 'lb': 'left back', | |
| 'rb': 'right back', 'cb': 'center back', | |
| 'gk': 'goalkeeper', 'st': 'striker', | |
| 'cm': 'central midfielder', 'lw': 'left wing', | |
| 'rw': 'right wing', 'lm': 'left mid', 'rm': 'right mid' | |
| } | |
| def _build_accent_map(self): | |
| """Pre-build accent removal map for O(1) lookups""" | |
| accents = { | |
| '': 'e', '': 'e', '': 'e', '': 'e', | |
| '': 'a', '': 'a', '': 'a', '': 'a', '': 'a', '': 'a', | |
| '': 'i', '': 'i', '': 'i', '': 'i', | |
| '': 'o', '': 'o', '': 'o', '': 'o', '': 'o', | |
| '': 'u', '': 'u', '': 'u', '': 'u', | |
| '': 'n', '': 'c', '': 'ss', '': 'y', '': 'y' | |
| } | |
| # Add uppercase versions | |
| for k, v in list(accents.items()): | |
| accents[k.upper()] = v.upper() | |
| return accents | |
| def normalize_text(self, text): | |
| """ | |
| Ultra-fast normalization with LRU cache | |
| Converts: Mbapp mbappe, So Paulo sao paulo | |
| """ | |
| if not text: | |
| return "" | |
| text = str(text).lower() | |
| # Fast accent removal using pre-built map | |
| chars = [] | |
| for c in text: | |
| chars.append(self._accent_map.get(c, c)) | |
| text = ''.join(chars) | |
| # Unicode normalization (fallback for missed characters) | |
| text = unicodedata.normalize('NFKD', text) | |
| text = ''.join([c for c in text if not unicodedata.combining(c)]) | |
| # Remove special characters (keep hyphens) | |
| text = self._special_chars_pattern.sub(' ', text) | |
| # Normalize whitespace | |
| text = self._whitespace_pattern.sub(' ', text).strip() | |
| return text | |
| def tokenize(self, text): | |
| """Fast tokenization with caching""" | |
| normalized = self.normalize_text(text) | |
| tokens = normalized.split() | |
| # Expand abbreviations inline | |
| result = [] | |
| for token in tokens: | |
| if token in self._abbreviations: | |
| result.append(self._abbreviations[token]) | |
| else: | |
| result.append(token) | |
| # Handle hyphenated names | |
| if '-' in token: | |
| result.extend(token.split('-')) | |
| return tuple(result) # Tuple for caching | |
| def quick_match(self, query, text): | |
| """ | |
| Ultra-fast substring matching | |
| Returns match score (0-1) | |
| """ | |
| q_norm = self.normalize_text(query) | |
| t_norm = self.normalize_text(text) | |
| if not q_norm or not t_norm: | |
| return 0.0 | |
| # Exact match | |
| if q_norm == t_norm: | |
| return 1.0 | |
| # Contains match | |
| if q_norm in t_norm: | |
| return 0.9 | |
| # Token overlap | |
| q_tokens = set(q_norm.split()) | |
| t_tokens = set(t_norm.split()) | |
| if q_tokens and t_tokens: | |
| overlap = len(q_tokens & t_tokens) / len(q_tokens | t_tokens) | |
| return overlap * 0.7 | |
| return 0.0 | |
| def fuzzy_similarity(self, str1, str2, threshold=0.6): | |
| """ | |
| Fast fuzzy matching using optimized Jaro-Winkler | |
| Only computes if strings are similar length | |
| """ | |
| s1 = self.normalize_text(str1) | |
| s2 = self.normalize_text(str2) | |
| if not s1 or not s2: | |
| return 0.0 | |
| # Quick reject if length difference is too large | |
| len_diff = abs(len(s1) - len(s2)) | |
| if len_diff > max(len(s1), len(s2)) * 0.5: | |
| return 0.0 | |
| # Simple edit distance approximation | |
| if s1 == s2: | |
| return 1.0 | |
| # Character overlap score (fast approximation) | |
| chars1 = set(s1) | |
| chars2 = set(s2) | |
| overlap = len(chars1 & chars2) / len(chars1 | chars2) | |
| return overlap | |
| # Global singleton | |
| _text_processor = None | |
| def get_text_processor(): | |
| """Get singleton instance""" | |
| global _text_processor | |
| if _text_processor is None: | |
| _text_processor = AdvancedTextProcessor() | |
| return _text_processor | |