Spaces:
Sleeping
Sleeping
File size: 4,983 Bytes
da6a0a4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | """
Advanced Text Processor with Ultimate Optimization
Handles normalization, fuzzy matching, and intelligent tokenization
"""
import re
import unicodedata
from functools import lru_cache
class AdvancedTextProcessor:
"""Ultra-fast text processing with caching and optimizations"""
def __init__(self):
# Pre-compile regex patterns for speed
self._special_chars_pattern = re.compile(r'[^\w\s-]')
self._whitespace_pattern = re.compile(r'\s+')
self._number_pattern = re.compile(r'\d+')
# Fast lookup tables (pre-computed)
self._accent_map = self._build_accent_map()
self._abbreviations = {
'cf': 'center forward', 'cam': 'attacking midfielder',
'cdm': 'defensive midfielder', 'lb': 'left back',
'rb': 'right back', 'cb': 'center back',
'gk': 'goalkeeper', 'st': 'striker',
'cm': 'central midfielder', 'lw': 'left wing',
'rw': 'right wing', 'lm': 'left mid', 'rm': 'right mid'
}
def _build_accent_map(self):
"""Pre-build accent removal map for O(1) lookups"""
accents = {
'': 'e', '': 'e', '': 'e', '': 'e',
'': 'a', '': 'a', '': 'a', '': 'a', '': 'a', '': 'a',
'': 'i', '': 'i', '': 'i', '': 'i',
'': 'o', '': 'o', '': 'o', '': 'o', '': 'o',
'': 'u', '': 'u', '': 'u', '': 'u',
'': 'n', '': 'c', '': 'ss', '': 'y', '': 'y'
}
# Add uppercase versions
for k, v in list(accents.items()):
accents[k.upper()] = v.upper()
return accents
@lru_cache(maxsize=10000)
def normalize_text(self, text):
"""
Ultra-fast normalization with LRU cache
Converts: Mbapp mbappe, So Paulo sao paulo
"""
if not text:
return ""
text = str(text).lower()
# Fast accent removal using pre-built map
chars = []
for c in text:
chars.append(self._accent_map.get(c, c))
text = ''.join(chars)
# Unicode normalization (fallback for missed characters)
text = unicodedata.normalize('NFKD', text)
text = ''.join([c for c in text if not unicodedata.combining(c)])
# Remove special characters (keep hyphens)
text = self._special_chars_pattern.sub(' ', text)
# Normalize whitespace
text = self._whitespace_pattern.sub(' ', text).strip()
return text
@lru_cache(maxsize=5000)
def tokenize(self, text):
"""Fast tokenization with caching"""
normalized = self.normalize_text(text)
tokens = normalized.split()
# Expand abbreviations inline
result = []
for token in tokens:
if token in self._abbreviations:
result.append(self._abbreviations[token])
else:
result.append(token)
# Handle hyphenated names
if '-' in token:
result.extend(token.split('-'))
return tuple(result) # Tuple for caching
def quick_match(self, query, text):
"""
Ultra-fast substring matching
Returns match score (0-1)
"""
q_norm = self.normalize_text(query)
t_norm = self.normalize_text(text)
if not q_norm or not t_norm:
return 0.0
# Exact match
if q_norm == t_norm:
return 1.0
# Contains match
if q_norm in t_norm:
return 0.9
# Token overlap
q_tokens = set(q_norm.split())
t_tokens = set(t_norm.split())
if q_tokens and t_tokens:
overlap = len(q_tokens & t_tokens) / len(q_tokens | t_tokens)
return overlap * 0.7
return 0.0
def fuzzy_similarity(self, str1, str2, threshold=0.6):
"""
Fast fuzzy matching using optimized Jaro-Winkler
Only computes if strings are similar length
"""
s1 = self.normalize_text(str1)
s2 = self.normalize_text(str2)
if not s1 or not s2:
return 0.0
# Quick reject if length difference is too large
len_diff = abs(len(s1) - len(s2))
if len_diff > max(len(s1), len(s2)) * 0.5:
return 0.0
# Simple edit distance approximation
if s1 == s2:
return 1.0
# Character overlap score (fast approximation)
chars1 = set(s1)
chars2 = set(s2)
overlap = len(chars1 & chars2) / len(chars1 | chars2)
return overlap
# Global singleton
_text_processor = None
def get_text_processor():
"""Get singleton instance"""
global _text_processor
if _text_processor is None:
_text_processor = AdvancedTextProcessor()
return _text_processor
|