scoutsearch / Backend /src /text_processor.py
Ali00922's picture
Upload 37 files
da6a0a4 verified
"""
Advanced Text Processor with Ultimate Optimization
Handles normalization, fuzzy matching, and intelligent tokenization
"""
import re
import unicodedata
from functools import lru_cache
class AdvancedTextProcessor:
"""Ultra-fast text processing with caching and optimizations"""
def __init__(self):
# Pre-compile regex patterns for speed
self._special_chars_pattern = re.compile(r'[^\w\s-]')
self._whitespace_pattern = re.compile(r'\s+')
self._number_pattern = re.compile(r'\d+')
# Fast lookup tables (pre-computed)
self._accent_map = self._build_accent_map()
self._abbreviations = {
'cf': 'center forward', 'cam': 'attacking midfielder',
'cdm': 'defensive midfielder', 'lb': 'left back',
'rb': 'right back', 'cb': 'center back',
'gk': 'goalkeeper', 'st': 'striker',
'cm': 'central midfielder', 'lw': 'left wing',
'rw': 'right wing', 'lm': 'left mid', 'rm': 'right mid'
}
def _build_accent_map(self):
"""Pre-build accent removal map for O(1) lookups"""
accents = {
'': 'e', '': 'e', '': 'e', '': 'e',
'': 'a', '': 'a', '': 'a', '': 'a', '': 'a', '': 'a',
'': 'i', '': 'i', '': 'i', '': 'i',
'': 'o', '': 'o', '': 'o', '': 'o', '': 'o',
'': 'u', '': 'u', '': 'u', '': 'u',
'': 'n', '': 'c', '': 'ss', '': 'y', '': 'y'
}
# Add uppercase versions
for k, v in list(accents.items()):
accents[k.upper()] = v.upper()
return accents
@lru_cache(maxsize=10000)
def normalize_text(self, text):
"""
Ultra-fast normalization with LRU cache
Converts: Mbapp mbappe, So Paulo sao paulo
"""
if not text:
return ""
text = str(text).lower()
# Fast accent removal using pre-built map
chars = []
for c in text:
chars.append(self._accent_map.get(c, c))
text = ''.join(chars)
# Unicode normalization (fallback for missed characters)
text = unicodedata.normalize('NFKD', text)
text = ''.join([c for c in text if not unicodedata.combining(c)])
# Remove special characters (keep hyphens)
text = self._special_chars_pattern.sub(' ', text)
# Normalize whitespace
text = self._whitespace_pattern.sub(' ', text).strip()
return text
@lru_cache(maxsize=5000)
def tokenize(self, text):
"""Fast tokenization with caching"""
normalized = self.normalize_text(text)
tokens = normalized.split()
# Expand abbreviations inline
result = []
for token in tokens:
if token in self._abbreviations:
result.append(self._abbreviations[token])
else:
result.append(token)
# Handle hyphenated names
if '-' in token:
result.extend(token.split('-'))
return tuple(result) # Tuple for caching
def quick_match(self, query, text):
"""
Ultra-fast substring matching
Returns match score (0-1)
"""
q_norm = self.normalize_text(query)
t_norm = self.normalize_text(text)
if not q_norm or not t_norm:
return 0.0
# Exact match
if q_norm == t_norm:
return 1.0
# Contains match
if q_norm in t_norm:
return 0.9
# Token overlap
q_tokens = set(q_norm.split())
t_tokens = set(t_norm.split())
if q_tokens and t_tokens:
overlap = len(q_tokens & t_tokens) / len(q_tokens | t_tokens)
return overlap * 0.7
return 0.0
def fuzzy_similarity(self, str1, str2, threshold=0.6):
"""
Fast fuzzy matching using optimized Jaro-Winkler
Only computes if strings are similar length
"""
s1 = self.normalize_text(str1)
s2 = self.normalize_text(str2)
if not s1 or not s2:
return 0.0
# Quick reject if length difference is too large
len_diff = abs(len(s1) - len(s2))
if len_diff > max(len(s1), len(s2)) * 0.5:
return 0.0
# Simple edit distance approximation
if s1 == s2:
return 1.0
# Character overlap score (fast approximation)
chars1 = set(s1)
chars2 = set(s2)
overlap = len(chars1 & chars2) / len(chars1 | chars2)
return overlap
# Global singleton
_text_processor = None
def get_text_processor():
"""Get singleton instance"""
global _text_processor
if _text_processor is None:
_text_processor = AdvancedTextProcessor()
return _text_processor