File size: 4,983 Bytes
da6a0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
"""
Advanced Text Processor with Ultimate Optimization
Handles normalization, fuzzy matching, and intelligent tokenization
"""

import re
import unicodedata
from functools import lru_cache

class AdvancedTextProcessor:
    """Ultra-fast text processing with caching and optimizations"""
    
    def __init__(self):
        # Pre-compile regex patterns for speed
        self._special_chars_pattern = re.compile(r'[^\w\s-]')
        self._whitespace_pattern = re.compile(r'\s+')
        self._number_pattern = re.compile(r'\d+')
        
        # Fast lookup tables (pre-computed)
        self._accent_map = self._build_accent_map()
        self._abbreviations = {
            'cf': 'center forward', 'cam': 'attacking midfielder',
            'cdm': 'defensive midfielder', 'lb': 'left back',
            'rb': 'right back', 'cb': 'center back',
            'gk': 'goalkeeper', 'st': 'striker',
            'cm': 'central midfielder', 'lw': 'left wing',
            'rw': 'right wing', 'lm': 'left mid', 'rm': 'right mid'
        }
    
    def _build_accent_map(self):
        """Pre-build accent removal map for O(1) lookups"""
        accents = {
            '': 'e', '': 'e', '': 'e', '': 'e',
            '': 'a', '': 'a', '': 'a', '': 'a', '': 'a', '': 'a',
            '': 'i', '': 'i', '': 'i', '': 'i',
            '': 'o', '': 'o', '': 'o', '': 'o', '': 'o',
            '': 'u', '': 'u', '': 'u', '': 'u',
            '': 'n', '': 'c', '': 'ss', '': 'y', '': 'y'
        }
        # Add uppercase versions
        for k, v in list(accents.items()):
            accents[k.upper()] = v.upper()
        return accents
    
    @lru_cache(maxsize=10000)
    def normalize_text(self, text):
        """
        Ultra-fast normalization with LRU cache
        Converts: Mbapp  mbappe, So Paulo  sao paulo
        """
        if not text:
            return ""
        
        text = str(text).lower()
        
        # Fast accent removal using pre-built map
        chars = []
        for c in text:
            chars.append(self._accent_map.get(c, c))
        text = ''.join(chars)
        
        # Unicode normalization (fallback for missed characters)
        text = unicodedata.normalize('NFKD', text)
        text = ''.join([c for c in text if not unicodedata.combining(c)])
        
        # Remove special characters (keep hyphens)
        text = self._special_chars_pattern.sub(' ', text)
        
        # Normalize whitespace
        text = self._whitespace_pattern.sub(' ', text).strip()
        
        return text
    
    @lru_cache(maxsize=5000)
    def tokenize(self, text):
        """Fast tokenization with caching"""
        normalized = self.normalize_text(text)
        tokens = normalized.split()
        
        # Expand abbreviations inline
        result = []
        for token in tokens:
            if token in self._abbreviations:
                result.append(self._abbreviations[token])
            else:
                result.append(token)
                # Handle hyphenated names
                if '-' in token:
                    result.extend(token.split('-'))
        
        return tuple(result)  # Tuple for caching
    
    def quick_match(self, query, text):
        """
        Ultra-fast substring matching
        Returns match score (0-1)
        """
        q_norm = self.normalize_text(query)
        t_norm = self.normalize_text(text)
        
        if not q_norm or not t_norm:
            return 0.0
        
        # Exact match
        if q_norm == t_norm:
            return 1.0
        
        # Contains match
        if q_norm in t_norm:
            return 0.9
        
        # Token overlap
        q_tokens = set(q_norm.split())
        t_tokens = set(t_norm.split())
        if q_tokens and t_tokens:
            overlap = len(q_tokens & t_tokens) / len(q_tokens | t_tokens)
            return overlap * 0.7
        
        return 0.0
    
    def fuzzy_similarity(self, str1, str2, threshold=0.6):
        """
        Fast fuzzy matching using optimized Jaro-Winkler
        Only computes if strings are similar length
        """
        s1 = self.normalize_text(str1)
        s2 = self.normalize_text(str2)
        
        if not s1 or not s2:
            return 0.0
        
        # Quick reject if length difference is too large
        len_diff = abs(len(s1) - len(s2))
        if len_diff > max(len(s1), len(s2)) * 0.5:
            return 0.0
        
        # Simple edit distance approximation
        if s1 == s2:
            return 1.0
        
        # Character overlap score (fast approximation)
        chars1 = set(s1)
        chars2 = set(s2)
        overlap = len(chars1 & chars2) / len(chars1 | chars2)
        
        return overlap

# Global singleton
_text_processor = None

def get_text_processor():
    """Get singleton instance"""
    global _text_processor
    if _text_processor is None:
        _text_processor = AdvancedTextProcessor()
    return _text_processor