""" Optimized Search Engine with Advanced Indexing and Caching Ultra-fast search with < 100ms response time """ import pandas as pd import numpy as np from collections import defaultdict from functools import lru_cache import time from text_processor import get_text_processor class OptimizedSearchEngine: """ Ultra-optimized search engine with: - Multi-level caching - Inverted index for O(1) lookups - Pre-computed normalized fields - Efficient ranking algorithm """ def __init__(self, df): print("[STARTUP] Building optimized search engine...") start = time.time() self.df = df self.text_processor = get_text_processor() # Pre-process all data once self.players = self._preprocess_players() # Build inverted indices for instant lookups self.name_index = defaultdict(set) self.club_index = defaultdict(set) self.nationality_index = defaultdict(set) self.position_index = defaultdict(set) self._build_indices() elapsed = (time.time() - start) * 1000 print(f" Search engine ready in {elapsed:.0f}ms") print(f" - {len(self.players)} players indexed") print(f" - {len(self.name_index)} name tokens") print(f" - {len(self.club_index)} clubs") print(f" - {len(self.nationality_index)} nationalities") def _preprocess_players(self): """Pre-compute all normalized fields""" players = [] for idx, row in self.df.iterrows(): player = { # Original fields 'id': idx, 'short_name': row.get('short_name', ''), 'long_name': row.get('long_name', ''), 'overall': int(row.get('overall', 0)), 'potential': int(row.get('potential', 0)), 'age': int(row.get('age', 0)), 'club_name': row.get('club_name', ''), 'nationality_name': row.get('nationality_name', ''), 'player_positions': row.get('player_positions', ''), 'value_eur': float(row.get('value_eur', 0)), 'wage_eur': float(row.get('wage_eur', 0)), 'pace': int(row.get('pace', 0)), 'shooting': int(row.get('shooting', 0)), 'passing': int(row.get('passing', 0)), 'dribbling': int(row.get('dribbling', 0)), 'defending': int(row.get('defending', 0)), 'physic': int(row.get('physic', 0)), # Pre-normalized fields for fast search '_norm_name': self.text_processor.normalize_text(row.get('short_name', '')), '_norm_long': self.text_processor.normalize_text(row.get('long_name', '')), '_norm_club': self.text_processor.normalize_text(row.get('club_name', '')), '_norm_nat': self.text_processor.normalize_text(row.get('nationality_name', '')), '_norm_pos': self.text_processor.normalize_text(row.get('player_positions', '')), # Pre-tokenized for instant matching '_tokens': self.text_processor.tokenize( f"{row.get('short_name', '')} {row.get('long_name', '')}" ) } players.append(player) return players def _build_indices(self): """Build inverted indices for O(1) lookups""" for idx, player in enumerate(self.players): # Index by name tokens for token in player['_tokens']: self.name_index[token].add(idx) # Index by club if player['_norm_club']: self.club_index[player['_norm_club']].add(idx) # Index by nationality if player['_norm_nat']: self.nationality_index[player['_norm_nat']].add(idx) # Index by position positions = player['_norm_pos'].split() for pos in positions: if pos: self.position_index[pos].add(idx) def search(self, query, filters=None, max_results=20): """ Ultra-fast search with multi-strategy approach Target: < 100ms response time """ start_time = time.time() if not query or not query.strip(): # No query - return top players by rating candidates = self.players[:100] else: # Get candidates using inverted index candidates = self._get_candidates_fast(query) # Apply filters if filters: candidates = self._apply_filters_fast(candidates, filters) # Rank results ranked = self._rank_fast(query, candidates, max_results) elapsed = (time.time() - start_time) * 1000 print(f" Search completed in {elapsed:.1f}ms ({len(ranked)} results)") return ranked def _get_candidates_fast(self, query): """ Lightning-fast candidate retrieval using indices Strategy: Start with smallest result set """ query_norm = self.text_processor.normalize_text(query) query_tokens = self.text_processor.tokenize(query) candidate_sets = [] # Strategy 1: Name token matches (most precise) for token in query_tokens: if token in self.name_index: candidate_sets.append(self.name_index[token]) # Strategy 2: Club matches if query_norm in self.club_index: candidate_sets.append(self.club_index[query_norm]) # Strategy 3: Nationality matches if query_norm in self.nationality_index: candidate_sets.append(self.nationality_index[query_norm]) # Strategy 4: Position matches for token in query_tokens: if token in self.position_index: candidate_sets.append(self.position_index[token]) if not candidate_sets: # Fallback: partial matching (slower but comprehensive) return self._fallback_search(query_norm, query_tokens) # Union of all candidate sets candidate_indices = set() for s in candidate_sets: candidate_indices.update(s) # Convert indices to player objects return [self.players[idx] for idx in candidate_indices if idx < len(self.players)] def _fallback_search(self, query_norm, query_tokens): """Fallback: scan all players for partial matches""" candidates = [] for player in self.players: # Quick substring check in pre-normalized fields searchable = f"{player['_norm_name']} {player['_norm_club']} {player['_norm_nat']}" if query_norm in searchable: candidates.append(player) if len(candidates) >= 100: # Limit fallback results break return candidates def _apply_filters_fast(self, candidates, filters): """Ultra-fast filtering using numpy-style operations""" filtered = [] for player in candidates: # Overall if 'overallMin' in filters and filters['overallMin']: if player['overall'] < filters['overallMin']: continue if 'overallMax' in filters and filters['overallMax']: if player['overall'] > filters['overallMax']: continue # Age if 'ageMin' in filters and filters['ageMin']: if player['age'] < filters['ageMin']: continue if 'ageMax' in filters and filters['ageMax']: if player['age'] > filters['ageMax']: continue # Position if 'position' in filters and filters['position']: if filters['position'].lower() not in player['_norm_pos']: continue # Pace if 'paceMin' in filters and filters['paceMin']: if player['pace'] < filters['paceMin']: continue # Shooting if 'shootingMin' in filters and filters['shootingMin']: if player['shooting'] < filters['shootingMin']: continue filtered.append(player) return filtered def _rank_fast(self, query, candidates, max_results): """ Fast ranking algorithm Scores based on: relevance (70%) + quality (30%) """ if not query or not query.strip(): # No query - sort by overall rating candidates.sort(key=lambda p: p['overall'], reverse=True) return candidates[:max_results] query_norm = self.text_processor.normalize_text(query) scored = [] for player in candidates: # Relevance score (fast matching) rel_score = 0.0 # Name match (highest weight) if query_norm in player['_norm_name']: rel_score += 100.0 if query_norm == player['_norm_name']: rel_score += 200.0 # Club match if query_norm in player['_norm_club']: rel_score += 30.0 # Nationality match if query_norm in player['_norm_nat']: rel_score += 20.0 # Token overlap for token in player['_tokens']: if query_norm in token or token in query_norm: rel_score += 10.0 # Quality score quality = player['overall'] # Final score: 70% relevance + 30% quality final_score = (rel_score * 0.7) + (quality * 0.3) scored.append((player, final_score)) # Sort by score descending scored.sort(key=lambda x: x[1], reverse=True) return [player for player, score in scored[:max_results]]