""" Autocomplete System: Trie-based real-time query suggestions Provides 3-5 relevant suggestions as user types """ import json import os from typing import List, Tuple, Optional from collections import defaultdict import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class TrieNode: """Node in the Trie structure""" def __init__(self): self.children: dict = {} self.is_end_of_word: bool = False self.frequency: int = 0 self.word: str = "" class AutocompleteTrie: """ Trie-based autocomplete system for fast prefix matching Supports frequency-based ranking of suggestions """ def __init__(self, lexicon_path: str = None, query_log_path: str = None): """ Initialize Autocomplete Trie Args: lexicon_path: Path to lexicon JSON for building trie query_log_path: Optional path to query log for frequency data """ self.root = TrieNode() self.lexicon_path = lexicon_path self.query_log_path = query_log_path self.word_count = 0 # Common football terms to boost self.boosted_terms = { 'striker', 'midfielder', 'defender', 'goalkeeper', 'winger', 'forward', 'attacking', 'defensive', 'fast', 'strong', 'young', 'experienced', 'veteran', 'talented', 'skillful' } if lexicon_path and os.path.exists(lexicon_path): self.build_from_lexicon() logger.info(f"AutocompleteTrie initialized with {self.word_count:,} words") def insert(self, word: str, frequency: int = 1): """ Insert a word into the trie Args: word: Word to insert frequency: Usage frequency (higher = more common) """ if not word: return word = word.lower().strip() node = self.root for char in word: if char not in node.children: node.children[char] = TrieNode() node = node.children[char] node.is_end_of_word = True node.word = word node.frequency = max(node.frequency, frequency) # Keep highest frequency # Boost common football terms if word in self.boosted_terms: node.frequency *= 3 def build_from_lexicon(self): """Build trie from lexicon file""" try: with open(self.lexicon_path, 'r', encoding='utf-8') as f: lexicon = json.load(f) # If large (like in Render's 512MB RAM tier), heavily truncate to the top 15k most common words if len(lexicon) > 15000: lexicon = sorted(lexicon, key=lambda x: x.get('df', 1), reverse=True)[:15000] logger.info(f"Building trie from {len(lexicon):,} lexicon entries (truncated for memory)...") for entry in lexicon: token = entry.get('token', '') df = entry.get('df', 1) # Document frequency as proxy for importance if token and len(token) >= 2: # Skip single chars self.insert(token, frequency=df) self.word_count += 1 logger.info(f"Trie built with {self.word_count:,} words") except Exception as e: logger.error(f"Error building trie from lexicon: {e}") def add_common_queries(self, queries: List[Tuple[str, int]]): """ Add common query phrases Args: queries: List of (query, frequency) tuples """ for query, freq in queries: # Split multi-word queries words = query.lower().split() for word in words: if len(word) >= 2: self.insert(word, frequency=freq) def _collect_suggestions(self, node: TrieNode, suggestions: List[Tuple[str, int]], limit: int = 5): """ Recursively collect suggestions from a node Args: node: Current trie node suggestions: List to accumulate suggestions limit: Maximum number of suggestions """ if len(suggestions) >= limit * 3: # Collect extra to sort later return if node.is_end_of_word: suggestions.append((node.word, node.frequency)) # Traverse children for char in sorted(node.children.keys()): self._collect_suggestions(node.children[char], suggestions, limit) def get_suggestions(self, prefix: str, limit: int = 5) -> List[dict]: """ Get autocomplete suggestions for a prefix Args: prefix: Partial word typed by user limit: Maximum number of suggestions to return Returns: List of suggestion dictionaries with word and score """ if not prefix: return [] prefix = prefix.lower().strip() node = self.root # Navigate to prefix node for char in prefix: if char not in node.children: return [] # Prefix not found node = node.children[char] # Collect all words with this prefix suggestions = [] self._collect_suggestions(node, suggestions, limit) # Sort by frequency (descending) and return top N suggestions.sort(key=lambda x: x[1], reverse=True) results = [] for word, freq in suggestions[:limit]: results.append({ 'word': word, 'score': freq, 'highlight': prefix # Part to highlight in UI }) return results def search_exact(self, word: str) -> bool: """Check if exact word exists in trie""" word = word.lower().strip() node = self.root for char in word: if char not in node.children: return False node = node.children[char] return node.is_end_of_word def get_statistics(self) -> dict: """Get trie statistics""" return { 'total_words': self.word_count, 'boosted_terms': len(self.boosted_terms) } class SmartAutocomplete: """ Enhanced autocomplete with context awareness Handles multi-word queries and football-specific context """ def __init__(self, trie: AutocompleteTrie): self.trie = trie # Common football query patterns self.query_templates = [ "best {position}", "top {position}", "young {position}", "fast {position}", "{nationality} {position}", "{position} in {league}" ] # Position keywords self.positions = { 'striker', 'forward', 'midfielder', 'defender', 'goalkeeper', 'winger', 'fullback', 'center-back' } # Context-aware suggestions self.context_keywords = { 'best': ['striker', 'midfielder', 'defender', 'goalkeeper'], 'top': ['scorer', 'rated', 'young', 'players'], 'fast': ['striker', 'winger', 'players'], 'young': ['talent', 'prospect', 'players'] } def get_smart_suggestions(self, query: str, limit: int = 5) -> List[dict]: """ Get context-aware suggestions for partial query Args: query: Partial query (may be multi-word) limit: Number of suggestions Returns: List of suggestion dictionaries """ query = query.strip().lower() if not query: return self._get_popular_queries(limit) # Multi-word query handling words = query.split() if len(words) > 1: # Complete last word, show full query last_word = words[-1] prefix = ' '.join(words[:-1]) suggestions = self.trie.get_suggestions(last_word, limit) # Format as complete queries results = [] for sugg in suggestions: results.append({ 'word': f"{prefix} {sugg['word']}" if prefix else sugg['word'], 'score': sugg['score'], 'highlight': last_word }) # Add context-aware completions if len(words) >= 2 and words[0] in self.context_keywords: context_words = self.context_keywords[words[0]] for context_word in context_words: if context_word.startswith(last_word): results.append({ 'word': f"{prefix} {context_word}", 'score': 1000, # Boost context suggestions 'highlight': last_word }) # Sort and return results.sort(key=lambda x: x['score'], reverse=True) return results[:limit] else: # Single word - standard autocomplete suggestions = self.trie.get_suggestions(query, limit) # Add template-based suggestions for positions if any(pos.startswith(query) for pos in self.positions): for pos in self.positions: if pos.startswith(query): suggestions.append({ 'word': f"best {pos}", 'score': 800, 'highlight': query }) suggestions.sort(key=lambda x: x['score'], reverse=True) return suggestions[:limit] def _get_popular_queries(self, limit: int = 5) -> List[dict]: """Get popular/suggested queries when input is empty""" popular = [ {'word': 'best striker', 'score': 1000, 'highlight': ''}, {'word': 'top midfielder', 'score': 900, 'highlight': ''}, {'word': 'young talent', 'score': 850, 'highlight': ''}, {'word': 'fast winger', 'score': 800, 'highlight': ''}, {'word': 'goalkeeper', 'score': 750, 'highlight': ''} ] return popular[:limit] # Global instance (initialized in app.py) autocomplete_engine: Optional[SmartAutocomplete] = None def initialize_autocomplete(lexicon_path: str) -> SmartAutocomplete: """ Initialize global autocomplete engine Args: lexicon_path: Path to lexicon file Returns: SmartAutocomplete instance """ global autocomplete_engine trie = AutocompleteTrie(lexicon_path=lexicon_path) autocomplete_engine = SmartAutocomplete(trie) logger.info("Autocomplete engine initialized") return autocomplete_engine