# add_document.py # DYNAMIC DOCUMENT ADDITION - Incrementally add new players without full rebuild import csv import json import math import os import re import time from collections import defaultdict # ---------- PATHS ---------- SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) PROJECT_ROOT = os.path.dirname(SCRIPT_DIR) INDEX_DIR = os.path.join(PROJECT_ROOT, 'data', 'index') BARREL_DIR = os.path.join(INDEX_DIR, 'barrels') LEXICON_PATH = os.path.join(INDEX_DIR, 'lexicon_complete.json') FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, 'forward_index_termid.json') INVERTED_INDEX_PATH = os.path.join(INDEX_DIR, 'inverted_index_termid.json') TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, 'term_to_barrel_map.json') # ---------- TEXT NORMALIZATION (MUST MATCH BUILD PIPELINE) ---------- COMPREHENSIVE_STOP_WORDS = { "the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was", "are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that", "this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too", "very", "can", "will", "just", "should", "now", "player", "club", "team", "football", "soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la", "bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place", # Universal terms that appear in ALL documents (filtering for memory/performance) "comprehensive", "international", "performance", "transfermarkt", "injury", "summary", "market", "history", "database", "value", # Stemmed versions and other universal terms "data", "teammat", "sourc", "career", "assist", "app", "minut", "available", "national", "significant", "teammate", "transfer", "goal" } def simple_stemmer(word: str) -> str: if word.endswith("ing") and len(word) > 5: return word[:-3] elif word.endswith("ed") and len(word) > 4: return word[:-2] elif word.endswith("es") and len(word) > 4: return word[:-2] elif word.endswith("s") and len(word) > 3: return word[:-1] return word def normalize_and_tokenize(text: str): text = text.lower() tokens = re.findall(r"\b[a-z]+\b", text) result = [] for w in tokens: if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2: continue result.append(simple_stemmer(w)) return result # ---------- LOAD EXISTING INDEXES ---------- def load_indexes(): """Load all existing indexes into memory.""" print("[load] Loading lexicon...") with open(LEXICON_PATH, 'r', encoding='utf-8') as f: lexicon = json.load(f) token_to_entry = {entry["token"]: entry for entry in lexicon} max_term_id = max(entry["term_id"] for entry in lexicon) print(f"[done] Loaded {len(lexicon):,} tokens (max_term_id={max_term_id})") print("[load] Loading forward index...") with open(FORWARD_INDEX_PATH, 'r', encoding='utf-8') as f: forward_index = json.load(f) doc_by_id = {doc["player_id"]: doc for doc in forward_index} print(f"[done] Loaded {len(forward_index):,} documents") print("[load] Loading term-to-barrel mapping...") with open(TERM_TO_BARREL_MAP_PATH, 'r', encoding='utf-8') as f: term_to_barrel = json.load(f) print(f"[done] Loaded {len(term_to_barrel):,} mappings") return { 'lexicon': lexicon, 'token_to_entry': token_to_entry, 'max_term_id': max_term_id, 'forward_index': forward_index, 'doc_by_id': doc_by_id, 'term_to_barrel': term_to_barrel } # ---------- ADD NEW DOCUMENT ---------- def add_document(player_data: dict, indexes: dict): """ Add a new player document to the search engine. player_data format: { "player_id": 999999, "player_name": "New Player", "detailed_content": "Long text with player bio, stats, etc...", # ... other metadata fields } Returns: dict with statistics about the update """ start_time = time.perf_counter() player_id = player_data.get("player_id") player_name = player_data.get("player_name", "") detailed_content = player_data.get("detailed_content", "") if not player_id or not player_name: return {"error": "Missing required fields: player_id, player_name"} if player_id in indexes['doc_by_id']: return {"error": f"Player ID {player_id} already exists"} print(f"\n[add] Adding player: {player_name} (ID={player_id})") # 1. Tokenize content print("[step 1/5] Tokenizing content...") all_text = f"{player_name} {detailed_content}" tokens = normalize_and_tokenize(all_text) if not tokens: return {"error": "No valid tokens found in document"} # Count term frequencies term_freq = defaultdict(int) for token in tokens: term_freq[token] += 1 total_terms = len(tokens) unique_terms = len(term_freq) print(f" Found {total_terms} tokens, {unique_terms} unique") # 2. Update lexicon (assign term_ids to new tokens) print("[step 2/5] Updating lexicon...") new_tokens = [] next_term_id = indexes['max_term_id'] + 1 for token, tf in term_freq.items(): if token not in indexes['token_to_entry']: # New token - add to lexicon new_entry = { "token": token, "df": 1, # This document is the first "term_id": next_term_id } indexes['lexicon'].append(new_entry) indexes['token_to_entry'][token] = new_entry new_tokens.append(token) next_term_id += 1 else: # Existing token - increment document frequency indexes['token_to_entry'][token]["df"] += 1 indexes['max_term_id'] = next_term_id - 1 print(f" Added {len(new_tokens)} new tokens to lexicon") # 3. Update forward index print("[step 3/5] Updating forward index...") term_ids_in_doc = {} for token, tf in term_freq.items(): entry = indexes['token_to_entry'][token] term_id = entry["term_id"] term_ids_in_doc[term_id] = { "token": token, "tf": tf } forward_entry = { "player_id": player_id, "player_name": player_name, "total_terms": total_terms, "unique_terms": unique_terms, "terms": term_ids_in_doc } indexes['forward_index'].append(forward_entry) indexes['doc_by_id'][player_id] = forward_entry print(f" Added document to forward index") # 4. Update barrels (inverted index distributed) print("[step 4/5] Updating barrels...") barrels_updated = set() for token, tf in term_freq.items(): entry = indexes['token_to_entry'][token] term_id = entry["term_id"] term_id_str = str(term_id) # Determine which barrel this term belongs to barrel_name = indexes['term_to_barrel'].get(term_id_str) if not barrel_name: # New term - assign to a barrel (use simple mod distribution) num_barrels = max(int(bn.split('_')[1]) for bn in set(indexes['term_to_barrel'].values())) + 1 barrel_idx = term_id % num_barrels barrel_name = f"barrel_{barrel_idx:03d}" indexes['term_to_barrel'][term_id_str] = barrel_name # Load barrel, update, save back barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json") if os.path.exists(barrel_path): with open(barrel_path, 'r', encoding='utf-8') as f: barrel_data = json.load(f) else: barrel_data = { 'metadata': { 'term_count': 0, 'posting_count': 0, 'barrel_name': barrel_name }, 'inverted_index': {} } # Update postings for this term if term_id_str not in barrel_data['inverted_index']: barrel_data['inverted_index'][term_id_str] = { 'token': token, 'df': entry['df'], 'postings': {} } # Add this document to postings barrel_data['inverted_index'][term_id_str]['postings'][str(player_id)] = { "tf": tf } # Update df in barrel barrel_data['inverted_index'][term_id_str]['df'] = entry['df'] # Update metadata barrel_data['metadata']['term_count'] = len(barrel_data['inverted_index']) barrel_data['metadata']['posting_count'] = sum( len(term_data['postings']) for term_data in barrel_data['inverted_index'].values() ) # Save barrel with open(barrel_path, 'w', encoding='utf-8') as f: json.dump(barrel_data, f, ensure_ascii=False) barrels_updated.add(barrel_name) print(f" Updated {len(barrels_updated)} barrels: {sorted(barrels_updated)}") # 5. Save updated indexes print("[step 5/5] Saving updated indexes...") # Save lexicon with open(LEXICON_PATH, 'w', encoding='utf-8') as f: json.dump(indexes['lexicon'], f, ensure_ascii=False) # Save forward index with open(FORWARD_INDEX_PATH, 'w', encoding='utf-8') as f: json.dump(indexes['forward_index'], f, ensure_ascii=False) # Save term-to-barrel mapping with open(TERM_TO_BARREL_MAP_PATH, 'w', encoding='utf-8') as f: json.dump(indexes['term_to_barrel'], f, ensure_ascii=False) print(f" Saved all indexes") elapsed = time.perf_counter() - start_time stats = { "success": True, "player_id": player_id, "player_name": player_name, "total_terms": total_terms, "unique_terms": unique_terms, "new_tokens_added": len(new_tokens), "barrels_updated": len(barrels_updated), "time_seconds": elapsed, "meets_requirement": elapsed < 60 # Must be under 1 minute } print(f"\n[done] Document added in {elapsed:.2f} seconds") if stats["meets_requirement"]: print("[perf]Under 1 minute requirement") else: print("[perf]Exceeded 1 minute requirement") return stats # ---------- CLI ---------- if __name__ == "__main__": print("=" * 60) print("DYNAMIC DOCUMENT ADDITION SYSTEM") print("=" * 60) # Load indexes indexes = load_indexes() print("\n[ready] System ready to add new documents.") print("[info] Enter player data in JSON format or type 'exit' to quit.\n") # Example usage print("Example player data format:") example = { "player_id": 999999, "player_name": "Test Player", "detailed_content": "This is a test player from Manchester United. He plays as a striker and has won multiple trophies." } print(json.dumps(example, indent=2)) print("\n" + "-" * 60 + "\n") while True: print("Enter player data (JSON) or 'exit':") user_input = input("> ").strip() if user_input.lower() == 'exit': break try: player_data = json.loads(user_input) result = add_document(player_data, indexes) print("\n[result]") print(json.dumps(result, indent=2)) except json.JSONDecodeError as e: print(f"[error] Invalid JSON: {e}") except Exception as e: print(f"[error] {e}") print("\n" + "-" * 60 + "\n") print("\n[exit] Exiting document addition system.")