Spaces:
Sleeping
Sleeping
| # add_document.py | |
| # DYNAMIC DOCUMENT ADDITION - Incrementally add new players without full rebuild | |
| import csv | |
| import json | |
| import math | |
| import os | |
| import re | |
| import time | |
| from collections import defaultdict | |
| # ---------- PATHS ---------- | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| PROJECT_ROOT = os.path.dirname(SCRIPT_DIR) | |
| INDEX_DIR = os.path.join(PROJECT_ROOT, 'data', 'index') | |
| BARREL_DIR = os.path.join(INDEX_DIR, 'barrels') | |
| LEXICON_PATH = os.path.join(INDEX_DIR, 'lexicon_complete.json') | |
| FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, 'forward_index_termid.json') | |
| INVERTED_INDEX_PATH = os.path.join(INDEX_DIR, 'inverted_index_termid.json') | |
| TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, 'term_to_barrel_map.json') | |
| # ---------- TEXT NORMALIZATION (MUST MATCH BUILD PIPELINE) ---------- | |
| COMPREHENSIVE_STOP_WORDS = { | |
| "the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was", | |
| "are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that", | |
| "this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who", | |
| "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", | |
| "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too", | |
| "very", "can", "will", "just", "should", "now", "player", "club", "team", "football", | |
| "soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la", | |
| "bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place", | |
| # Universal terms that appear in ALL documents (filtering for memory/performance) | |
| "comprehensive", "international", "performance", "transfermarkt", "injury", | |
| "summary", "market", "history", "database", "value", | |
| # Stemmed versions and other universal terms | |
| "data", "teammat", "sourc", "career", "assist", "app", "minut", | |
| "available", "national", "significant", "teammate", "transfer", "goal" | |
| } | |
| def simple_stemmer(word: str) -> str: | |
| if word.endswith("ing") and len(word) > 5: | |
| return word[:-3] | |
| elif word.endswith("ed") and len(word) > 4: | |
| return word[:-2] | |
| elif word.endswith("es") and len(word) > 4: | |
| return word[:-2] | |
| elif word.endswith("s") and len(word) > 3: | |
| return word[:-1] | |
| return word | |
| def normalize_and_tokenize(text: str): | |
| text = text.lower() | |
| tokens = re.findall(r"\b[a-z]+\b", text) | |
| result = [] | |
| for w in tokens: | |
| if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2: | |
| continue | |
| result.append(simple_stemmer(w)) | |
| return result | |
| # ---------- LOAD EXISTING INDEXES ---------- | |
| def load_indexes(): | |
| """Load all existing indexes into memory.""" | |
| print("[load] Loading lexicon...") | |
| with open(LEXICON_PATH, 'r', encoding='utf-8') as f: | |
| lexicon = json.load(f) | |
| token_to_entry = {entry["token"]: entry for entry in lexicon} | |
| max_term_id = max(entry["term_id"] for entry in lexicon) | |
| print(f"[done] Loaded {len(lexicon):,} tokens (max_term_id={max_term_id})") | |
| print("[load] Loading forward index...") | |
| with open(FORWARD_INDEX_PATH, 'r', encoding='utf-8') as f: | |
| forward_index = json.load(f) | |
| doc_by_id = {doc["player_id"]: doc for doc in forward_index} | |
| print(f"[done] Loaded {len(forward_index):,} documents") | |
| print("[load] Loading term-to-barrel mapping...") | |
| with open(TERM_TO_BARREL_MAP_PATH, 'r', encoding='utf-8') as f: | |
| term_to_barrel = json.load(f) | |
| print(f"[done] Loaded {len(term_to_barrel):,} mappings") | |
| return { | |
| 'lexicon': lexicon, | |
| 'token_to_entry': token_to_entry, | |
| 'max_term_id': max_term_id, | |
| 'forward_index': forward_index, | |
| 'doc_by_id': doc_by_id, | |
| 'term_to_barrel': term_to_barrel | |
| } | |
| # ---------- ADD NEW DOCUMENT ---------- | |
| def add_document(player_data: dict, indexes: dict): | |
| """ | |
| Add a new player document to the search engine. | |
| player_data format: | |
| { | |
| "player_id": 999999, | |
| "player_name": "New Player", | |
| "detailed_content": "Long text with player bio, stats, etc...", | |
| # ... other metadata fields | |
| } | |
| Returns: dict with statistics about the update | |
| """ | |
| start_time = time.perf_counter() | |
| player_id = player_data.get("player_id") | |
| player_name = player_data.get("player_name", "") | |
| detailed_content = player_data.get("detailed_content", "") | |
| if not player_id or not player_name: | |
| return {"error": "Missing required fields: player_id, player_name"} | |
| if player_id in indexes['doc_by_id']: | |
| return {"error": f"Player ID {player_id} already exists"} | |
| print(f"\n[add] Adding player: {player_name} (ID={player_id})") | |
| # 1. Tokenize content | |
| print("[step 1/5] Tokenizing content...") | |
| all_text = f"{player_name} {detailed_content}" | |
| tokens = normalize_and_tokenize(all_text) | |
| if not tokens: | |
| return {"error": "No valid tokens found in document"} | |
| # Count term frequencies | |
| term_freq = defaultdict(int) | |
| for token in tokens: | |
| term_freq[token] += 1 | |
| total_terms = len(tokens) | |
| unique_terms = len(term_freq) | |
| print(f" Found {total_terms} tokens, {unique_terms} unique") | |
| # 2. Update lexicon (assign term_ids to new tokens) | |
| print("[step 2/5] Updating lexicon...") | |
| new_tokens = [] | |
| next_term_id = indexes['max_term_id'] + 1 | |
| for token, tf in term_freq.items(): | |
| if token not in indexes['token_to_entry']: | |
| # New token - add to lexicon | |
| new_entry = { | |
| "token": token, | |
| "df": 1, # This document is the first | |
| "term_id": next_term_id | |
| } | |
| indexes['lexicon'].append(new_entry) | |
| indexes['token_to_entry'][token] = new_entry | |
| new_tokens.append(token) | |
| next_term_id += 1 | |
| else: | |
| # Existing token - increment document frequency | |
| indexes['token_to_entry'][token]["df"] += 1 | |
| indexes['max_term_id'] = next_term_id - 1 | |
| print(f" Added {len(new_tokens)} new tokens to lexicon") | |
| # 3. Update forward index | |
| print("[step 3/5] Updating forward index...") | |
| term_ids_in_doc = {} | |
| for token, tf in term_freq.items(): | |
| entry = indexes['token_to_entry'][token] | |
| term_id = entry["term_id"] | |
| term_ids_in_doc[term_id] = { | |
| "token": token, | |
| "tf": tf | |
| } | |
| forward_entry = { | |
| "player_id": player_id, | |
| "player_name": player_name, | |
| "total_terms": total_terms, | |
| "unique_terms": unique_terms, | |
| "terms": term_ids_in_doc | |
| } | |
| indexes['forward_index'].append(forward_entry) | |
| indexes['doc_by_id'][player_id] = forward_entry | |
| print(f" Added document to forward index") | |
| # 4. Update barrels (inverted index distributed) | |
| print("[step 4/5] Updating barrels...") | |
| barrels_updated = set() | |
| for token, tf in term_freq.items(): | |
| entry = indexes['token_to_entry'][token] | |
| term_id = entry["term_id"] | |
| term_id_str = str(term_id) | |
| # Determine which barrel this term belongs to | |
| barrel_name = indexes['term_to_barrel'].get(term_id_str) | |
| if not barrel_name: | |
| # New term - assign to a barrel (use simple mod distribution) | |
| num_barrels = max(int(bn.split('_')[1]) for bn in set(indexes['term_to_barrel'].values())) + 1 | |
| barrel_idx = term_id % num_barrels | |
| barrel_name = f"barrel_{barrel_idx:03d}" | |
| indexes['term_to_barrel'][term_id_str] = barrel_name | |
| # Load barrel, update, save back | |
| barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json") | |
| if os.path.exists(barrel_path): | |
| with open(barrel_path, 'r', encoding='utf-8') as f: | |
| barrel_data = json.load(f) | |
| else: | |
| barrel_data = { | |
| 'metadata': { | |
| 'term_count': 0, | |
| 'posting_count': 0, | |
| 'barrel_name': barrel_name | |
| }, | |
| 'inverted_index': {} | |
| } | |
| # Update postings for this term | |
| if term_id_str not in barrel_data['inverted_index']: | |
| barrel_data['inverted_index'][term_id_str] = { | |
| 'token': token, | |
| 'df': entry['df'], | |
| 'postings': {} | |
| } | |
| # Add this document to postings | |
| barrel_data['inverted_index'][term_id_str]['postings'][str(player_id)] = { | |
| "tf": tf | |
| } | |
| # Update df in barrel | |
| barrel_data['inverted_index'][term_id_str]['df'] = entry['df'] | |
| # Update metadata | |
| barrel_data['metadata']['term_count'] = len(barrel_data['inverted_index']) | |
| barrel_data['metadata']['posting_count'] = sum( | |
| len(term_data['postings']) | |
| for term_data in barrel_data['inverted_index'].values() | |
| ) | |
| # Save barrel | |
| with open(barrel_path, 'w', encoding='utf-8') as f: | |
| json.dump(barrel_data, f, ensure_ascii=False) | |
| barrels_updated.add(barrel_name) | |
| print(f" Updated {len(barrels_updated)} barrels: {sorted(barrels_updated)}") | |
| # 5. Save updated indexes | |
| print("[step 5/5] Saving updated indexes...") | |
| # Save lexicon | |
| with open(LEXICON_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(indexes['lexicon'], f, ensure_ascii=False) | |
| # Save forward index | |
| with open(FORWARD_INDEX_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(indexes['forward_index'], f, ensure_ascii=False) | |
| # Save term-to-barrel mapping | |
| with open(TERM_TO_BARREL_MAP_PATH, 'w', encoding='utf-8') as f: | |
| json.dump(indexes['term_to_barrel'], f, ensure_ascii=False) | |
| print(f" Saved all indexes") | |
| elapsed = time.perf_counter() - start_time | |
| stats = { | |
| "success": True, | |
| "player_id": player_id, | |
| "player_name": player_name, | |
| "total_terms": total_terms, | |
| "unique_terms": unique_terms, | |
| "new_tokens_added": len(new_tokens), | |
| "barrels_updated": len(barrels_updated), | |
| "time_seconds": elapsed, | |
| "meets_requirement": elapsed < 60 # Must be under 1 minute | |
| } | |
| print(f"\n[done] Document added in {elapsed:.2f} seconds") | |
| if stats["meets_requirement"]: | |
| print("[perf]Under 1 minute requirement") | |
| else: | |
| print("[perf]Exceeded 1 minute requirement") | |
| return stats | |
| # ---------- CLI ---------- | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("DYNAMIC DOCUMENT ADDITION SYSTEM") | |
| print("=" * 60) | |
| # Load indexes | |
| indexes = load_indexes() | |
| print("\n[ready] System ready to add new documents.") | |
| print("[info] Enter player data in JSON format or type 'exit' to quit.\n") | |
| # Example usage | |
| print("Example player data format:") | |
| example = { | |
| "player_id": 999999, | |
| "player_name": "Test Player", | |
| "detailed_content": "This is a test player from Manchester United. He plays as a striker and has won multiple trophies." | |
| } | |
| print(json.dumps(example, indent=2)) | |
| print("\n" + "-" * 60 + "\n") | |
| while True: | |
| print("Enter player data (JSON) or 'exit':") | |
| user_input = input("> ").strip() | |
| if user_input.lower() == 'exit': | |
| break | |
| try: | |
| player_data = json.loads(user_input) | |
| result = add_document(player_data, indexes) | |
| print("\n[result]") | |
| print(json.dumps(result, indent=2)) | |
| except json.JSONDecodeError as e: | |
| print(f"[error] Invalid JSON: {e}") | |
| except Exception as e: | |
| print(f"[error] {e}") | |
| print("\n" + "-" * 60 + "\n") | |
| print("\n[exit] Exiting document addition system.") | |