Spaces:

Ali00922
/

scoutsearch

Sleeping

App Files Files Community

scoutsearch / Backend /src /add_document.py

Ali00922

Upload 37 files

da6a0a4 verified 2 months ago

raw

history blame contribute delete

11.9 kB

	# add_document.py
	# DYNAMIC DOCUMENT ADDITION - Incrementally add new players without full rebuild
	import csv
	import json
	import math
	import os
	import re
	import time
	from collections import defaultdict

	# ---------- PATHS ----------

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
	INDEX_DIR = os.path.join(PROJECT_ROOT, 'data', 'index')
	BARREL_DIR = os.path.join(INDEX_DIR, 'barrels')

	LEXICON_PATH = os.path.join(INDEX_DIR, 'lexicon_complete.json')
	FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, 'forward_index_termid.json')
	INVERTED_INDEX_PATH = os.path.join(INDEX_DIR, 'inverted_index_termid.json')
	TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, 'term_to_barrel_map.json')

	# ---------- TEXT NORMALIZATION (MUST MATCH BUILD PIPELINE) ----------

	COMPREHENSIVE_STOP_WORDS = {
	"the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was",
	"are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that",
	"this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who",
	"when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
	"other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too",
	"very", "can", "will", "just", "should", "now", "player", "club", "team", "football",
	"soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la",
	"bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place",
	# Universal terms that appear in ALL documents (filtering for memory/performance)
	"comprehensive", "international", "performance", "transfermarkt", "injury",
	"summary", "market", "history", "database", "value",
	# Stemmed versions and other universal terms
	"data", "teammat", "sourc", "career", "assist", "app", "minut",
	"available", "national", "significant", "teammate", "transfer", "goal"
	}

	def simple_stemmer(word: str) -> str:
	if word.endswith("ing") and len(word) > 5:
	return word[:-3]
	elif word.endswith("ed") and len(word) > 4:
	return word[:-2]
	elif word.endswith("es") and len(word) > 4:
	return word[:-2]
	elif word.endswith("s") and len(word) > 3:
	return word[:-1]
	return word

	def normalize_and_tokenize(text: str):
	text = text.lower()
	tokens = re.findall(r"\b[a-z]+\b", text)
	result = []
	for w in tokens:
	if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2:
	continue
	result.append(simple_stemmer(w))
	return result

	# ---------- LOAD EXISTING INDEXES ----------

	def load_indexes():
	"""Load all existing indexes into memory."""
	print("[load] Loading lexicon...")
	with open(LEXICON_PATH, 'r', encoding='utf-8') as f:
	lexicon = json.load(f)
	token_to_entry = {entry["token"]: entry for entry in lexicon}
	max_term_id = max(entry["term_id"] for entry in lexicon)
	print(f"[done] Loaded {len(lexicon):,} tokens (max_term_id={max_term_id})")

	print("[load] Loading forward index...")
	with open(FORWARD_INDEX_PATH, 'r', encoding='utf-8') as f:
	forward_index = json.load(f)
	doc_by_id = {doc["player_id"]: doc for doc in forward_index}
	print(f"[done] Loaded {len(forward_index):,} documents")

	print("[load] Loading term-to-barrel mapping...")
	with open(TERM_TO_BARREL_MAP_PATH, 'r', encoding='utf-8') as f:
	term_to_barrel = json.load(f)
	print(f"[done] Loaded {len(term_to_barrel):,} mappings")

	return {
	'lexicon': lexicon,
	'token_to_entry': token_to_entry,
	'max_term_id': max_term_id,
	'forward_index': forward_index,
	'doc_by_id': doc_by_id,
	'term_to_barrel': term_to_barrel
	}

	# ---------- ADD NEW DOCUMENT ----------

	def add_document(player_data: dict, indexes: dict):
	"""
	Add a new player document to the search engine.

	player_data format:
	{
	"player_id": 999999,
	"player_name": "New Player",
	"detailed_content": "Long text with player bio, stats, etc...",
	# ... other metadata fields
	}

	Returns: dict with statistics about the update
	"""
	start_time = time.perf_counter()

	player_id = player_data.get("player_id")
	player_name = player_data.get("player_name", "")
	detailed_content = player_data.get("detailed_content", "")

	if not player_id or not player_name:
	return {"error": "Missing required fields: player_id, player_name"}

	if player_id in indexes['doc_by_id']:
	return {"error": f"Player ID {player_id} already exists"}

	print(f"\n[add] Adding player: {player_name} (ID={player_id})")

	# 1. Tokenize content
	print("[step 1/5] Tokenizing content...")
	all_text = f"{player_name} {detailed_content}"
	tokens = normalize_and_tokenize(all_text)

	if not tokens:
	return {"error": "No valid tokens found in document"}

	# Count term frequencies
	term_freq = defaultdict(int)
	for token in tokens:
	term_freq[token] += 1

	total_terms = len(tokens)
	unique_terms = len(term_freq)
	print(f" Found {total_terms} tokens, {unique_terms} unique")

	# 2. Update lexicon (assign term_ids to new tokens)
	print("[step 2/5] Updating lexicon...")
	new_tokens = []
	next_term_id = indexes['max_term_id'] + 1

	for token, tf in term_freq.items():
	if token not in indexes['token_to_entry']:
	# New token - add to lexicon
	new_entry = {
	"token": token,
	"df": 1, # This document is the first
	"term_id": next_term_id
	}
	indexes['lexicon'].append(new_entry)
	indexes['token_to_entry'][token] = new_entry
	new_tokens.append(token)
	next_term_id += 1
	else:
	# Existing token - increment document frequency
	indexes['token_to_entry'][token]["df"] += 1

	indexes['max_term_id'] = next_term_id - 1
	print(f" Added {len(new_tokens)} new tokens to lexicon")

	# 3. Update forward index
	print("[step 3/5] Updating forward index...")
	term_ids_in_doc = {}
	for token, tf in term_freq.items():
	entry = indexes['token_to_entry'][token]
	term_id = entry["term_id"]
	term_ids_in_doc[term_id] = {
	"token": token,
	"tf": tf
	}

	forward_entry = {
	"player_id": player_id,
	"player_name": player_name,
	"total_terms": total_terms,
	"unique_terms": unique_terms,
	"terms": term_ids_in_doc
	}
	indexes['forward_index'].append(forward_entry)
	indexes['doc_by_id'][player_id] = forward_entry
	print(f" Added document to forward index")

	# 4. Update barrels (inverted index distributed)
	print("[step 4/5] Updating barrels...")
	barrels_updated = set()

	for token, tf in term_freq.items():
	entry = indexes['token_to_entry'][token]
	term_id = entry["term_id"]
	term_id_str = str(term_id)

	# Determine which barrel this term belongs to
	barrel_name = indexes['term_to_barrel'].get(term_id_str)

	if not barrel_name:
	# New term - assign to a barrel (use simple mod distribution)
	num_barrels = max(int(bn.split('_')[1]) for bn in set(indexes['term_to_barrel'].values())) + 1
	barrel_idx = term_id % num_barrels
	barrel_name = f"barrel_{barrel_idx:03d}"
	indexes['term_to_barrel'][term_id_str] = barrel_name

	# Load barrel, update, save back
	barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json")

	if os.path.exists(barrel_path):
	with open(barrel_path, 'r', encoding='utf-8') as f:
	barrel_data = json.load(f)
	else:
	barrel_data = {
	'metadata': {
	'term_count': 0,
	'posting_count': 0,
	'barrel_name': barrel_name
	},
	'inverted_index': {}
	}

	# Update postings for this term
	if term_id_str not in barrel_data['inverted_index']:
	barrel_data['inverted_index'][term_id_str] = {
	'token': token,
	'df': entry['df'],
	'postings': {}
	}

	# Add this document to postings
	barrel_data['inverted_index'][term_id_str]['postings'][str(player_id)] = {
	"tf": tf
	}

	# Update df in barrel
	barrel_data['inverted_index'][term_id_str]['df'] = entry['df']

	# Update metadata
	barrel_data['metadata']['term_count'] = len(barrel_data['inverted_index'])
	barrel_data['metadata']['posting_count'] = sum(
	len(term_data['postings'])
	for term_data in barrel_data['inverted_index'].values()
	)

	# Save barrel
	with open(barrel_path, 'w', encoding='utf-8') as f:
	json.dump(barrel_data, f, ensure_ascii=False)

	barrels_updated.add(barrel_name)

	print(f" Updated {len(barrels_updated)} barrels: {sorted(barrels_updated)}")

	# 5. Save updated indexes
	print("[step 5/5] Saving updated indexes...")

	# Save lexicon
	with open(LEXICON_PATH, 'w', encoding='utf-8') as f:
	json.dump(indexes['lexicon'], f, ensure_ascii=False)

	# Save forward index
	with open(FORWARD_INDEX_PATH, 'w', encoding='utf-8') as f:
	json.dump(indexes['forward_index'], f, ensure_ascii=False)

	# Save term-to-barrel mapping
	with open(TERM_TO_BARREL_MAP_PATH, 'w', encoding='utf-8') as f:
	json.dump(indexes['term_to_barrel'], f, ensure_ascii=False)

	print(f" Saved all indexes")

	elapsed = time.perf_counter() - start_time

	stats = {
	"success": True,
	"player_id": player_id,
	"player_name": player_name,
	"total_terms": total_terms,
	"unique_terms": unique_terms,
	"new_tokens_added": len(new_tokens),
	"barrels_updated": len(barrels_updated),
	"time_seconds": elapsed,
	"meets_requirement": elapsed < 60 # Must be under 1 minute
	}

	print(f"\n[done] Document added in {elapsed:.2f} seconds")
	if stats["meets_requirement"]:
	print("[perf]Under 1 minute requirement")
	else:
	print("[perf]Exceeded 1 minute requirement")

	return stats

	# ---------- CLI ----------

	if __name__ == "__main__":
	print("=" * 60)
	print("DYNAMIC DOCUMENT ADDITION SYSTEM")
	print("=" * 60)

	# Load indexes
	indexes = load_indexes()

	print("\n[ready] System ready to add new documents.")
	print("[info] Enter player data in JSON format or type 'exit' to quit.\n")

	# Example usage
	print("Example player data format:")
	example = {
	"player_id": 999999,
	"player_name": "Test Player",
	"detailed_content": "This is a test player from Manchester United. He plays as a striker and has won multiple trophies."
	}
	print(json.dumps(example, indent=2))
	print("\n" + "-" * 60 + "\n")

	while True:
	print("Enter player data (JSON) or 'exit':")
	user_input = input("> ").strip()

	if user_input.lower() == 'exit':
	break

	try:
	player_data = json.loads(user_input)
	result = add_document(player_data, indexes)
	print("\n[result]")
	print(json.dumps(result, indent=2))
	except json.JSONDecodeError as e:
	print(f"[error] Invalid JSON: {e}")
	except Exception as e:
	print(f"[error] {e}")

	print("\n" + "-" * 60 + "\n")

	print("\n[exit] Exiting document addition system.")