Spaces:

Ali00922
/

scoutsearch

Sleeping

App Files Files Community

scoutsearch / Backend /src /search_engine.py

Ali00922

Upload 37 files

da6a0a4 verified 2 months ago

raw

history blame contribute delete

14.6 kB

	# search_engine_barrels.py
	# BARREL-OPTIMIZED SEARCH ENGINE - Loads only required barrels per query
	import csv
	import json
	import math
	import os
	import re
	import time
	from collections import defaultdict

	# ---------- CONFIG & PATHS ----------

	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
	INDEX_DIR = os.path.join(PROJECT_ROOT, "data", "index")
	BARREL_DIR = os.path.join(INDEX_DIR, "barrels")
	LEXICON_PATH = os.path.join(INDEX_DIR, "lexicon_complete.json")
	FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, "forward_index_termid.json")
	TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, "term_to_barrel_map.json")
	MARKET_VALUE_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "raw", "player_latest_market_value", "player_latest_market_value.csv")
	PROFILE_DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "processed", "complete_player_profiles.json")

	# BM25 parameters
	K1 = 1.2
	B = 0.75

	# Scoring boosts
	NAME_TOKEN_WEIGHT = 0.75
	NAME_PREFIX_BONUS = 1.25
	EXACT_NAME_BONUS = 3.0
	RAW_SUBSTRING_BONUS = 0.25
	MARKET_VALUE_WEIGHT = 12.0
	PROFILE_LENGTH_WEIGHT = 4.0
	NON_NAME_MATCH_PENALTY = 1.5

	# ---------- TEXT NORMALIZATION ----------

	COMPREHENSIVE_STOP_WORDS = {
	"the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was",
	"are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that",
	"this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who",
	"when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
	"other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too",
	"very", "can", "will", "just", "should", "now", "player", "club", "team", "football",
	"soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la",
	"bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place",
	# Universal terms that appear in ALL documents (filtering for memory/performance)
	"comprehensive", "international", "performance", "transfermarkt", "injury",
	"summary", "market", "history", "database", "value",
	# Stemmed versions and other universal terms
	"data", "teammat", "sourc", "career", "assist", "app", "minut",
	"available", "national", "significant", "teammate", "transfer", "goal"
	}

	def simple_stemmer(word: str) -> str:
	if word.endswith("ing") and len(word) > 5:
	return word[:-3]
	elif word.endswith("ed") and len(word) > 4:
	return word[:-2]
	elif word.endswith("es") and len(word) > 4:
	return word[:-2]
	elif word.endswith("s") and len(word) > 3:
	return word[:-1]
	return word

	def normalize_and_tokenize(text: str):
	text = text.lower()
	tokens = re.findall(r"\b[a-z]+\b", text)
	result = []
	for w in tokens:
	if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2:
	continue
	result.append(simple_stemmer(w))
	return result

	def normalize_name_tokens(value: str):
	if not isinstance(value, str):
	return []
	tokens = re.findall(r"[a-z]+", value.lower())
	return [simple_stemmer(tok) for tok in tokens if tok]

	def build_name_metadata(name: str):
	tokens = normalize_name_tokens(name)
	token_set = set(tokens)
	normalized = " ".join(tokens)
	return {
	"tokens": tokens,
	"token_set": token_set,
	"normalized": normalized,
	"raw_lower": name.lower() if isinstance(name, str) else "",
	}

	def load_market_values(path: str):
	values = {}
	try:
	with open(path, "r", encoding="utf-8") as handle:
	reader = csv.DictReader(handle)
	for row in reader:
	try:
	player_id = int(row.get("player_id", ""))
	except (TypeError, ValueError):
	continue
	raw_value = row.get("value")
	try:
	value = float(raw_value)
	except (TypeError, ValueError):
	continue
	date_key = row.get("date_unix", "") or ""
	current = values.get(player_id)
	if current is None or date_key > current[0]:
	values[player_id] = (date_key, value)
	except FileNotFoundError:
	print(f"[warn] Market value file not found at {path}")
	return {}
	return {pid: info[1] for pid, info in values.items()}

	def load_profile_lengths(path: str):
	try:
	with open(path, "r", encoding="utf-8") as handle:
	data = json.load(handle)
	except FileNotFoundError:
	print(f"[warn] Profile data file not found at {path}")
	return {}

	lengths = {}
	for entry in data:
	player_id = entry.get("player_id")
	if not isinstance(player_id, int):
	continue
	detailed = entry.get("detailed_content")
	if isinstance(detailed, str) and detailed:
	lengths[player_id] = len(detailed)
	return lengths

	# ---------- LOAD STATIC INDEXES (NOT INVERTED INDEX) ----------

	print("[init] Loading lexicon...")
	with open(LEXICON_PATH, "r", encoding="utf-8") as f:
	lexicon_entries = json.load(f)
	token_to_id = {entry["token"]: entry["term_id"] for entry in lexicon_entries}
	termid_to_token = {entry["term_id"]: entry["token"] for entry in lexicon_entries}
	term_document_frequency = {entry["term_id"]: entry["df"] for entry in lexicon_entries}
	print(f"[done] Lexicon loaded: {len(token_to_id):,} tokens")

	print("[init] Loading forward index...")
	with open(FORWARD_INDEX_PATH, "r", encoding="utf-8") as f:
	forward_index = json.load(f)
	doc_by_id = {doc["player_id"]: doc for doc in forward_index}
	N = len(doc_by_id)
	avg_doc_len = sum(d["total_terms"] for d in forward_index) / N if N > 0 else 0.0
	name_metadata = {doc_id: build_name_metadata(doc.get("player_name"))
	for doc_id, doc in doc_by_id.items()}
	print(f"[done] Forward index: {N:,} documents (avg_len={avg_doc_len:.2f})")

	print("[init] Loading term-to-barrel mapping...")
	with open(TERM_TO_BARREL_MAP_PATH, "r", encoding="utf-8") as f:
	term_to_barrel = json.load(f)
	print(f"[done] Term-to-barrel map loaded: {len(term_to_barrel):,} mappings")

	print("[init] Loading market values...")
	player_market_value = load_market_values(MARKET_VALUE_PATH)
	max_market_value = max(player_market_value.values(), default=0.0)
	market_value_log_max = math.log1p(max_market_value) if max_market_value > 0 else 1.0
	print(f"[done] Market values loaded for {len(player_market_value):,} players")

	print("[init] Loading profile metadata...")
	profile_length_by_id = load_profile_lengths(PROFILE_DATA_PATH)
	max_profile_length = max(profile_length_by_id.values(), default=0)
	profile_length_log_max = math.log1p(max_profile_length) if max_profile_length > 0 else 1.0
	print(f"[done] Profile metadata loaded for {len(profile_length_by_id):,} players")

	# ---------- BARREL CACHE (LRU-like) ----------

	barrel_cache = {}
	MAX_CACHED_BARRELS = 10 # Keep only 10 barrels in memory at once

	def load_barrel(barrel_name: str):
	"""Load a barrel file and cache it. Implements simple LRU eviction."""
	if barrel_name in barrel_cache:
	return barrel_cache[barrel_name]

	barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json")
	try:
	with open(barrel_path, "r", encoding="utf-8") as f:
	barrel_data = json.load(f)

	# Cache management
	if len(barrel_cache) >= MAX_CACHED_BARRELS:
	# Remove oldest (first) entry
	oldest_key = next(iter(barrel_cache))
	del barrel_cache[oldest_key]

	barrel_cache[barrel_name] = barrel_data
	return barrel_data
	except FileNotFoundError:
	print(f"[error] Barrel file not found: {barrel_path}")
	return None

	# ---------- QUERY TO TERM IDs ----------

	def tokens_to_term_ids(tokens):
	seen = set()
	unique_term_ids = []
	for tok in tokens:
	tid = token_to_id.get(tok)
	if tid is None or tid in seen:
	continue
	seen.add(tid)
	unique_term_ids.append(tid)
	return unique_term_ids

	# ---------- BM25 SCORING ----------

	def bm25_score(tf, df, doc_len, N, avg_doc_len, k1=K1, b=B):
	idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
	denom = tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
	return idf * (tf * (k1 + 1) / denom)

	# ---------- BARREL-BASED SEARCH ----------

	def search(query: str, top_k: int = 10, verbose: bool = True):
	start_time = time.perf_counter()

	log = print if verbose else (lambda args, *kwargs: None)

	log(f"\n[query] {query}")
	query_tokens = normalize_and_tokenize(query)
	term_ids = tokens_to_term_ids(query_tokens)

	if not term_ids:
	elapsed = (time.perf_counter() - start_time) * 1000
	log(f"No query terms found in lexicon. (took {elapsed:.2f} ms)")
	return []

	log("Query tokens -> term_ids:",
	[(termid_to_token.get(tid, "?"), tid) for tid in term_ids])

	# KEY OPTIMIZATION: Determine which barrels to load
	required_barrels = set()
	for tid in term_ids:
	barrel_name = term_to_barrel.get(str(tid))
	if barrel_name:
	required_barrels.add(barrel_name)

	log(f"[barrels] Loading {len(required_barrels)} barrel(s): {sorted(required_barrels)}")

	# Load only required barrels
	barrel_load_start = time.perf_counter()
	loaded_barrels = {}
	for barrel_name in required_barrels:
	barrel_data = load_barrel(barrel_name)
	if barrel_data:
	loaded_barrels[barrel_name] = barrel_data
	barrel_load_time = (time.perf_counter() - barrel_load_start) * 1000
	log(f"[barrels] Loaded in {barrel_load_time:.2f} ms")

	# BM25 scoring using barrel data
	scores = defaultdict(float)

	for tid in term_ids:
	df = term_document_frequency.get(tid, 0)
	if df == 0:
	continue

	# Get barrel for this term
	barrel_name = term_to_barrel.get(str(tid))
	if not barrel_name or barrel_name not in loaded_barrels:
	continue

	barrel_data = loaded_barrels[barrel_name]
	inverted_index_part = barrel_data.get("inverted_index", {})

	# Get postings for this term
	term_data = inverted_index_part.get(str(tid))
	if not term_data:
	continue

	postings = term_data.get("postings", {})

	for doc_id_str, info in postings.items():
	doc_id = int(doc_id_str)
	tf = info["tf"]
	doc_len = doc_by_id[doc_id]["total_terms"]
	scores[doc_id] += bm25_score(tf, df, doc_len, N, avg_doc_len)

	# Metadata boosting (same as before)
	if scores:
	query_name_tokens = normalize_name_tokens(query)
	query_name = " ".join(query_name_tokens)
	raw_query_lower = query.lower().strip()

	for doc_id in scores:
	boost = 0.0
	meta = name_metadata.get(doc_id)
	has_name_match = False
	match_count = 0

	if meta:
	if query_tokens:
	match_count = sum(1 for tok in query_tokens if tok in meta["token_set"])
	if match_count:
	boost += NAME_TOKEN_WEIGHT * match_count
	has_name_match = True

	if query_name:
	if meta["normalized"] == query_name:
	boost += EXACT_NAME_BONUS
	has_name_match = True
	elif meta["normalized"].startswith(query_name):
	boost += NAME_PREFIX_BONUS
	has_name_match = True

	if raw_query_lower and raw_query_lower in meta["raw_lower"]:
	boost += RAW_SUBSTRING_BONUS
	has_name_match = True

	if not has_name_match and query_tokens:
	boost -= NON_NAME_MATCH_PENALTY

	if has_name_match:
	value = player_market_value.get(doc_id)
	if value and market_value_log_max > 0.0:
	boost += MARKET_VALUE_WEIGHT * (math.log1p(value) / market_value_log_max)

	length = profile_length_by_id.get(doc_id)
	if length and profile_length_log_max > 0.0:
	boost += PROFILE_LENGTH_WEIGHT * (math.log1p(length) / profile_length_log_max)

	scores[doc_id] += boost

	if not scores:
	elapsed = (time.perf_counter() - start_time) * 1000
	log(f"No documents matched these terms. (took {elapsed:.2f} ms)")
	return []

	ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

	results = []
	for rank, (doc_id, score) in enumerate(ranked, start=1):
	doc = doc_by_id[doc_id]
	results.append({
	"rank": rank,
	"doc_id": doc_id,
	"player_id": doc["player_id"],
	"player_name": doc["player_name"],
	"score": score,
	"market_value": player_market_value.get(doc_id),
	})

	elapsed = (time.perf_counter() - start_time) * 1000

	log("\n[top] Results:")
	for r in results:
	value = r["market_value"]
	length = profile_length_by_id.get(r["doc_id"])
	extras = []
	if value:
	extras.append(f"market_value~{value:,.0f} EUR")
	if length:
	extras.append(f"profile_chars={length}")
	extra_text = f" [{', '.join(extras)}]" if extras else ""
	log(f"{r['rank']:2d}. [{r['score']:.3f}] {r['player_name']} (player_id={r['player_id']}){extra_text}")

	log(f"\n[time] {elapsed:.2f} ms (barrel_load={barrel_load_time:.2f} ms)")
	log(f"[memory] {len(barrel_cache)} barrels cached, {len(required_barrels)} loaded for this query")

	if elapsed < 500:
	log("[perf]Under 500 ms goal")
	else:
	log("[perf]Above 500 ms goal")

	return results

	# ---------- CLI ----------

	if __name__ == "__main__":
	print("\n[ready] BARREL-OPTIMIZED search engine ready.")
	print(f"[info] System loads only required barrels per query (max {MAX_CACHED_BARRELS} cached)")
	print("[info] Type a query or press Enter to exit.\n")

	while True:
	q = input("Query> ").strip()
	if not q:
	break
	search(q, top_k=10)

	print("\n[exit] Exiting search engine.")