Spaces:

Ali00922
/

scoutsearch

Sleeping

File size: 14,597 Bytes

da6a0a4

# search_engine_barrels.py
# BARREL-OPTIMIZED SEARCH ENGINE - Loads only required barrels per query
import csv
import json
import math
import os
import re
import time
from collections import defaultdict

# ---------- CONFIG & PATHS ----------

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
INDEX_DIR = os.path.join(PROJECT_ROOT, "data", "index")
BARREL_DIR = os.path.join(INDEX_DIR, "barrels")
LEXICON_PATH = os.path.join(INDEX_DIR, "lexicon_complete.json")
FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, "forward_index_termid.json")
TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, "term_to_barrel_map.json")
MARKET_VALUE_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "raw", "player_latest_market_value", "player_latest_market_value.csv")
PROFILE_DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "processed", "complete_player_profiles.json")

# BM25 parameters
K1 = 1.2
B = 0.75

# Scoring boosts
NAME_TOKEN_WEIGHT = 0.75
NAME_PREFIX_BONUS = 1.25
EXACT_NAME_BONUS = 3.0
RAW_SUBSTRING_BONUS = 0.25
MARKET_VALUE_WEIGHT = 12.0
PROFILE_LENGTH_WEIGHT = 4.0
NON_NAME_MATCH_PENALTY = 1.5

# ---------- TEXT NORMALIZATION ----------

COMPREHENSIVE_STOP_WORDS = {
    "the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was",
    "are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that",
    "this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who",
    "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
    "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too",
    "very", "can", "will", "just", "should", "now", "player", "club", "team", "football",
    "soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la",
    "bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place",
    # Universal terms that appear in ALL documents (filtering for memory/performance)
    "comprehensive", "international", "performance", "transfermarkt", "injury",
    "summary", "market", "history", "database", "value",
    # Stemmed versions and other universal terms
    "data", "teammat", "sourc", "career", "assist", "app", "minut",
    "available", "national", "significant", "teammate", "transfer", "goal"
}

def simple_stemmer(word: str) -> str:
    if word.endswith("ing") and len(word) > 5:
        return word[:-3]
    elif word.endswith("ed") and len(word) > 4:
        return word[:-2]
    elif word.endswith("es") and len(word) > 4:
        return word[:-2]
    elif word.endswith("s") and len(word) > 3:
        return word[:-1]
    return word

def normalize_and_tokenize(text: str):
    text = text.lower()
    tokens = re.findall(r"\b[a-z]+\b", text)
    result = []
    for w in tokens:
        if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2:
            continue
        result.append(simple_stemmer(w))
    return result

def normalize_name_tokens(value: str):
    if not isinstance(value, str):
        return []
    tokens = re.findall(r"[a-z]+", value.lower())
    return [simple_stemmer(tok) for tok in tokens if tok]

def build_name_metadata(name: str):
    tokens = normalize_name_tokens(name)
    token_set = set(tokens)
    normalized = " ".join(tokens)
    return {
        "tokens": tokens,
        "token_set": token_set,
        "normalized": normalized,
        "raw_lower": name.lower() if isinstance(name, str) else "",
    }

def load_market_values(path: str):
    values = {}
    try:
        with open(path, "r", encoding="utf-8") as handle:
            reader = csv.DictReader(handle)
            for row in reader:
                try:
                    player_id = int(row.get("player_id", ""))
                except (TypeError, ValueError):
                    continue
                raw_value = row.get("value")
                try:
                    value = float(raw_value)
                except (TypeError, ValueError):
                    continue
                date_key = row.get("date_unix", "") or ""
                current = values.get(player_id)
                if current is None or date_key > current[0]:
                    values[player_id] = (date_key, value)
    except FileNotFoundError:
        print(f"[warn] Market value file not found at {path}")
        return {}
    return {pid: info[1] for pid, info in values.items()}

def load_profile_lengths(path: str):
    try:
        with open(path, "r", encoding="utf-8") as handle:
            data = json.load(handle)
    except FileNotFoundError:
        print(f"[warn] Profile data file not found at {path}")
        return {}

    lengths = {}
    for entry in data:
        player_id = entry.get("player_id")
        if not isinstance(player_id, int):
            continue
        detailed = entry.get("detailed_content")
        if isinstance(detailed, str) and detailed:
            lengths[player_id] = len(detailed)
    return lengths

# ---------- LOAD STATIC INDEXES (NOT INVERTED INDEX) ----------

print("[init] Loading lexicon...")
with open(LEXICON_PATH, "r", encoding="utf-8") as f:
    lexicon_entries = json.load(f)
token_to_id = {entry["token"]: entry["term_id"] for entry in lexicon_entries}
termid_to_token = {entry["term_id"]: entry["token"] for entry in lexicon_entries}
term_document_frequency = {entry["term_id"]: entry["df"] for entry in lexicon_entries}
print(f"[done] Lexicon loaded: {len(token_to_id):,} tokens")

print("[init] Loading forward index...")
with open(FORWARD_INDEX_PATH, "r", encoding="utf-8") as f:
    forward_index = json.load(f)
doc_by_id = {doc["player_id"]: doc for doc in forward_index}
N = len(doc_by_id)
avg_doc_len = sum(d["total_terms"] for d in forward_index) / N if N > 0 else 0.0
name_metadata = {doc_id: build_name_metadata(doc.get("player_name"))
                 for doc_id, doc in doc_by_id.items()}
print(f"[done] Forward index: {N:,} documents (avg_len={avg_doc_len:.2f})")

print("[init] Loading term-to-barrel mapping...")
with open(TERM_TO_BARREL_MAP_PATH, "r", encoding="utf-8") as f:
    term_to_barrel = json.load(f)
print(f"[done] Term-to-barrel map loaded: {len(term_to_barrel):,} mappings")

print("[init] Loading market values...")
player_market_value = load_market_values(MARKET_VALUE_PATH)
max_market_value = max(player_market_value.values(), default=0.0)
market_value_log_max = math.log1p(max_market_value) if max_market_value > 0 else 1.0
print(f"[done] Market values loaded for {len(player_market_value):,} players")

print("[init] Loading profile metadata...")
profile_length_by_id = load_profile_lengths(PROFILE_DATA_PATH)
max_profile_length = max(profile_length_by_id.values(), default=0)
profile_length_log_max = math.log1p(max_profile_length) if max_profile_length > 0 else 1.0
print(f"[done] Profile metadata loaded for {len(profile_length_by_id):,} players")

# ---------- BARREL CACHE (LRU-like) ----------

barrel_cache = {}
MAX_CACHED_BARRELS = 10  # Keep only 10 barrels in memory at once

def load_barrel(barrel_name: str):
    """Load a barrel file and cache it. Implements simple LRU eviction."""
    if barrel_name in barrel_cache:
        return barrel_cache[barrel_name]
    
    barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json")
    try:
        with open(barrel_path, "r", encoding="utf-8") as f:
            barrel_data = json.load(f)
        
        # Cache management
        if len(barrel_cache) >= MAX_CACHED_BARRELS:
            # Remove oldest (first) entry
            oldest_key = next(iter(barrel_cache))
            del barrel_cache[oldest_key]
        
        barrel_cache[barrel_name] = barrel_data
        return barrel_data
    except FileNotFoundError:
        print(f"[error] Barrel file not found: {barrel_path}")
        return None

# ---------- QUERY TO TERM IDs ----------

def tokens_to_term_ids(tokens):
    seen = set()
    unique_term_ids = []
    for tok in tokens:
        tid = token_to_id.get(tok)
        if tid is None or tid in seen:
            continue
        seen.add(tid)
        unique_term_ids.append(tid)
    return unique_term_ids

# ---------- BM25 SCORING ----------

def bm25_score(tf, df, doc_len, N, avg_doc_len, k1=K1, b=B):
    idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
    denom = tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
    return idf * (tf * (k1 + 1) / denom)

# ---------- BARREL-BASED SEARCH ----------

def search(query: str, top_k: int = 10, verbose: bool = True):
    start_time = time.perf_counter()
    
    log = print if verbose else (lambda *args, **kwargs: None)
    
    log(f"\n[query] {query}")
    query_tokens = normalize_and_tokenize(query)
    term_ids = tokens_to_term_ids(query_tokens)
    
    if not term_ids:
        elapsed = (time.perf_counter() - start_time) * 1000
        log(f"No query terms found in lexicon. (took {elapsed:.2f} ms)")
        return []
    
    log("Query tokens -> term_ids:",
        [(termid_to_token.get(tid, "?"), tid) for tid in term_ids])
    
    # **KEY OPTIMIZATION: Determine which barrels to load**
    required_barrels = set()
    for tid in term_ids:
        barrel_name = term_to_barrel.get(str(tid))
        if barrel_name:
            required_barrels.add(barrel_name)
    
    log(f"[barrels] Loading {len(required_barrels)} barrel(s): {sorted(required_barrels)}")
    
    # Load only required barrels
    barrel_load_start = time.perf_counter()
    loaded_barrels = {}
    for barrel_name in required_barrels:
        barrel_data = load_barrel(barrel_name)
        if barrel_data:
            loaded_barrels[barrel_name] = barrel_data
    barrel_load_time = (time.perf_counter() - barrel_load_start) * 1000
    log(f"[barrels] Loaded in {barrel_load_time:.2f} ms")
    
    # BM25 scoring using barrel data
    scores = defaultdict(float)
    
    for tid in term_ids:
        df = term_document_frequency.get(tid, 0)
        if df == 0:
            continue
        
        # Get barrel for this term
        barrel_name = term_to_barrel.get(str(tid))
        if not barrel_name or barrel_name not in loaded_barrels:
            continue
        
        barrel_data = loaded_barrels[barrel_name]
        inverted_index_part = barrel_data.get("inverted_index", {})
        
        # Get postings for this term
        term_data = inverted_index_part.get(str(tid))
        if not term_data:
            continue
        
        postings = term_data.get("postings", {})
        
        for doc_id_str, info in postings.items():
            doc_id = int(doc_id_str)
            tf = info["tf"]
            doc_len = doc_by_id[doc_id]["total_terms"]
            scores[doc_id] += bm25_score(tf, df, doc_len, N, avg_doc_len)
    
    # Metadata boosting (same as before)
    if scores:
        query_name_tokens = normalize_name_tokens(query)
        query_name = " ".join(query_name_tokens)
        raw_query_lower = query.lower().strip()
        
        for doc_id in scores:
            boost = 0.0
            meta = name_metadata.get(doc_id)
            has_name_match = False
            match_count = 0
            
            if meta:
                if query_tokens:
                    match_count = sum(1 for tok in query_tokens if tok in meta["token_set"])
                    if match_count:
                        boost += NAME_TOKEN_WEIGHT * match_count
                        has_name_match = True
                
                if query_name:
                    if meta["normalized"] == query_name:
                        boost += EXACT_NAME_BONUS
                        has_name_match = True
                    elif meta["normalized"].startswith(query_name):
                        boost += NAME_PREFIX_BONUS
                        has_name_match = True
                
                if raw_query_lower and raw_query_lower in meta["raw_lower"]:
                    boost += RAW_SUBSTRING_BONUS
                    has_name_match = True
            
            if not has_name_match and query_tokens:
                boost -= NON_NAME_MATCH_PENALTY
            
            if has_name_match:
                value = player_market_value.get(doc_id)
                if value and market_value_log_max > 0.0:
                    boost += MARKET_VALUE_WEIGHT * (math.log1p(value) / market_value_log_max)
                
                length = profile_length_by_id.get(doc_id)
                if length and profile_length_log_max > 0.0:
                    boost += PROFILE_LENGTH_WEIGHT * (math.log1p(length) / profile_length_log_max)
            
            scores[doc_id] += boost
    
    if not scores:
        elapsed = (time.perf_counter() - start_time) * 1000
        log(f"No documents matched these terms. (took {elapsed:.2f} ms)")
        return []
    
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    
    results = []
    for rank, (doc_id, score) in enumerate(ranked, start=1):
        doc = doc_by_id[doc_id]
        results.append({
            "rank": rank,
            "doc_id": doc_id,
            "player_id": doc["player_id"],
            "player_name": doc["player_name"],
            "score": score,
            "market_value": player_market_value.get(doc_id),
        })
    
    elapsed = (time.perf_counter() - start_time) * 1000
    
    log("\n[top] Results:")
    for r in results:
        value = r["market_value"]
        length = profile_length_by_id.get(r["doc_id"])
        extras = []
        if value:
            extras.append(f"market_value~{value:,.0f} EUR")
        if length:
            extras.append(f"profile_chars={length}")
        extra_text = f" [{', '.join(extras)}]" if extras else ""
        log(f"{r['rank']:2d}. [{r['score']:.3f}] {r['player_name']} (player_id={r['player_id']}){extra_text}")
    
    log(f"\n[time] {elapsed:.2f} ms (barrel_load={barrel_load_time:.2f} ms)")
    log(f"[memory] {len(barrel_cache)} barrels cached, {len(required_barrels)} loaded for this query")
    
    if elapsed < 500:
        log("[perf]Under 500 ms goal")
    else:
        log("[perf]Above 500 ms goal")
    
    return results

# ---------- CLI ----------

if __name__ == "__main__":
    print("\n[ready] BARREL-OPTIMIZED search engine ready.")
    print(f"[info] System loads only required barrels per query (max {MAX_CACHED_BARRELS} cached)")
    print("[info] Type a query or press Enter to exit.\n")
    
    while True:
        q = input("Query> ").strip()
        if not q:
            break
        search(q, top_k=10)
    
    print("\n[exit] Exiting search engine.")