File size: 14,597 Bytes
da6a0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
# search_engine_barrels.py
# BARREL-OPTIMIZED SEARCH ENGINE - Loads only required barrels per query
import csv
import json
import math
import os
import re
import time
from collections import defaultdict

# ---------- CONFIG & PATHS ----------

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
INDEX_DIR = os.path.join(PROJECT_ROOT, "data", "index")
BARREL_DIR = os.path.join(INDEX_DIR, "barrels")
LEXICON_PATH = os.path.join(INDEX_DIR, "lexicon_complete.json")
FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, "forward_index_termid.json")
TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, "term_to_barrel_map.json")
MARKET_VALUE_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "raw", "player_latest_market_value", "player_latest_market_value.csv")
PROFILE_DATA_PATH = os.path.join(os.path.dirname(__file__), "..", "data", "processed", "complete_player_profiles.json")

# BM25 parameters
K1 = 1.2
B = 0.75

# Scoring boosts
NAME_TOKEN_WEIGHT = 0.75
NAME_PREFIX_BONUS = 1.25
EXACT_NAME_BONUS = 3.0
RAW_SUBSTRING_BONUS = 0.25
MARKET_VALUE_WEIGHT = 12.0
PROFILE_LENGTH_WEIGHT = 4.0
NON_NAME_MATCH_PENALTY = 1.5

# ---------- TEXT NORMALIZATION ----------

COMPREHENSIVE_STOP_WORDS = {
    "the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was",
    "are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that",
    "this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who",
    "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
    "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too",
    "very", "can", "will", "just", "should", "now", "player", "club", "team", "football",
    "soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la",
    "bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place",
    # Universal terms that appear in ALL documents (filtering for memory/performance)
    "comprehensive", "international", "performance", "transfermarkt", "injury",
    "summary", "market", "history", "database", "value",
    # Stemmed versions and other universal terms
    "data", "teammat", "sourc", "career", "assist", "app", "minut",
    "available", "national", "significant", "teammate", "transfer", "goal"
}

def simple_stemmer(word: str) -> str:
    if word.endswith("ing") and len(word) > 5:
        return word[:-3]
    elif word.endswith("ed") and len(word) > 4:
        return word[:-2]
    elif word.endswith("es") and len(word) > 4:
        return word[:-2]
    elif word.endswith("s") and len(word) > 3:
        return word[:-1]
    return word

def normalize_and_tokenize(text: str):
    text = text.lower()
    tokens = re.findall(r"\b[a-z]+\b", text)
    result = []
    for w in tokens:
        if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2:
            continue
        result.append(simple_stemmer(w))
    return result

def normalize_name_tokens(value: str):
    if not isinstance(value, str):
        return []
    tokens = re.findall(r"[a-z]+", value.lower())
    return [simple_stemmer(tok) for tok in tokens if tok]

def build_name_metadata(name: str):
    tokens = normalize_name_tokens(name)
    token_set = set(tokens)
    normalized = " ".join(tokens)
    return {
        "tokens": tokens,
        "token_set": token_set,
        "normalized": normalized,
        "raw_lower": name.lower() if isinstance(name, str) else "",
    }

def load_market_values(path: str):
    values = {}
    try:
        with open(path, "r", encoding="utf-8") as handle:
            reader = csv.DictReader(handle)
            for row in reader:
                try:
                    player_id = int(row.get("player_id", ""))
                except (TypeError, ValueError):
                    continue
                raw_value = row.get("value")
                try:
                    value = float(raw_value)
                except (TypeError, ValueError):
                    continue
                date_key = row.get("date_unix", "") or ""
                current = values.get(player_id)
                if current is None or date_key > current[0]:
                    values[player_id] = (date_key, value)
    except FileNotFoundError:
        print(f"[warn] Market value file not found at {path}")
        return {}
    return {pid: info[1] for pid, info in values.items()}

def load_profile_lengths(path: str):
    try:
        with open(path, "r", encoding="utf-8") as handle:
            data = json.load(handle)
    except FileNotFoundError:
        print(f"[warn] Profile data file not found at {path}")
        return {}

    lengths = {}
    for entry in data:
        player_id = entry.get("player_id")
        if not isinstance(player_id, int):
            continue
        detailed = entry.get("detailed_content")
        if isinstance(detailed, str) and detailed:
            lengths[player_id] = len(detailed)
    return lengths

# ---------- LOAD STATIC INDEXES (NOT INVERTED INDEX) ----------

print("[init] Loading lexicon...")
with open(LEXICON_PATH, "r", encoding="utf-8") as f:
    lexicon_entries = json.load(f)
token_to_id = {entry["token"]: entry["term_id"] for entry in lexicon_entries}
termid_to_token = {entry["term_id"]: entry["token"] for entry in lexicon_entries}
term_document_frequency = {entry["term_id"]: entry["df"] for entry in lexicon_entries}
print(f"[done] Lexicon loaded: {len(token_to_id):,} tokens")

print("[init] Loading forward index...")
with open(FORWARD_INDEX_PATH, "r", encoding="utf-8") as f:
    forward_index = json.load(f)
doc_by_id = {doc["player_id"]: doc for doc in forward_index}
N = len(doc_by_id)
avg_doc_len = sum(d["total_terms"] for d in forward_index) / N if N > 0 else 0.0
name_metadata = {doc_id: build_name_metadata(doc.get("player_name"))
                 for doc_id, doc in doc_by_id.items()}
print(f"[done] Forward index: {N:,} documents (avg_len={avg_doc_len:.2f})")

print("[init] Loading term-to-barrel mapping...")
with open(TERM_TO_BARREL_MAP_PATH, "r", encoding="utf-8") as f:
    term_to_barrel = json.load(f)
print(f"[done] Term-to-barrel map loaded: {len(term_to_barrel):,} mappings")

print("[init] Loading market values...")
player_market_value = load_market_values(MARKET_VALUE_PATH)
max_market_value = max(player_market_value.values(), default=0.0)
market_value_log_max = math.log1p(max_market_value) if max_market_value > 0 else 1.0
print(f"[done] Market values loaded for {len(player_market_value):,} players")

print("[init] Loading profile metadata...")
profile_length_by_id = load_profile_lengths(PROFILE_DATA_PATH)
max_profile_length = max(profile_length_by_id.values(), default=0)
profile_length_log_max = math.log1p(max_profile_length) if max_profile_length > 0 else 1.0
print(f"[done] Profile metadata loaded for {len(profile_length_by_id):,} players")

# ---------- BARREL CACHE (LRU-like) ----------

barrel_cache = {}
MAX_CACHED_BARRELS = 10  # Keep only 10 barrels in memory at once

def load_barrel(barrel_name: str):
    """Load a barrel file and cache it. Implements simple LRU eviction."""
    if barrel_name in barrel_cache:
        return barrel_cache[barrel_name]
    
    barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json")
    try:
        with open(barrel_path, "r", encoding="utf-8") as f:
            barrel_data = json.load(f)
        
        # Cache management
        if len(barrel_cache) >= MAX_CACHED_BARRELS:
            # Remove oldest (first) entry
            oldest_key = next(iter(barrel_cache))
            del barrel_cache[oldest_key]
        
        barrel_cache[barrel_name] = barrel_data
        return barrel_data
    except FileNotFoundError:
        print(f"[error] Barrel file not found: {barrel_path}")
        return None

# ---------- QUERY TO TERM IDs ----------

def tokens_to_term_ids(tokens):
    seen = set()
    unique_term_ids = []
    for tok in tokens:
        tid = token_to_id.get(tok)
        if tid is None or tid in seen:
            continue
        seen.add(tid)
        unique_term_ids.append(tid)
    return unique_term_ids

# ---------- BM25 SCORING ----------

def bm25_score(tf, df, doc_len, N, avg_doc_len, k1=K1, b=B):
    idf = math.log((N - df + 0.5) / (df + 0.5) + 1.0)
    denom = tf + k1 * (1 - b + b * (doc_len / avg_doc_len))
    return idf * (tf * (k1 + 1) / denom)

# ---------- BARREL-BASED SEARCH ----------

def search(query: str, top_k: int = 10, verbose: bool = True):
    start_time = time.perf_counter()
    
    log = print if verbose else (lambda *args, **kwargs: None)
    
    log(f"\n[query] {query}")
    query_tokens = normalize_and_tokenize(query)
    term_ids = tokens_to_term_ids(query_tokens)
    
    if not term_ids:
        elapsed = (time.perf_counter() - start_time) * 1000
        log(f"No query terms found in lexicon. (took {elapsed:.2f} ms)")
        return []
    
    log("Query tokens -> term_ids:",
        [(termid_to_token.get(tid, "?"), tid) for tid in term_ids])
    
    # **KEY OPTIMIZATION: Determine which barrels to load**
    required_barrels = set()
    for tid in term_ids:
        barrel_name = term_to_barrel.get(str(tid))
        if barrel_name:
            required_barrels.add(barrel_name)
    
    log(f"[barrels] Loading {len(required_barrels)} barrel(s): {sorted(required_barrels)}")
    
    # Load only required barrels
    barrel_load_start = time.perf_counter()
    loaded_barrels = {}
    for barrel_name in required_barrels:
        barrel_data = load_barrel(barrel_name)
        if barrel_data:
            loaded_barrels[barrel_name] = barrel_data
    barrel_load_time = (time.perf_counter() - barrel_load_start) * 1000
    log(f"[barrels] Loaded in {barrel_load_time:.2f} ms")
    
    # BM25 scoring using barrel data
    scores = defaultdict(float)
    
    for tid in term_ids:
        df = term_document_frequency.get(tid, 0)
        if df == 0:
            continue
        
        # Get barrel for this term
        barrel_name = term_to_barrel.get(str(tid))
        if not barrel_name or barrel_name not in loaded_barrels:
            continue
        
        barrel_data = loaded_barrels[barrel_name]
        inverted_index_part = barrel_data.get("inverted_index", {})
        
        # Get postings for this term
        term_data = inverted_index_part.get(str(tid))
        if not term_data:
            continue
        
        postings = term_data.get("postings", {})
        
        for doc_id_str, info in postings.items():
            doc_id = int(doc_id_str)
            tf = info["tf"]
            doc_len = doc_by_id[doc_id]["total_terms"]
            scores[doc_id] += bm25_score(tf, df, doc_len, N, avg_doc_len)
    
    # Metadata boosting (same as before)
    if scores:
        query_name_tokens = normalize_name_tokens(query)
        query_name = " ".join(query_name_tokens)
        raw_query_lower = query.lower().strip()
        
        for doc_id in scores:
            boost = 0.0
            meta = name_metadata.get(doc_id)
            has_name_match = False
            match_count = 0
            
            if meta:
                if query_tokens:
                    match_count = sum(1 for tok in query_tokens if tok in meta["token_set"])
                    if match_count:
                        boost += NAME_TOKEN_WEIGHT * match_count
                        has_name_match = True
                
                if query_name:
                    if meta["normalized"] == query_name:
                        boost += EXACT_NAME_BONUS
                        has_name_match = True
                    elif meta["normalized"].startswith(query_name):
                        boost += NAME_PREFIX_BONUS
                        has_name_match = True
                
                if raw_query_lower and raw_query_lower in meta["raw_lower"]:
                    boost += RAW_SUBSTRING_BONUS
                    has_name_match = True
            
            if not has_name_match and query_tokens:
                boost -= NON_NAME_MATCH_PENALTY
            
            if has_name_match:
                value = player_market_value.get(doc_id)
                if value and market_value_log_max > 0.0:
                    boost += MARKET_VALUE_WEIGHT * (math.log1p(value) / market_value_log_max)
                
                length = profile_length_by_id.get(doc_id)
                if length and profile_length_log_max > 0.0:
                    boost += PROFILE_LENGTH_WEIGHT * (math.log1p(length) / profile_length_log_max)
            
            scores[doc_id] += boost
    
    if not scores:
        elapsed = (time.perf_counter() - start_time) * 1000
        log(f"No documents matched these terms. (took {elapsed:.2f} ms)")
        return []
    
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
    
    results = []
    for rank, (doc_id, score) in enumerate(ranked, start=1):
        doc = doc_by_id[doc_id]
        results.append({
            "rank": rank,
            "doc_id": doc_id,
            "player_id": doc["player_id"],
            "player_name": doc["player_name"],
            "score": score,
            "market_value": player_market_value.get(doc_id),
        })
    
    elapsed = (time.perf_counter() - start_time) * 1000
    
    log("\n[top] Results:")
    for r in results:
        value = r["market_value"]
        length = profile_length_by_id.get(r["doc_id"])
        extras = []
        if value:
            extras.append(f"market_value~{value:,.0f} EUR")
        if length:
            extras.append(f"profile_chars={length}")
        extra_text = f" [{', '.join(extras)}]" if extras else ""
        log(f"{r['rank']:2d}. [{r['score']:.3f}] {r['player_name']} (player_id={r['player_id']}){extra_text}")
    
    log(f"\n[time] {elapsed:.2f} ms (barrel_load={barrel_load_time:.2f} ms)")
    log(f"[memory] {len(barrel_cache)} barrels cached, {len(required_barrels)} loaded for this query")
    
    if elapsed < 500:
        log("[perf]Under 500 ms goal")
    else:
        log("[perf]Above 500 ms goal")
    
    return results

# ---------- CLI ----------

if __name__ == "__main__":
    print("\n[ready] BARREL-OPTIMIZED search engine ready.")
    print(f"[info] System loads only required barrels per query (max {MAX_CACHED_BARRELS} cached)")
    print("[info] Type a query or press Enter to exit.\n")
    
    while True:
        q = input("Query> ").strip()
        if not q:
            break
        search(q, top_k=10)
    
    print("\n[exit] Exiting search engine.")