File size: 11,927 Bytes
da6a0a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
# add_document.py
# DYNAMIC DOCUMENT ADDITION - Incrementally add new players without full rebuild
import csv
import json
import math
import os
import re
import time
from collections import defaultdict

# ---------- PATHS ----------

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
INDEX_DIR = os.path.join(PROJECT_ROOT, 'data', 'index')
BARREL_DIR = os.path.join(INDEX_DIR, 'barrels')

LEXICON_PATH = os.path.join(INDEX_DIR, 'lexicon_complete.json')
FORWARD_INDEX_PATH = os.path.join(INDEX_DIR, 'forward_index_termid.json')
INVERTED_INDEX_PATH = os.path.join(INDEX_DIR, 'inverted_index_termid.json')
TERM_TO_BARREL_MAP_PATH = os.path.join(BARREL_DIR, 'term_to_barrel_map.json')

# ---------- TEXT NORMALIZATION (MUST MATCH BUILD PIPELINE) ----------

COMPREHENSIVE_STOP_WORDS = {
    "the", "and", "in", "for", "with", "on", "at", "from", "by", "as", "is", "was",
    "are", "were", "be", "been", "have", "has", "had", "to", "of", "a", "an", "that",
    "this", "these", "those", "it", "its", "or", "but", "not", "what", "which", "who",
    "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most",
    "other", "some", "such", "no", "nor", "only", "own", "same", "so", "than", "too",
    "very", "can", "will", "just", "should", "now", "player", "club", "team", "football",
    "soccer", "match", "game", "season", "league", "cup", "champions", "premier", "la",
    "bundesliga", "serie", "current", "main", "position", "nationality", "birth", "place",
    # Universal terms that appear in ALL documents (filtering for memory/performance)
    "comprehensive", "international", "performance", "transfermarkt", "injury",
    "summary", "market", "history", "database", "value",
    # Stemmed versions and other universal terms
    "data", "teammat", "sourc", "career", "assist", "app", "minut",
    "available", "national", "significant", "teammate", "transfer", "goal"
}

def simple_stemmer(word: str) -> str:
    if word.endswith("ing") and len(word) > 5:
        return word[:-3]
    elif word.endswith("ed") and len(word) > 4:
        return word[:-2]
    elif word.endswith("es") and len(word) > 4:
        return word[:-2]
    elif word.endswith("s") and len(word) > 3:
        return word[:-1]
    return word

def normalize_and_tokenize(text: str):
    text = text.lower()
    tokens = re.findall(r"\b[a-z]+\b", text)
    result = []
    for w in tokens:
        if w in COMPREHENSIVE_STOP_WORDS or len(w) <= 2:
            continue
        result.append(simple_stemmer(w))
    return result

# ---------- LOAD EXISTING INDEXES ----------

def load_indexes():
    """Load all existing indexes into memory."""
    print("[load] Loading lexicon...")
    with open(LEXICON_PATH, 'r', encoding='utf-8') as f:
        lexicon = json.load(f)
    token_to_entry = {entry["token"]: entry for entry in lexicon}
    max_term_id = max(entry["term_id"] for entry in lexicon)
    print(f"[done] Loaded {len(lexicon):,} tokens (max_term_id={max_term_id})")
    
    print("[load] Loading forward index...")
    with open(FORWARD_INDEX_PATH, 'r', encoding='utf-8') as f:
        forward_index = json.load(f)
    doc_by_id = {doc["player_id"]: doc for doc in forward_index}
    print(f"[done] Loaded {len(forward_index):,} documents")
    
    print("[load] Loading term-to-barrel mapping...")
    with open(TERM_TO_BARREL_MAP_PATH, 'r', encoding='utf-8') as f:
        term_to_barrel = json.load(f)
    print(f"[done] Loaded {len(term_to_barrel):,} mappings")
    
    return {
        'lexicon': lexicon,
        'token_to_entry': token_to_entry,
        'max_term_id': max_term_id,
        'forward_index': forward_index,
        'doc_by_id': doc_by_id,
        'term_to_barrel': term_to_barrel
    }

# ---------- ADD NEW DOCUMENT ----------

def add_document(player_data: dict, indexes: dict):
    """
    Add a new player document to the search engine.
    
    player_data format:
    {
        "player_id": 999999,
        "player_name": "New Player",
        "detailed_content": "Long text with player bio, stats, etc...",
        # ... other metadata fields
    }
    
    Returns: dict with statistics about the update
    """
    start_time = time.perf_counter()
    
    player_id = player_data.get("player_id")
    player_name = player_data.get("player_name", "")
    detailed_content = player_data.get("detailed_content", "")
    
    if not player_id or not player_name:
        return {"error": "Missing required fields: player_id, player_name"}
    
    if player_id in indexes['doc_by_id']:
        return {"error": f"Player ID {player_id} already exists"}
    
    print(f"\n[add] Adding player: {player_name} (ID={player_id})")
    
    # 1. Tokenize content
    print("[step 1/5] Tokenizing content...")
    all_text = f"{player_name} {detailed_content}"
    tokens = normalize_and_tokenize(all_text)
    
    if not tokens:
        return {"error": "No valid tokens found in document"}
    
    # Count term frequencies
    term_freq = defaultdict(int)
    for token in tokens:
        term_freq[token] += 1
    
    total_terms = len(tokens)
    unique_terms = len(term_freq)
    print(f"   Found {total_terms} tokens, {unique_terms} unique")
    
    # 2. Update lexicon (assign term_ids to new tokens)
    print("[step 2/5] Updating lexicon...")
    new_tokens = []
    next_term_id = indexes['max_term_id'] + 1
    
    for token, tf in term_freq.items():
        if token not in indexes['token_to_entry']:
            # New token - add to lexicon
            new_entry = {
                "token": token,
                "df": 1,  # This document is the first
                "term_id": next_term_id
            }
            indexes['lexicon'].append(new_entry)
            indexes['token_to_entry'][token] = new_entry
            new_tokens.append(token)
            next_term_id += 1
        else:
            # Existing token - increment document frequency
            indexes['token_to_entry'][token]["df"] += 1
    
    indexes['max_term_id'] = next_term_id - 1
    print(f"   Added {len(new_tokens)} new tokens to lexicon")
    
    # 3. Update forward index
    print("[step 3/5] Updating forward index...")
    term_ids_in_doc = {}
    for token, tf in term_freq.items():
        entry = indexes['token_to_entry'][token]
        term_id = entry["term_id"]
        term_ids_in_doc[term_id] = {
            "token": token,
            "tf": tf
        }
    
    forward_entry = {
        "player_id": player_id,
        "player_name": player_name,
        "total_terms": total_terms,
        "unique_terms": unique_terms,
        "terms": term_ids_in_doc
    }
    indexes['forward_index'].append(forward_entry)
    indexes['doc_by_id'][player_id] = forward_entry
    print(f"   Added document to forward index")
    
    # 4. Update barrels (inverted index distributed)
    print("[step 4/5] Updating barrels...")
    barrels_updated = set()
    
    for token, tf in term_freq.items():
        entry = indexes['token_to_entry'][token]
        term_id = entry["term_id"]
        term_id_str = str(term_id)
        
        # Determine which barrel this term belongs to
        barrel_name = indexes['term_to_barrel'].get(term_id_str)
        
        if not barrel_name:
            # New term - assign to a barrel (use simple mod distribution)
            num_barrels = max(int(bn.split('_')[1]) for bn in set(indexes['term_to_barrel'].values())) + 1
            barrel_idx = term_id % num_barrels
            barrel_name = f"barrel_{barrel_idx:03d}"
            indexes['term_to_barrel'][term_id_str] = barrel_name
        
        # Load barrel, update, save back
        barrel_path = os.path.join(BARREL_DIR, f"{barrel_name}.json")
        
        if os.path.exists(barrel_path):
            with open(barrel_path, 'r', encoding='utf-8') as f:
                barrel_data = json.load(f)
        else:
            barrel_data = {
                'metadata': {
                    'term_count': 0,
                    'posting_count': 0,
                    'barrel_name': barrel_name
                },
                'inverted_index': {}
            }
        
        # Update postings for this term
        if term_id_str not in barrel_data['inverted_index']:
            barrel_data['inverted_index'][term_id_str] = {
                'token': token,
                'df': entry['df'],
                'postings': {}
            }
        
        # Add this document to postings
        barrel_data['inverted_index'][term_id_str]['postings'][str(player_id)] = {
            "tf": tf
        }
        
        # Update df in barrel
        barrel_data['inverted_index'][term_id_str]['df'] = entry['df']
        
        # Update metadata
        barrel_data['metadata']['term_count'] = len(barrel_data['inverted_index'])
        barrel_data['metadata']['posting_count'] = sum(
            len(term_data['postings']) 
            for term_data in barrel_data['inverted_index'].values()
        )
        
        # Save barrel
        with open(barrel_path, 'w', encoding='utf-8') as f:
            json.dump(barrel_data, f, ensure_ascii=False)
        
        barrels_updated.add(barrel_name)
    
    print(f"   Updated {len(barrels_updated)} barrels: {sorted(barrels_updated)}")
    
    # 5. Save updated indexes
    print("[step 5/5] Saving updated indexes...")
    
    # Save lexicon
    with open(LEXICON_PATH, 'w', encoding='utf-8') as f:
        json.dump(indexes['lexicon'], f, ensure_ascii=False)
    
    # Save forward index
    with open(FORWARD_INDEX_PATH, 'w', encoding='utf-8') as f:
        json.dump(indexes['forward_index'], f, ensure_ascii=False)
    
    # Save term-to-barrel mapping
    with open(TERM_TO_BARREL_MAP_PATH, 'w', encoding='utf-8') as f:
        json.dump(indexes['term_to_barrel'], f, ensure_ascii=False)
    
    print(f"   Saved all indexes")
    
    elapsed = time.perf_counter() - start_time
    
    stats = {
        "success": True,
        "player_id": player_id,
        "player_name": player_name,
        "total_terms": total_terms,
        "unique_terms": unique_terms,
        "new_tokens_added": len(new_tokens),
        "barrels_updated": len(barrels_updated),
        "time_seconds": elapsed,
        "meets_requirement": elapsed < 60  # Must be under 1 minute
    }
    
    print(f"\n[done] Document added in {elapsed:.2f} seconds")
    if stats["meets_requirement"]:
        print("[perf]Under 1 minute requirement")
    else:
        print("[perf]Exceeded 1 minute requirement")
    
    return stats

# ---------- CLI ----------

if __name__ == "__main__":
    print("=" * 60)
    print("DYNAMIC DOCUMENT ADDITION SYSTEM")
    print("=" * 60)
    
    # Load indexes
    indexes = load_indexes()
    
    print("\n[ready] System ready to add new documents.")
    print("[info] Enter player data in JSON format or type 'exit' to quit.\n")
    
    # Example usage
    print("Example player data format:")
    example = {
        "player_id": 999999,
        "player_name": "Test Player",
        "detailed_content": "This is a test player from Manchester United. He plays as a striker and has won multiple trophies."
    }
    print(json.dumps(example, indent=2))
    print("\n" + "-" * 60 + "\n")
    
    while True:
        print("Enter player data (JSON) or 'exit':")
        user_input = input("> ").strip()
        
        if user_input.lower() == 'exit':
            break
        
        try:
            player_data = json.loads(user_input)
            result = add_document(player_data, indexes)
            print("\n[result]")
            print(json.dumps(result, indent=2))
        except json.JSONDecodeError as e:
            print(f"[error] Invalid JSON: {e}")
        except Exception as e:
            print(f"[error] {e}")
        
        print("\n" + "-" * 60 + "\n")
    
    print("\n[exit] Exiting document addition system.")