npmaiecosystem commited on
Commit
4aaa65b
Β·
verified Β·
1 Parent(s): 586cf1f

Create appp.py

Browse files
Files changed (1) hide show
  1. appp.py +434 -0
appp.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ═══════════════════════════════════════════════════════════════════
2
+ # LARA β€” Step 1: Generate Training Data for ML K-Prediction Model
3
+ # By Sonu Kumar, NPMAI ECOSYSTEM
4
+ #
5
+ # What this script does:
6
+ # For each question in NaturalQuestions, it runs LARA's cross-encoder
7
+ # and finds the OPTIMAL candidate pool size β€” the smallest number of
8
+ # candidates that captures all chunks scoring above 0.3.
9
+ # It also records index structure statistics (IVF centroid distances,
10
+ # cluster sizes) that the ML model will learn from.
11
+ #
12
+ # Output: training_data.csv β€” one row per query, ready for ML training
13
+ # ═══════════════════════════════════════════════════════════════════
14
+
15
+ # ── Install everything needed (run this cell first in Colab) ───────
16
+ # !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+ import faiss
21
+ import time
22
+ import math
23
+ import os
24
+ from sentence_transformers import SentenceTransformer, CrossEncoder
25
+ from datasets import load_dataset
26
+
27
+ # ══════════════════════════════════════════════════════════════════
28
+ # CONFIGURATION β€” change these if needed
29
+ # ══════════════════════════════════════════════════════════════════
30
+
31
+ NUM_QUESTIONS = 500 # how many questions to process
32
+ # 500 is manageable on free Colab CPU
33
+ # increase to 2000+ if you have more time
34
+
35
+ CHUNK_SIZE = 1000 # characters per chunk β€” same as LARA paper
36
+ CHUNK_OVERLAP = 200 # overlap between chunks
37
+ THRESHOLD = 0.3 # cross-encoder quality threshold
38
+ NUM_IVF_CLUSTERS = 50 # number of IVF clusters for the index
39
+ # rule of thumb: sqrt(total_vectors)
40
+ OUTPUT_FILE = "training_data.csv"
41
+
42
+ # ══════════════════════════════════════════════════════════════════
43
+ # STEP 1 β€” Load Models
44
+ # These load once and are reused for every query.
45
+ # ══════════════════════════════════════════════════════════════════
46
+
47
+ print("Loading bi-encoder (for creating vector embeddings)...")
48
+ # The bi-encoder converts text to vectors
49
+ # Same model used in LARA paper
50
+ bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5")
51
+
52
+ print("Loading cross-encoder (for quality scoring)...")
53
+ # The cross-encoder scores query-document pairs
54
+ # This is the 0.3 threshold model from LARA
55
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
56
+
57
+ print("Models loaded.\n")
58
+
59
+ # ══════════════════════════════════════════════════════════════════
60
+ # STEP 2 β€” Load NaturalQuestions Dataset
61
+ # NaturalQuestions has real Google search queries with Wikipedia answers
62
+ # We use a small subset for training data generation
63
+ # ══════════════════════════════════════════════════════════════════
64
+
65
+ print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...")
66
+
67
+ # Load from HuggingFace β€” this downloads automatically
68
+ # validation split is smaller and faster to load
69
+ dataset = load_dataset(
70
+ "natural_questions",
71
+ split=f"validation[:{NUM_QUESTIONS}]",
72
+ trust_remote_code=True
73
+ )
74
+
75
+ print(f"Loaded {len(dataset)} questions.\n")
76
+
77
+ # ══════════════════════════════════════════════════════════════════
78
+ # HELPER FUNCTIONS
79
+ # ══════════════════════════════════════════════════════════════════
80
+
81
+ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
82
+ """
83
+ Split text into overlapping chunks.
84
+ chunk_size=1000 chars, overlap=200 chars β€” same as LARA paper.
85
+
86
+ Example: text of 2500 chars with chunk_size=1000, overlap=200
87
+ Chunk 1: chars 0 to 1000
88
+ Chunk 2: chars 800 to 1800
89
+ Chunk 3: chars 1600 to 2500
90
+ """
91
+ chunks = []
92
+ start = 0
93
+ while start < len(text):
94
+ end = start + chunk_size
95
+ chunks.append(text[start:end])
96
+ start += (chunk_size - overlap)
97
+ if start >= len(text):
98
+ break
99
+ return chunks
100
+
101
+
102
+ def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS):
103
+ """
104
+ Build an IVF (Inverted File) index from a list of text chunks.
105
+
106
+ IVF works like this:
107
+ 1. Embed all chunks into vectors
108
+ 2. Group vectors into n_clusters clusters using k-means
109
+ 3. Each cluster has a centroid (the average vector)
110
+ 4. At search time, find nearest centroids first, then search those clusters
111
+
112
+ Returns:
113
+ - index: the FAISS IVF index object
114
+ - embeddings: the raw vectors (numpy array)
115
+ - centroids: the cluster centres (shape: n_clusters Γ— vector_dim)
116
+ """
117
+ # Embed all chunks β€” convert text to vectors
118
+ embeddings = bi_enc.encode(
119
+ chunks,
120
+ normalize_embeddings=True, # normalise to unit length
121
+ show_progress_bar=False
122
+ ).astype("float32")
123
+
124
+ dim = embeddings.shape[1] # vector dimension (384 for bge-small)
125
+
126
+ # If corpus is too small for IVF, fall back to Flat (brute force)
127
+ actual_clusters = min(n_clusters, max(1, len(chunks) // 4))
128
+
129
+ if len(chunks) < 10:
130
+ # Very small corpus β€” use brute force
131
+ index = faiss.IndexFlatIP(dim) # IP = Inner Product (cosine sim)
132
+ index.add(embeddings)
133
+ centroids = embeddings # for small corpus, vectors ARE the centroids
134
+ index_type = "flat"
135
+ else:
136
+ # Build IVF index
137
+ quantizer = faiss.IndexFlatIP(dim)
138
+ index = faiss.IndexIVFFlat(
139
+ quantizer, dim, actual_clusters,
140
+ faiss.METRIC_INNER_PRODUCT
141
+ )
142
+ index.train(embeddings) # learn cluster structure
143
+ index.add(embeddings) # add vectors to index
144
+ index.nprobe = max(1, actual_clusters // 5) # search this many clusters
145
+
146
+ # Extract centroids from the trained quantizer
147
+ centroids = faiss.vector_to_array(
148
+ index.quantizer.xb
149
+ ).reshape(-1, dim)
150
+ index_type = "ivf"
151
+
152
+ return index, embeddings, centroids, index_type
153
+
154
+
155
+ def get_index_features(query_vec, centroids, embeddings, index_type):
156
+ """
157
+ Extract structural features from the index for a given query.
158
+ These are the features the ML model will learn from.
159
+
160
+ For IVF:
161
+ - distances from query to all centroids
162
+ - how many centroids are "close" (within threshold)
163
+ - spread (variance) of close centroid distances
164
+ - estimated relevant pool size (sum of cluster sizes for close centroids)
165
+
166
+ Returns a dictionary of features.
167
+ """
168
+ # query_vec shape: (dim,) β€” a single vector
169
+ query_vec = query_vec.reshape(1, -1).astype("float32")
170
+
171
+ # Distance from query to every centroid
172
+ # Higher inner product = closer (we use cosine similarity)
173
+ centroid_distances = np.dot(centroids, query_vec.T).flatten()
174
+
175
+ # Sort distances descending (most similar first)
176
+ sorted_distances = np.sort(centroid_distances)[::-1]
177
+
178
+ # Define "close" as top 20% of centroids
179
+ threshold_distance = np.percentile(centroid_distances, 80)
180
+ close_mask = centroid_distances >= threshold_distance
181
+
182
+ # How many centroids are close
183
+ n_close_centroids = int(np.sum(close_mask))
184
+
185
+ # Spread of close centroid distances β€” high spread = relevant
186
+ # chunks scattered across corpus
187
+ close_distances = centroid_distances[close_mask]
188
+ spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0
189
+
190
+ # Estimate relevant pool β€” for IVF, approximate cluster size
191
+ corpus_size = len(embeddings)
192
+ avg_cluster_size = corpus_size / max(len(centroids), 1)
193
+ estimated_pool = int(n_close_centroids * avg_cluster_size)
194
+
195
+ # Top-1 and top-3 centroid distances
196
+ top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0
197
+ top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist
198
+
199
+ return {
200
+ "corpus_size": corpus_size,
201
+ "n_centroids": len(centroids),
202
+ "n_close_centroids": n_close_centroids,
203
+ "centroid_spread": spread,
204
+ "estimated_pool": estimated_pool,
205
+ "top1_centroid_dist": top1_dist,
206
+ "top3_centroid_dist": top3_dist,
207
+ "index_type": 0 if index_type == "flat" else 1,
208
+ }
209
+
210
+
211
+ def find_optimal_k(query, chunks, query_vec, index, embeddings,
212
+ bi_enc, cross_enc, threshold=THRESHOLD):
213
+ """
214
+ Find the OPTIMAL candidate pool size for this query.
215
+
216
+ This is the KEY function for generating training data.
217
+
218
+ The optimal k is the SMALLEST candidate pool that captures
219
+ ALL chunks scoring above 0.3 on the cross-encoder.
220
+
221
+ How we find it:
222
+ 1. First retrieve ALL chunks (pool = len(chunks))
223
+ 2. Run cross-encoder on all of them
224
+ 3. Find which ones score >= 0.3 β€” these are the "gold" relevant chunks
225
+ 4. Find the smallest pool that would have retrieved all of them
226
+
227
+ Returns:
228
+ - optimal_k: the smallest pool size that captures all relevant chunks
229
+ - n_relevant: how many chunks scored above 0.3
230
+ - dynamic_k_size: same as n_relevant (what LARA's threshold would give)
231
+ """
232
+ if len(chunks) == 0:
233
+ return 0, 0, 0
234
+
235
+ # Retrieve ALL chunks β€” we want to know ground truth
236
+ all_pool_size = len(chunks)
237
+
238
+ # Search the full index
239
+ query_vec_2d = query_vec.reshape(1, -1).astype("float32")
240
+
241
+ # Get similarity scores for all chunks
242
+ scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten()
243
+
244
+ # Rank by similarity (highest first)
245
+ ranked_indices = np.argsort(scores_matrix)[::-1]
246
+ ranked_chunks = [chunks[i] for i in ranked_indices]
247
+
248
+ # Run cross-encoder on ALL ranked chunks
249
+ # This is expensive but we only do it once per query for training data
250
+ pairs = [(query, chunk) for chunk in ranked_chunks]
251
+
252
+ if len(pairs) > 100:
253
+ # Cap at 100 for speed β€” still gives good training signal
254
+ pairs = pairs[:100]
255
+ ranked_chunks = ranked_chunks[:100]
256
+
257
+ ce_scores = cross_enc.predict(pairs)
258
+
259
+ # Find which chunks are "relevant" (score >= threshold)
260
+ relevant_flags = ce_scores >= threshold
261
+ n_relevant = int(np.sum(relevant_flags))
262
+
263
+ if n_relevant == 0:
264
+ # No relevant chunks found β€” optimal k is small
265
+ return min(10, len(chunks)), 0, 0
266
+
267
+ # Find the position of the LAST relevant chunk in the ranking
268
+ # optimal_k = that position + 1
269
+ # (we need a pool at least this large to capture all relevant chunks)
270
+ last_relevant_position = 0
271
+ for i, flag in enumerate(relevant_flags):
272
+ if flag:
273
+ last_relevant_position = i
274
+
275
+ optimal_k = last_relevant_position + 1
276
+
277
+ return optimal_k, n_relevant, n_relevant
278
+
279
+
280
+ def get_query_features(query, bi_enc):
281
+ """
282
+ Extract features from the query itself.
283
+ These help the model understand query complexity.
284
+ """
285
+ # Encode query to vector
286
+ query_vec = bi_enc.encode(
287
+ query,
288
+ normalize_embeddings=True,
289
+ show_progress_bar=False
290
+ ).astype("float32")
291
+
292
+ # Query length in characters
293
+ query_len = len(query)
294
+
295
+ # Number of words
296
+ query_words = len(query.split())
297
+
298
+ # Does query contain a question word? (factual vs complex)
299
+ question_words = ["what", "who", "when", "where", "which",
300
+ "how", "why", "is", "are", "was", "were"]
301
+ has_question_word = int(
302
+ any(w in query.lower().split() for w in question_words)
303
+ )
304
+
305
+ # Use first 32 dimensions of query embedding as features
306
+ # (using all 384 would make the feature vector too large)
307
+ query_embedding_compressed = query_vec[:32].tolist()
308
+
309
+ return query_vec, {
310
+ "query_len": query_len,
311
+ "query_words": query_words,
312
+ "has_question_word": has_question_word,
313
+ "query_emb": query_embedding_compressed,
314
+ }
315
+
316
+
317
+ # ══════════════════════════════════════════════════════════════════
318
+ # STEP 3 β€” Main Data Generation Loop
319
+ # ══════════════════════════════════════════════════════════════════
320
+
321
+ print("Starting training data generation...")
322
+ print(f"Processing {NUM_QUESTIONS} questions.\n")
323
+
324
+ all_rows = [] # will become our training dataset
325
+ errors = 0
326
+
327
+ for idx, example in enumerate(dataset):
328
+
329
+ # ── Extract question and context from NaturalQuestions ────────
330
+ # NaturalQuestions has the question and a Wikipedia document
331
+ question = example["question"]["text"]
332
+
333
+ # Get the Wikipedia article text as our "corpus"
334
+ # NaturalQuestions stores the document as a list of tokens
335
+ # We join them to get the full text
336
+ try:
337
+ doc_tokens = example["document"]["tokens"]["token"]
338
+ context = " ".join(doc_tokens[:3000]) # first 3000 tokens
339
+ except Exception:
340
+ errors += 1
341
+ continue
342
+
343
+ if len(context) < 200:
344
+ # Skip very short documents
345
+ continue
346
+
347
+ # ── Chunk the context ─────────────────────────────────────────
348
+ chunks = chunk_text(context)
349
+
350
+ if len(chunks) < 2:
351
+ continue
352
+
353
+ # ── Build IVF index for this corpus ───────────────────────────
354
+ try:
355
+ index, embeddings, centroids, index_type = build_ivf_index(
356
+ chunks, bi_encoder
357
+ )
358
+ except Exception as e:
359
+ errors += 1
360
+ continue
361
+
362
+ # ── Get query features and embedding ──────────────────────────
363
+ query_vec, query_feats = get_query_features(question, bi_encoder)
364
+
365
+ # ── Get index structure features ──────────────────────────────
366
+ index_feats = get_index_features(
367
+ query_vec, centroids, embeddings, index_type
368
+ )
369
+
370
+ # ── Find optimal k β€” THE KEY STEP ─────────────────────────────
371
+ t0 = time.time()
372
+ optimal_k, n_relevant, dynamic_k = find_optimal_k(
373
+ question, chunks, query_vec, index,
374
+ embeddings, bi_encoder, cross_encoder
375
+ )
376
+ elapsed = time.time() - t0
377
+
378
+ # ── Build one training row ────────────────────────────────────
379
+ row = {
380
+ # Target variable β€” what the ML model needs to predict
381
+ "optimal_k": optimal_k,
382
+
383
+ # Query features
384
+ "query_len": query_feats["query_len"],
385
+ "query_words": query_feats["query_words"],
386
+ "has_question_word": query_feats["has_question_word"],
387
+
388
+ # Index structure features
389
+ "corpus_size": index_feats["corpus_size"],
390
+ "n_centroids": index_feats["n_centroids"],
391
+ "n_close_centroids": index_feats["n_close_centroids"],
392
+ "centroid_spread": index_feats["centroid_spread"],
393
+ "estimated_pool": index_feats["estimated_pool"],
394
+ "top1_centroid_dist": index_feats["top1_centroid_dist"],
395
+ "top3_centroid_dist": index_feats["top3_centroid_dist"],
396
+ "index_type": index_feats["index_type"],
397
+
398
+ # Extra info (not used for training, useful for analysis)
399
+ "n_relevant_chunks": n_relevant,
400
+ "n_total_chunks": len(chunks),
401
+ "question": question,
402
+ "elapsed_sec": round(elapsed, 3),
403
+ }
404
+
405
+ # Add compressed query embedding as features
406
+ # qe_0, qe_1, ... qe_31 β€” 32 numbers from the query vector
407
+ for i, val in enumerate(query_feats["query_emb"]):
408
+ row[f"qe_{i}"] = round(float(val), 6)
409
+
410
+ all_rows.append(row)
411
+
412
+ # ── Progress update ───────────────────────────────────────────
413
+ if (idx + 1) % 50 == 0:
414
+ print(f" Processed {idx+1}/{NUM_QUESTIONS} questions. "
415
+ f"Rows collected: {len(all_rows)}. "
416
+ f"Errors: {errors}.")
417
+
418
+ # ══════════════════════════════════════════════════════════════════
419
+ # STEP 4 β€” Save Training Data
420
+ # ══════════════════════════════════════════════════════════════════
421
+
422
+ df = pd.DataFrame(all_rows)
423
+
424
+ print(f"\nDone. Total rows: {len(df)}")
425
+ print(f"Errors skipped: {errors}")
426
+ print(f"\nDataset preview:")
427
+ print(df[["question","corpus_size","n_relevant_chunks",
428
+ "optimal_k","top1_centroid_dist"]].head(10))
429
+ print(f"\nOptimal K statistics:")
430
+ print(df["optimal_k"].describe())
431
+
432
+ df.to_csv(OUTPUT_FILE, index=False)
433
+ print(f"\nSaved to: {OUTPUT_FILE}")
434
+ print("Download this file β€” you will use it in Step 2 (ML training).")