# ═══════════════════════════════════════════════════════════════════ # LARA — Step 1: Generate Training Data for ML K-Prediction Model # By Sonu Kumar, NPMAI ECOSYSTEM # # What this script does: # For each question in NaturalQuestions, it runs LARA's cross-encoder # and finds the OPTIMAL candidate pool size — the smallest number of # candidates that captures all chunks scoring above 0.3. # It also records index structure statistics (IVF centroid distances, # cluster sizes) that the ML model will learn from. # # Output: training_data.csv — one row per query, ready for ML training # ═══════════════════════════════════════════════════════════════════ # ── Install everything needed (run this cell first in Colab) ─────── # !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy import numpy as np import pandas as pd import faiss import time import math import os from sentence_transformers import SentenceTransformer, CrossEncoder from datasets import load_dataset # ══════════════════════════════════════════════════════════════════ # CONFIGURATION — change these if needed # ══════════════════════════════════════════════════════════════════ NUM_QUESTIONS = 2 # how many questions to process # 500 is manageable on free Colab CPU # increase to 2000+ if you have more time CHUNK_SIZE = 1000 # characters per chunk — same as LARA paper CHUNK_OVERLAP = 200 # overlap between chunks THRESHOLD = 0.3 # cross-encoder quality threshold NUM_IVF_CLUSTERS = 50 # number of IVF clusters for the index # rule of thumb: sqrt(total_vectors) OUTPUT_FILE = "training_data.csv" # ══════════════════════════════════════════════════════════════════ # STEP 1 — Load Models # These load once and are reused for every query. # ══════════════════════════════════════════════════════════════════ print("Loading bi-encoder (for creating vector embeddings)...") # The bi-encoder converts text to vectors # Same model used in LARA paper bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5") print("Loading cross-encoder (for quality scoring)...") # The cross-encoder scores query-document pairs # This is the 0.3 threshold model from LARA cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") print("Models loaded.\n") # ══════════════════════════════════════════════════════════════════ # STEP 2 — Load NaturalQuestions Dataset # NaturalQuestions has real Google search queries with Wikipedia answers # We use a small subset for training data generation # ══════════════════════════════════════════════════════════════════ print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...") # Load from HuggingFace — this downloads automatically # validation split is smaller and faster to load dataset = load_dataset( "natural_questions", split=f"validation[:{NUM_QUESTIONS}]", trust_remote_code=True ) print(f"Loaded {len(dataset)} questions.\n") # ══════════════════════════════════════════════════════════════════ # HELPER FUNCTIONS # ══════════════════════════════════════════════════════════════════ def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): """ Split text into overlapping chunks. chunk_size=1000 chars, overlap=200 chars — same as LARA paper. Example: text of 2500 chars with chunk_size=1000, overlap=200 Chunk 1: chars 0 to 1000 Chunk 2: chars 800 to 1800 Chunk 3: chars 1600 to 2500 """ chunks = [] start = 0 while start < len(text): end = start + chunk_size chunks.append(text[start:end]) start += (chunk_size - overlap) if start >= len(text): break return chunks def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS): """ Build an IVF (Inverted File) index from a list of text chunks. IVF works like this: 1. Embed all chunks into vectors 2. Group vectors into n_clusters clusters using k-means 3. Each cluster has a centroid (the average vector) 4. At search time, find nearest centroids first, then search those clusters Returns: - index: the FAISS IVF index object - embeddings: the raw vectors (numpy array) - centroids: the cluster centres (shape: n_clusters × vector_dim) """ # Embed all chunks — convert text to vectors embeddings = bi_enc.encode( chunks, normalize_embeddings=True, # normalise to unit length show_progress_bar=False ).astype("float32") dim = embeddings.shape[1] # vector dimension (384 for bge-small) # If corpus is too small for IVF, fall back to Flat (brute force) actual_clusters = min(n_clusters, max(1, len(chunks) // 4)) if len(chunks) < 10: # Very small corpus — use brute force index = faiss.IndexFlatIP(dim) # IP = Inner Product (cosine sim) index.add(embeddings) centroids = embeddings # for small corpus, vectors ARE the centroids index_type = "flat" else: # Build IVF index quantizer = faiss.IndexFlatIP(dim) index = faiss.IndexIVFFlat( quantizer, dim, actual_clusters, faiss.METRIC_INNER_PRODUCT ) index.train(embeddings) # learn cluster structure index.add(embeddings) # add vectors to index index.nprobe = max(1, actual_clusters // 5) # search this many clusters # Extract centroids from the trained quantizer centroids = faiss.vector_to_array( index.quantizer.xb ).reshape(-1, dim) index_type = "ivf" return index, embeddings, centroids, index_type def get_index_features(query_vec, centroids, embeddings, index_type): """ Extract structural features from the index for a given query. These are the features the ML model will learn from. For IVF: - distances from query to all centroids - how many centroids are "close" (within threshold) - spread (variance) of close centroid distances - estimated relevant pool size (sum of cluster sizes for close centroids) Returns a dictionary of features. """ # query_vec shape: (dim,) — a single vector query_vec = query_vec.reshape(1, -1).astype("float32") # Distance from query to every centroid # Higher inner product = closer (we use cosine similarity) centroid_distances = np.dot(centroids, query_vec.T).flatten() # Sort distances descending (most similar first) sorted_distances = np.sort(centroid_distances)[::-1] # Define "close" as top 20% of centroids threshold_distance = np.percentile(centroid_distances, 80) close_mask = centroid_distances >= threshold_distance # How many centroids are close n_close_centroids = int(np.sum(close_mask)) # Spread of close centroid distances — high spread = relevant # chunks scattered across corpus close_distances = centroid_distances[close_mask] spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0 # Estimate relevant pool — for IVF, approximate cluster size corpus_size = len(embeddings) avg_cluster_size = corpus_size / max(len(centroids), 1) estimated_pool = int(n_close_centroids * avg_cluster_size) # Top-1 and top-3 centroid distances top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0 top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist return { "corpus_size": corpus_size, "n_centroids": len(centroids), "n_close_centroids": n_close_centroids, "centroid_spread": spread, "estimated_pool": estimated_pool, "top1_centroid_dist": top1_dist, "top3_centroid_dist": top3_dist, "index_type": 0 if index_type == "flat" else 1, } def find_optimal_k(query, chunks, query_vec, index, embeddings, bi_enc, cross_enc, threshold=THRESHOLD): """ Find the OPTIMAL candidate pool size for this query. This is the KEY function for generating training data. The optimal k is the SMALLEST candidate pool that captures ALL chunks scoring above 0.3 on the cross-encoder. How we find it: 1. First retrieve ALL chunks (pool = len(chunks)) 2. Run cross-encoder on all of them 3. Find which ones score >= 0.3 — these are the "gold" relevant chunks 4. Find the smallest pool that would have retrieved all of them Returns: - optimal_k: the smallest pool size that captures all relevant chunks - n_relevant: how many chunks scored above 0.3 - dynamic_k_size: same as n_relevant (what LARA's threshold would give) """ if len(chunks) == 0: return 0, 0, 0 # Retrieve ALL chunks — we want to know ground truth all_pool_size = len(chunks) # Search the full index query_vec_2d = query_vec.reshape(1, -1).astype("float32") # Get similarity scores for all chunks scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten() # Rank by similarity (highest first) ranked_indices = np.argsort(scores_matrix)[::-1] ranked_chunks = [chunks[i] for i in ranked_indices] # Run cross-encoder on ALL ranked chunks # This is expensive but we only do it once per query for training data pairs = [(query, chunk) for chunk in ranked_chunks] if len(pairs) > 100: # Cap at 100 for speed — still gives good training signal pairs = pairs[:100] ranked_chunks = ranked_chunks[:100] ce_scores = cross_enc.predict(pairs) # Find which chunks are "relevant" (score >= threshold) relevant_flags = ce_scores >= threshold n_relevant = int(np.sum(relevant_flags)) if n_relevant == 0: # No relevant chunks found — optimal k is small return min(10, len(chunks)), 0, 0 # Find the position of the LAST relevant chunk in the ranking # optimal_k = that position + 1 # (we need a pool at least this large to capture all relevant chunks) last_relevant_position = 0 for i, flag in enumerate(relevant_flags): if flag: last_relevant_position = i optimal_k = last_relevant_position + 1 return optimal_k, n_relevant, n_relevant def get_query_features(query, bi_enc): """ Extract features from the query itself. These help the model understand query complexity. """ # Encode query to vector query_vec = bi_enc.encode( query, normalize_embeddings=True, show_progress_bar=False ).astype("float32") # Query length in characters query_len = len(query) # Number of words query_words = len(query.split()) # Does query contain a question word? (factual vs complex) question_words = ["what", "who", "when", "where", "which", "how", "why", "is", "are", "was", "were"] has_question_word = int( any(w in query.lower().split() for w in question_words) ) # Use first 32 dimensions of query embedding as features # (using all 384 would make the feature vector too large) query_embedding_compressed = query_vec[:32].tolist() return query_vec, { "query_len": query_len, "query_words": query_words, "has_question_word": has_question_word, "query_emb": query_embedding_compressed, } # ══════════════════════════════════════════════════════════════════ # STEP 3 — Main Data Generation Loop # ══════════════════════════════════════════════════════════════════ print("Starting training data generation...") print(f"Processing {NUM_QUESTIONS} questions.\n") all_rows = [] # will become our training dataset errors = 0 for idx, example in enumerate(dataset): # ── Extract question and context from NaturalQuestions ──────── # NaturalQuestions has the question and a Wikipedia document question = example["question"]["text"] # Get the Wikipedia article text as our "corpus" # NaturalQuestions stores the document as a list of tokens # We join them to get the full text try: doc_tokens = example["document"]["tokens"]["token"] context = " ".join(doc_tokens[:3000]) # first 3000 tokens except Exception: errors += 1 continue if len(context) < 200: # Skip very short documents continue # ── Chunk the context ───────────────────────────────────────── chunks = chunk_text(context) if len(chunks) < 2: continue # ── Build IVF index for this corpus ─────────────────────────── try: index, embeddings, centroids, index_type = build_ivf_index( chunks, bi_encoder ) except Exception as e: errors += 1 continue # ── Get query features and embedding ────────────────────────── query_vec, query_feats = get_query_features(question, bi_encoder) # ── Get index structure features ────────────────────────────── index_feats = get_index_features( query_vec, centroids, embeddings, index_type ) # ── Find optimal k — THE KEY STEP ───────────────────────────── t0 = time.time() optimal_k, n_relevant, dynamic_k = find_optimal_k( question, chunks, query_vec, index, embeddings, bi_encoder, cross_encoder ) elapsed = time.time() - t0 # ── Build one training row ──────────────────────────────────── row = { # Target variable — what the ML model needs to predict "optimal_k": optimal_k, # Query features "query_len": query_feats["query_len"], "query_words": query_feats["query_words"], "has_question_word": query_feats["has_question_word"], # Index structure features "corpus_size": index_feats["corpus_size"], "n_centroids": index_feats["n_centroids"], "n_close_centroids": index_feats["n_close_centroids"], "centroid_spread": index_feats["centroid_spread"], "estimated_pool": index_feats["estimated_pool"], "top1_centroid_dist": index_feats["top1_centroid_dist"], "top3_centroid_dist": index_feats["top3_centroid_dist"], "index_type": index_feats["index_type"], # Extra info (not used for training, useful for analysis) "n_relevant_chunks": n_relevant, "n_total_chunks": len(chunks), "question": question, "elapsed_sec": round(elapsed, 3), } # Add compressed query embedding as features # qe_0, qe_1, ... qe_31 — 32 numbers from the query vector for i, val in enumerate(query_feats["query_emb"]): row[f"qe_{i}"] = round(float(val), 6) all_rows.append(row) # ── Progress update ─────────────────────────────────────────── if (idx + 1) % 50 == 0: print(f" Processed {idx+1}/{NUM_QUESTIONS} questions. " f"Rows collected: {len(all_rows)}. " f"Errors: {errors}.") # ══════════════════════════════════════════════════════════════════ # STEP 4 — Save Training Data # ══════════════════════════════════════════════════════════════════ df = pd.DataFrame(all_rows) print(f"\nDone. Total rows: {len(df)}") print(f"Errors skipped: {errors}") print(f"\nDataset preview:") print(df[["question","corpus_size","n_relevant_chunks", "optimal_k","top1_centroid_dist"]].head(10)) print(f"\nOptimal K statistics:") print(df["optimal_k"].describe()) df.to_csv(OUTPUT_FILE, index=False) print(f"\nSaved to: {OUTPUT_FILE}") print("Download this file — you will use it in Step 2 (ML training).")