Spaces:
Runtime error
Runtime error
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # LARA β Step 1: Generate Training Data for ML K-Prediction Model | |
| # By Sonu Kumar, NPMAI ECOSYSTEM | |
| # | |
| # What this script does: | |
| # For each question in NaturalQuestions, it runs LARA's cross-encoder | |
| # and finds the OPTIMAL candidate pool size β the smallest number of | |
| # candidates that captures all chunks scoring above 0.3. | |
| # It also records index structure statistics (IVF centroid distances, | |
| # cluster sizes) that the ML model will learn from. | |
| # | |
| # Output: training_data.csv β one row per query, ready for ML training | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ββ Install everything needed (run this cell first in Colab) βββββββ | |
| # !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy | |
| import numpy as np | |
| import pandas as pd | |
| import faiss | |
| import time | |
| import math | |
| import os | |
| from sentence_transformers import SentenceTransformer, CrossEncoder | |
| from datasets import load_dataset | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CONFIGURATION β change these if needed | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| NUM_QUESTIONS = 2 # how many questions to process | |
| # 500 is manageable on free Colab CPU | |
| # increase to 2000+ if you have more time | |
| CHUNK_SIZE = 1000 # characters per chunk β same as LARA paper | |
| CHUNK_OVERLAP = 200 # overlap between chunks | |
| THRESHOLD = 0.3 # cross-encoder quality threshold | |
| NUM_IVF_CLUSTERS = 50 # number of IVF clusters for the index | |
| # rule of thumb: sqrt(total_vectors) | |
| OUTPUT_FILE = "training_data.csv" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 1 β Load Models | |
| # These load once and are reused for every query. | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading bi-encoder (for creating vector embeddings)...") | |
| # The bi-encoder converts text to vectors | |
| # Same model used in LARA paper | |
| bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5") | |
| print("Loading cross-encoder (for quality scoring)...") | |
| # The cross-encoder scores query-document pairs | |
| # This is the 0.3 threshold model from LARA | |
| cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") | |
| print("Models loaded.\n") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 2 β Load NaturalQuestions Dataset | |
| # NaturalQuestions has real Google search queries with Wikipedia answers | |
| # We use a small subset for training data generation | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...") | |
| # Load from HuggingFace β this downloads automatically | |
| # validation split is smaller and faster to load | |
| dataset = load_dataset( | |
| "natural_questions", | |
| split=f"validation[:{NUM_QUESTIONS}]", | |
| trust_remote_code=True | |
| ) | |
| print(f"Loaded {len(dataset)} questions.\n") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPER FUNCTIONS | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP): | |
| """ | |
| Split text into overlapping chunks. | |
| chunk_size=1000 chars, overlap=200 chars β same as LARA paper. | |
| Example: text of 2500 chars with chunk_size=1000, overlap=200 | |
| Chunk 1: chars 0 to 1000 | |
| Chunk 2: chars 800 to 1800 | |
| Chunk 3: chars 1600 to 2500 | |
| """ | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| end = start + chunk_size | |
| chunks.append(text[start:end]) | |
| start += (chunk_size - overlap) | |
| if start >= len(text): | |
| break | |
| return chunks | |
| def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS): | |
| """ | |
| Build an IVF (Inverted File) index from a list of text chunks. | |
| IVF works like this: | |
| 1. Embed all chunks into vectors | |
| 2. Group vectors into n_clusters clusters using k-means | |
| 3. Each cluster has a centroid (the average vector) | |
| 4. At search time, find nearest centroids first, then search those clusters | |
| Returns: | |
| - index: the FAISS IVF index object | |
| - embeddings: the raw vectors (numpy array) | |
| - centroids: the cluster centres (shape: n_clusters Γ vector_dim) | |
| """ | |
| # Embed all chunks β convert text to vectors | |
| embeddings = bi_enc.encode( | |
| chunks, | |
| normalize_embeddings=True, # normalise to unit length | |
| show_progress_bar=False | |
| ).astype("float32") | |
| dim = embeddings.shape[1] # vector dimension (384 for bge-small) | |
| # If corpus is too small for IVF, fall back to Flat (brute force) | |
| actual_clusters = min(n_clusters, max(1, len(chunks) // 4)) | |
| if len(chunks) < 10: | |
| # Very small corpus β use brute force | |
| index = faiss.IndexFlatIP(dim) # IP = Inner Product (cosine sim) | |
| index.add(embeddings) | |
| centroids = embeddings # for small corpus, vectors ARE the centroids | |
| index_type = "flat" | |
| else: | |
| # Build IVF index | |
| quantizer = faiss.IndexFlatIP(dim) | |
| index = faiss.IndexIVFFlat( | |
| quantizer, dim, actual_clusters, | |
| faiss.METRIC_INNER_PRODUCT | |
| ) | |
| index.train(embeddings) # learn cluster structure | |
| index.add(embeddings) # add vectors to index | |
| index.nprobe = max(1, actual_clusters // 5) # search this many clusters | |
| # Extract centroids from the trained quantizer | |
| centroids = faiss.vector_to_array( | |
| index.quantizer.xb | |
| ).reshape(-1, dim) | |
| index_type = "ivf" | |
| return index, embeddings, centroids, index_type | |
| def get_index_features(query_vec, centroids, embeddings, index_type): | |
| """ | |
| Extract structural features from the index for a given query. | |
| These are the features the ML model will learn from. | |
| For IVF: | |
| - distances from query to all centroids | |
| - how many centroids are "close" (within threshold) | |
| - spread (variance) of close centroid distances | |
| - estimated relevant pool size (sum of cluster sizes for close centroids) | |
| Returns a dictionary of features. | |
| """ | |
| # query_vec shape: (dim,) β a single vector | |
| query_vec = query_vec.reshape(1, -1).astype("float32") | |
| # Distance from query to every centroid | |
| # Higher inner product = closer (we use cosine similarity) | |
| centroid_distances = np.dot(centroids, query_vec.T).flatten() | |
| # Sort distances descending (most similar first) | |
| sorted_distances = np.sort(centroid_distances)[::-1] | |
| # Define "close" as top 20% of centroids | |
| threshold_distance = np.percentile(centroid_distances, 80) | |
| close_mask = centroid_distances >= threshold_distance | |
| # How many centroids are close | |
| n_close_centroids = int(np.sum(close_mask)) | |
| # Spread of close centroid distances β high spread = relevant | |
| # chunks scattered across corpus | |
| close_distances = centroid_distances[close_mask] | |
| spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0 | |
| # Estimate relevant pool β for IVF, approximate cluster size | |
| corpus_size = len(embeddings) | |
| avg_cluster_size = corpus_size / max(len(centroids), 1) | |
| estimated_pool = int(n_close_centroids * avg_cluster_size) | |
| # Top-1 and top-3 centroid distances | |
| top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0 | |
| top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist | |
| return { | |
| "corpus_size": corpus_size, | |
| "n_centroids": len(centroids), | |
| "n_close_centroids": n_close_centroids, | |
| "centroid_spread": spread, | |
| "estimated_pool": estimated_pool, | |
| "top1_centroid_dist": top1_dist, | |
| "top3_centroid_dist": top3_dist, | |
| "index_type": 0 if index_type == "flat" else 1, | |
| } | |
| def find_optimal_k(query, chunks, query_vec, index, embeddings, | |
| bi_enc, cross_enc, threshold=THRESHOLD): | |
| """ | |
| Find the OPTIMAL candidate pool size for this query. | |
| This is the KEY function for generating training data. | |
| The optimal k is the SMALLEST candidate pool that captures | |
| ALL chunks scoring above 0.3 on the cross-encoder. | |
| How we find it: | |
| 1. First retrieve ALL chunks (pool = len(chunks)) | |
| 2. Run cross-encoder on all of them | |
| 3. Find which ones score >= 0.3 β these are the "gold" relevant chunks | |
| 4. Find the smallest pool that would have retrieved all of them | |
| Returns: | |
| - optimal_k: the smallest pool size that captures all relevant chunks | |
| - n_relevant: how many chunks scored above 0.3 | |
| - dynamic_k_size: same as n_relevant (what LARA's threshold would give) | |
| """ | |
| if len(chunks) == 0: | |
| return 0, 0, 0 | |
| # Retrieve ALL chunks β we want to know ground truth | |
| all_pool_size = len(chunks) | |
| # Search the full index | |
| query_vec_2d = query_vec.reshape(1, -1).astype("float32") | |
| # Get similarity scores for all chunks | |
| scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten() | |
| # Rank by similarity (highest first) | |
| ranked_indices = np.argsort(scores_matrix)[::-1] | |
| ranked_chunks = [chunks[i] for i in ranked_indices] | |
| # Run cross-encoder on ALL ranked chunks | |
| # This is expensive but we only do it once per query for training data | |
| pairs = [(query, chunk) for chunk in ranked_chunks] | |
| if len(pairs) > 100: | |
| # Cap at 100 for speed β still gives good training signal | |
| pairs = pairs[:100] | |
| ranked_chunks = ranked_chunks[:100] | |
| ce_scores = cross_enc.predict(pairs) | |
| # Find which chunks are "relevant" (score >= threshold) | |
| relevant_flags = ce_scores >= threshold | |
| n_relevant = int(np.sum(relevant_flags)) | |
| if n_relevant == 0: | |
| # No relevant chunks found β optimal k is small | |
| return min(10, len(chunks)), 0, 0 | |
| # Find the position of the LAST relevant chunk in the ranking | |
| # optimal_k = that position + 1 | |
| # (we need a pool at least this large to capture all relevant chunks) | |
| last_relevant_position = 0 | |
| for i, flag in enumerate(relevant_flags): | |
| if flag: | |
| last_relevant_position = i | |
| optimal_k = last_relevant_position + 1 | |
| return optimal_k, n_relevant, n_relevant | |
| def get_query_features(query, bi_enc): | |
| """ | |
| Extract features from the query itself. | |
| These help the model understand query complexity. | |
| """ | |
| # Encode query to vector | |
| query_vec = bi_enc.encode( | |
| query, | |
| normalize_embeddings=True, | |
| show_progress_bar=False | |
| ).astype("float32") | |
| # Query length in characters | |
| query_len = len(query) | |
| # Number of words | |
| query_words = len(query.split()) | |
| # Does query contain a question word? (factual vs complex) | |
| question_words = ["what", "who", "when", "where", "which", | |
| "how", "why", "is", "are", "was", "were"] | |
| has_question_word = int( | |
| any(w in query.lower().split() for w in question_words) | |
| ) | |
| # Use first 32 dimensions of query embedding as features | |
| # (using all 384 would make the feature vector too large) | |
| query_embedding_compressed = query_vec[:32].tolist() | |
| return query_vec, { | |
| "query_len": query_len, | |
| "query_words": query_words, | |
| "has_question_word": has_question_word, | |
| "query_emb": query_embedding_compressed, | |
| } | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 3 β Main Data Generation Loop | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Starting training data generation...") | |
| print(f"Processing {NUM_QUESTIONS} questions.\n") | |
| all_rows = [] # will become our training dataset | |
| errors = 0 | |
| for idx, example in enumerate(dataset): | |
| # ββ Extract question and context from NaturalQuestions ββββββββ | |
| # NaturalQuestions has the question and a Wikipedia document | |
| question = example["question"]["text"] | |
| # Get the Wikipedia article text as our "corpus" | |
| # NaturalQuestions stores the document as a list of tokens | |
| # We join them to get the full text | |
| try: | |
| doc_tokens = example["document"]["tokens"]["token"] | |
| context = " ".join(doc_tokens[:3000]) # first 3000 tokens | |
| except Exception: | |
| errors += 1 | |
| continue | |
| if len(context) < 200: | |
| # Skip very short documents | |
| continue | |
| # ββ Chunk the context βββββββββββββββββββββββββββββββββββββββββ | |
| chunks = chunk_text(context) | |
| if len(chunks) < 2: | |
| continue | |
| # ββ Build IVF index for this corpus βββββββββββββββββββββββββββ | |
| try: | |
| index, embeddings, centroids, index_type = build_ivf_index( | |
| chunks, bi_encoder | |
| ) | |
| except Exception as e: | |
| errors += 1 | |
| continue | |
| # ββ Get query features and embedding ββββββββββββββββββββββββββ | |
| query_vec, query_feats = get_query_features(question, bi_encoder) | |
| # ββ Get index structure features ββββββββββββββββββββββββββββββ | |
| index_feats = get_index_features( | |
| query_vec, centroids, embeddings, index_type | |
| ) | |
| # ββ Find optimal k β THE KEY STEP βββββββββββββββββββββββββββββ | |
| t0 = time.time() | |
| optimal_k, n_relevant, dynamic_k = find_optimal_k( | |
| question, chunks, query_vec, index, | |
| embeddings, bi_encoder, cross_encoder | |
| ) | |
| elapsed = time.time() - t0 | |
| # ββ Build one training row ββββββββββββββββββββββββββββββββββββ | |
| row = { | |
| # Target variable β what the ML model needs to predict | |
| "optimal_k": optimal_k, | |
| # Query features | |
| "query_len": query_feats["query_len"], | |
| "query_words": query_feats["query_words"], | |
| "has_question_word": query_feats["has_question_word"], | |
| # Index structure features | |
| "corpus_size": index_feats["corpus_size"], | |
| "n_centroids": index_feats["n_centroids"], | |
| "n_close_centroids": index_feats["n_close_centroids"], | |
| "centroid_spread": index_feats["centroid_spread"], | |
| "estimated_pool": index_feats["estimated_pool"], | |
| "top1_centroid_dist": index_feats["top1_centroid_dist"], | |
| "top3_centroid_dist": index_feats["top3_centroid_dist"], | |
| "index_type": index_feats["index_type"], | |
| # Extra info (not used for training, useful for analysis) | |
| "n_relevant_chunks": n_relevant, | |
| "n_total_chunks": len(chunks), | |
| "question": question, | |
| "elapsed_sec": round(elapsed, 3), | |
| } | |
| # Add compressed query embedding as features | |
| # qe_0, qe_1, ... qe_31 β 32 numbers from the query vector | |
| for i, val in enumerate(query_feats["query_emb"]): | |
| row[f"qe_{i}"] = round(float(val), 6) | |
| all_rows.append(row) | |
| # ββ Progress update βββββββββββββββββββββββββββββββββββββββββββ | |
| if (idx + 1) % 50 == 0: | |
| print(f" Processed {idx+1}/{NUM_QUESTIONS} questions. " | |
| f"Rows collected: {len(all_rows)}. " | |
| f"Errors: {errors}.") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # STEP 4 β Save Training Data | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| df = pd.DataFrame(all_rows) | |
| print(f"\nDone. Total rows: {len(df)}") | |
| print(f"Errors skipped: {errors}") | |
| print(f"\nDataset preview:") | |
| print(df[["question","corpus_size","n_relevant_chunks", | |
| "optimal_k","top1_centroid_dist"]].head(10)) | |
| print(f"\nOptimal K statistics:") | |
| print(df["optimal_k"].describe()) | |
| df.to_csv(OUTPUT_FILE, index=False) | |
| print(f"\nSaved to: {OUTPUT_FILE}") | |
| print("Download this file β you will use it in Step 2 (ML training).") |