ML_Model / appp.py
npmaiecosystem's picture
Update appp.py
89da3cb verified
# ═══════════════════════════════════════════════════════════════════
# LARA β€” Step 1: Generate Training Data for ML K-Prediction Model
# By Sonu Kumar, NPMAI ECOSYSTEM
#
# What this script does:
# For each question in NaturalQuestions, it runs LARA's cross-encoder
# and finds the OPTIMAL candidate pool size β€” the smallest number of
# candidates that captures all chunks scoring above 0.3.
# It also records index structure statistics (IVF centroid distances,
# cluster sizes) that the ML model will learn from.
#
# Output: training_data.csv β€” one row per query, ready for ML training
# ═══════════════════════════════════════════════════════════════════
# ── Install everything needed (run this cell first in Colab) ───────
# !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy
import numpy as np
import pandas as pd
import faiss
import time
import math
import os
from sentence_transformers import SentenceTransformer, CrossEncoder
from datasets import load_dataset
# ══════════════════════════════════════════════════════════════════
# CONFIGURATION β€” change these if needed
# ══════════════════════════════════════════════════════════════════
NUM_QUESTIONS = 2 # how many questions to process
# 500 is manageable on free Colab CPU
# increase to 2000+ if you have more time
CHUNK_SIZE = 1000 # characters per chunk β€” same as LARA paper
CHUNK_OVERLAP = 200 # overlap between chunks
THRESHOLD = 0.3 # cross-encoder quality threshold
NUM_IVF_CLUSTERS = 50 # number of IVF clusters for the index
# rule of thumb: sqrt(total_vectors)
OUTPUT_FILE = "training_data.csv"
# ══════════════════════════════════════════════════════════════════
# STEP 1 β€” Load Models
# These load once and are reused for every query.
# ══════════════════════════════════════════════════════════════════
print("Loading bi-encoder (for creating vector embeddings)...")
# The bi-encoder converts text to vectors
# Same model used in LARA paper
bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5")
print("Loading cross-encoder (for quality scoring)...")
# The cross-encoder scores query-document pairs
# This is the 0.3 threshold model from LARA
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
print("Models loaded.\n")
# ══════════════════════════════════════════════════════════════════
# STEP 2 β€” Load NaturalQuestions Dataset
# NaturalQuestions has real Google search queries with Wikipedia answers
# We use a small subset for training data generation
# ══════════════════════════════════════════════════════════════════
print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...")
# Load from HuggingFace β€” this downloads automatically
# validation split is smaller and faster to load
dataset = load_dataset(
"natural_questions",
split=f"validation[:{NUM_QUESTIONS}]",
trust_remote_code=True
)
print(f"Loaded {len(dataset)} questions.\n")
# ══════════════════════════════════════════════════════════════════
# HELPER FUNCTIONS
# ══════════════════════════════════════════════════════════════════
def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
"""
Split text into overlapping chunks.
chunk_size=1000 chars, overlap=200 chars β€” same as LARA paper.
Example: text of 2500 chars with chunk_size=1000, overlap=200
Chunk 1: chars 0 to 1000
Chunk 2: chars 800 to 1800
Chunk 3: chars 1600 to 2500
"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start += (chunk_size - overlap)
if start >= len(text):
break
return chunks
def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS):
"""
Build an IVF (Inverted File) index from a list of text chunks.
IVF works like this:
1. Embed all chunks into vectors
2. Group vectors into n_clusters clusters using k-means
3. Each cluster has a centroid (the average vector)
4. At search time, find nearest centroids first, then search those clusters
Returns:
- index: the FAISS IVF index object
- embeddings: the raw vectors (numpy array)
- centroids: the cluster centres (shape: n_clusters Γ— vector_dim)
"""
# Embed all chunks β€” convert text to vectors
embeddings = bi_enc.encode(
chunks,
normalize_embeddings=True, # normalise to unit length
show_progress_bar=False
).astype("float32")
dim = embeddings.shape[1] # vector dimension (384 for bge-small)
# If corpus is too small for IVF, fall back to Flat (brute force)
actual_clusters = min(n_clusters, max(1, len(chunks) // 4))
if len(chunks) < 10:
# Very small corpus β€” use brute force
index = faiss.IndexFlatIP(dim) # IP = Inner Product (cosine sim)
index.add(embeddings)
centroids = embeddings # for small corpus, vectors ARE the centroids
index_type = "flat"
else:
# Build IVF index
quantizer = faiss.IndexFlatIP(dim)
index = faiss.IndexIVFFlat(
quantizer, dim, actual_clusters,
faiss.METRIC_INNER_PRODUCT
)
index.train(embeddings) # learn cluster structure
index.add(embeddings) # add vectors to index
index.nprobe = max(1, actual_clusters // 5) # search this many clusters
# Extract centroids from the trained quantizer
centroids = faiss.vector_to_array(
index.quantizer.xb
).reshape(-1, dim)
index_type = "ivf"
return index, embeddings, centroids, index_type
def get_index_features(query_vec, centroids, embeddings, index_type):
"""
Extract structural features from the index for a given query.
These are the features the ML model will learn from.
For IVF:
- distances from query to all centroids
- how many centroids are "close" (within threshold)
- spread (variance) of close centroid distances
- estimated relevant pool size (sum of cluster sizes for close centroids)
Returns a dictionary of features.
"""
# query_vec shape: (dim,) β€” a single vector
query_vec = query_vec.reshape(1, -1).astype("float32")
# Distance from query to every centroid
# Higher inner product = closer (we use cosine similarity)
centroid_distances = np.dot(centroids, query_vec.T).flatten()
# Sort distances descending (most similar first)
sorted_distances = np.sort(centroid_distances)[::-1]
# Define "close" as top 20% of centroids
threshold_distance = np.percentile(centroid_distances, 80)
close_mask = centroid_distances >= threshold_distance
# How many centroids are close
n_close_centroids = int(np.sum(close_mask))
# Spread of close centroid distances β€” high spread = relevant
# chunks scattered across corpus
close_distances = centroid_distances[close_mask]
spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0
# Estimate relevant pool β€” for IVF, approximate cluster size
corpus_size = len(embeddings)
avg_cluster_size = corpus_size / max(len(centroids), 1)
estimated_pool = int(n_close_centroids * avg_cluster_size)
# Top-1 and top-3 centroid distances
top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0
top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist
return {
"corpus_size": corpus_size,
"n_centroids": len(centroids),
"n_close_centroids": n_close_centroids,
"centroid_spread": spread,
"estimated_pool": estimated_pool,
"top1_centroid_dist": top1_dist,
"top3_centroid_dist": top3_dist,
"index_type": 0 if index_type == "flat" else 1,
}
def find_optimal_k(query, chunks, query_vec, index, embeddings,
bi_enc, cross_enc, threshold=THRESHOLD):
"""
Find the OPTIMAL candidate pool size for this query.
This is the KEY function for generating training data.
The optimal k is the SMALLEST candidate pool that captures
ALL chunks scoring above 0.3 on the cross-encoder.
How we find it:
1. First retrieve ALL chunks (pool = len(chunks))
2. Run cross-encoder on all of them
3. Find which ones score >= 0.3 β€” these are the "gold" relevant chunks
4. Find the smallest pool that would have retrieved all of them
Returns:
- optimal_k: the smallest pool size that captures all relevant chunks
- n_relevant: how many chunks scored above 0.3
- dynamic_k_size: same as n_relevant (what LARA's threshold would give)
"""
if len(chunks) == 0:
return 0, 0, 0
# Retrieve ALL chunks β€” we want to know ground truth
all_pool_size = len(chunks)
# Search the full index
query_vec_2d = query_vec.reshape(1, -1).astype("float32")
# Get similarity scores for all chunks
scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten()
# Rank by similarity (highest first)
ranked_indices = np.argsort(scores_matrix)[::-1]
ranked_chunks = [chunks[i] for i in ranked_indices]
# Run cross-encoder on ALL ranked chunks
# This is expensive but we only do it once per query for training data
pairs = [(query, chunk) for chunk in ranked_chunks]
if len(pairs) > 100:
# Cap at 100 for speed β€” still gives good training signal
pairs = pairs[:100]
ranked_chunks = ranked_chunks[:100]
ce_scores = cross_enc.predict(pairs)
# Find which chunks are "relevant" (score >= threshold)
relevant_flags = ce_scores >= threshold
n_relevant = int(np.sum(relevant_flags))
if n_relevant == 0:
# No relevant chunks found β€” optimal k is small
return min(10, len(chunks)), 0, 0
# Find the position of the LAST relevant chunk in the ranking
# optimal_k = that position + 1
# (we need a pool at least this large to capture all relevant chunks)
last_relevant_position = 0
for i, flag in enumerate(relevant_flags):
if flag:
last_relevant_position = i
optimal_k = last_relevant_position + 1
return optimal_k, n_relevant, n_relevant
def get_query_features(query, bi_enc):
"""
Extract features from the query itself.
These help the model understand query complexity.
"""
# Encode query to vector
query_vec = bi_enc.encode(
query,
normalize_embeddings=True,
show_progress_bar=False
).astype("float32")
# Query length in characters
query_len = len(query)
# Number of words
query_words = len(query.split())
# Does query contain a question word? (factual vs complex)
question_words = ["what", "who", "when", "where", "which",
"how", "why", "is", "are", "was", "were"]
has_question_word = int(
any(w in query.lower().split() for w in question_words)
)
# Use first 32 dimensions of query embedding as features
# (using all 384 would make the feature vector too large)
query_embedding_compressed = query_vec[:32].tolist()
return query_vec, {
"query_len": query_len,
"query_words": query_words,
"has_question_word": has_question_word,
"query_emb": query_embedding_compressed,
}
# ══════════════════════════════════════════════════════════════════
# STEP 3 β€” Main Data Generation Loop
# ══════════════════════════════════════════════════════════════════
print("Starting training data generation...")
print(f"Processing {NUM_QUESTIONS} questions.\n")
all_rows = [] # will become our training dataset
errors = 0
for idx, example in enumerate(dataset):
# ── Extract question and context from NaturalQuestions ────────
# NaturalQuestions has the question and a Wikipedia document
question = example["question"]["text"]
# Get the Wikipedia article text as our "corpus"
# NaturalQuestions stores the document as a list of tokens
# We join them to get the full text
try:
doc_tokens = example["document"]["tokens"]["token"]
context = " ".join(doc_tokens[:3000]) # first 3000 tokens
except Exception:
errors += 1
continue
if len(context) < 200:
# Skip very short documents
continue
# ── Chunk the context ─────────────────────────────────────────
chunks = chunk_text(context)
if len(chunks) < 2:
continue
# ── Build IVF index for this corpus ───────────────────────────
try:
index, embeddings, centroids, index_type = build_ivf_index(
chunks, bi_encoder
)
except Exception as e:
errors += 1
continue
# ── Get query features and embedding ──────────────────────────
query_vec, query_feats = get_query_features(question, bi_encoder)
# ── Get index structure features ──────────────────────────────
index_feats = get_index_features(
query_vec, centroids, embeddings, index_type
)
# ── Find optimal k β€” THE KEY STEP ─────────────────────────────
t0 = time.time()
optimal_k, n_relevant, dynamic_k = find_optimal_k(
question, chunks, query_vec, index,
embeddings, bi_encoder, cross_encoder
)
elapsed = time.time() - t0
# ── Build one training row ────────────────────────────────────
row = {
# Target variable β€” what the ML model needs to predict
"optimal_k": optimal_k,
# Query features
"query_len": query_feats["query_len"],
"query_words": query_feats["query_words"],
"has_question_word": query_feats["has_question_word"],
# Index structure features
"corpus_size": index_feats["corpus_size"],
"n_centroids": index_feats["n_centroids"],
"n_close_centroids": index_feats["n_close_centroids"],
"centroid_spread": index_feats["centroid_spread"],
"estimated_pool": index_feats["estimated_pool"],
"top1_centroid_dist": index_feats["top1_centroid_dist"],
"top3_centroid_dist": index_feats["top3_centroid_dist"],
"index_type": index_feats["index_type"],
# Extra info (not used for training, useful for analysis)
"n_relevant_chunks": n_relevant,
"n_total_chunks": len(chunks),
"question": question,
"elapsed_sec": round(elapsed, 3),
}
# Add compressed query embedding as features
# qe_0, qe_1, ... qe_31 β€” 32 numbers from the query vector
for i, val in enumerate(query_feats["query_emb"]):
row[f"qe_{i}"] = round(float(val), 6)
all_rows.append(row)
# ── Progress update ───────────────────────────────────────────
if (idx + 1) % 50 == 0:
print(f" Processed {idx+1}/{NUM_QUESTIONS} questions. "
f"Rows collected: {len(all_rows)}. "
f"Errors: {errors}.")
# ══════════════════════════════════════════════════════════════════
# STEP 4 β€” Save Training Data
# ══════════════════════════════════════════════════════════════════
df = pd.DataFrame(all_rows)
print(f"\nDone. Total rows: {len(df)}")
print(f"Errors skipped: {errors}")
print(f"\nDataset preview:")
print(df[["question","corpus_size","n_relevant_chunks",
"optimal_k","top1_centroid_dist"]].head(10))
print(f"\nOptimal K statistics:")
print(df["optimal_k"].describe())
df.to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved to: {OUTPUT_FILE}")
print("Download this file β€” you will use it in Step 2 (ML training).")