Spaces:

npmaiecosystem
/

ML_Model

Runtime error

App Files Files Community

ML_Model / appp.py

npmaiecosystem

Update appp.py

89da3cb verified about 13 hours ago

raw

history blame contribute delete

18.4 kB

	# ═══════════════════════════════════════════════════════════════════
	# LARA — Step 1: Generate Training Data for ML K-Prediction Model
	# By Sonu Kumar, NPMAI ECOSYSTEM
	#
	# What this script does:
	# For each question in NaturalQuestions, it runs LARA's cross-encoder
	# and finds the OPTIMAL candidate pool size — the smallest number of
	# candidates that captures all chunks scoring above 0.3.
	# It also records index structure statistics (IVF centroid distances,
	# cluster sizes) that the ML model will learn from.
	#
	# Output: training_data.csv — one row per query, ready for ML training
	# ═══════════════════════════════════════════════════════════════════

	# ── Install everything needed (run this cell first in Colab) ───────
	# !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy

	import numpy as np
	import pandas as pd
	import faiss
	import time
	import math
	import os
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from datasets import load_dataset

	# ══════════════════════════════════════════════════════════════════
	# CONFIGURATION — change these if needed
	# ══════════════════════════════════════════════════════════════════

	NUM_QUESTIONS = 2 # how many questions to process
	# 500 is manageable on free Colab CPU
	# increase to 2000+ if you have more time

	CHUNK_SIZE = 1000 # characters per chunk — same as LARA paper
	CHUNK_OVERLAP = 200 # overlap between chunks
	THRESHOLD = 0.3 # cross-encoder quality threshold
	NUM_IVF_CLUSTERS = 50 # number of IVF clusters for the index
	# rule of thumb: sqrt(total_vectors)
	OUTPUT_FILE = "training_data.csv"

	# ══════════════════════════════════════════════════════════════════
	# STEP 1 — Load Models
	# These load once and are reused for every query.
	# ══════════════════════════════════════════════════════════════════

	print("Loading bi-encoder (for creating vector embeddings)...")
	# The bi-encoder converts text to vectors
	# Same model used in LARA paper
	bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5")

	print("Loading cross-encoder (for quality scoring)...")
	# The cross-encoder scores query-document pairs
	# This is the 0.3 threshold model from LARA
	cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

	print("Models loaded.\n")

	# ══════════════════════════════════════════════════════════════════
	# STEP 2 — Load NaturalQuestions Dataset
	# NaturalQuestions has real Google search queries with Wikipedia answers
	# We use a small subset for training data generation
	# ══════════════════════════════════════════════════════════════════

	print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...")

	# Load from HuggingFace — this downloads automatically
	# validation split is smaller and faster to load
	dataset = load_dataset(
	"natural_questions",
	split=f"validation[:{NUM_QUESTIONS}]",
	trust_remote_code=True
	)

	print(f"Loaded {len(dataset)} questions.\n")

	# ══════════════════════════════════════════════════════════════════
	# HELPER FUNCTIONS
	# ══════════════════════════════════════════════════════════════════

	def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
	"""
	Split text into overlapping chunks.
	chunk_size=1000 chars, overlap=200 chars — same as LARA paper.

	Example: text of 2500 chars with chunk_size=1000, overlap=200
	Chunk 1: chars 0 to 1000
	Chunk 2: chars 800 to 1800
	Chunk 3: chars 1600 to 2500
	"""
	chunks = []
	start = 0
	while start < len(text):
	end = start + chunk_size
	chunks.append(text[start:end])
	start += (chunk_size - overlap)
	if start >= len(text):
	break
	return chunks


	def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS):
	"""
	Build an IVF (Inverted File) index from a list of text chunks.

	IVF works like this:
	1. Embed all chunks into vectors
	2. Group vectors into n_clusters clusters using k-means
	3. Each cluster has a centroid (the average vector)
	4. At search time, find nearest centroids first, then search those clusters

	Returns:
	- index: the FAISS IVF index object
	- embeddings: the raw vectors (numpy array)
	- centroids: the cluster centres (shape: n_clusters × vector_dim)
	"""
	# Embed all chunks — convert text to vectors
	embeddings = bi_enc.encode(
	chunks,
	normalize_embeddings=True, # normalise to unit length
	show_progress_bar=False
	).astype("float32")

	dim = embeddings.shape[1] # vector dimension (384 for bge-small)

	# If corpus is too small for IVF, fall back to Flat (brute force)
	actual_clusters = min(n_clusters, max(1, len(chunks) // 4))

	if len(chunks) < 10:
	# Very small corpus — use brute force
	index = faiss.IndexFlatIP(dim) # IP = Inner Product (cosine sim)
	index.add(embeddings)
	centroids = embeddings # for small corpus, vectors ARE the centroids
	index_type = "flat"
	else:
	# Build IVF index
	quantizer = faiss.IndexFlatIP(dim)
	index = faiss.IndexIVFFlat(
	quantizer, dim, actual_clusters,
	faiss.METRIC_INNER_PRODUCT
	)
	index.train(embeddings) # learn cluster structure
	index.add(embeddings) # add vectors to index
	index.nprobe = max(1, actual_clusters // 5) # search this many clusters

	# Extract centroids from the trained quantizer
	centroids = faiss.vector_to_array(
	index.quantizer.xb
	).reshape(-1, dim)
	index_type = "ivf"

	return index, embeddings, centroids, index_type


	def get_index_features(query_vec, centroids, embeddings, index_type):
	"""
	Extract structural features from the index for a given query.
	These are the features the ML model will learn from.

	For IVF:
	- distances from query to all centroids
	- how many centroids are "close" (within threshold)
	- spread (variance) of close centroid distances
	- estimated relevant pool size (sum of cluster sizes for close centroids)

	Returns a dictionary of features.
	"""
	# query_vec shape: (dim,) — a single vector
	query_vec = query_vec.reshape(1, -1).astype("float32")

	# Distance from query to every centroid
	# Higher inner product = closer (we use cosine similarity)
	centroid_distances = np.dot(centroids, query_vec.T).flatten()

	# Sort distances descending (most similar first)
	sorted_distances = np.sort(centroid_distances)[::-1]

	# Define "close" as top 20% of centroids
	threshold_distance = np.percentile(centroid_distances, 80)
	close_mask = centroid_distances >= threshold_distance

	# How many centroids are close
	n_close_centroids = int(np.sum(close_mask))

	# Spread of close centroid distances — high spread = relevant
	# chunks scattered across corpus
	close_distances = centroid_distances[close_mask]
	spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0

	# Estimate relevant pool — for IVF, approximate cluster size
	corpus_size = len(embeddings)
	avg_cluster_size = corpus_size / max(len(centroids), 1)
	estimated_pool = int(n_close_centroids * avg_cluster_size)

	# Top-1 and top-3 centroid distances
	top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0
	top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist

	return {
	"corpus_size": corpus_size,
	"n_centroids": len(centroids),
	"n_close_centroids": n_close_centroids,
	"centroid_spread": spread,
	"estimated_pool": estimated_pool,
	"top1_centroid_dist": top1_dist,
	"top3_centroid_dist": top3_dist,
	"index_type": 0 if index_type == "flat" else 1,
	}


	def find_optimal_k(query, chunks, query_vec, index, embeddings,
	bi_enc, cross_enc, threshold=THRESHOLD):
	"""
	Find the OPTIMAL candidate pool size for this query.

	This is the KEY function for generating training data.

	The optimal k is the SMALLEST candidate pool that captures
	ALL chunks scoring above 0.3 on the cross-encoder.

	How we find it:
	1. First retrieve ALL chunks (pool = len(chunks))
	2. Run cross-encoder on all of them
	3. Find which ones score >= 0.3 — these are the "gold" relevant chunks
	4. Find the smallest pool that would have retrieved all of them

	Returns:
	- optimal_k: the smallest pool size that captures all relevant chunks
	- n_relevant: how many chunks scored above 0.3
	- dynamic_k_size: same as n_relevant (what LARA's threshold would give)
	"""
	if len(chunks) == 0:
	return 0, 0, 0

	# Retrieve ALL chunks — we want to know ground truth
	all_pool_size = len(chunks)

	# Search the full index
	query_vec_2d = query_vec.reshape(1, -1).astype("float32")

	# Get similarity scores for all chunks
	scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten()

	# Rank by similarity (highest first)
	ranked_indices = np.argsort(scores_matrix)[::-1]
	ranked_chunks = [chunks[i] for i in ranked_indices]

	# Run cross-encoder on ALL ranked chunks
	# This is expensive but we only do it once per query for training data
	pairs = [(query, chunk) for chunk in ranked_chunks]

	if len(pairs) > 100:
	# Cap at 100 for speed — still gives good training signal
	pairs = pairs[:100]
	ranked_chunks = ranked_chunks[:100]

	ce_scores = cross_enc.predict(pairs)

	# Find which chunks are "relevant" (score >= threshold)
	relevant_flags = ce_scores >= threshold
	n_relevant = int(np.sum(relevant_flags))

	if n_relevant == 0:
	# No relevant chunks found — optimal k is small
	return min(10, len(chunks)), 0, 0

	# Find the position of the LAST relevant chunk in the ranking
	# optimal_k = that position + 1
	# (we need a pool at least this large to capture all relevant chunks)
	last_relevant_position = 0
	for i, flag in enumerate(relevant_flags):
	if flag:
	last_relevant_position = i

	optimal_k = last_relevant_position + 1

	return optimal_k, n_relevant, n_relevant


	def get_query_features(query, bi_enc):
	"""
	Extract features from the query itself.
	These help the model understand query complexity.
	"""
	# Encode query to vector
	query_vec = bi_enc.encode(
	query,
	normalize_embeddings=True,
	show_progress_bar=False
	).astype("float32")

	# Query length in characters
	query_len = len(query)

	# Number of words
	query_words = len(query.split())

	# Does query contain a question word? (factual vs complex)
	question_words = ["what", "who", "when", "where", "which",
	"how", "why", "is", "are", "was", "were"]
	has_question_word = int(
	any(w in query.lower().split() for w in question_words)
	)

	# Use first 32 dimensions of query embedding as features
	# (using all 384 would make the feature vector too large)
	query_embedding_compressed = query_vec[:32].tolist()

	return query_vec, {
	"query_len": query_len,
	"query_words": query_words,
	"has_question_word": has_question_word,
	"query_emb": query_embedding_compressed,
	}


	# ══════════════════════════════════════════════════════════════════
	# STEP 3 — Main Data Generation Loop
	# ══════════════════════════════════════════════════════════════════

	print("Starting training data generation...")
	print(f"Processing {NUM_QUESTIONS} questions.\n")

	all_rows = [] # will become our training dataset
	errors = 0

	for idx, example in enumerate(dataset):

	# ── Extract question and context from NaturalQuestions ────────
	# NaturalQuestions has the question and a Wikipedia document
	question = example["question"]["text"]

	# Get the Wikipedia article text as our "corpus"
	# NaturalQuestions stores the document as a list of tokens
	# We join them to get the full text
	try:
	doc_tokens = example["document"]["tokens"]["token"]
	context = " ".join(doc_tokens[:3000]) # first 3000 tokens
	except Exception:
	errors += 1
	continue

	if len(context) < 200:
	# Skip very short documents
	continue

	# ── Chunk the context ─────────────────────────────────────────
	chunks = chunk_text(context)

	if len(chunks) < 2:
	continue

	# ── Build IVF index for this corpus ───────────────────────────
	try:
	index, embeddings, centroids, index_type = build_ivf_index(
	chunks, bi_encoder
	)
	except Exception as e:
	errors += 1
	continue

	# ── Get query features and embedding ──────────────────────────
	query_vec, query_feats = get_query_features(question, bi_encoder)

	# ── Get index structure features ──────────────────────────────
	index_feats = get_index_features(
	query_vec, centroids, embeddings, index_type
	)

	# ── Find optimal k — THE KEY STEP ─────────────────────────────
	t0 = time.time()
	optimal_k, n_relevant, dynamic_k = find_optimal_k(
	question, chunks, query_vec, index,
	embeddings, bi_encoder, cross_encoder
	)
	elapsed = time.time() - t0

	# ── Build one training row ────────────────────────────────────
	row = {
	# Target variable — what the ML model needs to predict
	"optimal_k": optimal_k,

	# Query features
	"query_len": query_feats["query_len"],
	"query_words": query_feats["query_words"],
	"has_question_word": query_feats["has_question_word"],

	# Index structure features
	"corpus_size": index_feats["corpus_size"],
	"n_centroids": index_feats["n_centroids"],
	"n_close_centroids": index_feats["n_close_centroids"],
	"centroid_spread": index_feats["centroid_spread"],
	"estimated_pool": index_feats["estimated_pool"],
	"top1_centroid_dist": index_feats["top1_centroid_dist"],
	"top3_centroid_dist": index_feats["top3_centroid_dist"],
	"index_type": index_feats["index_type"],

	# Extra info (not used for training, useful for analysis)
	"n_relevant_chunks": n_relevant,
	"n_total_chunks": len(chunks),
	"question": question,
	"elapsed_sec": round(elapsed, 3),
	}

	# Add compressed query embedding as features
	# qe_0, qe_1, ... qe_31 — 32 numbers from the query vector
	for i, val in enumerate(query_feats["query_emb"]):
	row[f"qe_{i}"] = round(float(val), 6)

	all_rows.append(row)

	# ── Progress update ───────────────────────────────────────────
	if (idx + 1) % 50 == 0:
	print(f" Processed {idx+1}/{NUM_QUESTIONS} questions. "
	f"Rows collected: {len(all_rows)}. "
	f"Errors: {errors}.")

	# ══════════════════════════════════════════════════════════════════
	# STEP 4 — Save Training Data
	# ══════════════════════════════════════════════════════════════════

	df = pd.DataFrame(all_rows)

	print(f"\nDone. Total rows: {len(df)}")
	print(f"Errors skipped: {errors}")
	print(f"\nDataset preview:")
	print(df[["question","corpus_size","n_relevant_chunks",
	"optimal_k","top1_centroid_dist"]].head(10))
	print(f"\nOptimal K statistics:")
	print(df["optimal_k"].describe())

	df.to_csv(OUTPUT_FILE, index=False)
	print(f"\nSaved to: {OUTPUT_FILE}")
	print("Download this file — you will use it in Step 2 (ML training).")