Spaces:

npmaiecosystem
/

ML_Model

Runtime error

App Files Files Community

npmaiecosystem commited on 1 day ago

Commit

4aaa65b

verified ·

1 Parent(s): 586cf1f

Create appp.py

Browse files

Files changed (1) hide show

appp.py +434 -0

appp.py ADDED Viewed

	@@ -0,0 +1,434 @@

+# ═══════════════════════════════════════════════════════════════════
+# LARA — Step 1: Generate Training Data for ML K-Prediction Model
+# By Sonu Kumar, NPMAI ECOSYSTEM
+#
+# What this script does:
+# For each question in NaturalQuestions, it runs LARA's cross-encoder
+# and finds the OPTIMAL candidate pool size — the smallest number of
+# candidates that captures all chunks scoring above 0.3.
+# It also records index structure statistics (IVF centroid distances,
+# cluster sizes) that the ML model will learn from.
+#
+# Output: training_data.csv — one row per query, ready for ML training
+# ═══════════════════════════════════════════════════════════════════
+# ── Install everything needed (run this cell first in Colab) ───────
+# !pip install datasets sentence-transformers faiss-cpu scikit-learn pandas numpy
+import numpy as np
+import pandas as pd
+import faiss
+import time
+import math
+import os
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from datasets import load_dataset
+# ══════════════════════════════════════════════════════════════════
+# CONFIGURATION — change these if needed
+# ══════════════════════════════════════════════════════════════════
+NUM_QUESTIONS    = 500     # how many questions to process
+                           # 500 is manageable on free Colab CPU
+                           # increase to 2000+ if you have more time
+CHUNK_SIZE       = 1000    # characters per chunk — same as LARA paper
+CHUNK_OVERLAP    = 200     # overlap between chunks
+THRESHOLD        = 0.3     # cross-encoder quality threshold
+NUM_IVF_CLUSTERS = 50      # number of IVF clusters for the index
+                           # rule of thumb: sqrt(total_vectors)
+OUTPUT_FILE      = "training_data.csv"
+# ══════════════════════════════════════════════════════════════════
+# STEP 1 — Load Models
+# These load once and are reused for every query.
+# ══════════════════════════════════════════════════════════════════
+print("Loading bi-encoder (for creating vector embeddings)...")
+# The bi-encoder converts text to vectors
+# Same model used in LARA paper
+bi_encoder = SentenceTransformer("BAAI/bge-small-en-v1.5")
+print("Loading cross-encoder (for quality scoring)...")
+# The cross-encoder scores query-document pairs
+# This is the 0.3 threshold model from LARA
+cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
+print("Models loaded.\n")
+# ══════════════════════════════════════════════════════════════════
+# STEP 2 — Load NaturalQuestions Dataset
+# NaturalQuestions has real Google search queries with Wikipedia answers
+# We use a small subset for training data generation
+# ══════════════════════════════════════════════════════════════════
+print(f"Loading NaturalQuestions dataset ({NUM_QUESTIONS} questions)...")
+# Load from HuggingFace — this downloads automatically
+# validation split is smaller and faster to load
+dataset = load_dataset(
+    "natural_questions",
+    split=f"validation[:{NUM_QUESTIONS}]",
+    trust_remote_code=True
+)
+print(f"Loaded {len(dataset)} questions.\n")
+# ══════════════════════════════════════════════════════════════════
+# HELPER FUNCTIONS
+# ══════════════════════════════════════════════════════════════════
+def chunk_text(text, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
+    """
+    Split text into overlapping chunks.
+    chunk_size=1000 chars, overlap=200 chars — same as LARA paper.
+    Example: text of 2500 chars with chunk_size=1000, overlap=200
+    Chunk 1: chars 0    to 1000
+    Chunk 2: chars 800  to 1800
+    Chunk 3: chars 1600 to 2500
+    """
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start += (chunk_size - overlap)
+        if start >= len(text):
+            break
+    return chunks
+def build_ivf_index(chunks, bi_enc, n_clusters=NUM_IVF_CLUSTERS):
+    """
+    Build an IVF (Inverted File) index from a list of text chunks.
+    IVF works like this:
+    1. Embed all chunks into vectors
+    2. Group vectors into n_clusters clusters using k-means
+    3. Each cluster has a centroid (the average vector)
+    4. At search time, find nearest centroids first, then search those clusters
+    Returns:
+    - index: the FAISS IVF index object
+    - embeddings: the raw vectors (numpy array)
+    - centroids: the cluster centres (shape: n_clusters × vector_dim)
+    """
+    # Embed all chunks — convert text to vectors
+    embeddings = bi_enc.encode(
+        chunks,
+        normalize_embeddings=True,  # normalise to unit length
+        show_progress_bar=False
+    ).astype("float32")
+    dim = embeddings.shape[1]  # vector dimension (384 for bge-small)
+    # If corpus is too small for IVF, fall back to Flat (brute force)
+    actual_clusters = min(n_clusters, max(1, len(chunks) // 4))
+    if len(chunks) < 10:
+        # Very small corpus — use brute force
+        index = faiss.IndexFlatIP(dim)  # IP = Inner Product (cosine sim)
+        index.add(embeddings)
+        centroids = embeddings  # for small corpus, vectors ARE the centroids
+        index_type = "flat"
+    else:
+        # Build IVF index
+        quantizer = faiss.IndexFlatIP(dim)
+        index = faiss.IndexIVFFlat(
+            quantizer, dim, actual_clusters,
+            faiss.METRIC_INNER_PRODUCT
+        )
+        index.train(embeddings)   # learn cluster structure
+        index.add(embeddings)     # add vectors to index
+        index.nprobe = max(1, actual_clusters // 5)  # search this many clusters
+        # Extract centroids from the trained quantizer
+        centroids = faiss.vector_to_array(
+            index.quantizer.xb
+        ).reshape(-1, dim)
+        index_type = "ivf"
+    return index, embeddings, centroids, index_type
+def get_index_features(query_vec, centroids, embeddings, index_type):
+    """
+    Extract structural features from the index for a given query.
+    These are the features the ML model will learn from.
+    For IVF:
+    - distances from query to all centroids
+    - how many centroids are "close" (within threshold)
+    - spread (variance) of close centroid distances
+    - estimated relevant pool size (sum of cluster sizes for close centroids)
+    Returns a dictionary of features.
+    """
+    # query_vec shape: (dim,) — a single vector
+    query_vec = query_vec.reshape(1, -1).astype("float32")
+    # Distance from query to every centroid
+    # Higher inner product = closer (we use cosine similarity)
+    centroid_distances = np.dot(centroids, query_vec.T).flatten()
+    # Sort distances descending (most similar first)
+    sorted_distances = np.sort(centroid_distances)[::-1]
+    # Define "close" as top 20% of centroids
+    threshold_distance = np.percentile(centroid_distances, 80)
+    close_mask = centroid_distances >= threshold_distance
+    # How many centroids are close
+    n_close_centroids = int(np.sum(close_mask))
+    # Spread of close centroid distances — high spread = relevant
+    # chunks scattered across corpus
+    close_distances = centroid_distances[close_mask]
+    spread = float(np.std(close_distances)) if len(close_distances) > 1 else 0.0
+    # Estimate relevant pool — for IVF, approximate cluster size
+    corpus_size = len(embeddings)
+    avg_cluster_size = corpus_size / max(len(centroids), 1)
+    estimated_pool = int(n_close_centroids * avg_cluster_size)
+    # Top-1 and top-3 centroid distances
+    top1_dist = float(sorted_distances[0]) if len(sorted_distances) > 0 else 0.0
+    top3_dist = float(np.mean(sorted_distances[:3])) if len(sorted_distances) >= 3 else top1_dist
+    return {
+        "corpus_size":        corpus_size,
+        "n_centroids":        len(centroids),
+        "n_close_centroids":  n_close_centroids,
+        "centroid_spread":    spread,
+        "estimated_pool":     estimated_pool,
+        "top1_centroid_dist": top1_dist,
+        "top3_centroid_dist": top3_dist,
+        "index_type":         0 if index_type == "flat" else 1,
+    }
+def find_optimal_k(query, chunks, query_vec, index, embeddings,
+                   bi_enc, cross_enc, threshold=THRESHOLD):
+    """
+    Find the OPTIMAL candidate pool size for this query.
+    This is the KEY function for generating training data.
+    The optimal k is the SMALLEST candidate pool that captures
+    ALL chunks scoring above 0.3 on the cross-encoder.
+    How we find it:
+    1. First retrieve ALL chunks (pool = len(chunks))
+    2. Run cross-encoder on all of them
+    3. Find which ones score >= 0.3 — these are the "gold" relevant chunks
+    4. Find the smallest pool that would have retrieved all of them
+    Returns:
+    - optimal_k: the smallest pool size that captures all relevant chunks
+    - n_relevant: how many chunks scored above 0.3
+    - dynamic_k_size: same as n_relevant (what LARA's threshold would give)
+    """
+    if len(chunks) == 0:
+        return 0, 0, 0
+    # Retrieve ALL chunks — we want to know ground truth
+    all_pool_size = len(chunks)
+    # Search the full index
+    query_vec_2d = query_vec.reshape(1, -1).astype("float32")
+    # Get similarity scores for all chunks
+    scores_matrix = np.dot(embeddings, query_vec_2d.T).flatten()
+    # Rank by similarity (highest first)
+    ranked_indices = np.argsort(scores_matrix)[::-1]
+    ranked_chunks  = [chunks[i] for i in ranked_indices]
+    # Run cross-encoder on ALL ranked chunks
+    # This is expensive but we only do it once per query for training data
+    pairs = [(query, chunk) for chunk in ranked_chunks]
+    if len(pairs) > 100:
+        # Cap at 100 for speed — still gives good training signal
+        pairs = pairs[:100]
+        ranked_chunks = ranked_chunks[:100]
+    ce_scores = cross_enc.predict(pairs)
+    # Find which chunks are "relevant" (score >= threshold)
+    relevant_flags = ce_scores >= threshold
+    n_relevant = int(np.sum(relevant_flags))
+    if n_relevant == 0:
+        # No relevant chunks found — optimal k is small
+        return min(10, len(chunks)), 0, 0
+    # Find the position of the LAST relevant chunk in the ranking
+    # optimal_k = that position + 1
+    # (we need a pool at least this large to capture all relevant chunks)
+    last_relevant_position = 0
+    for i, flag in enumerate(relevant_flags):
+        if flag:
+            last_relevant_position = i
+    optimal_k = last_relevant_position + 1
+    return optimal_k, n_relevant, n_relevant
+def get_query_features(query, bi_enc):
+    """
+    Extract features from the query itself.
+    These help the model understand query complexity.
+    """
+    # Encode query to vector
+    query_vec = bi_enc.encode(
+        query,
+        normalize_embeddings=True,
+        show_progress_bar=False
+    ).astype("float32")
+    # Query length in characters
+    query_len = len(query)
+    # Number of words
+    query_words = len(query.split())
+    # Does query contain a question word? (factual vs complex)
+    question_words = ["what", "who", "when", "where", "which",
+                      "how", "why", "is", "are", "was", "were"]
+    has_question_word = int(
+        any(w in query.lower().split() for w in question_words)
+    )
+    # Use first 32 dimensions of query embedding as features
+    # (using all 384 would make the feature vector too large)
+    query_embedding_compressed = query_vec[:32].tolist()
+    return query_vec, {
+        "query_len":           query_len,
+        "query_words":         query_words,
+        "has_question_word":   has_question_word,
+        "query_emb":           query_embedding_compressed,
+    }
+# ══════════════════════════════════════════════════════════════════
+# STEP 3 — Main Data Generation Loop
+# ══════════════════════════════════════════════════════════════════
+print("Starting training data generation...")
+print(f"Processing {NUM_QUESTIONS} questions.\n")
+all_rows = []   # will become our training dataset
+errors   = 0
+for idx, example in enumerate(dataset):
+    # ── Extract question and context from NaturalQuestions ────────
+    # NaturalQuestions has the question and a Wikipedia document
+    question = example["question"]["text"]
+    # Get the Wikipedia article text as our "corpus"
+    # NaturalQuestions stores the document as a list of tokens
+    # We join them to get the full text
+    try:
+        doc_tokens = example["document"]["tokens"]["token"]
+        context    = " ".join(doc_tokens[:3000])  # first 3000 tokens
+    except Exception:
+        errors += 1
+        continue
+    if len(context) < 200:
+        # Skip very short documents
+        continue
+    # ── Chunk the context ─────────────────────────────────────────
+    chunks = chunk_text(context)
+    if len(chunks) < 2:
+        continue
+    # ── Build IVF index for this corpus ───────────────────────────
+    try:
+        index, embeddings, centroids, index_type = build_ivf_index(
+            chunks, bi_encoder
+        )
+    except Exception as e:
+        errors += 1
+        continue
+    # ── Get query features and embedding ──────────────────────────
+    query_vec, query_feats = get_query_features(question, bi_encoder)
+    # ── Get index structure features ──────────────────────────────
+    index_feats = get_index_features(
+        query_vec, centroids, embeddings, index_type
+    )
+    # ── Find optimal k — THE KEY STEP ─────────────────────────────
+    t0 = time.time()
+    optimal_k, n_relevant, dynamic_k = find_optimal_k(
+        question, chunks, query_vec, index,
+        embeddings, bi_encoder, cross_encoder
+    )
+    elapsed = time.time() - t0
+    # ── Build one training row ────────────────────────────────────
+    row = {
+        # Target variable — what the ML model needs to predict
+        "optimal_k":           optimal_k,
+        # Query features
+        "query_len":           query_feats["query_len"],
+        "query_words":         query_feats["query_words"],
+        "has_question_word":   query_feats["has_question_word"],
+        # Index structure features
+        "corpus_size":         index_feats["corpus_size"],
+        "n_centroids":         index_feats["n_centroids"],
+        "n_close_centroids":   index_feats["n_close_centroids"],
+        "centroid_spread":     index_feats["centroid_spread"],
+        "estimated_pool":      index_feats["estimated_pool"],
+        "top1_centroid_dist":  index_feats["top1_centroid_dist"],
+        "top3_centroid_dist":  index_feats["top3_centroid_dist"],
+        "index_type":          index_feats["index_type"],
+        # Extra info (not used for training, useful for analysis)
+        "n_relevant_chunks":   n_relevant,
+        "n_total_chunks":      len(chunks),
+        "question":            question,
+        "elapsed_sec":         round(elapsed, 3),
+    }
+    # Add compressed query embedding as features
+    # qe_0, qe_1, ... qe_31 — 32 numbers from the query vector
+    for i, val in enumerate(query_feats["query_emb"]):
+        row[f"qe_{i}"] = round(float(val), 6)
+    all_rows.append(row)
+    # ── Progress update ───────────────────────────────────────────
+    if (idx + 1) % 50 == 0:
+        print(f"  Processed {idx+1}/{NUM_QUESTIONS} questions. "
+              f"Rows collected: {len(all_rows)}. "
+              f"Errors: {errors}.")
+# ══════════════════════════════════════════════════════════════════
+# STEP 4 — Save Training Data
+# ══════════════════════════════════════════════════════════════════
+df = pd.DataFrame(all_rows)
+print(f"\nDone. Total rows: {len(df)}")
+print(f"Errors skipped: {errors}")
+print(f"\nDataset preview:")
+print(df[["question","corpus_size","n_relevant_chunks",
+          "optimal_k","top1_centroid_dist"]].head(10))
+print(f"\nOptimal K statistics:")
+print(df["optimal_k"].describe())
+df.to_csv(OUTPUT_FILE, index=False)
+print(f"\nSaved to: {OUTPUT_FILE}")
+print("Download this file — you will use it in Step 2 (ML training).")