""" Standalone benchmark runner — runs all 50 reference questions through Baseline LLM and RAG, computes all 8 metrics, saves to data/benchmark_cache.json. Run once: python run_benchmark.py """ import os, json, time import numpy as np # rouge_score MUST be imported before heavy ML libs to avoid segfault from rouge_score import rouge_scorer as rs from dotenv import load_dotenv load_dotenv() # ── Import system modules (after rouge_score) ───────────────────────────────── from langchain_huggingface import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from openai import OpenAI # ── Load reference answers ──────────────────────────────────────────────────── with open("data/reference_answers.json", encoding="utf-8") as f: ref_answers = json.load(f) QUESTIONS = list(ref_answers.keys()) # ── Load vector store ───────────────────────────────────────────────────────── INDEX_PATH = "data/faiss_index" print("Loading vector store...") emb = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") vs = FAISS.load_local(INDEX_PATH, emb, allow_dangerous_deserialization=True) print("Vector store ready.\n") # ── Groq client ─────────────────────────────────────────────────────────────── client = OpenAI( api_key=os.getenv("GROQ_API_KEY"), base_url="https://api.groq.com/openai/v1", ) MODEL = "llama-3.1-8b-instant" BASELINE_SYS = ( "You are a programming tutor specializing in Data Structures, Algorithms, " "and Web Development. Answer questions clearly and concisely." ) RAG_SYS = ( "You are a programming tutor. Use only the provided context to answer. " "If the answer is not in the context, say 'I don't have that in my knowledge base.'" ) def ask_baseline(q: str) -> dict: t = time.time() r = client.chat.completions.create( model=MODEL, messages=[{"role": "system", "content": BASELINE_SYS}, {"role": "user", "content": q}], max_tokens=300, temperature=0.3, ) return {"answer": r.choices[0].message.content.strip(), "response_time": round(time.time()-t, 2)} def ask_rag(q: str) -> dict: t = time.time() docs = vs.similarity_search(q, k=3) context = "\n\n".join([d.page_content for d in docs]) prompt = f"Context:\n{context}\n\nQuestion: {q}\nAnswer:" r = client.chat.completions.create( model=MODEL, messages=[{"role": "system", "content": RAG_SYS}, {"role": "user", "content": prompt}], max_tokens=300, temperature=0.3, ) return {"answer": r.choices[0].message.content.strip(), "response_time": round(time.time()-t, 2), "context": context} # ── Metric helpers ──────────────────────────────────────────────────────────── scorer = rs.RougeScorer(["rougeL"], use_stemmer=True) def _cosine(a, b): n = np.linalg.norm(a) * np.linalg.norm(b) return float(np.dot(a, b) / (n + 1e-8)) def compute_metrics(answer: str, question: str, context: str = "") -> dict: if not answer or not answer.strip(): return {"accuracy": 0, "rouge_l": 0, "groundedness": 0, "answer_relevance": 0, "faithfulness": 0} try: a_emb = np.array(vs.embeddings.embed_query(answer)) q_emb = np.array(vs.embeddings.embed_query(question)) answer_relevance = round(max(0.0, _cosine(a_emb, q_emb)), 3) ref = ref_answers.get(question.strip().lower(), "") accuracy, rouge_l = 0.0, 0.0 if ref: rouge_l = round(scorer.score(ref, answer)["rougeL"].fmeasure, 3) r_emb = np.array(vs.embeddings.embed_query(ref)) accuracy = round(max(0.0, _cosine(a_emb, r_emb)), 3) if context and context.strip(): c_emb = np.array(vs.embeddings.embed_query(context[:1000])) groundedness = round(max(0.0, _cosine(a_emb, c_emb)), 3) faithfulness = round(scorer.score(context[:1000], answer)["rougeL"].fmeasure, 3) else: groundedness = accuracy faithfulness = rouge_l return {"accuracy": accuracy, "rouge_l": rouge_l, "groundedness": groundedness, "answer_relevance": answer_relevance, "faithfulness": faithfulness} except Exception as e: print(f" [metric error] {e}") return {"accuracy": 0, "rouge_l": 0, "groundedness": 0, "answer_relevance": 0, "faithfulness": 0} def _cost(answer: str, system: str) -> float: tokens = max(1, len(answer.split())) if system == "r1": return round(0.001 + tokens * 0.0000059, 4) elif system == "r2": return round(0.0015 + tokens * 0.0000059 * 1.8, 4) else: return round(tokens * 0.0000015, 4) # ── Run benchmark ───────────────────────────────────────────────────────────── # Load existing partial results to resume if interrupted OUT_PATH = "data/benchmark_cache.json" if os.path.exists(OUT_PATH): with open(OUT_PATH, encoding="utf-8") as f: results = json.load(f) done_qs = {r["question"] for r in results} print(f"Resuming — {len(results)} already done.\n") else: results = [] done_qs = set() total = len(QUESTIONS) print(f"Running benchmark on {total} questions...\n") for i, q in enumerate(QUESTIONS): if q in done_qs: print(f"[{i+1:02d}/{total}] SKIP (cached): {q[:55]}") continue print(f"[{i+1:02d}/{total}] {q[:60]}") r1 = ask_baseline(q) r2 = ask_rag(q) ctx = r2.get("context", "") m1 = compute_metrics(r1["answer"], q) m2 = compute_metrics(r2["answer"], q, context=ctx) results.append({ "question": q, "r1_time": r1["response_time"], "r2_time": r2["response_time"], "r3_time": 0, "r1_rouge": m1["rouge_l"], "r2_rouge": m2["rouge_l"], "r3_rouge": 0, "r1_sim": m1["accuracy"], "r2_sim": m2["accuracy"], "r3_sim": 0, "r1_ground": m1["groundedness"], "r2_ground": m2["groundedness"], "r3_ground": 0, "r1_relev": m1["answer_relevance"], "r2_relev": m2["answer_relevance"], "r3_relev": 0, "r1_faith": m1["faithfulness"], "r2_faith": m2["faithfulness"], "r3_faith": 0, "r1_cost": _cost(r1["answer"], "r1"), "r2_cost": _cost(r2["answer"], "r2"), "r3_cost": 0, }) print(f" r1_acc={m1['accuracy']:.2f} r2_acc={m2['accuracy']:.2f} | " f"r1={r1['response_time']}s r2={r2['response_time']}s") # Save after every question so we can resume if interrupted with open(OUT_PATH, "w", encoding="utf-8") as f: json.dump(results, f, indent=2) # ── Summary ─────────────────────────────────────────────────────────────────── n = len(results) r1_acc = round(sum(r["r1_sim"] for r in results) / n * 100, 1) r2_acc = round(sum(r["r2_sim"] for r in results) / n * 100, 1) r1_t = round(sum(r["r1_time"] for r in results) / n, 2) r2_t = round(sum(r["r2_time"] for r in results) / n, 2) print(f"\nDone! {n} rows saved to {OUT_PATH}") print(f" Baseline — accuracy {r1_acc}% avg_time {r1_t}s") print(f" RAG — accuracy {r2_acc}% avg_time {r2_t}s")