""" Runs all 3 systems on the test questions and saves results to results/evaluation_results.csv. After running, open the CSV and manually fill in: - correctness_score (1-5): 1=wrong, 3=partial, 5=perfect - hallucination (Yes/No): Yes if the answer contains false information """ import csv import os import time from system1_baseline import ask_baseline from system2_rag import ask_rag, load_vectorstore, build_vectorstore QUESTIONS_PATH = "data/questions.csv" RESULTS_PATH = "results/evaluation_results.csv" NUM_QUESTIONS = 30 # increase to 50 when ready def load_questions(): questions = [] with open(QUESTIONS_PATH, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: questions.append(row["question"]) return questions[:NUM_QUESTIONS] def run_evaluation(): os.makedirs("results", exist_ok=True) # Load RAG vectorstore if not os.path.exists("data/faiss_index"): print("Building FAISS index first...") vs = build_vectorstore() else: print("Loading existing FAISS index...") vs = load_vectorstore() questions = load_questions() print(f"\nEvaluating {len(questions)} questions across 3 systems...\n") results = [] for i, q in enumerate(questions, 1): print(f"[{i}/{len(questions)}] {q}") r1 = ask_baseline(q) time.sleep(0.5) # avoid rate limiting r2 = ask_rag(q, vs) time.sleep(0.5) # System 3 — placeholder until fine-tuned model is ready try: from system3_inference import ask_finetuned r3 = ask_finetuned(q) except Exception: r3 = {"answer": "Model not available yet.", "response_time": 0} results.append({ "id": i, "question": q, # Baseline "baseline_answer": r1["answer"], "baseline_time": r1["response_time"], "baseline_correctness": "", # Fill manually: 1-5 "baseline_hallucination": "", # Fill manually: Yes/No # RAG "rag_answer": r2["answer"], "rag_time": r2["response_time"], "rag_correctness": "", "rag_hallucination": "", # Fine-tuned "finetuned_answer": r3["answer"], "finetuned_time": r3["response_time"], "finetuned_correctness": "", "finetuned_hallucination": "", }) print(f" Baseline ({r1['response_time']}s): {r1['answer'][:70]}...") print(f" RAG ({r2['response_time']}s): {r2['answer'][:70]}...") print() # Save CSV with open(RESULTS_PATH, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=results[0].keys()) writer.writeheader() writer.writerows(results) print(f"Saved {len(results)} results to {RESULTS_PATH}") print("\nNext step: open the CSV and fill in 'correctness' (1-5) and 'hallucination' (Yes/No) columns manually.") def print_summary(): """Print average scores after you have manually filled in the CSV.""" if not os.path.exists(RESULTS_PATH): print("No results file found. Run evaluate.py first.") return baseline_scores, rag_scores, ft_scores = [], [], [] baseline_hall, rag_hall, ft_hall = [], [], [] baseline_times, rag_times, ft_times = [], [], [] with open(RESULTS_PATH, encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: if row["baseline_correctness"]: baseline_scores.append(float(row["baseline_correctness"])) baseline_hall.append(1 if row["baseline_hallucination"].strip().lower() == "yes" else 0) baseline_times.append(float(row["baseline_time"]) if row["baseline_time"] else 0) if row["rag_correctness"]: rag_scores.append(float(row["rag_correctness"])) rag_hall.append(1 if row["rag_hallucination"].strip().lower() == "yes" else 0) rag_times.append(float(row["rag_time"]) if row["rag_time"] else 0) if row["finetuned_correctness"]: ft_scores.append(float(row["finetuned_correctness"])) ft_hall.append(1 if row["finetuned_hallucination"].strip().lower() == "yes" else 0) ft_times.append(float(row["finetuned_time"]) if row["finetuned_time"] else 0) def avg(lst): return round(sum(lst) / len(lst), 2) if lst else "N/A" def pct(lst): return f"{round(sum(lst)/len(lst)*100)}%" if lst else "N/A" print("\n===== EVALUATION SUMMARY =====") print(f"{'Metric':<30} {'Baseline':>12} {'RAG':>12} {'Fine-tuned':>12}") print("-" * 66) print(f"{'Avg Correctness (1-5)':<30} {avg(baseline_scores):>12} {avg(rag_scores):>12} {avg(ft_scores):>12}") print(f"{'Hallucination Rate':<30} {pct(baseline_hall):>12} {pct(rag_hall):>12} {pct(ft_hall):>12}") print(f"{'Avg Response Time (s)':<30} {avg(baseline_times):>12} {avg(rag_times):>12} {avg(ft_times):>12}") if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "summary": print_summary() else: run_evaluation()