Spaces:

Adityax-07
/

CodeSage

Sleeping

CodeSage / evaluate.py

Aditya

Add LLM vs RAG vs Fine-Tuning comparison project

4a3f117 3 months ago

5.2 kB

	"""
	Runs all 3 systems on the test questions and saves results to results/evaluation_results.csv.
	After running, open the CSV and manually fill in:
	- correctness_score (1-5): 1=wrong, 3=partial, 5=perfect
	- hallucination (Yes/No): Yes if the answer contains false information
	"""
	import csv
	import os
	import time

	from system1_baseline import ask_baseline
	from system2_rag import ask_rag, load_vectorstore, build_vectorstore

	QUESTIONS_PATH = "data/questions.csv"
	RESULTS_PATH = "results/evaluation_results.csv"
	NUM_QUESTIONS = 30 # increase to 50 when ready


	def load_questions():
	questions = []
	with open(QUESTIONS_PATH, encoding="utf-8") as f:
	reader = csv.DictReader(f)
	for row in reader:
	questions.append(row["question"])
	return questions[:NUM_QUESTIONS]


	def run_evaluation():
	os.makedirs("results", exist_ok=True)

	# Load RAG vectorstore
	if not os.path.exists("data/faiss_index"):
	print("Building FAISS index first...")
	vs = build_vectorstore()
	else:
	print("Loading existing FAISS index...")
	vs = load_vectorstore()

	questions = load_questions()
	print(f"\nEvaluating {len(questions)} questions across 3 systems...\n")

	results = []

	for i, q in enumerate(questions, 1):
	print(f"[{i}/{len(questions)}] {q}")

	r1 = ask_baseline(q)
	time.sleep(0.5) # avoid rate limiting
	r2 = ask_rag(q, vs)
	time.sleep(0.5)

	# System 3 — placeholder until fine-tuned model is ready
	try:
	from system3_inference import ask_finetuned
	r3 = ask_finetuned(q)
	except Exception:
	r3 = {"answer": "Model not available yet.", "response_time": 0}

	results.append({
	"id": i,
	"question": q,
	# Baseline
	"baseline_answer": r1["answer"],
	"baseline_time": r1["response_time"],
	"baseline_correctness": "", # Fill manually: 1-5
	"baseline_hallucination": "", # Fill manually: Yes/No
	# RAG
	"rag_answer": r2["answer"],
	"rag_time": r2["response_time"],
	"rag_correctness": "",
	"rag_hallucination": "",
	# Fine-tuned
	"finetuned_answer": r3["answer"],
	"finetuned_time": r3["response_time"],
	"finetuned_correctness": "",
	"finetuned_hallucination": "",
	})

	print(f" Baseline ({r1['response_time']}s): {r1['answer'][:70]}...")
	print(f" RAG ({r2['response_time']}s): {r2['answer'][:70]}...")
	print()

	# Save CSV
	with open(RESULTS_PATH, "w", newline="", encoding="utf-8") as f:
	writer = csv.DictWriter(f, fieldnames=results[0].keys())
	writer.writeheader()
	writer.writerows(results)

	print(f"Saved {len(results)} results to {RESULTS_PATH}")
	print("\nNext step: open the CSV and fill in 'correctness' (1-5) and 'hallucination' (Yes/No) columns manually.")


	def print_summary():
	"""Print average scores after you have manually filled in the CSV."""
	if not os.path.exists(RESULTS_PATH):
	print("No results file found. Run evaluate.py first.")
	return

	baseline_scores, rag_scores, ft_scores = [], [], []
	baseline_hall, rag_hall, ft_hall = [], [], []
	baseline_times, rag_times, ft_times = [], [], []

	with open(RESULTS_PATH, encoding="utf-8") as f:
	reader = csv.DictReader(f)
	for row in reader:
	if row["baseline_correctness"]:
	baseline_scores.append(float(row["baseline_correctness"]))
	baseline_hall.append(1 if row["baseline_hallucination"].strip().lower() == "yes" else 0)
	baseline_times.append(float(row["baseline_time"]) if row["baseline_time"] else 0)
	if row["rag_correctness"]:
	rag_scores.append(float(row["rag_correctness"]))
	rag_hall.append(1 if row["rag_hallucination"].strip().lower() == "yes" else 0)
	rag_times.append(float(row["rag_time"]) if row["rag_time"] else 0)
	if row["finetuned_correctness"]:
	ft_scores.append(float(row["finetuned_correctness"]))
	ft_hall.append(1 if row["finetuned_hallucination"].strip().lower() == "yes" else 0)
	ft_times.append(float(row["finetuned_time"]) if row["finetuned_time"] else 0)

	def avg(lst): return round(sum(lst) / len(lst), 2) if lst else "N/A"
	def pct(lst): return f"{round(sum(lst)/len(lst)*100)}%" if lst else "N/A"

	print("\n===== EVALUATION SUMMARY =====")
	print(f"{'Metric':<30} {'Baseline':>12} {'RAG':>12} {'Fine-tuned':>12}")
	print("-" * 66)
	print(f"{'Avg Correctness (1-5)':<30} {avg(baseline_scores):>12} {avg(rag_scores):>12} {avg(ft_scores):>12}")
	print(f"{'Hallucination Rate':<30} {pct(baseline_hall):>12} {pct(rag_hall):>12} {pct(ft_hall):>12}")
	print(f"{'Avg Response Time (s)':<30} {avg(baseline_times):>12} {avg(rag_times):>12} {avg(ft_times):>12}")


	if __name__ == "__main__":
	import sys
	if len(sys.argv) > 1 and sys.argv[1] == "summary":
	print_summary()
	else:
	run_evaluation()