CodeSage / evaluate.py
Aditya
Add LLM vs RAG vs Fine-Tuning comparison project
4a3f117
Raw
History Blame Contribute Delete
5.2 kB
"""
Runs all 3 systems on the test questions and saves results to results/evaluation_results.csv.
After running, open the CSV and manually fill in:
- correctness_score (1-5): 1=wrong, 3=partial, 5=perfect
- hallucination (Yes/No): Yes if the answer contains false information
"""
import csv
import os
import time
from system1_baseline import ask_baseline
from system2_rag import ask_rag, load_vectorstore, build_vectorstore
QUESTIONS_PATH = "data/questions.csv"
RESULTS_PATH = "results/evaluation_results.csv"
NUM_QUESTIONS = 30 # increase to 50 when ready
def load_questions():
questions = []
with open(QUESTIONS_PATH, encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
questions.append(row["question"])
return questions[:NUM_QUESTIONS]
def run_evaluation():
os.makedirs("results", exist_ok=True)
# Load RAG vectorstore
if not os.path.exists("data/faiss_index"):
print("Building FAISS index first...")
vs = build_vectorstore()
else:
print("Loading existing FAISS index...")
vs = load_vectorstore()
questions = load_questions()
print(f"\nEvaluating {len(questions)} questions across 3 systems...\n")
results = []
for i, q in enumerate(questions, 1):
print(f"[{i}/{len(questions)}] {q}")
r1 = ask_baseline(q)
time.sleep(0.5) # avoid rate limiting
r2 = ask_rag(q, vs)
time.sleep(0.5)
# System 3 — placeholder until fine-tuned model is ready
try:
from system3_inference import ask_finetuned
r3 = ask_finetuned(q)
except Exception:
r3 = {"answer": "Model not available yet.", "response_time": 0}
results.append({
"id": i,
"question": q,
# Baseline
"baseline_answer": r1["answer"],
"baseline_time": r1["response_time"],
"baseline_correctness": "", # Fill manually: 1-5
"baseline_hallucination": "", # Fill manually: Yes/No
# RAG
"rag_answer": r2["answer"],
"rag_time": r2["response_time"],
"rag_correctness": "",
"rag_hallucination": "",
# Fine-tuned
"finetuned_answer": r3["answer"],
"finetuned_time": r3["response_time"],
"finetuned_correctness": "",
"finetuned_hallucination": "",
})
print(f" Baseline ({r1['response_time']}s): {r1['answer'][:70]}...")
print(f" RAG ({r2['response_time']}s): {r2['answer'][:70]}...")
print()
# Save CSV
with open(RESULTS_PATH, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=results[0].keys())
writer.writeheader()
writer.writerows(results)
print(f"Saved {len(results)} results to {RESULTS_PATH}")
print("\nNext step: open the CSV and fill in 'correctness' (1-5) and 'hallucination' (Yes/No) columns manually.")
def print_summary():
"""Print average scores after you have manually filled in the CSV."""
if not os.path.exists(RESULTS_PATH):
print("No results file found. Run evaluate.py first.")
return
baseline_scores, rag_scores, ft_scores = [], [], []
baseline_hall, rag_hall, ft_hall = [], [], []
baseline_times, rag_times, ft_times = [], [], []
with open(RESULTS_PATH, encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
if row["baseline_correctness"]:
baseline_scores.append(float(row["baseline_correctness"]))
baseline_hall.append(1 if row["baseline_hallucination"].strip().lower() == "yes" else 0)
baseline_times.append(float(row["baseline_time"]) if row["baseline_time"] else 0)
if row["rag_correctness"]:
rag_scores.append(float(row["rag_correctness"]))
rag_hall.append(1 if row["rag_hallucination"].strip().lower() == "yes" else 0)
rag_times.append(float(row["rag_time"]) if row["rag_time"] else 0)
if row["finetuned_correctness"]:
ft_scores.append(float(row["finetuned_correctness"]))
ft_hall.append(1 if row["finetuned_hallucination"].strip().lower() == "yes" else 0)
ft_times.append(float(row["finetuned_time"]) if row["finetuned_time"] else 0)
def avg(lst): return round(sum(lst) / len(lst), 2) if lst else "N/A"
def pct(lst): return f"{round(sum(lst)/len(lst)*100)}%" if lst else "N/A"
print("\n===== EVALUATION SUMMARY =====")
print(f"{'Metric':<30} {'Baseline':>12} {'RAG':>12} {'Fine-tuned':>12}")
print("-" * 66)
print(f"{'Avg Correctness (1-5)':<30} {avg(baseline_scores):>12} {avg(rag_scores):>12} {avg(ft_scores):>12}")
print(f"{'Hallucination Rate':<30} {pct(baseline_hall):>12} {pct(rag_hall):>12} {pct(ft_hall):>12}")
print(f"{'Avg Response Time (s)':<30} {avg(baseline_times):>12} {avg(rag_times):>12} {avg(ft_times):>12}")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "summary":
print_summary()
else:
run_evaluation()