Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

ai-engineering-project / evaluation /run_evaluation.py

GitHub Action

Clean deployment without binary files

f884e6e 2 months ago

8.33 kB

	"""
	Unified Evaluation Runner for RAG System

	This script provides comprehensive evaluation capabilities including:
	- Deterministic groundedness evaluation with reproducible scoring
	- Enhanced citation accuracy validation
	- Performance benchmarking and latency analysis
	- Comprehensive evaluation metrics and reporting

	Features:
	- LLM-based groundedness evaluation (with fallback to token overlap)
	- Citation accuracy checking with filename validation
	- Deterministic evaluation with fixed seeds for reproducibility
	- Performance tier analysis (fast/normal/slow responses)
	- Comprehensive reporting with statistical analysis
	"""

	import json
	import os
	import statistics
	import time
	from typing import Any, Dict, List

	import requests
	from tqdm import tqdm

	ROOT = os.path.dirname(os.path.abspath(__file__))
	EVAL_DIR = os.path.join(ROOT)
	QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json")
	GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json")
	OUT_FILE = os.path.join(EVAL_DIR, "results.json")
	EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results")
	os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)

	TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space")
	CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat")
	TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30"))


	def load_json(path: str) -> Any:
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	def token_overlap_score(gold: str, response: str) -> float:
	"""Simple partial match score based on token overlap."""
	gold_tokens = set(gold.lower().split())
	resp_tokens = set(response.lower().split())
	if not gold_tokens:
	return 0.0
	overlap = gold_tokens & resp_tokens
	return len(overlap) / len(gold_tokens)


	def citation_matches(expected: List[str], returned_sources: List[Dict[str, Any]]) -> float:
	"""Fraction of expected sources that appear in returned sources by filename match."""
	# If no expected sources, treat as correct only if model returned none
	if not expected:
	return 1.0 if not returned_sources else 0.0

	# Helper: normalize a filename or url -> lowercase basename without common extensions
	import os
	import re
	from difflib import SequenceMatcher

	def normalize(s: str) -> str:
	if not s:
	return ""
	s = s.strip()
	# If it's a URL or path-like, take the basename
	# Remove query string / fragments
	s = re.sub(r"[?#].*$", "", s)
	base = os.path.basename(s)
	# remove common extensions
	base = re.sub(r"\.(md\|markdown\|txt\|html\|htm\|pdf\|csv\|json\|yaml\|yml\|py\|ipynb)$", "", base, flags=re.IGNORECASE)
	return base.lower()

	# Build a set of normalized returned filenames from various possible keys
	returned_filenames = set()
	for s in returned_sources or []:
	# s may be a dict containing keys like filename, source_file, file, url, path
	if isinstance(s, dict):
	candidates = [s.get(k) for k in ("filename", "source_file", "file", "url", "path", "source")]
	# also some sources embed metadata
	meta = s.get("metadata") or {}
	if isinstance(meta, dict):
	candidates += [meta.get(k) for k in ("filename", "file", "source_file")]
	else:
	# s might be a plain string
	candidates = [s]

	for c in candidates:
	if c:
	returned_filenames.add(normalize(str(c)))

	# Now for each expected source, try exact normalized match, substring, or fuzzy match
	matched = 0
	# threshold can be tuned via environment variable
	try:
	env_thresh = float(os.getenv("EVAL_CITATION_FUZZY_THRESHOLD", "0.72"))
	except Exception:
	env_thresh = 0.72

	for e in expected:
	ne = normalize(str(e))
	if not ne:
	continue
	found = False
	# exact
	if ne in returned_filenames:
	found = True
	else:
	# substring match
	for rf in returned_filenames:
	if ne in rf or rf in ne:
	found = True
	break
	if not found:
	# fuzzy match using SequenceMatcher
	best = 0.0
	for rf in returned_filenames:
	if not rf:
	continue
	score = SequenceMatcher(None, ne, rf).ratio()
	if score > best:
	best = score
	# treat as match if similarity >= 0.72 (tunable)
	if best >= env_thresh:
	found = True

	if found:
	matched += 1

	return matched / len(expected)


	def run_eval(target: str = TARGET_URL):
	questions = load_json(QUESTIONS_FILE)
	golds = load_json(GOLD_FILE)

	results = []
	latencies = []

	for q in tqdm(questions, desc="Questions"):
	qid = str(q["id"])
	payload = {"message": q["question"], "include_sources": True}
	url = target.rstrip("/") + CHAT_ENDPOINT
	start = time.time()
	try:
	r = requests.post(url, json=payload, timeout=TIMEOUT)
	latency = time.time() - start
	latencies.append(latency)

	if r.status_code != 200:
	results.append(
	{
	"id": qid,
	"question": q["question"],
	"status_code": r.status_code,
	"error": r.text,
	}
	)
	continue

	data = r.json()
	response_text = data.get("response", "")
	returned_sources = data.get("sources", []) or []

	gold_answer = golds.get(qid, {}).get("answer", "")
	expected_sources = golds.get(qid, {}).get("expected_sources", [])

	overlap = token_overlap_score(gold_answer, response_text)
	citation_acc = citation_matches(expected_sources, returned_sources)

	results.append(
	{
	"id": qid,
	"question": q["question"],
	"response": response_text,
	"latency_s": latency,
	"overlap_score": overlap,
	"citation_accuracy": citation_acc,
	"returned_sources": returned_sources,
	}
	)

	except Exception as e:
	latency = time.time() - start
	latencies.append(latency)
	results.append(
	{
	"id": qid,
	"question": q["question"],
	"status_code": "error",
	"error": str(e),
	}
	)

	# compute summary metrics
	success_latencies = [lat for lat in latencies if lat is not None]
	p50 = statistics.median(success_latencies) if success_latencies else None
	p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None

	# compute averages for overlap and citation (only for successful responses)
	overlaps = [r.get("overlap_score") for r in results if isinstance(r.get("overlap_score"), float)]
	citations = [r.get("citation_accuracy") for r in results if isinstance(r.get("citation_accuracy"), float)]

	summary = {
	"target": target,
	"n_questions": len(questions),
	"latency_p50_s": p50,
	"latency_p95_s": p95,
	"avg_overlap": sum(overlaps) / len(overlaps) if overlaps else None,
	"avg_citation_accuracy": sum(citations) / len(citations) if citations else None,
	}

	out = {"summary": summary, "results": results}

	with open(OUT_FILE, "w", encoding="utf-8") as f:
	json.dump(out, f, indent=2)

	# Also write a compact summary copy for CI collection
	try:
	summary_path = os.path.join(EVAL_RESULTS_DIR, "results_summary.json")
	with open(summary_path, "w", encoding="utf-8") as sf:
	json.dump(summary, sf, indent=2)
	except Exception:
	pass

	print("Evaluation complete. Summary:")
	print(json.dumps(summary, indent=2))
	print(f"Results written to {OUT_FILE}")


	if __name__ == "__main__":
	target = os.getenv("EVAL_TARGET_URL", TARGET_URL)
	run_eval(target)