Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

ai-engineering-project / evaluation /enhanced_evaluation.py

GitHub Action

Clean deployment without binary files

f884e6e 3 months ago

13.6 kB

	"""
	Enhanced evaluation with proper groundedness checking.

	This module implements LLM-based groundedness evaluation that checks if
	generated answers are factually consistent with and fully supported by
	the retrieved evidence, going beyond simple token overlap.
	"""

	import json
	import os
	import statistics
	import time
	from typing import Any, Dict, List

	import requests
	from tqdm import tqdm

	ROOT = os.path.dirname(os.path.abspath(__file__))
	EVAL_DIR = os.path.join(ROOT)
	QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json")
	GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json")
	OUT_FILE = os.path.join(EVAL_DIR, "enhanced_results.json")
	EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results")

	# Ensure results directory exists
	os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)

	TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space")
	CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat")
	TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30"))

	# LLM API for groundedness evaluation
	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
	GROUNDEDNESS_MODEL = "microsoft/wizardlm-2-8x22b"


	def load_json(path: str) -> Any:
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	def evaluate_groundedness_llm(generated_answer: str, retrieved_context: List[str]) -> Dict[str, Any]:
	"""
	Use LLM to evaluate if the generated answer is grounded in the retrieved context.

	Args:
	generated_answer: The generated response text
	retrieved_context: List of retrieved document excerpts

	Returns:
	Dictionary with groundedness score and explanation
	"""
	if not OPENROUTER_API_KEY:
	# Fallback to token overlap if no API key
	return {
	"grounded": True,
	"confidence": 0.5,
	"explanation": "Using fallback token overlap method - no OpenRouter API key available",
	"method": "token_overlap_fallback",
	}

	# Create context from retrieved documents
	context_text = "\n\n".join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(retrieved_context)])

	# Groundedness evaluation prompt
	prompt = f"""You are an expert evaluator tasked with determining if a generated answer is
	factually grounded in the provided context.

	CONTEXT (Retrieved Documents):
	{context_text}

	GENERATED ANSWER:
	{generated_answer}

	TASK:
	Evaluate whether the generated answer is:
	1. FACTUALLY CONSISTENT with the context (no contradictions)
	2. FULLY SUPPORTED by the context (all claims can be verified)
	3. NOT HALLUCINATED (no information absent from context)

	Respond with a JSON object containing:
	- "grounded": boolean (true if fully grounded, false otherwise)
	- "confidence": float 0-1 (how confident you are in this assessment)
	- "explanation": string (detailed reasoning for your assessment)
	- "unsupported_claims": list of strings (any claims not supported by context)

	Be strict: if ANY part of the answer contains information not present in or
	contradicted by the context, mark as false."""

	try:
	response = requests.post(
	"https://openrouter.ai/api/v1/chat/completions",
	headers={
	"Authorization": f"Bearer {OPENROUTER_API_KEY}",
	"Content-Type": "application/json",
	},
	json={
	"model": GROUNDEDNESS_MODEL,
	"messages": [{"role": "user", "content": prompt}],
	"temperature": 0.1,
	"max_tokens": 500,
	},
	timeout=30,
	)

	if response.status_code == 200:
	result = response.json()
	content = result["choices"][0]["message"]["content"]

	# Try to parse JSON response
	try:
	evaluation = json.loads(content)
	evaluation["method"] = "llm_evaluation"
	return evaluation
	except json.JSONDecodeError:
	# Fallback if LLM didn't return valid JSON
	is_grounded = "true" in content.lower() and "grounded" in content.lower()
	return {
	"grounded": is_grounded,
	"confidence": 0.7,
	"explanation": f"LLM evaluation (non-JSON): {content[:200]}...",
	"method": "llm_evaluation_parsed",
	}
	else:
	# API error fallback
	return {
	"grounded": True,
	"confidence": 0.3,
	"explanation": f"API error {response.status_code}, using neutral assessment",
	"method": "api_error_fallback",
	}

	except Exception as e:
	# Exception fallback
	return {
	"grounded": True,
	"confidence": 0.3,
	"explanation": f"Evaluation error: {str(e)}, using neutral assessment",
	"method": "exception_fallback",
	}


	def evaluate_citation_accuracy_enhanced(
	expected_sources: List[str],
	returned_sources: List[Dict[str, Any]],
	generated_answer: str,
	) -> Dict[str, Any]:
	"""
	Enhanced citation accuracy that considers both source presence and relevance.

	Args:
	expected_sources: List of expected source filenames
	returned_sources: List of returned source dictionaries
	generated_answer: The generated response text

	Returns:
	Dictionary with citation accuracy metrics
	"""
	if not expected_sources:
	return {
	"citation_accuracy": 1.0 if not returned_sources else 0.0,
	"expected_count": 0,
	"returned_count": len(returned_sources),
	"correctly_cited": 0,
	"method": "exact_match",
	}

	# Extract returned filenames
	returned_filenames = {
	s.get("document") or s.get("filename") or s.get("source_file") or s.get("file") for s in returned_sources
	}
	returned_filenames = {f for f in returned_filenames if f}

	# Count correct citations
	correctly_cited = 0
	for expected in expected_sources:
	if expected in returned_filenames:
	correctly_cited += 1

	citation_accuracy = correctly_cited / len(expected_sources) if expected_sources else 0.0

	return {
	"citation_accuracy": citation_accuracy,
	"expected_count": len(expected_sources),
	"returned_count": len(returned_filenames),
	"correctly_cited": correctly_cited,
	"expected_sources": expected_sources,
	"returned_sources": list(returned_filenames),
	"method": "exact_match",
	}


	def token_overlap_score(gold: str, response: str) -> float:
	"""Simple partial match score based on token overlap."""
	gold_tokens = set(gold.lower().split())
	resp_tokens = set(response.lower().split())
	if not gold_tokens:
	return 0.0
	overlap = gold_tokens & resp_tokens
	return len(overlap) / len(gold_tokens)


	def run_enhanced_evaluation(target: str = TARGET_URL):
	"""Run enhanced evaluation with proper groundedness checking."""
	questions = load_json(QUESTIONS_FILE)
	golds = load_json(GOLD_FILE)

	results = []
	latencies = []
	groundedness_scores = []
	citation_accuracies = []

	print(f"Running enhanced evaluation against {target}")
	print(f"Using groundedness evaluation: {'LLM-based' if OPENROUTER_API_KEY else 'Token overlap fallback'}")

	for q in tqdm(questions, desc="Enhanced Evaluation"):
	qid = str(q["id"])
	payload = {"message": q["question"], "include_sources": True}
	url = target.rstrip("/") + CHAT_ENDPOINT
	start = time.time()

	try:
	# Add progress info
	print(f"\nEvaluating question {qid}: {q['question'][:50]}...")
	r = requests.post(url, json=payload, timeout=TIMEOUT)
	latency = time.time() - start
	latencies.append(latency)
	print(f"Response received in {latency:.2f}s")

	if r.status_code != 200:
	results.append(
	{
	"id": qid,
	"question": q["question"],
	"status_code": r.status_code,
	"error": r.text,
	"latency_s": latency,
	}
	)
	continue

	data = r.json()
	response_text = data.get("response", "")
	returned_sources = data.get("sources", []) or []

	gold_answer = golds.get(qid, {}).get("answer", "")
	expected_sources = golds.get(qid, {}).get("expected_sources", [])

	# Enhanced groundedness evaluation
	context_excerpts = [s.get("excerpt", "") for s in returned_sources if s.get("excerpt")]
	groundedness_eval = evaluate_groundedness_llm(response_text, context_excerpts)

	# Enhanced citation accuracy
	citation_eval = evaluate_citation_accuracy_enhanced(expected_sources, returned_sources, response_text)

	# Traditional token overlap for comparison
	overlap_score = token_overlap_score(gold_answer, response_text)

	# Store comprehensive results
	result = {
	"id": qid,
	"question": q["question"],
	"response": response_text,
	"latency_s": latency,
	# Enhanced groundedness metrics
	"groundedness": groundedness_eval,
	# Enhanced citation metrics
	"citation_evaluation": citation_eval,
	# Traditional metrics for comparison
	"overlap_score": overlap_score,
	"citation_accuracy": citation_eval["citation_accuracy"],
	# Source information
	"returned_sources": returned_sources,
	"expected_sources": expected_sources,
	"gold_answer": gold_answer,
	}

	results.append(result)

	# Track metrics for summary
	if groundedness_eval.get("grounded") is not None:
	groundedness_scores.append(1.0 if groundedness_eval["grounded"] else 0.0)
	citation_accuracies.append(citation_eval["citation_accuracy"])

	except Exception as e:
	latency = time.time() - start
	latencies.append(latency)
	results.append(
	{
	"id": qid,
	"question": q["question"],
	"status_code": "error",
	"error": str(e),
	"latency_s": latency,
	}
	)

	# Calculate summary metrics
	success_latencies = [lat for lat in latencies if lat is not None]
	p50 = statistics.median(success_latencies) if success_latencies else None
	p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None

	# Enhanced summary metrics
	avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) if groundedness_scores else None
	avg_citation_accuracy = sum(citation_accuracies) / len(citation_accuracies) if citation_accuracies else None

	# Count successful evaluations
	successful_evals = len([r for r in results if r.get("groundedness") is not None])
	total_questions = len(questions)

	summary = {
	"target": target,
	"evaluation_method": "enhanced_llm_based",
	"n_questions": total_questions,
	"successful_evaluations": successful_evals,
	"success_rate": (successful_evals / total_questions if total_questions > 0 else 0),
	# Performance metrics
	"latency_p50_s": p50,
	"latency_p95_s": p95,
	"avg_latency_s": (sum(success_latencies) / len(success_latencies) if success_latencies else None),
	# Quality metrics (enhanced)
	"avg_groundedness_score": avg_groundedness,
	"avg_citation_accuracy": avg_citation_accuracy,
	"groundedness_method": ("llm_evaluation" if OPENROUTER_API_KEY else "token_overlap_fallback"),
	# Additional insights
	"grounded_responses": sum(groundedness_scores),
	"ungrounded_responses": (len(groundedness_scores) - sum(groundedness_scores) if groundedness_scores else 0),
	"perfect_citations": len([c for c in citation_accuracies if c == 1.0]),
	"no_citations": len([c for c in citation_accuracies if c == 0.0]),
	}

	# Save enhanced results
	output = {
	"summary": summary,
	"results": results,
	"metadata": {
	"evaluation_timestamp": time.time(),
	"evaluation_version": "enhanced_v1.0",
	"groundedness_model": (GROUNDEDNESS_MODEL if OPENROUTER_API_KEY else "token_overlap"),
	"target_endpoint": target + CHAT_ENDPOINT,
	},
	}

	# Save to evaluation directory and a centralized evaluation_results folder
	with open(OUT_FILE, "w", encoding="utf-8") as f:
	json.dump(output, f, indent=2)

	# Also write a copy into evaluation_results for CI aggregation
	try:
	out_summary_path = os.path.join(EVAL_RESULTS_DIR, "enhanced_results_summary.json")
	with open(out_summary_path, "w", encoding="utf-8") as f2:
	json.dump(output["summary"], f2, indent=2)
	except Exception:
	pass

	print("\nEnhanced Evaluation Complete!")
	print("=" * 50)
	print(json.dumps(summary, indent=2))
	print(f"\nDetailed results saved to {OUT_FILE}")

	return output


	if __name__ == "__main__":
	target = os.getenv("EVAL_TARGET_URL", TARGET_URL)
	run_enhanced_evaluation(target)