ai-engineering-project / evaluation /enhanced_evaluation.py
GitHub Action
Clean deployment without binary files
f884e6e
"""
Enhanced evaluation with proper groundedness checking.
This module implements LLM-based groundedness evaluation that checks if
generated answers are factually consistent with and fully supported by
the retrieved evidence, going beyond simple token overlap.
"""
import json
import os
import statistics
import time
from typing import Any, Dict, List
import requests
from tqdm import tqdm
ROOT = os.path.dirname(os.path.abspath(__file__))
EVAL_DIR = os.path.join(ROOT)
QUESTIONS_FILE = os.path.join(EVAL_DIR, "questions.json")
GOLD_FILE = os.path.join(EVAL_DIR, "gold_answers.json")
OUT_FILE = os.path.join(EVAL_DIR, "enhanced_results.json")
EVAL_RESULTS_DIR = os.path.join(os.path.dirname(EVAL_DIR), "evaluation_results")
# Ensure results directory exists
os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)
TARGET_URL = os.getenv("EVAL_TARGET_URL", "https://msse-team-3-ai-engineering-project.hf.space")
CHAT_ENDPOINT = os.getenv("EVAL_CHAT_PATH", "/chat")
TIMEOUT = int(os.getenv("EVAL_TIMEOUT", "30"))
# LLM API for groundedness evaluation
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
GROUNDEDNESS_MODEL = "microsoft/wizardlm-2-8x22b"
def load_json(path: str) -> Any:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def evaluate_groundedness_llm(generated_answer: str, retrieved_context: List[str]) -> Dict[str, Any]:
"""
Use LLM to evaluate if the generated answer is grounded in the retrieved context.
Args:
generated_answer: The generated response text
retrieved_context: List of retrieved document excerpts
Returns:
Dictionary with groundedness score and explanation
"""
if not OPENROUTER_API_KEY:
# Fallback to token overlap if no API key
return {
"grounded": True,
"confidence": 0.5,
"explanation": "Using fallback token overlap method - no OpenRouter API key available",
"method": "token_overlap_fallback",
}
# Create context from retrieved documents
context_text = "\n\n".join([f"Document {i+1}: {ctx}" for i, ctx in enumerate(retrieved_context)])
# Groundedness evaluation prompt
prompt = f"""You are an expert evaluator tasked with determining if a generated answer is
factually grounded in the provided context.
CONTEXT (Retrieved Documents):
{context_text}
GENERATED ANSWER:
{generated_answer}
TASK:
Evaluate whether the generated answer is:
1. FACTUALLY CONSISTENT with the context (no contradictions)
2. FULLY SUPPORTED by the context (all claims can be verified)
3. NOT HALLUCINATED (no information absent from context)
Respond with a JSON object containing:
- "grounded": boolean (true if fully grounded, false otherwise)
- "confidence": float 0-1 (how confident you are in this assessment)
- "explanation": string (detailed reasoning for your assessment)
- "unsupported_claims": list of strings (any claims not supported by context)
Be strict: if ANY part of the answer contains information not present in or
contradicted by the context, mark as false."""
try:
response = requests.post(
"https://openrouter.ai/api/v1/chat/completions",
headers={
"Authorization": f"Bearer {OPENROUTER_API_KEY}",
"Content-Type": "application/json",
},
json={
"model": GROUNDEDNESS_MODEL,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1,
"max_tokens": 500,
},
timeout=30,
)
if response.status_code == 200:
result = response.json()
content = result["choices"][0]["message"]["content"]
# Try to parse JSON response
try:
evaluation = json.loads(content)
evaluation["method"] = "llm_evaluation"
return evaluation
except json.JSONDecodeError:
# Fallback if LLM didn't return valid JSON
is_grounded = "true" in content.lower() and "grounded" in content.lower()
return {
"grounded": is_grounded,
"confidence": 0.7,
"explanation": f"LLM evaluation (non-JSON): {content[:200]}...",
"method": "llm_evaluation_parsed",
}
else:
# API error fallback
return {
"grounded": True,
"confidence": 0.3,
"explanation": f"API error {response.status_code}, using neutral assessment",
"method": "api_error_fallback",
}
except Exception as e:
# Exception fallback
return {
"grounded": True,
"confidence": 0.3,
"explanation": f"Evaluation error: {str(e)}, using neutral assessment",
"method": "exception_fallback",
}
def evaluate_citation_accuracy_enhanced(
expected_sources: List[str],
returned_sources: List[Dict[str, Any]],
generated_answer: str,
) -> Dict[str, Any]:
"""
Enhanced citation accuracy that considers both source presence and relevance.
Args:
expected_sources: List of expected source filenames
returned_sources: List of returned source dictionaries
generated_answer: The generated response text
Returns:
Dictionary with citation accuracy metrics
"""
if not expected_sources:
return {
"citation_accuracy": 1.0 if not returned_sources else 0.0,
"expected_count": 0,
"returned_count": len(returned_sources),
"correctly_cited": 0,
"method": "exact_match",
}
# Extract returned filenames
returned_filenames = {
s.get("document") or s.get("filename") or s.get("source_file") or s.get("file") for s in returned_sources
}
returned_filenames = {f for f in returned_filenames if f}
# Count correct citations
correctly_cited = 0
for expected in expected_sources:
if expected in returned_filenames:
correctly_cited += 1
citation_accuracy = correctly_cited / len(expected_sources) if expected_sources else 0.0
return {
"citation_accuracy": citation_accuracy,
"expected_count": len(expected_sources),
"returned_count": len(returned_filenames),
"correctly_cited": correctly_cited,
"expected_sources": expected_sources,
"returned_sources": list(returned_filenames),
"method": "exact_match",
}
def token_overlap_score(gold: str, response: str) -> float:
"""Simple partial match score based on token overlap."""
gold_tokens = set(gold.lower().split())
resp_tokens = set(response.lower().split())
if not gold_tokens:
return 0.0
overlap = gold_tokens & resp_tokens
return len(overlap) / len(gold_tokens)
def run_enhanced_evaluation(target: str = TARGET_URL):
"""Run enhanced evaluation with proper groundedness checking."""
questions = load_json(QUESTIONS_FILE)
golds = load_json(GOLD_FILE)
results = []
latencies = []
groundedness_scores = []
citation_accuracies = []
print(f"Running enhanced evaluation against {target}")
print(f"Using groundedness evaluation: {'LLM-based' if OPENROUTER_API_KEY else 'Token overlap fallback'}")
for q in tqdm(questions, desc="Enhanced Evaluation"):
qid = str(q["id"])
payload = {"message": q["question"], "include_sources": True}
url = target.rstrip("/") + CHAT_ENDPOINT
start = time.time()
try:
# Add progress info
print(f"\nEvaluating question {qid}: {q['question'][:50]}...")
r = requests.post(url, json=payload, timeout=TIMEOUT)
latency = time.time() - start
latencies.append(latency)
print(f"Response received in {latency:.2f}s")
if r.status_code != 200:
results.append(
{
"id": qid,
"question": q["question"],
"status_code": r.status_code,
"error": r.text,
"latency_s": latency,
}
)
continue
data = r.json()
response_text = data.get("response", "")
returned_sources = data.get("sources", []) or []
gold_answer = golds.get(qid, {}).get("answer", "")
expected_sources = golds.get(qid, {}).get("expected_sources", [])
# Enhanced groundedness evaluation
context_excerpts = [s.get("excerpt", "") for s in returned_sources if s.get("excerpt")]
groundedness_eval = evaluate_groundedness_llm(response_text, context_excerpts)
# Enhanced citation accuracy
citation_eval = evaluate_citation_accuracy_enhanced(expected_sources, returned_sources, response_text)
# Traditional token overlap for comparison
overlap_score = token_overlap_score(gold_answer, response_text)
# Store comprehensive results
result = {
"id": qid,
"question": q["question"],
"response": response_text,
"latency_s": latency,
# Enhanced groundedness metrics
"groundedness": groundedness_eval,
# Enhanced citation metrics
"citation_evaluation": citation_eval,
# Traditional metrics for comparison
"overlap_score": overlap_score,
"citation_accuracy": citation_eval["citation_accuracy"],
# Source information
"returned_sources": returned_sources,
"expected_sources": expected_sources,
"gold_answer": gold_answer,
}
results.append(result)
# Track metrics for summary
if groundedness_eval.get("grounded") is not None:
groundedness_scores.append(1.0 if groundedness_eval["grounded"] else 0.0)
citation_accuracies.append(citation_eval["citation_accuracy"])
except Exception as e:
latency = time.time() - start
latencies.append(latency)
results.append(
{
"id": qid,
"question": q["question"],
"status_code": "error",
"error": str(e),
"latency_s": latency,
}
)
# Calculate summary metrics
success_latencies = [lat for lat in latencies if lat is not None]
p50 = statistics.median(success_latencies) if success_latencies else None
p95 = sorted(success_latencies)[max(0, int(len(success_latencies) * 0.95) - 1)] if success_latencies else None
# Enhanced summary metrics
avg_groundedness = sum(groundedness_scores) / len(groundedness_scores) if groundedness_scores else None
avg_citation_accuracy = sum(citation_accuracies) / len(citation_accuracies) if citation_accuracies else None
# Count successful evaluations
successful_evals = len([r for r in results if r.get("groundedness") is not None])
total_questions = len(questions)
summary = {
"target": target,
"evaluation_method": "enhanced_llm_based",
"n_questions": total_questions,
"successful_evaluations": successful_evals,
"success_rate": (successful_evals / total_questions if total_questions > 0 else 0),
# Performance metrics
"latency_p50_s": p50,
"latency_p95_s": p95,
"avg_latency_s": (sum(success_latencies) / len(success_latencies) if success_latencies else None),
# Quality metrics (enhanced)
"avg_groundedness_score": avg_groundedness,
"avg_citation_accuracy": avg_citation_accuracy,
"groundedness_method": ("llm_evaluation" if OPENROUTER_API_KEY else "token_overlap_fallback"),
# Additional insights
"grounded_responses": sum(groundedness_scores),
"ungrounded_responses": (len(groundedness_scores) - sum(groundedness_scores) if groundedness_scores else 0),
"perfect_citations": len([c for c in citation_accuracies if c == 1.0]),
"no_citations": len([c for c in citation_accuracies if c == 0.0]),
}
# Save enhanced results
output = {
"summary": summary,
"results": results,
"metadata": {
"evaluation_timestamp": time.time(),
"evaluation_version": "enhanced_v1.0",
"groundedness_model": (GROUNDEDNESS_MODEL if OPENROUTER_API_KEY else "token_overlap"),
"target_endpoint": target + CHAT_ENDPOINT,
},
}
# Save to evaluation directory and a centralized evaluation_results folder
with open(OUT_FILE, "w", encoding="utf-8") as f:
json.dump(output, f, indent=2)
# Also write a copy into evaluation_results for CI aggregation
try:
out_summary_path = os.path.join(EVAL_RESULTS_DIR, "enhanced_results_summary.json")
with open(out_summary_path, "w", encoding="utf-8") as f2:
json.dump(output["summary"], f2, indent=2)
except Exception:
pass
print("\nEnhanced Evaluation Complete!")
print("=" * 50)
print(json.dumps(summary, indent=2))
print(f"\nDetailed results saved to {OUT_FILE}")
return output
if __name__ == "__main__":
target = os.getenv("EVAL_TARGET_URL", TARGET_URL)
run_enhanced_evaluation(target)