GitHub Action
Clean deployment without binary files
f884e6e
"""
Evaluation Dashboard
A web-based dashboard for running evaluations, viewing results, and monitoring
RAG system performance metrics in real-time.
"""
import json
import logging
import os
# Import evaluation modules
import sys
import time
from typing import Any, Dict, List, Optional
from flask import Blueprint, jsonify, render_template, request
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
logger = logging.getLogger(__name__)
# Helper to load a module from a specific filepath if it exists
def _load_module_from_path(module_name: str, path: str):
import importlib.util
if not os.path.exists(path):
return None
spec = importlib.util.spec_from_file_location(module_name, path)
if spec is None or spec.loader is None:
return None
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
# Safe fallback imports to avoid recursion during HF deployment
run_enhanced_evaluation = None
run_eval = None
def safe_run_enhanced_evaluation(*args, **kwargs):
"""Safe wrapper for enhanced evaluation that returns fallback response"""
return {"status": "error", "message": "Enhanced evaluation not available"}
def safe_run_eval(*args, **kwargs):
"""Safe wrapper for basic evaluation that returns fallback response"""
return {"status": "error", "message": "Basic evaluation not available"}
# Use safe wrappers by default to prevent recursion
run_enhanced_evaluation = safe_run_enhanced_evaluation
run_eval = safe_run_eval
evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation")
RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results")
EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation")
def ensure_results_dir():
"""Ensure results directory exists."""
os.makedirs(RESULTS_DIR, exist_ok=True)
def load_latest_results() -> Optional[Dict[str, Any]]:
"""Load the most recent evaluation results."""
ensure_results_dir()
# Check for enhanced results first
enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json")
basic_results_file = os.path.join(EVAL_DIR, "results.json")
latest_file = None
latest_time = 0
for results_file in [enhanced_results_file, basic_results_file]:
if os.path.exists(results_file):
mtime = os.path.getmtime(results_file)
if mtime > latest_time:
latest_time = mtime
latest_file = results_file
if latest_file:
with open(latest_file, "r") as f:
return json.load(f)
return None
def get_evaluation_history() -> List[Dict[str, Any]]:
"""Get history of all evaluation runs."""
ensure_results_dir()
history = []
# Check evaluation_results directory for timestamped files
if os.path.exists(RESULTS_DIR):
for filename in os.listdir(RESULTS_DIR):
if filename.endswith("_results.json"):
filepath = os.path.join(RESULTS_DIR, filename)
try:
with open(filepath, "r") as f:
data = json.load(f)
history.append(
{
"filename": filename,
"timestamp": os.path.getmtime(filepath),
"summary": data.get("summary", {}),
"filepath": filepath,
}
)
except Exception as e:
print(f"Error loading {filename}: {e}")
# Sort by timestamp, newest first
history.sort(key=lambda x: x["timestamp"], reverse=True)
return history
@evaluation_bp.route("/")
def dashboard():
"""Main evaluation dashboard."""
return render_template("evaluation/dashboard.html")
@evaluation_bp.route("/api/status")
def api_status():
"""API endpoint for dashboard status."""
latest_results = load_latest_results()
history = get_evaluation_history()
status = {
"has_results": latest_results is not None,
"last_evaluation": None,
"total_evaluations": len(history),
"evaluation_available": True,
}
if latest_results:
summary = latest_results.get("summary", {})
metadata = latest_results.get("metadata", {})
status.update(
{
"last_evaluation": {
"timestamp": metadata.get("evaluation_timestamp", time.time()),
"n_questions": summary.get("n_questions", 0),
"success_rate": summary.get("success_rate", 0),
"avg_latency": summary.get("avg_latency_s", 0),
"groundedness_score": summary.get("avg_groundedness_score", 0),
"citation_accuracy": summary.get("avg_citation_accuracy", 0),
}
}
)
return jsonify(status)
@evaluation_bp.route("/api/results")
def api_results():
"""API endpoint for latest evaluation results."""
results = load_latest_results()
if results:
return jsonify(results)
else:
return jsonify({"error": "No evaluation results found"}), 404
@evaluation_bp.route("/api/history")
def api_history():
"""API endpoint for evaluation history."""
history = get_evaluation_history()
return jsonify(history)
@evaluation_bp.route("/api/run-evaluation", methods=["POST"])
def api_run_evaluation():
"""API endpoint to run a new evaluation."""
try:
data = request.get_json() or {}
evaluation_type = data.get("type", "enhanced") # 'basic' or 'enhanced'
target_url = data.get("target_url")
# Set target URL if provided
if target_url:
os.environ["EVAL_TARGET_URL"] = target_url
# Run the appropriate evaluation
if evaluation_type == "enhanced":
results = run_enhanced_evaluation()
else:
results = run_eval()
return jsonify(
{
"status": "success",
"message": f"{evaluation_type.title()} evaluation completed",
"results": results,
}
)
except Exception as e:
return (
jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}),
500,
)
@evaluation_bp.route("/api/metrics-summary")
def api_metrics_summary():
"""API endpoint for metrics summary with trends."""
history = get_evaluation_history()
if not history:
return jsonify({"error": "No evaluation history found"}), 404
# Calculate trends over last 5 evaluations
recent_history = history[:5]
metrics = {
"latency_trend": [],
"groundedness_trend": [],
"citation_trend": [],
"success_rate_trend": [],
"timestamps": [],
}
for eval_data in reversed(recent_history): # Reverse to get chronological order
summary = eval_data.get("summary", {})
timestamp = eval_data.get("timestamp", 0)
metrics["timestamps"].append(timestamp)
metrics["latency_trend"].append(summary.get("avg_latency_s", 0))
metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0))
metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0))
metrics["success_rate_trend"].append(summary.get("success_rate", 0))
# Calculate averages and trends
def calc_trend(values):
if len(values) < 2:
return 0
return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0
summary_metrics = {
"current_metrics": recent_history[0]["summary"] if recent_history else {},
"trends": {
"latency": calc_trend(metrics["latency_trend"]),
"groundedness": calc_trend(metrics["groundedness_trend"]),
"citation_accuracy": calc_trend(metrics["citation_trend"]),
"success_rate": calc_trend(metrics["success_rate_trend"]),
},
"historical_data": metrics,
}
return jsonify(summary_metrics)
@evaluation_bp.route("/detailed/<filename>")
def detailed_results(filename):
"""Detailed view of a specific evaluation run."""
filepath = os.path.join(RESULTS_DIR, filename)
if not os.path.exists(filepath):
# Also check main evaluation directory
filepath = os.path.join(EVAL_DIR, filename)
if not os.path.exists(filepath):
return "Evaluation results not found", 404
try:
with open(filepath, "r") as f:
results = json.load(f)
return render_template("evaluation/detailed.html", results=results, filename=filename)
except Exception as e:
return f"Error loading results: {str(e)}", 500
# Standalone Flask app for testing
if __name__ == "__main__":
from flask import Flask
app = Flask(__name__)
app.register_blueprint(evaluation_bp)
# Add templates directory
app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates")
print("Starting evaluation dashboard on http://localhost:8080")
print("Available endpoints:")
print(" - GET /evaluation/ - Main dashboard")
print(" - GET /evaluation/api/status - Dashboard status")
print(" - GET /evaluation/api/results - Latest results")
print(" - POST /evaluation/api/run-evaluation - Run new evaluation")
app.run(debug=True, port=8080)