Spaces:
Sleeping
Sleeping
File size: 9,538 Bytes
f884e6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 |
"""
Evaluation Dashboard
A web-based dashboard for running evaluations, viewing results, and monitoring
RAG system performance metrics in real-time.
"""
import json
import logging
import os
# Import evaluation modules
import sys
import time
from typing import Any, Dict, List, Optional
from flask import Blueprint, jsonify, render_template, request
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
logger = logging.getLogger(__name__)
# Helper to load a module from a specific filepath if it exists
def _load_module_from_path(module_name: str, path: str):
import importlib.util
if not os.path.exists(path):
return None
spec = importlib.util.spec_from_file_location(module_name, path)
if spec is None or spec.loader is None:
return None
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
return mod
# Safe fallback imports to avoid recursion during HF deployment
run_enhanced_evaluation = None
run_eval = None
def safe_run_enhanced_evaluation(*args, **kwargs):
"""Safe wrapper for enhanced evaluation that returns fallback response"""
return {"status": "error", "message": "Enhanced evaluation not available"}
def safe_run_eval(*args, **kwargs):
"""Safe wrapper for basic evaluation that returns fallback response"""
return {"status": "error", "message": "Basic evaluation not available"}
# Use safe wrappers by default to prevent recursion
run_enhanced_evaluation = safe_run_enhanced_evaluation
run_eval = safe_run_eval
evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation")
RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results")
EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation")
def ensure_results_dir():
"""Ensure results directory exists."""
os.makedirs(RESULTS_DIR, exist_ok=True)
def load_latest_results() -> Optional[Dict[str, Any]]:
"""Load the most recent evaluation results."""
ensure_results_dir()
# Check for enhanced results first
enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json")
basic_results_file = os.path.join(EVAL_DIR, "results.json")
latest_file = None
latest_time = 0
for results_file in [enhanced_results_file, basic_results_file]:
if os.path.exists(results_file):
mtime = os.path.getmtime(results_file)
if mtime > latest_time:
latest_time = mtime
latest_file = results_file
if latest_file:
with open(latest_file, "r") as f:
return json.load(f)
return None
def get_evaluation_history() -> List[Dict[str, Any]]:
"""Get history of all evaluation runs."""
ensure_results_dir()
history = []
# Check evaluation_results directory for timestamped files
if os.path.exists(RESULTS_DIR):
for filename in os.listdir(RESULTS_DIR):
if filename.endswith("_results.json"):
filepath = os.path.join(RESULTS_DIR, filename)
try:
with open(filepath, "r") as f:
data = json.load(f)
history.append(
{
"filename": filename,
"timestamp": os.path.getmtime(filepath),
"summary": data.get("summary", {}),
"filepath": filepath,
}
)
except Exception as e:
print(f"Error loading {filename}: {e}")
# Sort by timestamp, newest first
history.sort(key=lambda x: x["timestamp"], reverse=True)
return history
@evaluation_bp.route("/")
def dashboard():
"""Main evaluation dashboard."""
return render_template("evaluation/dashboard.html")
@evaluation_bp.route("/api/status")
def api_status():
"""API endpoint for dashboard status."""
latest_results = load_latest_results()
history = get_evaluation_history()
status = {
"has_results": latest_results is not None,
"last_evaluation": None,
"total_evaluations": len(history),
"evaluation_available": True,
}
if latest_results:
summary = latest_results.get("summary", {})
metadata = latest_results.get("metadata", {})
status.update(
{
"last_evaluation": {
"timestamp": metadata.get("evaluation_timestamp", time.time()),
"n_questions": summary.get("n_questions", 0),
"success_rate": summary.get("success_rate", 0),
"avg_latency": summary.get("avg_latency_s", 0),
"groundedness_score": summary.get("avg_groundedness_score", 0),
"citation_accuracy": summary.get("avg_citation_accuracy", 0),
}
}
)
return jsonify(status)
@evaluation_bp.route("/api/results")
def api_results():
"""API endpoint for latest evaluation results."""
results = load_latest_results()
if results:
return jsonify(results)
else:
return jsonify({"error": "No evaluation results found"}), 404
@evaluation_bp.route("/api/history")
def api_history():
"""API endpoint for evaluation history."""
history = get_evaluation_history()
return jsonify(history)
@evaluation_bp.route("/api/run-evaluation", methods=["POST"])
def api_run_evaluation():
"""API endpoint to run a new evaluation."""
try:
data = request.get_json() or {}
evaluation_type = data.get("type", "enhanced") # 'basic' or 'enhanced'
target_url = data.get("target_url")
# Set target URL if provided
if target_url:
os.environ["EVAL_TARGET_URL"] = target_url
# Run the appropriate evaluation
if evaluation_type == "enhanced":
results = run_enhanced_evaluation()
else:
results = run_eval()
return jsonify(
{
"status": "success",
"message": f"{evaluation_type.title()} evaluation completed",
"results": results,
}
)
except Exception as e:
return (
jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}),
500,
)
@evaluation_bp.route("/api/metrics-summary")
def api_metrics_summary():
"""API endpoint for metrics summary with trends."""
history = get_evaluation_history()
if not history:
return jsonify({"error": "No evaluation history found"}), 404
# Calculate trends over last 5 evaluations
recent_history = history[:5]
metrics = {
"latency_trend": [],
"groundedness_trend": [],
"citation_trend": [],
"success_rate_trend": [],
"timestamps": [],
}
for eval_data in reversed(recent_history): # Reverse to get chronological order
summary = eval_data.get("summary", {})
timestamp = eval_data.get("timestamp", 0)
metrics["timestamps"].append(timestamp)
metrics["latency_trend"].append(summary.get("avg_latency_s", 0))
metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0))
metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0))
metrics["success_rate_trend"].append(summary.get("success_rate", 0))
# Calculate averages and trends
def calc_trend(values):
if len(values) < 2:
return 0
return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0
summary_metrics = {
"current_metrics": recent_history[0]["summary"] if recent_history else {},
"trends": {
"latency": calc_trend(metrics["latency_trend"]),
"groundedness": calc_trend(metrics["groundedness_trend"]),
"citation_accuracy": calc_trend(metrics["citation_trend"]),
"success_rate": calc_trend(metrics["success_rate_trend"]),
},
"historical_data": metrics,
}
return jsonify(summary_metrics)
@evaluation_bp.route("/detailed/<filename>")
def detailed_results(filename):
"""Detailed view of a specific evaluation run."""
filepath = os.path.join(RESULTS_DIR, filename)
if not os.path.exists(filepath):
# Also check main evaluation directory
filepath = os.path.join(EVAL_DIR, filename)
if not os.path.exists(filepath):
return "Evaluation results not found", 404
try:
with open(filepath, "r") as f:
results = json.load(f)
return render_template("evaluation/detailed.html", results=results, filename=filename)
except Exception as e:
return f"Error loading results: {str(e)}", 500
# Standalone Flask app for testing
if __name__ == "__main__":
from flask import Flask
app = Flask(__name__)
app.register_blueprint(evaluation_bp)
# Add templates directory
app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates")
print("Starting evaluation dashboard on http://localhost:8080")
print("Available endpoints:")
print(" - GET /evaluation/ - Main dashboard")
print(" - GET /evaluation/api/status - Dashboard status")
print(" - GET /evaluation/api/results - Latest results")
print(" - POST /evaluation/api/run-evaluation - Run new evaluation")
app.run(debug=True, port=8080)
|