""" Evaluation Dashboard A web-based dashboard for running evaluations, viewing results, and monitoring RAG system performance metrics in real-time. """ import json import logging import os # Import evaluation modules import sys import time from typing import Any, Dict, List, Optional from flask import Blueprint, jsonify, render_template, request sys.path.append(os.path.dirname(os.path.abspath(__file__))) logger = logging.getLogger(__name__) # Helper to load a module from a specific filepath if it exists def _load_module_from_path(module_name: str, path: str): import importlib.util if not os.path.exists(path): return None spec = importlib.util.spec_from_file_location(module_name, path) if spec is None or spec.loader is None: return None mod = importlib.util.module_from_spec(spec) spec.loader.exec_module(mod) return mod # Safe fallback imports to avoid recursion during HF deployment run_enhanced_evaluation = None run_eval = None def safe_run_enhanced_evaluation(*args, **kwargs): """Safe wrapper for enhanced evaluation that returns fallback response""" return {"status": "error", "message": "Enhanced evaluation not available"} def safe_run_eval(*args, **kwargs): """Safe wrapper for basic evaluation that returns fallback response""" return {"status": "error", "message": "Basic evaluation not available"} # Use safe wrappers by default to prevent recursion run_enhanced_evaluation = safe_run_enhanced_evaluation run_eval = safe_run_eval evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation") RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results") EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation") def ensure_results_dir(): """Ensure results directory exists.""" os.makedirs(RESULTS_DIR, exist_ok=True) def load_latest_results() -> Optional[Dict[str, Any]]: """Load the most recent evaluation results.""" ensure_results_dir() # Check for enhanced results first enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json") basic_results_file = os.path.join(EVAL_DIR, "results.json") latest_file = None latest_time = 0 for results_file in [enhanced_results_file, basic_results_file]: if os.path.exists(results_file): mtime = os.path.getmtime(results_file) if mtime > latest_time: latest_time = mtime latest_file = results_file if latest_file: with open(latest_file, "r") as f: return json.load(f) return None def get_evaluation_history() -> List[Dict[str, Any]]: """Get history of all evaluation runs.""" ensure_results_dir() history = [] # Check evaluation_results directory for timestamped files if os.path.exists(RESULTS_DIR): for filename in os.listdir(RESULTS_DIR): if filename.endswith("_results.json"): filepath = os.path.join(RESULTS_DIR, filename) try: with open(filepath, "r") as f: data = json.load(f) history.append( { "filename": filename, "timestamp": os.path.getmtime(filepath), "summary": data.get("summary", {}), "filepath": filepath, } ) except Exception as e: print(f"Error loading {filename}: {e}") # Sort by timestamp, newest first history.sort(key=lambda x: x["timestamp"], reverse=True) return history @evaluation_bp.route("/") def dashboard(): """Main evaluation dashboard.""" return render_template("evaluation/dashboard.html") @evaluation_bp.route("/api/status") def api_status(): """API endpoint for dashboard status.""" latest_results = load_latest_results() history = get_evaluation_history() status = { "has_results": latest_results is not None, "last_evaluation": None, "total_evaluations": len(history), "evaluation_available": True, } if latest_results: summary = latest_results.get("summary", {}) metadata = latest_results.get("metadata", {}) status.update( { "last_evaluation": { "timestamp": metadata.get("evaluation_timestamp", time.time()), "n_questions": summary.get("n_questions", 0), "success_rate": summary.get("success_rate", 0), "avg_latency": summary.get("avg_latency_s", 0), "groundedness_score": summary.get("avg_groundedness_score", 0), "citation_accuracy": summary.get("avg_citation_accuracy", 0), } } ) return jsonify(status) @evaluation_bp.route("/api/results") def api_results(): """API endpoint for latest evaluation results.""" results = load_latest_results() if results: return jsonify(results) else: return jsonify({"error": "No evaluation results found"}), 404 @evaluation_bp.route("/api/history") def api_history(): """API endpoint for evaluation history.""" history = get_evaluation_history() return jsonify(history) @evaluation_bp.route("/api/run-evaluation", methods=["POST"]) def api_run_evaluation(): """API endpoint to run a new evaluation.""" try: data = request.get_json() or {} evaluation_type = data.get("type", "enhanced") # 'basic' or 'enhanced' target_url = data.get("target_url") # Set target URL if provided if target_url: os.environ["EVAL_TARGET_URL"] = target_url # Run the appropriate evaluation if evaluation_type == "enhanced": results = run_enhanced_evaluation() else: results = run_eval() return jsonify( { "status": "success", "message": f"{evaluation_type.title()} evaluation completed", "results": results, } ) except Exception as e: return ( jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}), 500, ) @evaluation_bp.route("/api/metrics-summary") def api_metrics_summary(): """API endpoint for metrics summary with trends.""" history = get_evaluation_history() if not history: return jsonify({"error": "No evaluation history found"}), 404 # Calculate trends over last 5 evaluations recent_history = history[:5] metrics = { "latency_trend": [], "groundedness_trend": [], "citation_trend": [], "success_rate_trend": [], "timestamps": [], } for eval_data in reversed(recent_history): # Reverse to get chronological order summary = eval_data.get("summary", {}) timestamp = eval_data.get("timestamp", 0) metrics["timestamps"].append(timestamp) metrics["latency_trend"].append(summary.get("avg_latency_s", 0)) metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0)) metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0)) metrics["success_rate_trend"].append(summary.get("success_rate", 0)) # Calculate averages and trends def calc_trend(values): if len(values) < 2: return 0 return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0 summary_metrics = { "current_metrics": recent_history[0]["summary"] if recent_history else {}, "trends": { "latency": calc_trend(metrics["latency_trend"]), "groundedness": calc_trend(metrics["groundedness_trend"]), "citation_accuracy": calc_trend(metrics["citation_trend"]), "success_rate": calc_trend(metrics["success_rate_trend"]), }, "historical_data": metrics, } return jsonify(summary_metrics) @evaluation_bp.route("/detailed/") def detailed_results(filename): """Detailed view of a specific evaluation run.""" filepath = os.path.join(RESULTS_DIR, filename) if not os.path.exists(filepath): # Also check main evaluation directory filepath = os.path.join(EVAL_DIR, filename) if not os.path.exists(filepath): return "Evaluation results not found", 404 try: with open(filepath, "r") as f: results = json.load(f) return render_template("evaluation/detailed.html", results=results, filename=filename) except Exception as e: return f"Error loading results: {str(e)}", 500 # Standalone Flask app for testing if __name__ == "__main__": from flask import Flask app = Flask(__name__) app.register_blueprint(evaluation_bp) # Add templates directory app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates") print("Starting evaluation dashboard on http://localhost:8080") print("Available endpoints:") print(" - GET /evaluation/ - Main dashboard") print(" - GET /evaluation/api/status - Dashboard status") print(" - GET /evaluation/api/results - Latest results") print(" - POST /evaluation/api/run-evaluation - Run new evaluation") app.run(debug=True, port=8080)