Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation Dashboard | |
| A web-based dashboard for running evaluations, viewing results, and monitoring | |
| RAG system performance metrics in real-time. | |
| """ | |
| import json | |
| import logging | |
| import os | |
| # Import evaluation modules | |
| import sys | |
| import time | |
| from typing import Any, Dict, List, Optional | |
| from flask import Blueprint, jsonify, render_template, request | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| logger = logging.getLogger(__name__) | |
| # Helper to load a module from a specific filepath if it exists | |
| def _load_module_from_path(module_name: str, path: str): | |
| import importlib.util | |
| if not os.path.exists(path): | |
| return None | |
| spec = importlib.util.spec_from_file_location(module_name, path) | |
| if spec is None or spec.loader is None: | |
| return None | |
| mod = importlib.util.module_from_spec(spec) | |
| spec.loader.exec_module(mod) | |
| return mod | |
| # Safe fallback imports to avoid recursion during HF deployment | |
| run_enhanced_evaluation = None | |
| run_eval = None | |
| def safe_run_enhanced_evaluation(*args, **kwargs): | |
| """Safe wrapper for enhanced evaluation that returns fallback response""" | |
| return {"status": "error", "message": "Enhanced evaluation not available"} | |
| def safe_run_eval(*args, **kwargs): | |
| """Safe wrapper for basic evaluation that returns fallback response""" | |
| return {"status": "error", "message": "Basic evaluation not available"} | |
| # Use safe wrappers by default to prevent recursion | |
| run_enhanced_evaluation = safe_run_enhanced_evaluation | |
| run_eval = safe_run_eval | |
| evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation") | |
| RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results") | |
| EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation") | |
| def ensure_results_dir(): | |
| """Ensure results directory exists.""" | |
| os.makedirs(RESULTS_DIR, exist_ok=True) | |
| def load_latest_results() -> Optional[Dict[str, Any]]: | |
| """Load the most recent evaluation results.""" | |
| ensure_results_dir() | |
| # Check for enhanced results first | |
| enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json") | |
| basic_results_file = os.path.join(EVAL_DIR, "results.json") | |
| latest_file = None | |
| latest_time = 0 | |
| for results_file in [enhanced_results_file, basic_results_file]: | |
| if os.path.exists(results_file): | |
| mtime = os.path.getmtime(results_file) | |
| if mtime > latest_time: | |
| latest_time = mtime | |
| latest_file = results_file | |
| if latest_file: | |
| with open(latest_file, "r") as f: | |
| return json.load(f) | |
| return None | |
| def get_evaluation_history() -> List[Dict[str, Any]]: | |
| """Get history of all evaluation runs.""" | |
| ensure_results_dir() | |
| history = [] | |
| # Check evaluation_results directory for timestamped files | |
| if os.path.exists(RESULTS_DIR): | |
| for filename in os.listdir(RESULTS_DIR): | |
| if filename.endswith("_results.json"): | |
| filepath = os.path.join(RESULTS_DIR, filename) | |
| try: | |
| with open(filepath, "r") as f: | |
| data = json.load(f) | |
| history.append( | |
| { | |
| "filename": filename, | |
| "timestamp": os.path.getmtime(filepath), | |
| "summary": data.get("summary", {}), | |
| "filepath": filepath, | |
| } | |
| ) | |
| except Exception as e: | |
| print(f"Error loading {filename}: {e}") | |
| # Sort by timestamp, newest first | |
| history.sort(key=lambda x: x["timestamp"], reverse=True) | |
| return history | |
| def dashboard(): | |
| """Main evaluation dashboard.""" | |
| return render_template("evaluation/dashboard.html") | |
| def api_status(): | |
| """API endpoint for dashboard status.""" | |
| latest_results = load_latest_results() | |
| history = get_evaluation_history() | |
| status = { | |
| "has_results": latest_results is not None, | |
| "last_evaluation": None, | |
| "total_evaluations": len(history), | |
| "evaluation_available": True, | |
| } | |
| if latest_results: | |
| summary = latest_results.get("summary", {}) | |
| metadata = latest_results.get("metadata", {}) | |
| status.update( | |
| { | |
| "last_evaluation": { | |
| "timestamp": metadata.get("evaluation_timestamp", time.time()), | |
| "n_questions": summary.get("n_questions", 0), | |
| "success_rate": summary.get("success_rate", 0), | |
| "avg_latency": summary.get("avg_latency_s", 0), | |
| "groundedness_score": summary.get("avg_groundedness_score", 0), | |
| "citation_accuracy": summary.get("avg_citation_accuracy", 0), | |
| } | |
| } | |
| ) | |
| return jsonify(status) | |
| def api_results(): | |
| """API endpoint for latest evaluation results.""" | |
| results = load_latest_results() | |
| if results: | |
| return jsonify(results) | |
| else: | |
| return jsonify({"error": "No evaluation results found"}), 404 | |
| def api_history(): | |
| """API endpoint for evaluation history.""" | |
| history = get_evaluation_history() | |
| return jsonify(history) | |
| def api_run_evaluation(): | |
| """API endpoint to run a new evaluation.""" | |
| try: | |
| data = request.get_json() or {} | |
| evaluation_type = data.get("type", "enhanced") # 'basic' or 'enhanced' | |
| target_url = data.get("target_url") | |
| # Set target URL if provided | |
| if target_url: | |
| os.environ["EVAL_TARGET_URL"] = target_url | |
| # Run the appropriate evaluation | |
| if evaluation_type == "enhanced": | |
| results = run_enhanced_evaluation() | |
| else: | |
| results = run_eval() | |
| return jsonify( | |
| { | |
| "status": "success", | |
| "message": f"{evaluation_type.title()} evaluation completed", | |
| "results": results, | |
| } | |
| ) | |
| except Exception as e: | |
| return ( | |
| jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}), | |
| 500, | |
| ) | |
| def api_metrics_summary(): | |
| """API endpoint for metrics summary with trends.""" | |
| history = get_evaluation_history() | |
| if not history: | |
| return jsonify({"error": "No evaluation history found"}), 404 | |
| # Calculate trends over last 5 evaluations | |
| recent_history = history[:5] | |
| metrics = { | |
| "latency_trend": [], | |
| "groundedness_trend": [], | |
| "citation_trend": [], | |
| "success_rate_trend": [], | |
| "timestamps": [], | |
| } | |
| for eval_data in reversed(recent_history): # Reverse to get chronological order | |
| summary = eval_data.get("summary", {}) | |
| timestamp = eval_data.get("timestamp", 0) | |
| metrics["timestamps"].append(timestamp) | |
| metrics["latency_trend"].append(summary.get("avg_latency_s", 0)) | |
| metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0)) | |
| metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0)) | |
| metrics["success_rate_trend"].append(summary.get("success_rate", 0)) | |
| # Calculate averages and trends | |
| def calc_trend(values): | |
| if len(values) < 2: | |
| return 0 | |
| return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0 | |
| summary_metrics = { | |
| "current_metrics": recent_history[0]["summary"] if recent_history else {}, | |
| "trends": { | |
| "latency": calc_trend(metrics["latency_trend"]), | |
| "groundedness": calc_trend(metrics["groundedness_trend"]), | |
| "citation_accuracy": calc_trend(metrics["citation_trend"]), | |
| "success_rate": calc_trend(metrics["success_rate_trend"]), | |
| }, | |
| "historical_data": metrics, | |
| } | |
| return jsonify(summary_metrics) | |
| def detailed_results(filename): | |
| """Detailed view of a specific evaluation run.""" | |
| filepath = os.path.join(RESULTS_DIR, filename) | |
| if not os.path.exists(filepath): | |
| # Also check main evaluation directory | |
| filepath = os.path.join(EVAL_DIR, filename) | |
| if not os.path.exists(filepath): | |
| return "Evaluation results not found", 404 | |
| try: | |
| with open(filepath, "r") as f: | |
| results = json.load(f) | |
| return render_template("evaluation/detailed.html", results=results, filename=filename) | |
| except Exception as e: | |
| return f"Error loading results: {str(e)}", 500 | |
| # Standalone Flask app for testing | |
| if __name__ == "__main__": | |
| from flask import Flask | |
| app = Flask(__name__) | |
| app.register_blueprint(evaluation_bp) | |
| # Add templates directory | |
| app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates") | |
| print("Starting evaluation dashboard on http://localhost:8080") | |
| print("Available endpoints:") | |
| print(" - GET /evaluation/ - Main dashboard") | |
| print(" - GET /evaluation/api/status - Dashboard status") | |
| print(" - GET /evaluation/api/results - Latest results") | |
| print(" - POST /evaluation/api/run-evaluation - Run new evaluation") | |
| app.run(debug=True, port=8080) | |