Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

ai-engineering-project / src /evaluation /dashboard.py

GitHub Action

Clean deployment without binary files

f884e6e 3 months ago

9.54 kB

	"""
	Evaluation Dashboard

	A web-based dashboard for running evaluations, viewing results, and monitoring
	RAG system performance metrics in real-time.
	"""

	import json
	import logging
	import os

	# Import evaluation modules
	import sys
	import time
	from typing import Any, Dict, List, Optional

	from flask import Blueprint, jsonify, render_template, request

	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	logger = logging.getLogger(__name__)


	# Helper to load a module from a specific filepath if it exists
	def _load_module_from_path(module_name: str, path: str):
	import importlib.util

	if not os.path.exists(path):
	return None

	spec = importlib.util.spec_from_file_location(module_name, path)
	if spec is None or spec.loader is None:
	return None
	mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(mod)
	return mod


	# Safe fallback imports to avoid recursion during HF deployment
	run_enhanced_evaluation = None
	run_eval = None


	def safe_run_enhanced_evaluation(args, *kwargs):
	"""Safe wrapper for enhanced evaluation that returns fallback response"""
	return {"status": "error", "message": "Enhanced evaluation not available"}


	def safe_run_eval(args, *kwargs):
	"""Safe wrapper for basic evaluation that returns fallback response"""
	return {"status": "error", "message": "Basic evaluation not available"}


	# Use safe wrappers by default to prevent recursion
	run_enhanced_evaluation = safe_run_enhanced_evaluation
	run_eval = safe_run_eval

	evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation")

	RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results")
	EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation")


	def ensure_results_dir():
	"""Ensure results directory exists."""
	os.makedirs(RESULTS_DIR, exist_ok=True)


	def load_latest_results() -> Optional[Dict[str, Any]]:
	"""Load the most recent evaluation results."""
	ensure_results_dir()

	# Check for enhanced results first
	enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json")
	basic_results_file = os.path.join(EVAL_DIR, "results.json")

	latest_file = None
	latest_time = 0

	for results_file in [enhanced_results_file, basic_results_file]:
	if os.path.exists(results_file):
	mtime = os.path.getmtime(results_file)
	if mtime > latest_time:
	latest_time = mtime
	latest_file = results_file

	if latest_file:
	with open(latest_file, "r") as f:
	return json.load(f)

	return None


	def get_evaluation_history() -> List[Dict[str, Any]]:
	"""Get history of all evaluation runs."""
	ensure_results_dir()

	history = []

	# Check evaluation_results directory for timestamped files
	if os.path.exists(RESULTS_DIR):
	for filename in os.listdir(RESULTS_DIR):
	if filename.endswith("_results.json"):
	filepath = os.path.join(RESULTS_DIR, filename)
	try:
	with open(filepath, "r") as f:
	data = json.load(f)
	history.append(
	{
	"filename": filename,
	"timestamp": os.path.getmtime(filepath),
	"summary": data.get("summary", {}),
	"filepath": filepath,
	}
	)
	except Exception as e:
	print(f"Error loading {filename}: {e}")

	# Sort by timestamp, newest first
	history.sort(key=lambda x: x["timestamp"], reverse=True)
	return history


	@evaluation_bp.route("/")
	def dashboard():
	"""Main evaluation dashboard."""
	return render_template("evaluation/dashboard.html")


	@evaluation_bp.route("/api/status")
	def api_status():
	"""API endpoint for dashboard status."""
	latest_results = load_latest_results()
	history = get_evaluation_history()

	status = {
	"has_results": latest_results is not None,
	"last_evaluation": None,
	"total_evaluations": len(history),
	"evaluation_available": True,
	}

	if latest_results:
	summary = latest_results.get("summary", {})
	metadata = latest_results.get("metadata", {})

	status.update(
	{
	"last_evaluation": {
	"timestamp": metadata.get("evaluation_timestamp", time.time()),
	"n_questions": summary.get("n_questions", 0),
	"success_rate": summary.get("success_rate", 0),
	"avg_latency": summary.get("avg_latency_s", 0),
	"groundedness_score": summary.get("avg_groundedness_score", 0),
	"citation_accuracy": summary.get("avg_citation_accuracy", 0),
	}
	}
	)

	return jsonify(status)


	@evaluation_bp.route("/api/results")
	def api_results():
	"""API endpoint for latest evaluation results."""
	results = load_latest_results()
	if results:
	return jsonify(results)
	else:
	return jsonify({"error": "No evaluation results found"}), 404


	@evaluation_bp.route("/api/history")
	def api_history():
	"""API endpoint for evaluation history."""
	history = get_evaluation_history()
	return jsonify(history)


	@evaluation_bp.route("/api/run-evaluation", methods=["POST"])
	def api_run_evaluation():
	"""API endpoint to run a new evaluation."""
	try:
	data = request.get_json() or {}
	evaluation_type = data.get("type", "enhanced") # 'basic' or 'enhanced'
	target_url = data.get("target_url")

	# Set target URL if provided
	if target_url:
	os.environ["EVAL_TARGET_URL"] = target_url

	# Run the appropriate evaluation
	if evaluation_type == "enhanced":
	results = run_enhanced_evaluation()
	else:
	results = run_eval()

	return jsonify(
	{
	"status": "success",
	"message": f"{evaluation_type.title()} evaluation completed",
	"results": results,
	}
	)

	except Exception as e:
	return (
	jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}),
	500,
	)


	@evaluation_bp.route("/api/metrics-summary")
	def api_metrics_summary():
	"""API endpoint for metrics summary with trends."""
	history = get_evaluation_history()

	if not history:
	return jsonify({"error": "No evaluation history found"}), 404

	# Calculate trends over last 5 evaluations
	recent_history = history[:5]

	metrics = {
	"latency_trend": [],
	"groundedness_trend": [],
	"citation_trend": [],
	"success_rate_trend": [],
	"timestamps": [],
	}

	for eval_data in reversed(recent_history): # Reverse to get chronological order
	summary = eval_data.get("summary", {})
	timestamp = eval_data.get("timestamp", 0)

	metrics["timestamps"].append(timestamp)
	metrics["latency_trend"].append(summary.get("avg_latency_s", 0))
	metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0))
	metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0))
	metrics["success_rate_trend"].append(summary.get("success_rate", 0))

	# Calculate averages and trends
	def calc_trend(values):
	if len(values) < 2:
	return 0
	return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0

	summary_metrics = {
	"current_metrics": recent_history[0]["summary"] if recent_history else {},
	"trends": {
	"latency": calc_trend(metrics["latency_trend"]),
	"groundedness": calc_trend(metrics["groundedness_trend"]),
	"citation_accuracy": calc_trend(metrics["citation_trend"]),
	"success_rate": calc_trend(metrics["success_rate_trend"]),
	},
	"historical_data": metrics,
	}

	return jsonify(summary_metrics)


	@evaluation_bp.route("/detailed/<filename>")
	def detailed_results(filename):
	"""Detailed view of a specific evaluation run."""
	filepath = os.path.join(RESULTS_DIR, filename)

	if not os.path.exists(filepath):
	# Also check main evaluation directory
	filepath = os.path.join(EVAL_DIR, filename)
	if not os.path.exists(filepath):
	return "Evaluation results not found", 404

	try:
	with open(filepath, "r") as f:
	results = json.load(f)

	return render_template("evaluation/detailed.html", results=results, filename=filename)
	except Exception as e:
	return f"Error loading results: {str(e)}", 500


	# Standalone Flask app for testing
	if __name__ == "__main__":
	from flask import Flask

	app = Flask(__name__)
	app.register_blueprint(evaluation_bp)

	# Add templates directory
	app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates")

	print("Starting evaluation dashboard on http://localhost:8080")
	print("Available endpoints:")
	print(" - GET /evaluation/ - Main dashboard")
	print(" - GET /evaluation/api/status - Dashboard status")
	print(" - GET /evaluation/api/results - Latest results")
	print(" - POST /evaluation/api/run-evaluation - Run new evaluation")

	app.run(debug=True, port=8080)