Spaces:

msse-team-3
/

ai-engineering-project

Sleeping

File size: 9,538 Bytes

f884e6e

"""
Evaluation Dashboard

A web-based dashboard for running evaluations, viewing results, and monitoring
RAG system performance metrics in real-time.
"""

import json
import logging
import os

# Import evaluation modules
import sys
import time
from typing import Any, Dict, List, Optional

from flask import Blueprint, jsonify, render_template, request

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

logger = logging.getLogger(__name__)


# Helper to load a module from a specific filepath if it exists
def _load_module_from_path(module_name: str, path: str):
    import importlib.util

    if not os.path.exists(path):
        return None

    spec = importlib.util.spec_from_file_location(module_name, path)
    if spec is None or spec.loader is None:
        return None
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod


# Safe fallback imports to avoid recursion during HF deployment
run_enhanced_evaluation = None
run_eval = None


def safe_run_enhanced_evaluation(*args, **kwargs):
    """Safe wrapper for enhanced evaluation that returns fallback response"""
    return {"status": "error", "message": "Enhanced evaluation not available"}


def safe_run_eval(*args, **kwargs):
    """Safe wrapper for basic evaluation that returns fallback response"""
    return {"status": "error", "message": "Basic evaluation not available"}


# Use safe wrappers by default to prevent recursion
run_enhanced_evaluation = safe_run_enhanced_evaluation
run_eval = safe_run_eval

evaluation_bp = Blueprint("evaluation", __name__, url_prefix="/evaluation")

RESULTS_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation_results")
EVAL_DIR = os.path.join(os.path.dirname(__file__), "..", "..", "evaluation")


def ensure_results_dir():
    """Ensure results directory exists."""
    os.makedirs(RESULTS_DIR, exist_ok=True)


def load_latest_results() -> Optional[Dict[str, Any]]:
    """Load the most recent evaluation results."""
    ensure_results_dir()

    # Check for enhanced results first
    enhanced_results_file = os.path.join(EVAL_DIR, "enhanced_results.json")
    basic_results_file = os.path.join(EVAL_DIR, "results.json")

    latest_file = None
    latest_time = 0

    for results_file in [enhanced_results_file, basic_results_file]:
        if os.path.exists(results_file):
            mtime = os.path.getmtime(results_file)
            if mtime > latest_time:
                latest_time = mtime
                latest_file = results_file

    if latest_file:
        with open(latest_file, "r") as f:
            return json.load(f)

    return None


def get_evaluation_history() -> List[Dict[str, Any]]:
    """Get history of all evaluation runs."""
    ensure_results_dir()

    history = []

    # Check evaluation_results directory for timestamped files
    if os.path.exists(RESULTS_DIR):
        for filename in os.listdir(RESULTS_DIR):
            if filename.endswith("_results.json"):
                filepath = os.path.join(RESULTS_DIR, filename)
                try:
                    with open(filepath, "r") as f:
                        data = json.load(f)
                        history.append(
                            {
                                "filename": filename,
                                "timestamp": os.path.getmtime(filepath),
                                "summary": data.get("summary", {}),
                                "filepath": filepath,
                            }
                        )
                except Exception as e:
                    print(f"Error loading {filename}: {e}")

    # Sort by timestamp, newest first
    history.sort(key=lambda x: x["timestamp"], reverse=True)
    return history


@evaluation_bp.route("/")
def dashboard():
    """Main evaluation dashboard."""
    return render_template("evaluation/dashboard.html")


@evaluation_bp.route("/api/status")
def api_status():
    """API endpoint for dashboard status."""
    latest_results = load_latest_results()
    history = get_evaluation_history()

    status = {
        "has_results": latest_results is not None,
        "last_evaluation": None,
        "total_evaluations": len(history),
        "evaluation_available": True,
    }

    if latest_results:
        summary = latest_results.get("summary", {})
        metadata = latest_results.get("metadata", {})

        status.update(
            {
                "last_evaluation": {
                    "timestamp": metadata.get("evaluation_timestamp", time.time()),
                    "n_questions": summary.get("n_questions", 0),
                    "success_rate": summary.get("success_rate", 0),
                    "avg_latency": summary.get("avg_latency_s", 0),
                    "groundedness_score": summary.get("avg_groundedness_score", 0),
                    "citation_accuracy": summary.get("avg_citation_accuracy", 0),
                }
            }
        )

    return jsonify(status)


@evaluation_bp.route("/api/results")
def api_results():
    """API endpoint for latest evaluation results."""
    results = load_latest_results()
    if results:
        return jsonify(results)
    else:
        return jsonify({"error": "No evaluation results found"}), 404


@evaluation_bp.route("/api/history")
def api_history():
    """API endpoint for evaluation history."""
    history = get_evaluation_history()
    return jsonify(history)


@evaluation_bp.route("/api/run-evaluation", methods=["POST"])
def api_run_evaluation():
    """API endpoint to run a new evaluation."""
    try:
        data = request.get_json() or {}
        evaluation_type = data.get("type", "enhanced")  # 'basic' or 'enhanced'
        target_url = data.get("target_url")

        # Set target URL if provided
        if target_url:
            os.environ["EVAL_TARGET_URL"] = target_url

        # Run the appropriate evaluation
        if evaluation_type == "enhanced":
            results = run_enhanced_evaluation()
        else:
            results = run_eval()

        return jsonify(
            {
                "status": "success",
                "message": f"{evaluation_type.title()} evaluation completed",
                "results": results,
            }
        )

    except Exception as e:
        return (
            jsonify({"status": "error", "message": f"Evaluation failed: {str(e)}"}),
            500,
        )


@evaluation_bp.route("/api/metrics-summary")
def api_metrics_summary():
    """API endpoint for metrics summary with trends."""
    history = get_evaluation_history()

    if not history:
        return jsonify({"error": "No evaluation history found"}), 404

    # Calculate trends over last 5 evaluations
    recent_history = history[:5]

    metrics = {
        "latency_trend": [],
        "groundedness_trend": [],
        "citation_trend": [],
        "success_rate_trend": [],
        "timestamps": [],
    }

    for eval_data in reversed(recent_history):  # Reverse to get chronological order
        summary = eval_data.get("summary", {})
        timestamp = eval_data.get("timestamp", 0)

        metrics["timestamps"].append(timestamp)
        metrics["latency_trend"].append(summary.get("avg_latency_s", 0))
        metrics["groundedness_trend"].append(summary.get("avg_groundedness_score", 0))
        metrics["citation_trend"].append(summary.get("avg_citation_accuracy", 0))
        metrics["success_rate_trend"].append(summary.get("success_rate", 0))

    # Calculate averages and trends
    def calc_trend(values):
        if len(values) < 2:
            return 0
        return (values[-1] - values[0]) / len(values) if values[0] != 0 else 0

    summary_metrics = {
        "current_metrics": recent_history[0]["summary"] if recent_history else {},
        "trends": {
            "latency": calc_trend(metrics["latency_trend"]),
            "groundedness": calc_trend(metrics["groundedness_trend"]),
            "citation_accuracy": calc_trend(metrics["citation_trend"]),
            "success_rate": calc_trend(metrics["success_rate_trend"]),
        },
        "historical_data": metrics,
    }

    return jsonify(summary_metrics)


@evaluation_bp.route("/detailed/<filename>")
def detailed_results(filename):
    """Detailed view of a specific evaluation run."""
    filepath = os.path.join(RESULTS_DIR, filename)

    if not os.path.exists(filepath):
        # Also check main evaluation directory
        filepath = os.path.join(EVAL_DIR, filename)
        if not os.path.exists(filepath):
            return "Evaluation results not found", 404

    try:
        with open(filepath, "r") as f:
            results = json.load(f)

        return render_template("evaluation/detailed.html", results=results, filename=filename)
    except Exception as e:
        return f"Error loading results: {str(e)}", 500


# Standalone Flask app for testing
if __name__ == "__main__":
    from flask import Flask

    app = Flask(__name__)
    app.register_blueprint(evaluation_bp)

    # Add templates directory
    app.template_folder = os.path.join(os.path.dirname(__file__), "..", "..", "templates")

    print("Starting evaluation dashboard on http://localhost:8080")
    print("Available endpoints:")
    print("  - GET  /evaluation/              - Main dashboard")
    print("  - GET  /evaluation/api/status    - Dashboard status")
    print("  - GET  /evaluation/api/results   - Latest results")
    print("  - POST /evaluation/api/run-evaluation - Run new evaluation")

    app.run(debug=True, port=8080)