"""
Runs evaluation on public benchmarks like MedQA, LegalBench.
"""
import sys
import os
import json
import argparse
from datetime import datetime
from typing import Any, Dict, List, Optional

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from modules.reasoning_engine import create_reasoning_engine
from evaluation.metrics import MetricsRunner


def _extract_query(item: Dict[str, Any]) -> Optional[str]:
    query = item.get("question") or item.get("query")
    if not isinstance(query, str):
        return None
    query = query.strip()
    return query if query else None


def _summarize_scores(scores: List[float]) -> Optional[float]:
    if not scores:
        return None
    return round(sum(scores) / len(scores), 4)


def run_benchmark(
    dataset_path: str,
    agent_name: str,
    max_samples: Optional[int] = None,
    output_path: Optional[str] = None,
) -> Dict[str, Any]:
    engine = create_reasoning_engine()
    metrics = MetricsRunner()

    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset not found: {dataset_path}")

    with open(dataset_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if not isinstance(data, list):
        raise ValueError("Benchmark dataset must be a JSON array of records")

    items = data if not max_samples else data[:max_samples]

    records: List[Dict[str, Any]] = []
    faithfulness_scores: List[float] = []
    succeeded = 0
    failed = 0
    skipped = 0

    for idx, item in enumerate(items, start=1):
        query = _extract_query(item)
        if not query:
            skipped += 1
            continue

        print(f"\n[{idx}/{len(items)}] Query: {query}")
        row: Dict[str, Any] = {
            "index": idx,
            "query": query,
        }

        try:
            result = engine.reason(agent_name, query)
            faithfulness = metrics.extract_faithfulness(result)
            confidence = metrics.extract_confidence(result)
            answer = result.get("answer", "")

            if isinstance(answer, str) and len(answer) > 120:
                answer_preview = f"{answer[:120]}..."
            else:
                answer_preview = answer

            row.update({
                "status": "ok",
                "in_domain": result.get("in_domain"),
                "confidence": confidence,
                "faithfulness": faithfulness,
                "answer_preview": answer_preview,
            })
            records.append(row)

            if faithfulness is not None:
                faithfulness_scores.append(faithfulness)
            succeeded += 1

            print(f"Answer: {answer_preview}")
            if faithfulness is None:
                print("Faithfulness: N/A")
            else:
                print(f"Faithfulness: {faithfulness:.3f}")
        except Exception as e:
            row.update({
                "status": "error",
                "error": str(e),
            })
            records.append(row)
            failed += 1
            print(f"Failed to process query: {e}")

    summary: Dict[str, Any] = {
        "dataset_path": dataset_path,
        "agent_name": agent_name,
        "total_rows": len(data),
        "attempted_rows": len(items),
        "succeeded": succeeded,
        "failed": failed,
        "skipped": skipped,
        "avg_faithfulness": _summarize_scores(faithfulness_scores),
        "generated_at_utc": datetime.utcnow().isoformat() + "Z",
    }

    print("\n--- Benchmark Summary ---")
    print(f"Attempted: {summary['attempted_rows']}")
    print(f"Succeeded: {summary['succeeded']}")
    print(f"Failed: {summary['failed']}")
    print(f"Skipped: {summary['skipped']}")
    print(f"Avg faithfulness: {summary['avg_faithfulness']}")

    if output_path:
        output_dir = os.path.dirname(output_path)
        if output_dir:
            os.makedirs(output_dir, exist_ok=True)
        payload = {
            "summary": summary,
            "results": records,
        }
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(payload, f, indent=2)
        print(f"Saved report to: {output_path}")

    return {
        "summary": summary,
        "results": records,
    }


def _default_dataset_path() -> str:
    return os.path.join(
        os.path.dirname(os.path.dirname(os.path.dirname(__file__))),
        "test_data",
        "medqa_sample.json",
    )


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Run benchmark dataset evaluation")
    parser.add_argument("--dataset-path", default=_default_dataset_path(), help="Path to benchmark JSON file")
    parser.add_argument("--agent-name", default="medical_agent", help="Compiled agent name")
    parser.add_argument(
        "--max-samples",
        type=int,
        default=0,
        help="Limit to first N records (0 means all)",
    )
    parser.add_argument("--output", default="", help="Optional output path for JSON report")
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    max_samples = args.max_samples if args.max_samples > 0 else None
    output_path = args.output if args.output else None
    run_benchmark(args.dataset_path, args.agent_name, max_samples=max_samples, output_path=output_path)