"""Run full RAGAS evaluation on generated responses and produce comparison table."""

import json
import os
import sys
from pathlib import Path

import yaml
from loguru import logger

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from src.evaluation.ragas_eval import run_ragas_evaluation
from src.evaluation.report import generate_report
from src.evaluation.classifier_eval import generate_comparison_table, measure_inference_time
from src.models import baseline as baseline_mod
from src.models.intent_classifier import IntentClassifier


def _get_model_size_mb(path: str) -> float:
    """Return total size of all files in a directory in MB."""
    total = 0
    for p in Path(path).rglob("*"):
        if p.is_file():
            try:
                total += p.stat().st_size
            except OSError:
                pass
    return total / (1024 * 1024)


def main() -> None:
    """Run RAGAS evaluation and generate comparison table."""
    Path("logs").mkdir(exist_ok=True)
    logger.add("logs/run_evaluation.log", rotation="10 MB")

    with open("config/config.yaml") as f:
        cfg = yaml.safe_load(f)

    results_dir = cfg["paths"]["results"]

    # Load generation results
    results_path = Path(results_dir) / "generation_results.json"
    if not results_path.exists():
        logger.error(f"Generation results not found at {results_path}. Run run_generation.py first.")
        sys.exit(1)

    with open(results_path) as f:
        results = json.load(f)

    # Subsample to target size for RAGAS (it can be slow)
    n = cfg["evaluation"]["ragas_sample_size"]
    if len(results) > n:
        import random
        random.seed(42)
        results_sample = random.sample(results, n)
    else:
        results_sample = results

    # Run RAGAS
    ragas_output = run_ragas_evaluation(
        results=results_sample,
        results_dir=results_dir,
        faithfulness_threshold=cfg["evaluation"]["faithfulness_flag_threshold"],
    )

    # Load classification reports
    baseline_report, distilbert_report = None, None
    b_path = Path(results_dir) / "baseline_classification_report.json"
    d_path = Path(results_dir) / "classification_report.json"
    if b_path.exists():
        with open(b_path) as f:
            baseline_report = json.load(f)
    if d_path.exists():
        with open(d_path) as f:
            distilbert_report = json.load(f)

    # Measure inference times
    from src.data.dataset import load_splits
    _, _, test_df = load_splits(cfg["paths"]["data_processed"])
    texts = test_df["text"].tolist()

    b_time_ms, d_time_ms = 0.0, 0.0
    if baseline_report:
        try:
            pipeline = baseline_mod.load_pipeline(cfg["paths"]["models_baseline"])
            b_time_ms = measure_inference_time(pipeline.predict, texts)
        except Exception as e:
            logger.warning(f"Could not measure baseline inference time: {e}")

    if distilbert_report:
        try:
            model_dir = str(Path(cfg["paths"]["models_distilbert"]) / "best")
            clf = IntentClassifier(model_dir=model_dir, max_length=cfg["classifier"]["max_length"])
            d_time_ms = measure_inference_time(
                lambda t: clf.predict_batch(t), texts
            )
        except Exception as e:
            logger.warning(f"Could not measure DistilBERT inference time: {e}")

    # Model sizes
    b_size = _get_model_size_mb(cfg["paths"]["models_baseline"])
    d_size = _get_model_size_mb(cfg["paths"]["models_distilbert"])

    # Comparison table
    if baseline_report and distilbert_report:
        generate_comparison_table(
            baseline_report=baseline_report,
            distilbert_report=distilbert_report,
            baseline_inference_ms=b_time_ms,
            distilbert_inference_ms=d_time_ms,
            baseline_size_mb=b_size,
            distilbert_size_mb=d_size,
            results_dir=results_dir,
        )

    # Final report
    generate_report(results_dir=results_dir, ragas_output=ragas_output)

    # Check RAGAS targets
    agg = ragas_output.get("aggregate", {})
    for metric, target in [
        ("faithfulness", cfg["evaluation"]["target_faithfulness"]),
        ("answer_relevancy", cfg["evaluation"]["target_answer_relevancy"]),
    ]:
        if metric in agg:
            mean = agg[metric]["mean"]
            status = "PASS" if mean >= target else "FAIL"
            logger.info(f"[{status}] {metric}: {mean:.4f} (target >= {target})")

    pct_flagged = ragas_output.get("pct_flagged", 100.0)
    flag_status = "PASS" if pct_flagged <= 5.0 else "FAIL"
    logger.info(f"[{flag_status}] Flagged responses: {pct_flagged:.1f}% (target <= 5%)")


if __name__ == "__main__":
    main()