File size: 16,650 Bytes
from __future__ import annotations

import os

# Quieter logs when TensorFlow/XLA are pulled in indirectly (common on Colab).
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
os.environ.setdefault("ABSL_MIN_LOG_LEVEL", "2")

import argparse
import gc
import json
import sys
from pathlib import Path

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

BASE_DIR = Path(__file__).resolve().parent.parent
if str(BASE_DIR) not in sys.path:
    sys.path.insert(0, str(BASE_DIR))

from combined_inference import classify_query
from config import (
    DEFAULT_BENCHMARK_PATH,
    EVALUATION_ARTIFACTS_DIR,
    HEAD_CONFIGS,
    IAB_HEAD_CONFIG,
    IAB_BEHAVIOR_LOCK_CASES_PATH,
    IAB_CROSS_VERTICAL_BEHAVIOR_LOCK_CASES_PATH,
    IAB_CROSS_VERTICAL_QUALITY_TARGET_CASES_PATH,
    IAB_QUALITY_TARGET_CASES_PATH,
    KNOWN_FAILURE_CASES_PATH,
    ensure_artifact_dirs,
)
from evaluation.regression_suite import (
    evaluate_iab_behavior_lock_cases,
    evaluate_iab_cross_vertical_behavior_lock_cases,
    evaluate_iab_cross_vertical_quality_target_cases,
    evaluate_iab_quality_target_cases,
    evaluate_known_failure_cases,
)
from evaluation.iab_quality import compute_path_metrics, evaluate_iab_views, path_from_label
from iab_classifier import predict_iab_content_classifier_batch
from model_runtime import get_head
from schemas import validate_classify_response


def _maybe_free_cuda_memory() -> None:
    try:
        import torch

        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    except Exception:
        pass


def load_jsonl(path: Path) -> list[dict]:
    with path.open("r", encoding="utf-8") as handle:
        return [json.loads(line) for line in handle]


def write_json(path: Path, payload: dict | list) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")


def evaluate_head_dataset(head_name: str, dataset_path: Path, suite_name: str, output_dir: Path) -> dict:
    head = get_head(head_name)
    config = head.config
    rows = load_jsonl(dataset_path)
    predictions = head.predict_batch([row["text"] for row in rows])

    y_true = [row[config.label_field] for row in rows]
    y_pred = [prediction["label"] for prediction in predictions]
    accepted = [prediction["meets_confidence_threshold"] for prediction in predictions]

    confusion = confusion_matrix(y_true, y_pred, labels=list(config.labels))
    confusion_df = pd.DataFrame(confusion, index=config.labels, columns=config.labels)
    confusion_path = output_dir / f"{head_name}_{suite_name}_confusion_matrix.csv"
    confusion_df.to_csv(confusion_path)

    accepted_total_count = sum(accepted)
    accepted_accuracy = (
        accuracy_score(
            [truth for truth, keep in zip(y_true, accepted) if keep],
            [pred for pred, keep in zip(y_pred, accepted) if keep],
        )
        if accepted_total_count
        else 0.0
    )

    report = classification_report(
        y_true,
        y_pred,
        labels=list(config.labels),
        output_dict=True,
        zero_division=0,
    )
    difficulty_breakdown = None
    if rows and all("difficulty" in row for row in rows):
        difficulty_breakdown = {}
        for difficulty in sorted({row["difficulty"] for row in rows}):
            indices = [idx for idx, row in enumerate(rows) if row["difficulty"] == difficulty]
            difficulty_true = [y_true[idx] for idx in indices]
            difficulty_pred = [y_pred[idx] for idx in indices]
            difficulty_accepted = [accepted[idx] for idx in indices]
            difficulty_accepted_count = sum(difficulty_accepted)
            difficulty_accepted_accuracy = (
                accuracy_score(
                    [truth for truth, keep in zip(difficulty_true, difficulty_accepted) if keep],
                    [pred for pred, keep in zip(difficulty_pred, difficulty_accepted) if keep],
                )
                if difficulty_accepted_count
                else 0.0
            )
            difficulty_breakdown[difficulty] = {
                "count": len(indices),
                "accuracy": round(float(accuracy_score(difficulty_true, difficulty_pred)), 4),
                "macro_f1": round(float(f1_score(difficulty_true, difficulty_pred, average="macro")), 4),
                "accepted_coverage": round(float(difficulty_accepted_count / len(indices)), 4),
                "accepted_accuracy": round(float(difficulty_accepted_accuracy), 4),
                "fallback_rate": round(float(1 - (difficulty_accepted_count / len(indices))), 4),
            }
    summary = {
        "head": head_name,
        "suite": suite_name,
        "dataset_path": str(dataset_path),
        "count": len(rows),
        "accuracy": round(float(accuracy_score(y_true, y_pred)), 4),
        "macro_f1": round(float(f1_score(y_true, y_pred, average="macro")), 4),
        "accepted_coverage": round(float(accepted_total_count / len(rows)), 4),
        "accepted_accuracy": round(float(accepted_accuracy), 4),
        "fallback_rate": round(float(1 - (accepted_total_count / len(rows))), 4),
        "per_class_metrics": report,
        "confusion_matrix_path": str(confusion_path),
    }
    if difficulty_breakdown is not None:
        summary["difficulty_breakdown"] = difficulty_breakdown
    write_json(output_dir / f"{head_name}_{suite_name}_report.json", summary)
    return summary


def evaluate_iab_dataset(dataset_path: Path, suite_name: str, output_dir: Path) -> dict:
    rows = load_jsonl(dataset_path)
    true_paths = [path_from_label(row["iab_path"]) for row in rows]
    true_labels = [row["iab_path"] for row in rows]
    predictions = predict_iab_content_classifier_batch([row["text"] for row in rows])
    if not any(output is not None for output in predictions):
        raise RuntimeError(
            "IAB classifier artifacts are unavailable. Run `python3 training/train_iab.py` "
            "and `python3 training/calibrate_confidence.py --head iab_content` "
            "from the `agentic-intent-classifier` directory first."
        )

    pred_paths = [
        tuple(output["path"]) if output is not None else tuple()
        for output in predictions
    ]
    accepted = [bool(output and output["meets_confidence_threshold"]) for output in predictions]
    source = next((output["source"] for output in predictions if output is not None), "supervised_classifier")
    pred_labels = [" > ".join(path) if path else "__no_prediction__" for path in pred_paths]

    accepted_total_count = sum(accepted)
    accepted_accuracy = (
        sum(1 for truth, pred, keep in zip(true_paths, pred_paths, accepted) if keep and truth == pred) / accepted_total_count
        if accepted_total_count
        else 0.0
    )
    difficulty_breakdown = None
    if rows and all("difficulty" in row for row in rows):
        difficulty_breakdown = {}
        for difficulty in sorted({row["difficulty"] for row in rows}):
            indices = [idx for idx, row in enumerate(rows) if row["difficulty"] == difficulty]
            difficulty_true_paths = [true_paths[idx] for idx in indices]
            difficulty_pred_paths = [pred_paths[idx] for idx in indices]
            difficulty_true_labels = [true_labels[idx] for idx in indices]
            difficulty_pred_labels = [pred_labels[idx] for idx in indices]
            difficulty_accepted = [accepted[idx] for idx in indices]
            difficulty_accepted_count = sum(difficulty_accepted)
            difficulty_accepted_accuracy = (
                sum(
                    1
                    for truth, pred, keep in zip(difficulty_true_paths, difficulty_pred_paths, difficulty_accepted)
                    if keep and truth == pred
                )
                / difficulty_accepted_count
                if difficulty_accepted_count
                else 0.0
            )
            difficulty_breakdown[difficulty] = {
                "count": len(indices),
                "accuracy": round(
                    float(sum(1 for truth, pred in zip(difficulty_true_paths, difficulty_pred_paths) if truth == pred) / max(len(indices), 1)),
                    4,
                ),
                "macro_f1": round(float(f1_score(difficulty_true_labels, difficulty_pred_labels, average="macro")), 4),
                "accepted_coverage": round(float(difficulty_accepted_count / max(len(indices), 1)), 4),
                "accepted_accuracy": round(float(difficulty_accepted_accuracy), 4),
                "fallback_rate": round(float(1 - (difficulty_accepted_count / max(len(indices), 1))), 4),
            }
    summary = {
        "head": "iab_content",
        "suite": suite_name,
        "dataset_path": str(dataset_path),
        "count": len(rows),
        "accuracy": round(float(sum(1 for truth, pred in zip(true_paths, pred_paths) if truth == pred) / max(len(rows), 1)), 4),
        "macro_f1": round(float(f1_score(true_labels, pred_labels, average="macro")), 4),
        "accepted_coverage": round(float(accepted_total_count / max(len(rows), 1)), 4),
        "accepted_accuracy": round(float(accepted_accuracy), 4),
        "fallback_rate": round(float(1 - (accepted_total_count / max(len(rows), 1))), 4),
        "primary_source": source,
        "tier_metrics": compute_path_metrics(true_paths, pred_paths),
        "view_metrics": evaluate_iab_views(rows),
    }
    if difficulty_breakdown is not None:
        summary["difficulty_breakdown"] = difficulty_breakdown
    write_json(output_dir / f"iab_content_{suite_name}_report.json", summary)
    return summary


def evaluate_combined_benchmark(path: Path, output_dir: Path) -> dict:
    benchmark = json.loads(path.read_text(encoding="utf-8"))
    outputs = []
    fallback_applied = 0
    for item in benchmark:
        payload = validate_classify_response(classify_query(item["input"]))
        if payload["model_output"].get("fallback"):
            fallback_applied += 1
        outputs.append(
            {
                "input": item["input"],
                "expected_behavior": item["expected_behavior"],
                "response": payload,
            }
        )
    write_json(output_dir / "combined_demo_benchmark.json", outputs)
    return {
        "benchmark_path": str(path),
        "count": len(outputs),
        "fallback_rate": round(fallback_applied / len(outputs), 4) if outputs else 0.0,
        "output_path": str(output_dir / "combined_demo_benchmark.json"),
    }


def main() -> None:
    parser = argparse.ArgumentParser(description="Run repeatable evaluation for classifier heads and combined output.")
    parser.add_argument(
        "--output-dir",
        default=str(EVALUATION_ARTIFACTS_DIR / "latest"),
        help="Directory to write evaluation artifacts into.",
    )
    parser.add_argument(
        "--skip-iab-train-eval",
        action="store_true",
        help="Skip the IAB train split (largest JSONL). Use on low-RAM hosts (e.g. Colab free tier).",
    )
    args = parser.parse_args()

    ensure_artifact_dirs()
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    summary = {"heads": {}, "combined": {}}
    for head_name, config in HEAD_CONFIGS.items():
        if head_name == "iab_content":
            continue
        head_summary = {}
        for split_name, split_path in config.split_paths.items():
            head_summary[split_name] = evaluate_head_dataset(head_name, split_path, split_name, output_dir)
        for suite_name, suite_path in config.stress_suite_paths.items():
            head_summary[suite_name] = evaluate_head_dataset(head_name, suite_path, suite_name, output_dir)
        summary["heads"][head_name] = head_summary
        gc.collect()
        _maybe_free_cuda_memory()

    iab_summary = {}
    for split_name, split_path in IAB_HEAD_CONFIG.split_paths.items():
        if args.skip_iab_train_eval and split_name == "train":
            continue
        iab_summary[split_name] = evaluate_iab_dataset(split_path, split_name, output_dir)
        gc.collect()
        _maybe_free_cuda_memory()
    for suite_name, suite_path in IAB_HEAD_CONFIG.stress_suite_paths.items():
        iab_summary[suite_name] = evaluate_iab_dataset(suite_path, suite_name, output_dir)
        gc.collect()
        _maybe_free_cuda_memory()
    summary["heads"]["iab_content"] = iab_summary

    summary["combined"]["demo_benchmark"] = evaluate_combined_benchmark(DEFAULT_BENCHMARK_PATH, output_dir)
    summary["combined"]["known_failure_regression"] = evaluate_known_failure_cases(KNOWN_FAILURE_CASES_PATH, output_dir)
    summary["combined"]["iab_behavior_lock_regression"] = evaluate_iab_behavior_lock_cases(
        IAB_BEHAVIOR_LOCK_CASES_PATH,
        output_dir,
    )
    summary["combined"]["iab_cross_vertical_behavior_lock_regression"] = evaluate_iab_cross_vertical_behavior_lock_cases(
        IAB_CROSS_VERTICAL_BEHAVIOR_LOCK_CASES_PATH,
        output_dir,
    )
    summary["combined"]["iab_quality_target_eval"] = evaluate_iab_quality_target_cases(
        IAB_QUALITY_TARGET_CASES_PATH,
        output_dir,
    )
    summary["combined"]["iab_cross_vertical_quality_target_eval"] = evaluate_iab_cross_vertical_quality_target_cases(
        IAB_CROSS_VERTICAL_QUALITY_TARGET_CASES_PATH,
        output_dir,
    )
    write_json(output_dir / "summary.json", summary)
    compact_summary = {
        "heads": {
            head_name: {
                "test": {
                    key: head_summary["test"][key]
                    for key in (
                        "count",
                        "accuracy",
                        "macro_f1",
                        "accepted_accuracy",
                        "accepted_coverage",
                        "fallback_rate",
                    )
                }
                | (
                    {"tier_metrics": head_summary["test"]["tier_metrics"]}
                    if "tier_metrics" in head_summary["test"]
                    else {}
                )
            }
            for head_name, head_summary in summary["heads"].items()
        },
        "combined": {
            "demo_benchmark": summary["combined"]["demo_benchmark"],
            "known_failure_regression": {
                "count": summary["combined"]["known_failure_regression"]["count"],
                "passed": summary["combined"]["known_failure_regression"]["passed"],
                "failed": summary["combined"]["known_failure_regression"]["failed"],
                "by_status": summary["combined"]["known_failure_regression"]["by_status"],
            },
            "iab_behavior_lock_regression": {
                "count": summary["combined"]["iab_behavior_lock_regression"]["count"],
                "passed": summary["combined"]["iab_behavior_lock_regression"]["passed"],
                "failed": summary["combined"]["iab_behavior_lock_regression"]["failed"],
                "by_status": summary["combined"]["iab_behavior_lock_regression"]["by_status"],
            },
            "iab_cross_vertical_behavior_lock_regression": {
                "count": summary["combined"]["iab_cross_vertical_behavior_lock_regression"]["count"],
                "passed": summary["combined"]["iab_cross_vertical_behavior_lock_regression"]["passed"],
                "failed": summary["combined"]["iab_cross_vertical_behavior_lock_regression"]["failed"],
                "by_status": summary["combined"]["iab_cross_vertical_behavior_lock_regression"]["by_status"],
            },
            "iab_quality_target_eval": {
                "count": summary["combined"]["iab_quality_target_eval"]["count"],
                "passed": summary["combined"]["iab_quality_target_eval"]["passed"],
                "failed": summary["combined"]["iab_quality_target_eval"]["failed"],
                "by_status": summary["combined"]["iab_quality_target_eval"]["by_status"],
            },
            "iab_cross_vertical_quality_target_eval": {
                "count": summary["combined"]["iab_cross_vertical_quality_target_eval"]["count"],
                "passed": summary["combined"]["iab_cross_vertical_quality_target_eval"]["passed"],
                "failed": summary["combined"]["iab_cross_vertical_quality_target_eval"]["failed"],
                "by_status": summary["combined"]["iab_cross_vertical_quality_target_eval"]["by_status"],
            },
        },
        "summary_path": str(output_dir / "summary.json"),
    }
    print(json.dumps(compact_summary, indent=2))


if __name__ == "__main__":
    main()