File size: 4,817 Bytes

e74a796

#!/usr/bin/env python3
import argparse
import json
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix


STRUCT_KEYS = [
    "json_parse_rate",
    "required_field_complete_rate",
    "current_behavior_accuracy",
    "current_behavior_macro_f1",
    "next_possible_behavior_accuracy",
    "next_possible_behavior_macro_f1",
    "is_transition_accuracy",
    "is_transition_macro_f1",
    "stage_index_accuracy",
    "total_stages_accuracy",
    "sequence_exact_match",
    "sequence_last_label_accuracy",
    "sequence_prefix_label_match",
]
MAE_KEYS = [
    "elapsed_seconds_in_current_behavior_mae",
    "estimated_remaining_seconds_mae",
    "full_remaining_seconds_mae",
    "expected_end_time_mae",
]
QA_KEYS = [
    "json_parse_rate",
    "required_field_complete_rate",
    "occupied_accuracy",
    "occupied_f1",
    "is_abnormal_accuracy",
    "is_abnormal_f1",
    "used_areas_micro_f1",
]


def load_metric(path):
    payload = json.loads(Path(path).read_text(encoding="utf-8"))
    return payload["run_name"], payload["task_type"], payload["metrics"]


def save_bar(df, x, y, hue, title, path):
    plt.figure(figsize=(max(9, len(df[x].unique()) * 0.8), 5))
    sns.barplot(data=df, x=x, y=y, hue=hue)
    plt.title(title)
    plt.xticks(rotation=35, ha="right")
    plt.ylim(0, max(1.0, df[y].max() * 1.15 if len(df) else 1.0))
    plt.tight_layout()
    plt.savefig(path, dpi=180)
    plt.close()


def prediction_rows(path):
    rows = []
    with open(path, encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            pred = rec.get("prediction") if isinstance(rec.get("prediction"), dict) else {}
            target = rec.get("target") if isinstance(rec.get("target"), dict) else {}
            rows.append((target, pred))
    return rows


def save_confusion(prediction_path, title, out_path, top_n=20):
    rows = prediction_rows(prediction_path)
    y_true = [t.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
    y_pred = [p.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
    if not y_true:
        return
    labels = pd.Series(y_true).value_counts().head(top_n).index.tolist()
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    plt.figure(figsize=(11, 9))
    sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(rotation=45, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(out_path, dpi=180)
    plt.close()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--metrics", nargs="+", required=True)
    parser.add_argument("--predictions", nargs="*", default=[])
    parser.add_argument("--out-dir", default="outputs/figures")
    args = parser.parse_args()

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    rows = []
    for metric_path in args.metrics:
        run_name, task_type, metrics = load_metric(metric_path)
        keys = STRUCT_KEYS + MAE_KEYS if task_type == "struct" else QA_KEYS + ["time_to_free_minutes_mae"]
        for key in keys:
            if key in metrics and metrics[key] is not None:
                rows.append({"run": run_name, "task": task_type, "metric": key, "value": metrics[key]})
    df = pd.DataFrame(rows)
    df.to_csv(out_dir / "metric_comparison.csv", index=False)

    if not df.empty:
        struct_score = df[(df["task"] == "struct") & (df["metric"].isin(STRUCT_KEYS))]
        if not struct_score.empty:
            save_bar(struct_score, "metric", "value", "run", "Structural Metrics: Base vs Fine-tuned", out_dir / "struct_scores.png")
        struct_mae = df[(df["task"] == "struct") & (df["metric"].isin(MAE_KEYS))]
        if not struct_mae.empty:
            save_bar(struct_mae, "metric", "value", "run", "Time MAE: Base vs Fine-tuned", out_dir / "struct_time_mae.png")
        qa_score = df[(df["task"] == "qa") & (df["metric"].isin(QA_KEYS))]
        if not qa_score.empty:
            save_bar(qa_score, "metric", "value", "run", "QA Metrics: Base vs Fine-tuned", out_dir / "qa_scores.png")
        qa_mae = df[(df["task"] == "qa") & (df["metric"] == "time_to_free_minutes_mae")]
        if not qa_mae.empty:
            save_bar(qa_mae, "metric", "value", "run", "QA Time-to-Free MAE", out_dir / "qa_time_mae.png")

    for pred_path in args.predictions:
        path = Path(pred_path)
        if "struct" in path.name:
            save_confusion(path, path.stem, out_dir / f"{path.stem}_current_behavior_confusion.png")

    print(f"wrote figures and comparison CSV to {out_dir}")


if __name__ == "__main__":
    main()