#!/usr/bin/env python3 import argparse import json from pathlib import Path import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from sklearn.metrics import confusion_matrix STRUCT_KEYS = [ "json_parse_rate", "required_field_complete_rate", "current_behavior_accuracy", "current_behavior_macro_f1", "next_possible_behavior_accuracy", "next_possible_behavior_macro_f1", "is_transition_accuracy", "is_transition_macro_f1", "stage_index_accuracy", "total_stages_accuracy", "sequence_exact_match", "sequence_last_label_accuracy", "sequence_prefix_label_match", ] MAE_KEYS = [ "elapsed_seconds_in_current_behavior_mae", "estimated_remaining_seconds_mae", "full_remaining_seconds_mae", "expected_end_time_mae", ] QA_KEYS = [ "json_parse_rate", "required_field_complete_rate", "occupied_accuracy", "occupied_f1", "is_abnormal_accuracy", "is_abnormal_f1", "used_areas_micro_f1", ] def load_metric(path): payload = json.loads(Path(path).read_text(encoding="utf-8")) return payload["run_name"], payload["task_type"], payload["metrics"] def save_bar(df, x, y, hue, title, path): plt.figure(figsize=(max(9, len(df[x].unique()) * 0.8), 5)) sns.barplot(data=df, x=x, y=y, hue=hue) plt.title(title) plt.xticks(rotation=35, ha="right") plt.ylim(0, max(1.0, df[y].max() * 1.15 if len(df) else 1.0)) plt.tight_layout() plt.savefig(path, dpi=180) plt.close() def prediction_rows(path): rows = [] with open(path, encoding="utf-8") as f: for line in f: rec = json.loads(line) pred = rec.get("prediction") if isinstance(rec.get("prediction"), dict) else {} target = rec.get("target") if isinstance(rec.get("target"), dict) else {} rows.append((target, pred)) return rows def save_confusion(prediction_path, title, out_path, top_n=20): rows = prediction_rows(prediction_path) y_true = [t.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None] y_pred = [p.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None] if not y_true: return labels = pd.Series(y_true).value_counts().head(top_n).index.tolist() cm = confusion_matrix(y_true, y_pred, labels=labels) plt.figure(figsize=(11, 9)) sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels) plt.title(title) plt.xlabel("Predicted") plt.ylabel("True") plt.xticks(rotation=45, ha="right") plt.yticks(rotation=0) plt.tight_layout() plt.savefig(out_path, dpi=180) plt.close() def main(): parser = argparse.ArgumentParser() parser.add_argument("--metrics", nargs="+", required=True) parser.add_argument("--predictions", nargs="*", default=[]) parser.add_argument("--out-dir", default="outputs/figures") args = parser.parse_args() out_dir = Path(args.out_dir) out_dir.mkdir(parents=True, exist_ok=True) rows = [] for metric_path in args.metrics: run_name, task_type, metrics = load_metric(metric_path) keys = STRUCT_KEYS + MAE_KEYS if task_type == "struct" else QA_KEYS + ["time_to_free_minutes_mae"] for key in keys: if key in metrics and metrics[key] is not None: rows.append({"run": run_name, "task": task_type, "metric": key, "value": metrics[key]}) df = pd.DataFrame(rows) df.to_csv(out_dir / "metric_comparison.csv", index=False) if not df.empty: struct_score = df[(df["task"] == "struct") & (df["metric"].isin(STRUCT_KEYS))] if not struct_score.empty: save_bar(struct_score, "metric", "value", "run", "Structural Metrics: Base vs Fine-tuned", out_dir / "struct_scores.png") struct_mae = df[(df["task"] == "struct") & (df["metric"].isin(MAE_KEYS))] if not struct_mae.empty: save_bar(struct_mae, "metric", "value", "run", "Time MAE: Base vs Fine-tuned", out_dir / "struct_time_mae.png") qa_score = df[(df["task"] == "qa") & (df["metric"].isin(QA_KEYS))] if not qa_score.empty: save_bar(qa_score, "metric", "value", "run", "QA Metrics: Base vs Fine-tuned", out_dir / "qa_scores.png") qa_mae = df[(df["task"] == "qa") & (df["metric"] == "time_to_free_minutes_mae")] if not qa_mae.empty: save_bar(qa_mae, "metric", "value", "run", "QA Time-to-Free MAE", out_dir / "qa_time_mae.png") for pred_path in args.predictions: path = Path(pred_path) if "struct" in path.name: save_confusion(path, path.stem, out_dir / f"{path.stem}_current_behavior_confusion.png") print(f"wrote figures and comparison CSV to {out_dir}") if __name__ == "__main__": main()