| |
| import argparse |
| import json |
| from pathlib import Path |
|
|
| import matplotlib.pyplot as plt |
| import pandas as pd |
| import seaborn as sns |
| from sklearn.metrics import confusion_matrix |
|
|
|
|
| STRUCT_KEYS = [ |
| "json_parse_rate", |
| "required_field_complete_rate", |
| "current_behavior_accuracy", |
| "current_behavior_macro_f1", |
| "next_possible_behavior_accuracy", |
| "next_possible_behavior_macro_f1", |
| "is_transition_accuracy", |
| "is_transition_macro_f1", |
| "stage_index_accuracy", |
| "total_stages_accuracy", |
| "sequence_exact_match", |
| "sequence_last_label_accuracy", |
| "sequence_prefix_label_match", |
| ] |
| MAE_KEYS = [ |
| "elapsed_seconds_in_current_behavior_mae", |
| "estimated_remaining_seconds_mae", |
| "full_remaining_seconds_mae", |
| "expected_end_time_mae", |
| ] |
| QA_KEYS = [ |
| "json_parse_rate", |
| "required_field_complete_rate", |
| "occupied_accuracy", |
| "occupied_f1", |
| "is_abnormal_accuracy", |
| "is_abnormal_f1", |
| "used_areas_micro_f1", |
| ] |
|
|
|
|
| def load_metric(path): |
| payload = json.loads(Path(path).read_text(encoding="utf-8")) |
| return payload["run_name"], payload["task_type"], payload["metrics"] |
|
|
|
|
| def save_bar(df, x, y, hue, title, path): |
| plt.figure(figsize=(max(9, len(df[x].unique()) * 0.8), 5)) |
| sns.barplot(data=df, x=x, y=y, hue=hue) |
| plt.title(title) |
| plt.xticks(rotation=35, ha="right") |
| plt.ylim(0, max(1.0, df[y].max() * 1.15 if len(df) else 1.0)) |
| plt.tight_layout() |
| plt.savefig(path, dpi=180) |
| plt.close() |
|
|
|
|
| def prediction_rows(path): |
| rows = [] |
| with open(path, encoding="utf-8") as f: |
| for line in f: |
| rec = json.loads(line) |
| pred = rec.get("prediction") if isinstance(rec.get("prediction"), dict) else {} |
| target = rec.get("target") if isinstance(rec.get("target"), dict) else {} |
| rows.append((target, pred)) |
| return rows |
|
|
|
|
| def save_confusion(prediction_path, title, out_path, top_n=20): |
| rows = prediction_rows(prediction_path) |
| y_true = [t.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None] |
| y_pred = [p.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None] |
| if not y_true: |
| return |
| labels = pd.Series(y_true).value_counts().head(top_n).index.tolist() |
| cm = confusion_matrix(y_true, y_pred, labels=labels) |
| plt.figure(figsize=(11, 9)) |
| sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels) |
| plt.title(title) |
| plt.xlabel("Predicted") |
| plt.ylabel("True") |
| plt.xticks(rotation=45, ha="right") |
| plt.yticks(rotation=0) |
| plt.tight_layout() |
| plt.savefig(out_path, dpi=180) |
| plt.close() |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--metrics", nargs="+", required=True) |
| parser.add_argument("--predictions", nargs="*", default=[]) |
| parser.add_argument("--out-dir", default="outputs/figures") |
| args = parser.parse_args() |
|
|
| out_dir = Path(args.out_dir) |
| out_dir.mkdir(parents=True, exist_ok=True) |
| rows = [] |
| for metric_path in args.metrics: |
| run_name, task_type, metrics = load_metric(metric_path) |
| keys = STRUCT_KEYS + MAE_KEYS if task_type == "struct" else QA_KEYS + ["time_to_free_minutes_mae"] |
| for key in keys: |
| if key in metrics and metrics[key] is not None: |
| rows.append({"run": run_name, "task": task_type, "metric": key, "value": metrics[key]}) |
| df = pd.DataFrame(rows) |
| df.to_csv(out_dir / "metric_comparison.csv", index=False) |
|
|
| if not df.empty: |
| struct_score = df[(df["task"] == "struct") & (df["metric"].isin(STRUCT_KEYS))] |
| if not struct_score.empty: |
| save_bar(struct_score, "metric", "value", "run", "Structural Metrics: Base vs Fine-tuned", out_dir / "struct_scores.png") |
| struct_mae = df[(df["task"] == "struct") & (df["metric"].isin(MAE_KEYS))] |
| if not struct_mae.empty: |
| save_bar(struct_mae, "metric", "value", "run", "Time MAE: Base vs Fine-tuned", out_dir / "struct_time_mae.png") |
| qa_score = df[(df["task"] == "qa") & (df["metric"].isin(QA_KEYS))] |
| if not qa_score.empty: |
| save_bar(qa_score, "metric", "value", "run", "QA Metrics: Base vs Fine-tuned", out_dir / "qa_scores.png") |
| qa_mae = df[(df["task"] == "qa") & (df["metric"] == "time_to_free_minutes_mae")] |
| if not qa_mae.empty: |
| save_bar(qa_mae, "metric", "value", "run", "QA Time-to-Free MAE", out_dir / "qa_time_mae.png") |
|
|
| for pred_path in args.predictions: |
| path = Path(pred_path) |
| if "struct" in path.name: |
| save_confusion(path, path.stem, out_dir / f"{path.stem}_current_behavior_confusion.png") |
|
|
| print(f"wrote figures and comparison CSV to {out_dir}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|