CabinLavatoryPrediction / code /visualize_results.py
sutama's picture
Upload CabinLavatoryPrediction LoRA adapter, checkpoint, code, and evaluation artifacts
e74a796 verified
#!/usr/bin/env python3
import argparse
import json
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix
STRUCT_KEYS = [
"json_parse_rate",
"required_field_complete_rate",
"current_behavior_accuracy",
"current_behavior_macro_f1",
"next_possible_behavior_accuracy",
"next_possible_behavior_macro_f1",
"is_transition_accuracy",
"is_transition_macro_f1",
"stage_index_accuracy",
"total_stages_accuracy",
"sequence_exact_match",
"sequence_last_label_accuracy",
"sequence_prefix_label_match",
]
MAE_KEYS = [
"elapsed_seconds_in_current_behavior_mae",
"estimated_remaining_seconds_mae",
"full_remaining_seconds_mae",
"expected_end_time_mae",
]
QA_KEYS = [
"json_parse_rate",
"required_field_complete_rate",
"occupied_accuracy",
"occupied_f1",
"is_abnormal_accuracy",
"is_abnormal_f1",
"used_areas_micro_f1",
]
def load_metric(path):
payload = json.loads(Path(path).read_text(encoding="utf-8"))
return payload["run_name"], payload["task_type"], payload["metrics"]
def save_bar(df, x, y, hue, title, path):
plt.figure(figsize=(max(9, len(df[x].unique()) * 0.8), 5))
sns.barplot(data=df, x=x, y=y, hue=hue)
plt.title(title)
plt.xticks(rotation=35, ha="right")
plt.ylim(0, max(1.0, df[y].max() * 1.15 if len(df) else 1.0))
plt.tight_layout()
plt.savefig(path, dpi=180)
plt.close()
def prediction_rows(path):
rows = []
with open(path, encoding="utf-8") as f:
for line in f:
rec = json.loads(line)
pred = rec.get("prediction") if isinstance(rec.get("prediction"), dict) else {}
target = rec.get("target") if isinstance(rec.get("target"), dict) else {}
rows.append((target, pred))
return rows
def save_confusion(prediction_path, title, out_path, top_n=20):
rows = prediction_rows(prediction_path)
y_true = [t.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
y_pred = [p.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
if not y_true:
return
labels = pd.Series(y_true).value_counts().head(top_n).index.tolist()
cm = confusion_matrix(y_true, y_pred, labels=labels)
plt.figure(figsize=(11, 9))
sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title(title)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(out_path, dpi=180)
plt.close()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--metrics", nargs="+", required=True)
parser.add_argument("--predictions", nargs="*", default=[])
parser.add_argument("--out-dir", default="outputs/figures")
args = parser.parse_args()
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
rows = []
for metric_path in args.metrics:
run_name, task_type, metrics = load_metric(metric_path)
keys = STRUCT_KEYS + MAE_KEYS if task_type == "struct" else QA_KEYS + ["time_to_free_minutes_mae"]
for key in keys:
if key in metrics and metrics[key] is not None:
rows.append({"run": run_name, "task": task_type, "metric": key, "value": metrics[key]})
df = pd.DataFrame(rows)
df.to_csv(out_dir / "metric_comparison.csv", index=False)
if not df.empty:
struct_score = df[(df["task"] == "struct") & (df["metric"].isin(STRUCT_KEYS))]
if not struct_score.empty:
save_bar(struct_score, "metric", "value", "run", "Structural Metrics: Base vs Fine-tuned", out_dir / "struct_scores.png")
struct_mae = df[(df["task"] == "struct") & (df["metric"].isin(MAE_KEYS))]
if not struct_mae.empty:
save_bar(struct_mae, "metric", "value", "run", "Time MAE: Base vs Fine-tuned", out_dir / "struct_time_mae.png")
qa_score = df[(df["task"] == "qa") & (df["metric"].isin(QA_KEYS))]
if not qa_score.empty:
save_bar(qa_score, "metric", "value", "run", "QA Metrics: Base vs Fine-tuned", out_dir / "qa_scores.png")
qa_mae = df[(df["task"] == "qa") & (df["metric"] == "time_to_free_minutes_mae")]
if not qa_mae.empty:
save_bar(qa_mae, "metric", "value", "run", "QA Time-to-Free MAE", out_dir / "qa_time_mae.png")
for pred_path in args.predictions:
path = Path(pred_path)
if "struct" in path.name:
save_confusion(path, path.stem, out_dir / f"{path.stem}_current_behavior_confusion.png")
print(f"wrote figures and comparison CSV to {out_dir}")
if __name__ == "__main__":
main()