CabinLavatoryPrediction / code /visualize_results.py

Upload CabinLavatoryPrediction LoRA adapter, checkpoint, code, and evaluation artifacts

e74a796 verified 5 days ago

4.82 kB

	#!/usr/bin/env python3
	import argparse
	import json
	from pathlib import Path

	import matplotlib.pyplot as plt
	import pandas as pd
	import seaborn as sns
	from sklearn.metrics import confusion_matrix


	STRUCT_KEYS = [
	"json_parse_rate",
	"required_field_complete_rate",
	"current_behavior_accuracy",
	"current_behavior_macro_f1",
	"next_possible_behavior_accuracy",
	"next_possible_behavior_macro_f1",
	"is_transition_accuracy",
	"is_transition_macro_f1",
	"stage_index_accuracy",
	"total_stages_accuracy",
	"sequence_exact_match",
	"sequence_last_label_accuracy",
	"sequence_prefix_label_match",
	]
	MAE_KEYS = [
	"elapsed_seconds_in_current_behavior_mae",
	"estimated_remaining_seconds_mae",
	"full_remaining_seconds_mae",
	"expected_end_time_mae",
	]
	QA_KEYS = [
	"json_parse_rate",
	"required_field_complete_rate",
	"occupied_accuracy",
	"occupied_f1",
	"is_abnormal_accuracy",
	"is_abnormal_f1",
	"used_areas_micro_f1",
	]


	def load_metric(path):
	payload = json.loads(Path(path).read_text(encoding="utf-8"))
	return payload["run_name"], payload["task_type"], payload["metrics"]


	def save_bar(df, x, y, hue, title, path):
	plt.figure(figsize=(max(9, len(df[x].unique()) * 0.8), 5))
	sns.barplot(data=df, x=x, y=y, hue=hue)
	plt.title(title)
	plt.xticks(rotation=35, ha="right")
	plt.ylim(0, max(1.0, df[y].max() * 1.15 if len(df) else 1.0))
	plt.tight_layout()
	plt.savefig(path, dpi=180)
	plt.close()


	def prediction_rows(path):
	rows = []
	with open(path, encoding="utf-8") as f:
	for line in f:
	rec = json.loads(line)
	pred = rec.get("prediction") if isinstance(rec.get("prediction"), dict) else {}
	target = rec.get("target") if isinstance(rec.get("target"), dict) else {}
	rows.append((target, pred))
	return rows


	def save_confusion(prediction_path, title, out_path, top_n=20):
	rows = prediction_rows(prediction_path)
	y_true = [t.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
	y_pred = [p.get("current_behavior") for t, p in rows if p.get("current_behavior") is not None]
	if not y_true:
	return
	labels = pd.Series(y_true).value_counts().head(top_n).index.tolist()
	cm = confusion_matrix(y_true, y_pred, labels=labels)
	plt.figure(figsize=(11, 9))
	sns.heatmap(cm, annot=False, cmap="Blues", xticklabels=labels, yticklabels=labels)
	plt.title(title)
	plt.xlabel("Predicted")
	plt.ylabel("True")
	plt.xticks(rotation=45, ha="right")
	plt.yticks(rotation=0)
	plt.tight_layout()
	plt.savefig(out_path, dpi=180)
	plt.close()


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--metrics", nargs="+", required=True)
	parser.add_argument("--predictions", nargs="*", default=[])
	parser.add_argument("--out-dir", default="outputs/figures")
	args = parser.parse_args()

	out_dir = Path(args.out_dir)
	out_dir.mkdir(parents=True, exist_ok=True)
	rows = []
	for metric_path in args.metrics:
	run_name, task_type, metrics = load_metric(metric_path)
	keys = STRUCT_KEYS + MAE_KEYS if task_type == "struct" else QA_KEYS + ["time_to_free_minutes_mae"]
	for key in keys:
	if key in metrics and metrics[key] is not None:
	rows.append({"run": run_name, "task": task_type, "metric": key, "value": metrics[key]})
	df = pd.DataFrame(rows)
	df.to_csv(out_dir / "metric_comparison.csv", index=False)

	if not df.empty:
	struct_score = df[(df["task"] == "struct") & (df["metric"].isin(STRUCT_KEYS))]
	if not struct_score.empty:
	save_bar(struct_score, "metric", "value", "run", "Structural Metrics: Base vs Fine-tuned", out_dir / "struct_scores.png")
	struct_mae = df[(df["task"] == "struct") & (df["metric"].isin(MAE_KEYS))]
	if not struct_mae.empty:
	save_bar(struct_mae, "metric", "value", "run", "Time MAE: Base vs Fine-tuned", out_dir / "struct_time_mae.png")
	qa_score = df[(df["task"] == "qa") & (df["metric"].isin(QA_KEYS))]
	if not qa_score.empty:
	save_bar(qa_score, "metric", "value", "run", "QA Metrics: Base vs Fine-tuned", out_dir / "qa_scores.png")
	qa_mae = df[(df["task"] == "qa") & (df["metric"] == "time_to_free_minutes_mae")]
	if not qa_mae.empty:
	save_bar(qa_mae, "metric", "value", "run", "QA Time-to-Free MAE", out_dir / "qa_time_mae.png")

	for pred_path in args.predictions:
	path = Path(pred_path)
	if "struct" in path.name:
	save_confusion(path, path.stem, out_dir / f"{path.stem}_current_behavior_confusion.png")

	print(f"wrote figures and comparison CSV to {out_dir}")


	if __name__ == "__main__":
	main()