Spaces:

sreshwarprasad
/

eshwar-gz2-api

Running

App Files Files Community

eshwar-gz2-api / src /evaluate_full.py

sreshwarprasad

Upload folder using huggingface_hub

e36eee4 verified 6 days ago

raw

history blame contribute delete

26.4 kB

	"""
	src/evaluate_full.py
	--------------------
	Full evaluation of all trained models on the held-out test set.

	Generates all paper figures and tables:

	Tables
	------
	table_metrics_proposed.csv — MAE / RMSE / bias / ECE for proposed model
	table_reached_branch_mae.csv — reached-branch MAE across all 5 models
	table_simplex_violation.csv — simplex validity for sigmoid baseline

	Figures (PDF + PNG, IEEE naming convention)
	-------------------------------------------
	fig_scatter_predicted_vs_true.pdf — predicted vs true vote fractions (proposed)
	fig_calibration_reliability.pdf — reliability diagrams, all models
	fig_ece_comparison.pdf — ECE bar chart, all models
	fig_attention_rollout_gallery.pdf — full 12-layer attention rollout gallery
	fig_attention_entropy_depth.pdf — CLS attention entropy vs. layer depth

	Usage
	-----
	cd ~/galaxy
	nohup python -m src.evaluate_full --config configs/full_train.yaml \
	> outputs/logs/evaluate.log 2>&1 &
	echo "PID: $!"
	"""

	import argparse
	import logging
	import sys
	from pathlib import Path

	import numpy as np
	import pandas as pd
	import torch
	import torch.nn.functional as F
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	from torch.amp import autocast
	from omegaconf import OmegaConf
	from tqdm import tqdm

	from src.dataset import build_dataloaders, QUESTION_GROUPS
	from src.model import build_model, build_dirichlet_model
	from src.metrics import (compute_metrics, predictions_to_numpy,
	compute_reached_branch_mae_table,
	dirichlet_predictions_to_numpy,
	simplex_violation_rate, _compute_ece)
	from src.attention_viz import plot_attention_grid, plot_attention_entropy
	from src.baselines import ResNet18Baseline

	logging.basicConfig(
	format="%(asctime)s %(levelname)s %(name)s %(message)s",
	datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout,
	)
	log = logging.getLogger("evaluate_full")

	# ── Global matplotlib style ────────────────────────────────────────────────────
	plt.rcParams.update({
	"figure.dpi" : 150,
	"savefig.dpi" : 300,
	"font.family" : "serif",
	"font.size" : 11,
	"axes.titlesize" : 11,
	"axes.labelsize" : 11,
	"xtick.labelsize" : 9,
	"ytick.labelsize" : 9,
	"legend.fontsize" : 9,
	"figure.facecolor" : "white",
	"axes.facecolor" : "white",
	"axes.grid" : True,
	"grid.alpha" : 0.3,
	"pdf.fonttype" : 42, # editable text in PDF
	"ps.fonttype" : 42,
	})

	QUESTION_LABELS = {
	"t01": "Smooth or features",
	"t02": "Edge-on disk",
	"t03": "Bar",
	"t04": "Spiral arms",
	"t05": "Bulge prominence",
	"t06": "Odd feature",
	"t07": "Roundedness",
	"t08": "Odd feature type",
	"t09": "Bulge shape",
	"t10": "Arms winding",
	"t11": "Arms number",
	}

	# Consistent colours and line styles for all models across all figures
	MODEL_COLORS = {
	"ResNet-18 + MSE (sigmoid)" : "#c0392b",
	"ResNet-18 + KL+MSE" : "#e67e22",
	"ViT-Base + MSE only" : "#2980b9",
	"ViT-Base + KL+MSE (proposed)" : "#27ae60",
	"ViT-Base + Dirichlet (Zoobot-style)": "#8e44ad",
	}
	MODEL_STYLES = {
	"ResNet-18 + MSE (sigmoid)" : "-",
	"ResNet-18 + KL+MSE" : "-.",
	"ViT-Base + MSE only" : "--",
	"ViT-Base + KL+MSE (proposed)" : "-",
	"ViT-Base + Dirichlet (Zoobot-style)": ":",
	}


	# ─────────────────────────────────────────────────────────────
	# Inference helpers
	# ─────────────────────────────────────────────────────────────

	def _infer_vit(model, loader, device, cfg,
	collect_attn=True, n_attn=16):
	model.eval()
	all_preds, all_targets, all_weights = [], [], []
	attn_images, all_layer_attns, attn_ids = [], [], []
	attn_done = False

	with torch.no_grad():
	for images, targets, weights, image_ids in tqdm(loader, desc="ViT inference"):
	images = images.to(device, non_blocking=True)
	targets = targets.to(device, non_blocking=True)
	weights = weights.to(device, non_blocking=True)
	with autocast("cuda", enabled=cfg.training.mixed_precision):
	logits = model(images)
	p, t, w = predictions_to_numpy(logits, targets, weights)
	all_preds.append(p)
	all_targets.append(t)
	all_weights.append(w)

	if collect_attn and not attn_done:
	layers = model.get_all_attention_weights()
	if layers is not None:
	n = min(n_attn, images.shape[0])
	attn_images.append(images[:n].cpu())
	all_layer_attns.append([l[:n].cpu() for l in layers])
	attn_ids.extend([int(i) for i in image_ids[:n]])
	if len(attn_ids) >= n_attn:
	attn_done = True

	preds = np.concatenate(all_preds)
	targets = np.concatenate(all_targets)
	weights = np.concatenate(all_weights)

	attn_imgs_t = torch.cat(attn_images, dim=0)[:n_attn] if attn_images else None
	merged_layers = None
	if all_layer_attns:
	merged_layers = [
	torch.cat([b[li] for b in all_layer_attns], dim=0)[:n_attn]
	for li in range(len(all_layer_attns[0]))
	]

	return preds, targets, weights, attn_imgs_t, merged_layers, attn_ids


	def _infer_resnet(model, loader, device, cfg, use_sigmoid: bool):
	model.eval()
	all_preds, all_targets, all_weights = [], [], []
	with torch.no_grad():
	for images, targets, weights, _ in tqdm(loader, desc="ResNet inference"):
	images = images.to(device, non_blocking=True)
	with autocast("cuda", enabled=cfg.training.mixed_precision):
	logits = model(images)
	if use_sigmoid:
	pred = torch.sigmoid(logits).cpu().numpy()
	else:
	pred = logits.detach().cpu().clone()
	for q, (s, e) in QUESTION_GROUPS.items():
	pred[:, s:e] = F.softmax(pred[:, s:e], dim=-1)
	pred = pred.numpy()
	all_preds.append(pred)
	all_targets.append(targets.numpy())
	all_weights.append(weights.numpy())
	return (np.concatenate(all_preds),
	np.concatenate(all_targets),
	np.concatenate(all_weights))


	def _infer_dirichlet(model, loader, device, cfg):
	model.eval()
	all_preds, all_targets, all_weights = [], [], []
	with torch.no_grad():
	for images, targets, weights, _ in tqdm(loader, desc="Dirichlet inference"):
	images = images.to(device, non_blocking=True)
	with autocast("cuda", enabled=cfg.training.mixed_precision):
	alpha = model(images)
	p, t, w = dirichlet_predictions_to_numpy(alpha, targets, weights)
	all_preds.append(p)
	all_targets.append(t)
	all_weights.append(w)
	return (np.concatenate(all_preds),
	np.concatenate(all_targets),
	np.concatenate(all_weights))


	# ─────────────────────────────────────────────────────────────
	# Figure 1: Predicted vs true scatter (proposed model)
	# ─────────────────────────────────────────────────────────────

	def fig_scatter_predicted_vs_true(preds, targets, weights, save_dir):
	path_pdf = save_dir / "fig_scatter_predicted_vs_true.pdf"
	path_png = save_dir / "fig_scatter_predicted_vs_true.png"
	if path_pdf.exists() and path_png.exists():
	log.info("Skip (exists): fig_scatter_predicted_vs_true"); return

	fig, axes = plt.subplots(3, 4, figsize=(16, 12))
	axes = axes.flatten()

	for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
	ax = axes[q_idx]
	mask = weights[:, q_idx] >= 0.05
	pq = preds[mask, start:end].flatten()
	tq = targets[mask, start:end].flatten()

	ax.scatter(tq, pq, alpha=0.06, s=1, color="#2563eb", rasterized=True)
	ax.plot([0, 1], [0, 1], "r--", linewidth=1, alpha=0.8)
	ax.set_xlim(0, 1); ax.set_ylim(0, 1)
	ax.set_xlabel("True vote fraction")
	ax.set_ylabel("Predicted vote fraction")
	ax.set_title(
	f"{q_name}: {QUESTION_LABELS[q_name]}\n"
	f"$n$ = {mask.sum():,} (w ≥ 0.05)",
	fontsize=9,
	)
	ax.set_aspect("equal")
	mae = np.abs(pq - tq).mean()
	ax.text(0.05, 0.92, f"MAE = {mae:.3f}",
	transform=ax.transAxes, fontsize=8,
	bbox=dict(boxstyle="round,pad=0.2", facecolor="white",
	edgecolor="grey", alpha=0.85))

	axes[-1].axis("off")
	plt.suptitle(
	"Predicted vs. true vote fractions — reached branches (w ≥ 0.05)\n"
	"ViT-Base/16 + hierarchical KL+MSE (proposed model, test set)",
	fontsize=12,
	)
	plt.tight_layout()
	fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
	fig.savefig(path_png, dpi=300, bbox_inches="tight")
	plt.close(fig)
	log.info("Saved: fig_scatter_predicted_vs_true")


	# ─────────────────────────────────────────────────────────────
	# Figure 2: Calibration reliability diagrams
	# ─────────────────────────────────────────────────────────────

	def fig_calibration_reliability(model_results, save_dir, n_bins=15):
	path_pdf = save_dir / "fig_calibration_reliability.pdf"
	path_png = save_dir / "fig_calibration_reliability.png"
	if path_pdf.exists() and path_png.exists():
	log.info("Skip (exists): fig_calibration_reliability"); return

	# Show 8 representative questions (skip t02 — bimodal, shown separately)
	q_show = ["t01", "t03", "t04", "t06", "t07", "t09", "t10", "t11"]
	fig, axes = plt.subplots(2, 4, figsize=(16, 8))
	axes = axes.flatten()

	for ax_idx, q_name in enumerate(q_show):
	ax = axes[ax_idx]
	start, end = QUESTION_GROUPS[q_name]
	q_idx = list(QUESTION_GROUPS.keys()).index(q_name)

	for model_name, (preds, targets, weights) in model_results.items():
	mask = weights[:, q_idx] >= 0.05
	if mask.sum() < 50:
	continue
	pf = preds[mask, start:end].flatten()
	tf = targets[mask, start:end].flatten()

	# Adaptive bins (equal-frequency) — consistent with ECE computation
	percentiles = np.linspace(0, 100, n_bins + 1)
	bin_edges = np.unique(np.percentile(pf, percentiles))
	if len(bin_edges) < 2:
	continue
	bin_ids = np.clip(
	np.digitize(pf, bin_edges[1:-1]), 0, len(bin_edges) - 2
	)
	mp = np.array([
	pf[bin_ids == b].mean() if (bin_ids == b).any() else np.nan
	for b in range(len(bin_edges) - 1)
	])
	mt = np.array([
	tf[bin_ids == b].mean() if (bin_ids == b).any() else np.nan
	for b in range(len(bin_edges) - 1)
	])
	valid = ~np.isnan(mp) & ~np.isnan(mt)
	ax.plot(
	mp[valid], mt[valid],
	MODEL_STYLES.get(model_name, "-"),
	color=MODEL_COLORS.get(model_name, "#888888"),
	linewidth=1.8, marker="o", markersize=3.5,
	label=model_name, alpha=0.9,
	)

	ax.plot([0, 1], [0, 1], "k--", linewidth=1, alpha=0.5, label="Perfect")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1)
	ax.set_xlabel("Mean predicted", fontsize=8)
	ax.set_ylabel("Mean true", fontsize=8)
	ax.set_title(f"{q_name}: {QUESTION_LABELS[q_name]}", fontsize=9)
	ax.set_aspect("equal")
	if ax_idx == 0:
	ax.legend(fontsize=6.5, loc="upper left")

	plt.suptitle(
	"Calibration reliability diagrams — all models (test set)\n"
	"Reached branches only (w ≥ 0.05). Adaptive equal-frequency bins. "
	"Closer to diagonal = better calibrated.",
	fontsize=11,
	)
	plt.tight_layout()
	fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
	fig.savefig(path_png, dpi=300, bbox_inches="tight")
	plt.close(fig)
	log.info("Saved: fig_calibration_reliability")


	# ─────────────────────────────────────────────────────────────
	# Figure 3: ECE bar chart
	# ─────────────────────────────────────────────────────────────

	def fig_ece_comparison(model_results, save_dir):
	path_pdf = save_dir / "fig_ece_comparison.pdf"
	path_png = save_dir / "fig_ece_comparison.png"
	if path_pdf.exists() and path_png.exists():
	log.info("Skip (exists): fig_ece_comparison"); return

	q_names = list(QUESTION_GROUPS.keys())
	ece_rows = []
	for model_name, (preds, targets, weights) in model_results.items():
	row = {"model": model_name}
	for q_idx, (q_name, (start, end)) in enumerate(QUESTION_GROUPS.items()):
	mask = weights[:, q_idx] >= 0.05
	if mask.sum() < 50:
	row[q_name] = float("nan")
	else:
	row[q_name] = _compute_ece(
	preds[mask, start:end].flatten(),
	targets[mask, start:end].flatten(),
	n_bins=15,
	)
	row["mean_ece"] = float(
	np.nanmean([row[q] for q in q_names])
	)
	ece_rows.append(row)

	df_ece = pd.DataFrame(ece_rows)
	df_ece.to_csv(save_dir / "table_ece_comparison.csv", index=False)

	x = np.arange(len(q_names))
	width = 0.80 / len(model_results)
	palette = list(MODEL_COLORS.values())

	fig, ax = plt.subplots(figsize=(14, 5))
	for i, (model_name, _) in enumerate(model_results.items()):
	vals = [
	float(df_ece[df_ece["model"] == model_name][q].values[0])
	for q in q_names
	]
	ax.bar(
	x + i * width, vals, width,
	label=model_name,
	color=MODEL_COLORS.get(model_name, palette[i % len(palette)]),
	alpha=0.85, edgecolor="white", linewidth=0.5,
	)

	ax.set_xticks(x + width * (len(model_results) - 1) / 2)
	ax.set_xticklabels(
	[f"{q}\n({QUESTION_LABELS[q][:12]})" for q in q_names],
	rotation=30, ha="right", fontsize=8,
	)
	ax.set_ylabel("Expected Calibration Error (ECE)", fontsize=11)
	ax.set_title(
	"Expected Calibration Error — all models (test set)\n"
	"Reached branches (w ≥ 0.05). Adaptive equal-frequency binning. "
	"Lower is better.",
	fontsize=11,
	)
	ax.legend(fontsize=8)
	ax.grid(True, alpha=0.3, axis="y")
	ax.set_axisbelow(True)
	plt.tight_layout()
	fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
	fig.savefig(path_png, dpi=300, bbox_inches="tight")
	plt.close(fig)
	log.info("Saved: fig_ece_comparison")


	# ─────────────────────────────────────────────────────────────
	# Figure 4: Attention rollout gallery
	# ─────────────────────────────────────────────────────────────

	def fig_attention_rollout_gallery(attn_imgs, all_layers, attn_ids, save_dir):
	if attn_imgs is None or all_layers is None:
	log.warning("No attention data — skipping gallery."); return

	path_pdf = save_dir / "fig_attention_rollout_gallery.pdf"
	path_png = save_dir / "fig_attention_rollout_gallery.png"

	if not path_pdf.exists():
	fig = plot_attention_grid(
	attn_imgs, all_layers, attn_ids,
	save_path=str(path_png),
	n_cols=4, rollout_mode="full",
	)
	fig.savefig(path_pdf, dpi=300, bbox_inches="tight", facecolor="black")
	plt.close(fig)
	log.info("Saved: fig_attention_rollout_gallery")

	# High-resolution PNG for journal submission
	path_hq = save_dir / "fig_attention_rollout_gallery_HQ.png"
	if not path_hq.exists():
	fig2 = plot_attention_grid(
	attn_imgs, all_layers, attn_ids,
	n_cols=4, rollout_mode="full",
	)
	fig2.savefig(path_hq, dpi=600, bbox_inches="tight", facecolor="black")
	plt.close(fig2)
	log.info("Saved: fig_attention_rollout_gallery_HQ (600 dpi)")


	# ─────────────────────────────────────────────────────────────
	# Figure 5: Attention entropy vs. depth
	# ─────────────────────────────────────────────────────────────

	def fig_attention_entropy_depth(all_layers, save_dir):
	if all_layers is None:
	log.warning("No attention layers — skipping entropy plot."); return

	path_pdf = save_dir / "fig_attention_entropy_depth.pdf"
	path_png = save_dir / "fig_attention_entropy_depth.png"
	if path_pdf.exists() and path_png.exists():
	log.info("Skip (exists): fig_attention_entropy_depth"); return

	fig = plot_attention_entropy(all_layers, save_path=str(path_png))
	fig.savefig(path_pdf, dpi=300, bbox_inches="tight")
	plt.close(fig)
	log.info("Saved: fig_attention_entropy_depth")


	# ─────────────────────────────────────────────────────────────
	# Table: metrics for proposed model
	# ─────────────────────────────────────────────────────────────

	def table_metrics_proposed(preds, targets, weights, save_dir):
	metrics = compute_metrics(preds, targets, weights)
	rows = []
	for q_name in QUESTION_GROUPS:
	rows.append({
	"question" : q_name,
	"description": QUESTION_LABELS[q_name],
	"MAE" : round(metrics[f"mae/{q_name}"], 5),
	"RMSE" : round(metrics[f"rmse/{q_name}"], 5),
	"bias" : round(metrics[f"bias/{q_name}"], 5),
	"ECE" : round(metrics[f"ece/{q_name}"], 5),
	})
	rows.append({
	"question": "weighted_avg", "description": "Weighted average",
	"MAE" : round(metrics["mae/weighted_avg"], 5),
	"RMSE": round(metrics["rmse/weighted_avg"], 5),
	"bias": "",
	"ECE" : round(metrics["ece/mean"], 5),
	})
	df = pd.DataFrame(rows)
	df.to_csv(save_dir / "table_metrics_proposed.csv", index=False)
	log.info("\n%s\n", df.to_string(index=False))
	return metrics


	# ─────────────────────────────────────────────────────────────
	# Table: simplex violation for sigmoid baseline
	# ─────────────────────────────────────────────────────────────

	def table_simplex_violation(model_results, save_dir):
	"""
	For each model, report the fraction of test samples where per-question
	predictions do not sum to 1 ± 0.02. Expected: ~0 for softmax models,
	nonzero for sigmoid baseline. This table explains why the sigmoid
	baseline achieves lower raw per-answer MAE despite being scientifically
	invalid: unconstrained sigmoid outputs fit each marginal independently.
	"""
	rows = []
	for model_name, (preds, _, _) in model_results.items():
	svr = simplex_violation_rate(preds, tolerance=0.02)
	row = {"model": model_name}
	row.update({q: round(svr[q], 4) for q in QUESTION_GROUPS})
	row["mean"] = round(svr["mean"], 4)
	rows.append(row)
	df = pd.DataFrame(rows)
	df.to_csv(save_dir / "table_simplex_violation.csv", index=False)
	log.info("Saved: table_simplex_violation.csv")
	log.info("\n%s\n", df[["model", "mean"]].to_string(index=False))
	return df


	# ─────────────────────────────────────────────────────────────
	# Main
	# ─────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--config", required=True)
	args = parser.parse_args()

	base_cfg = OmegaConf.load("configs/base.yaml")
	exp_cfg = OmegaConf.load(args.config)
	cfg = OmegaConf.merge(base_cfg, exp_cfg)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	save_dir = Path(cfg.outputs.figures_dir) / "evaluation"
	save_dir.mkdir(parents=True, exist_ok=True)
	ckpt_dir = Path(cfg.outputs.checkpoint_dir)

	_, _, test_loader = build_dataloaders(cfg)

	# ── Load all models ────────────────────────────────────────
	log.info("Loading models from: %s", ckpt_dir)

	def _load(path, model):
	ckpt = torch.load(path, map_location="cpu", weights_only=True)
	model.load_state_dict(ckpt["model_state"])
	return model

	vit_proposed = _load(
	ckpt_dir / "best_full_train.pt", build_model(cfg)
	).to(device)

	vit_mse = _load(
	ckpt_dir / "baseline_vit_mse.pt", build_model(cfg)
	).to(device)

	rn_mse = _load(
	ckpt_dir / "baseline_resnet18_mse.pt",
	ResNet18Baseline(dropout=cfg.model.dropout)
	).to(device)

	rn_kl = _load(
	ckpt_dir / "baseline_resnet18_klmse.pt",
	ResNet18Baseline(dropout=cfg.model.dropout)
	).to(device)

	vit_dirichlet = None
	dp = ckpt_dir / "baseline_vit_dirichlet.pt"
	if dp.exists():
	vit_dirichlet = _load(dp, build_dirichlet_model(cfg)).to(device)
	log.info("Loaded: ViT-Base + Dirichlet")

	# ── Run inference ──────────────────────────────────────────
	log.info("Running inference on test set...")

	(p_proposed, t_proposed, w_proposed,
	attn_imgs, all_layers, attn_ids) = _infer_vit(
	vit_proposed, test_loader, device, cfg,
	collect_attn=True, n_attn=16,
	)

	p_vit_mse, t_vit_mse, w_vit_mse = _infer_vit(
	vit_mse, test_loader, device, cfg, collect_attn=False
	)[:3]

	p_rn_mse, t_rn_mse, w_rn_mse = _infer_resnet(
	rn_mse, test_loader, device, cfg, use_sigmoid=True
	)

	p_rn_kl, t_rn_kl, w_rn_kl = _infer_resnet(
	rn_kl, test_loader, device, cfg, use_sigmoid=False
	)

	# Build model_results dict (order determines legend order in figures)
	model_results = {
	"ResNet-18 + MSE (sigmoid)" : (p_rn_mse, t_rn_mse, w_rn_mse),
	"ResNet-18 + KL+MSE" : (p_rn_kl, t_rn_kl, w_rn_kl),
	"ViT-Base + MSE only" : (p_vit_mse, t_vit_mse, w_vit_mse),
	"ViT-Base + KL+MSE (proposed)" : (p_proposed, t_proposed, w_proposed),
	}

	if vit_dirichlet is not None:
	p_dir, t_dir, w_dir = _infer_dirichlet(
	vit_dirichlet, test_loader, device, cfg
	)
	model_results["ViT-Base + Dirichlet (Zoobot-style)"] = (p_dir, t_dir, w_dir)

	# ── Tables ─────────────────────────────────────────────────
	log.info("Computing metrics...")
	table_metrics_proposed(p_proposed, t_proposed, w_proposed, save_dir)

	log.info("Computing reached-branch MAE table...")
	df_r = compute_reached_branch_mae_table(model_results)
	df_r.to_csv(save_dir / "table_reached_branch_mae.csv", index=False)
	log.info("Saved: table_reached_branch_mae.csv")

	log.info("Computing simplex violation table...")
	table_simplex_violation(model_results, save_dir)

	# ── Figures ────────────────────────────────────────────────
	log.info("Generating figures...")
	fig_scatter_predicted_vs_true(p_proposed, t_proposed, w_proposed, save_dir)
	fig_calibration_reliability(model_results, save_dir)
	fig_ece_comparison(model_results, save_dir)
	fig_attention_rollout_gallery(attn_imgs, all_layers, attn_ids, save_dir)
	fig_attention_entropy_depth(all_layers, save_dir)

	log.info("=" * 60)
	log.info("ALL OUTPUTS SAVED TO: %s", save_dir)
	log.info("=" * 60)

	metrics = compute_metrics(p_proposed, t_proposed, w_proposed)
	log.info("Proposed model — test set results:")
	log.info(" Weighted MAE = %.5f", metrics["mae/weighted_avg"])
	log.info(" Weighted RMSE = %.5f", metrics["rmse/weighted_avg"])
	log.info(" Mean ECE = %.5f", metrics["ece/mean"])


	if __name__ == "__main__":
	main()