lwm-spectro / task1 /train_mcs_models.py

Upload task1/train_mcs_models.py with huggingface_hub

a48635b verified about 2 months ago

108 kB

	#!/usr/bin/env python3
	"""Benchmark multiple backbones on modulation classification across dataset sizes.

	For each desired training size (samples per MCS class) and repetition, the script:
	1. Randomly samples spectrograms from distinct (modulation, rate, SNR, doppler) configs.
	2. Builds train/val/test splits (val/test sizes are configurable).
	3. Fine-tunes several backbones (LWM, ResNet18, EfficientNet-B0, MobileNet-V3,
	and a small CNN) using the same splits.
	4. Reports accuracy statistics and stores checkpoints/metrics per experiment.

	Input spectrograms are globally normalized using the dataset mean/std stored with
	the specified pretrained checkpoint (defaults to the latest run under `models/`).

	Usage example (defaults cover city_1_losangeles/LTE with all available SNR·mobility combos):

	python task1/train_mcs_models.py --train-sizes 128 --models resnet18 mobilenet_v3_small
	"""

	from __future__ import annotations

	import argparse
	import copy
	import csv
	import glob
	import json
	import os
	import pickle
	import random
	import re
	import sys
	from collections import Counter, defaultdict
	from pathlib import Path

	from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple

	from contextlib import nullcontext
	from datetime import datetime

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader, Dataset
	from torch.amp import autocast, GradScaler

	try:
	from tqdm import tqdm
	except ImportError: # pragma: no cover - optional dependency
	def tqdm(iterable, args, *kwargs):
	return iterable

	PROJECT_ROOT = Path(__file__).resolve().parent.parent
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.insert(0, str(PROJECT_ROOT))

	from pretraining.pretrained_model import lwm as lwm_model
	from utils import count_parameters


	COMM_CANONICAL = {
	"lte": "LTE",
	"wifi": "WiFi",
	"5g": "5G",
	}
	COMM_LOWER = {v: k for k, v in COMM_CANONICAL.items()}
	try:
	from sklearn.metrics import f1_score as sklearn_f1_score
	HAVE_SKLEARN = True
	except ImportError:
	HAVE_SKLEARN = False

	try:
	import matplotlib.pyplot as plt

	HAVE_MPL = True
	except ImportError:
	HAVE_MPL = False

	try:
	from task2.mobility_utils import LWMClassifierMinimal # type: ignore
	except ImportError: # pragma: no cover - optional dependency
	LWMClassifierMinimal = None # type: ignore[misc]

	# HPU support detection
	HPU_AVAILABLE = False
	try:
	import habana_frameworks.torch.core as htcore # type: ignore[import-not-found]
	HPU_AVAILABLE = hasattr(torch, "hpu") and torch.hpu.is_available()
	except (ImportError, AttributeError):
	pass


	def compute_f1(y_true: np.ndarray, y_pred: np.ndarray) -> float:
	if HAVE_SKLEARN:
	return float(sklearn_f1_score(y_true, y_pred, average="macro"))
	classes = np.unique(np.concatenate([y_true, y_pred]))
	scores = []
	for cls in classes:
	tp = np.sum((y_true == cls) & (y_pred == cls))
	fp = np.sum((y_true != cls) & (y_pred == cls))
	fn = np.sum((y_true == cls) & (y_pred != cls))
	precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
	recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
	denom = precision + recall
	f1 = (2 * precision * recall / denom) if denom > 0 else 0.0
	scores.append(f1)
	return float(np.mean(scores))


	MODULATION_LABELS = {
	"BPSK": 0,
	"QPSK": 1,
	"QAM16": 2,
	"QAM64": 3,
	"QAM256": 4,
	}

	LABEL_NAMES = {idx: name for name, idx in MODULATION_LABELS.items()}

	DEFAULT_LWM_TRAINABLE_LAYERS = 2 # fine-tune the last two transformer blocks

	_SAMPLE_COUNT_CACHE: Dict[str, int] = {}


	def normalize_per_sample(specs: np.ndarray, eps: float = 1e-6) -> np.ndarray:
	if specs.size == 0:
	return specs.astype(np.float32, copy=False)
	means = specs.mean(axis=(1, 2), keepdims=True)
	stds = specs.std(axis=(1, 2), keepdims=True)
	stds = np.maximum(stds, eps)
	normalized = (specs - means) / stds
	return normalized.astype(np.float32, copy=False)


	def apply_normalization(specs: np.ndarray, stats: Dict[str, object]) -> np.ndarray:
	mode = str(stats.get("normalization", "dataset")).lower()
	mean = float(stats.get("mean", 0.0))
	std = float(stats.get("std", 1.0))
	if abs(std) < 1e-6:
	std = 1e-6
	if mode == "dataset":
	return ((specs.astype(np.float32, copy=False) - mean) / std).astype(np.float32, copy=False)
	return normalize_per_sample(specs)


	def _unique_parameters(params: Iterable[nn.Parameter]) -> List[nn.Parameter]:
	seen: set[int] = set()
	unique: List[nn.Parameter] = []
	for param in params:
	pid = id(param)
	if pid not in seen:
	unique.append(param)
	seen.add(pid)
	return unique


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument(
	"--data-root",
	default=str(PROJECT_ROOT / "spectrograms"),
	help="Root directory containing city folders (default: project_root/spectrograms)",
	)
	parser.add_argument(
	"--cities",
	nargs="*",
	default=["city_1_losangeles"],
	help="City directories to include (default: %(default)s)",
	)
	parser.add_argument("--comm-types", nargs="*", default=["LTE"], help="Communication standards to include (default: %(default)s)")
	parser.add_argument("--LTE", dest="select_lte", action="store_true", help="Shortcut for --comm-types LTE")
	parser.add_argument("--WiFi", dest="select_wifi", action="store_true", help="Shortcut for --comm-types WiFi")
	parser.add_argument("--5G", dest="select_5g", action="store_true", help="Shortcut for --comm-types 5G")
	parser.add_argument("--snrs", nargs="*", default=None, help="SNR folders to include for training (default: all available)")
	parser.add_argument("--val-snrs", nargs="*", default=None, help="SNR folders for validation/test (default: all available)")
	parser.add_argument(
	"--mobilities",
	nargs="*",
	default=None,
	help="Mobility folders to include for training (default: all available)",
	)
	parser.add_argument(
	"--val-mobilities",
	nargs="*",
	default=None,
	help="Mobility folders for validation/test (default: all available)",
	)
	parser.add_argument("--fft-folder", default="512FFT", help="FFT folder name (default: %(default)s)")
	parser.add_argument(
	"--device",
	type=str,
	default="auto",
	choices=["auto", "cuda", "hpu", "cpu"],
	help="Device to use for training (default: auto - detects HPU, then CUDA, then CPU)",
	)
	parser.add_argument(
	"--gpu-ids",
	type=int,
	nargs="*",
	default=None,
	help="Specific GPU device IDs to use (only for CUDA, default: all visible GPUs)",
	)
	parser.add_argument(
	"--train-sizes",
	type=int,
	nargs="*",
	default=[2, 4, 8, 16, 32, 64, 128, 256],
	help="Training samples per class to benchmark",
	)
	parser.add_argument("--val-per-class", type=int, default=512, help="Validation samples per class")
	parser.add_argument("--test-per-class", type=int, default=512, help="Test samples per class")
	parser.add_argument("--repetitions", type=int, default=1, help="Repetitions per train size")
	parser.add_argument("--epochs", type=int, default=200, help="Epochs per run")
	parser.add_argument("--batch-size", type=int, default=32, help="Mini-batch size")
	parser.add_argument("--lr", type=float, default=8e-4, help="Learning rate for fine-tuning")
	parser.add_argument("--weight-decay", type=float, default=3e-2, help="Weight decay")
	parser.add_argument(
	"--no-epoch-history",
	action="store_true",
	help="Disable aggregated per-epoch history tracking",
	)
	parser.add_argument(
	"--no-epoch-plot",
	action="store_true",
	help="Disable per-repetition metric plots",
	)
	parser.add_argument(
	"--save-epoch-checkpoints",
	action="store_true",
	help="Persist per-epoch checkpoints (default: disabled)",
	)
	parser.add_argument(
	"--backbone-lr-factor",
	type=float,
	default=0.3,
	help="Relative LR multiplier applied to unfrozen backbone parameters (default: %(default)s)",
	)
	parser.add_argument(
	"--early-patience",
	type=int,
	default=5,
	help="Early stopping patience based on validation F1 (default: %(default)s)",
	)
	parser.add_argument(
	"--early-min-epochs",
	type=int,
	default=10,
	help="Minimum number of epochs to run before early stopping can trigger (default: %(default)s)",
	)
	parser.add_argument(
	"--finetune-epochs",
	type=int,
	default=0,
	help="Additional fine-tuning epochs to run after the main schedule (default: %(default)s)",
	)
	parser.add_argument(
	"--finetune-lr-factor",
	type=float,
	default=0.1,
	help="Multiplier applied to the base learning rate during fine-tuning (default: %(default)s)",
	)
	parser.add_argument(
	"--finetune-patience",
	type=int,
	default=3,
	help="Early stopping patience for the fine-tuning phase (default: %(default)s)",
	)
	parser.add_argument(
	"--finetune-min-epochs",
	type=int,
	default=0,
	help="Minimum epochs to execute in the fine-tuning phase before early stopping is considered (default: %(default)s)",
	)
	parser.add_argument(
	"--debug-eval-batches",
	type=int,
	default=0,
	help="Log detailed stats for the first N evaluation batches (0 disables logging)",
	)
	parser.add_argument(
	"--debug-eval-interval",
	type=int,
	default=1,
	help="Evaluate logging interval in batches when debug logging is enabled (default: %(default)s)",
	)
	parser.add_argument(
	"--debug-eval-softmax",
	action="store_true",
	help="When debugging evaluations, also log softmax statistics per batch",
	)
	parser.add_argument(
	"--models",
	nargs="*",
	default=["lwm", "resnet18", "efficientnet_b0", "mobilenet_v3_small", "simple_cnn", "ieee_cnn"],
	help="Models to benchmark",
	)
	parser.add_argument(
	"--raw-input-models",
	nargs="*",
	default=None,
	help=(
	"Models that should receive raw spectrograms without additional normalization "
	"(default: all non-LWM models)"
	),
	)
	parser.add_argument(
	"--lwm-trainable-layers",
	type=int,
	default=2,
	help="Number of transformer layers (from the end) to fine-tune in LWM (default: %(default)s)",
	)
	parser.add_argument(
	"--lwm-classifier-dim",
	type=int,
	default=64,
	help="Hidden width for the LWM classifier MLP head (default: %(default)s; ignored for linear head)",
	)
	parser.add_argument(
	"--lwm-head-dropout",
	type=float,
	default=0.0,
	help="Dropout applied inside the LWM classifier head (default: %(default)s)",
	)
	parser.add_argument(
	"--lwm-head-type",
	choices=("linear", "mlp", "res1dcnn"),
	default="res1dcnn",
	help="Classifier head architecture for LWM (default: %(default)s)",
	)
	parser.add_argument(
	"--lwm-backbone-lr-factor",
	type=float,
	default=0.2,
	help="LR multiplier for unfrozen LWM backbone layers (default: %(default)s)",
	)
	parser.add_argument(
	"--resnet-head-width",
	type=int,
	default=512,
	help="Hidden width for the ResNet18 classifier head (default: %(default)s)",
	)
	parser.add_argument(
	"--efficientnet-head-width",
	type=int,
	default=296,
	help="Hidden width for the EfficientNet-B0 classifier head (default: %(default)s)",
	)
	parser.add_argument(
	"--mobilenet-head-width",
	type=int,
	default=576,
	help="Hidden width for the MobileNetV3-Small classifier head (default: %(default)s)",
	)
	parser.add_argument(
	"--imagenet-head-dropout",
	type=float,
	default=0.6,
	help="Dropout probability used inside ImageNet backbone classifier heads (default: %(default)s)",
	)
	parser.add_argument(
	"--imagenet-weight-decay-scale",
	type=float,
	default=2.0,
	help="Multiplier applied to weight decay for ImageNet backbone trainable parameters (default: %(default)s)",
	)
	parser.add_argument(
	"--simple-cnn-hidden-dims",
	type=int,
	nargs="*",
	default=[272, 128],
	help="Hidden layer widths for the Simple CNN classifier (default: %(default)s)",
	)
	parser.add_argument(
	"--ieee-cnn-hidden-dims",
	type=int,
	nargs="*",
	default=[512, 256],
	help="Hidden layer widths for the IEEE CNN classifier (default: %(default)s)",
	)
	parser.add_argument(
	"--ieee-cnn-dropout",
	type=float,
	default=0.3,
	help="Dropout rate for the IEEE CNN model (default: %(default)s)",
	)
	parser.add_argument("--checkpoint", type=Path, default=None, help="Path to pretrained LWM checkpoint (.pth)")
	parser.add_argument("--stats", type=Path, default=None, help="dataset_stats.json path")
	parser.add_argument(
	"--models-root",
	type=Path,
	default=PROJECT_ROOT / "models",
	help="Root with pretrained runs (default: project_root/models)",
	)
	parser.add_argument(
	"--output-dir",
	type=Path,
	default=PROJECT_ROOT / "task1" / "mcs_benchmarks",
	help="Results root directory (per-run subfolder created automatically)",
	)
	parser.add_argument(
	"--export-full-model",
	type=Path,
	default=None,
	help="Directory where best full-model checkpoints (backbone + head) will be exported per run",
	)
	parser.add_argument("--seed", type=int, default=42, help="Base random seed")
	args = parser.parse_args()
	args.output_root = args.output_dir

	quick_comm: List[str] = []
	if getattr(args, "select_lte", False):
	quick_comm.append("LTE")
	if getattr(args, "select_wifi", False):
	quick_comm.append("WiFi")
	if getattr(args, "select_5g", False):
	quick_comm.append("5G")
	if quick_comm:
	args.comm_types = quick_comm

	normalized: List[str] = []
	for comm in args.comm_types:
	upper = comm.upper()
	if upper == "WIFI":
	normalized.append("WiFi")
	elif upper == "LTE":
	normalized.append("LTE")
	elif upper == "5G":
	normalized.append("5G")
	else:
	normalized.append(comm)
	args.comm_types = normalized
	timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
	comm_tokens: List[str] = []
	for comm in args.comm_types:
	canonical = COMM_LOWER.get(comm, comm.lower())
	token = re.sub(r"[^a-z0-9]+", "-", canonical.lower()).strip("-")
	comm_tokens.append(token or "unknown")
	comm_suffix = "-".join(comm_tokens) if comm_tokens else "unknown"
	args.run_timestamp = timestamp
	args.output_dir = args.output_root / comm_suffix / timestamp
	args.comm_suffix = comm_suffix

	if args.gpu_ids is not None and len(args.gpu_ids) == 0:
	args.gpu_ids = None

	if not args.simple_cnn_hidden_dims:
	args.simple_cnn_hidden_dims = [512, 256]
	if not args.ieee_cnn_hidden_dims:
	args.ieee_cnn_hidden_dims = [512, 256]
	if args.raw_input_models is None:
	args.raw_input_models = [
	model.lower()
	for model in args.models
	if model.lower() not in {"lwm"}
	]
	else:
	args.raw_input_models = [model.lower() for model in args.raw_input_models]

	args.models = [model for model in args.models]
	args.save_epoch_history = not args.no_epoch_history
	args.plot_epoch_history = not args.no_epoch_plot

	args.imagenet_head_dropout = float(max(0.0, min(args.imagenet_head_dropout, 0.95)))
	args.imagenet_weight_decay_scale = float(max(0.0, args.imagenet_weight_decay_scale))

	return args


	def find_latest_run(models_root: Path) -> Path:
	run_dirs = [p for p in models_root.iterdir() if p.is_dir()]
	run_dirs = [p for p in run_dirs if not p.name.lower().endswith("_models")]
	valid_runs = [p for p in run_dirs if any(p.glob("*.pth"))]
	if valid_runs:
	return max(valid_runs, key=lambda p: p.stat().st_mtime)

	checkpoints = list(models_root.glob("*.pth"))
	if checkpoints:
	print(f"[INFO] No checkpoint-bearing subdirectories under {models_root}; using root as run directory.")
	return models_root

	raise FileNotFoundError(f"No checkpoints found under {models_root}")


	def find_best_checkpoint(run_dir: Path) -> Path:
	candidates = list(run_dir.glob("*.pth"))
	if not candidates:
	raise FileNotFoundError(f"No checkpoints in {run_dir}")

	def metric(path: Path) -> float:
	match = re.search(r"_val([0-9]+(?:\.[0-9]+)?)", path.name)
	if match:
	try:
	return float(match.group(1))
	except ValueError:
	pass
	return float("inf")

	best = min(candidates, key=metric)
	return best


	def resolve_models_directory(args: argparse.Namespace) -> Path:
	base = args.models_root.expanduser().resolve()
	if not base.exists():
	raise FileNotFoundError(f"Models root not found: {base}")
	matches: List[Path] = []
	for comm in args.comm_types:
	subdir = base / f"{comm}_models"
	if subdir.exists():
	matches.append(subdir)
	else:
	print(f"[WARN] Models directory for {comm} not found at {subdir}")
	if len(matches) == 1:
	print(f"[INFO] Using models directory for {args.comm_types[0]}: {matches[0]}")
	return matches[0]
	if len(matches) > 1:
	raise ValueError(
	"Multiple communication-specific model directories detected; please provide --checkpoint explicitly."
	)
	print(f"[INFO] Using shared models directory: {base}")
	return base


	def resolve_checkpoint_and_stats(args: argparse.Namespace, require_checkpoint: bool) -> Tuple[Path \| None, Dict[str, object]]:
	checkpoint: Path \| None = None
	models_dir = resolve_models_directory(args)
	user_provided_stats = args.stats is not None

	if args.checkpoint is not None:
	checkpoint = args.checkpoint.expanduser().resolve()
	if not checkpoint.exists():
	raise FileNotFoundError(f"Checkpoint not found: {checkpoint}")
	stats_path = args.stats.expanduser().resolve() if user_provided_stats else checkpoint.parent / "dataset_stats.json"
	else:
	run_dir = find_latest_run(models_dir)
	stats_path = run_dir / "dataset_stats.json"
	if require_checkpoint:
	checkpoint = find_best_checkpoint(run_dir)
	else:
	checkpoint = None

	if stats_path.exists():
	try:
	with open(stats_path, "r", encoding="utf-8") as f:
	stats = json.load(f)
	except json.JSONDecodeError as exc:
	if user_provided_stats:
	raise ValueError(
	f"Failed to parse dataset_stats.json at {stats_path}: {exc}"
	) from exc
	print(
	f"[WARN] Corrupt dataset_stats.json at {stats_path}; "
	"falling back to mean=0/std=1 per-sample normalization."
	)
	stats = {"mean": 0.0, "std": 1.0, "normalization": "per_sample"}
	else:
	if "mean" not in stats or "std" not in stats:
	raise ValueError("dataset_stats.json must contain 'mean' and 'std'")
	stats.setdefault("normalization", stats.get("mode", "dataset"))
	else:
	if user_provided_stats:
	raise FileNotFoundError(f"dataset_stats.json not found: {stats_path}")
	stats = {"mean": 0.0, "std": 1.0, "normalization": "per_sample"}
	print(f"[WARN] dataset_stats.json not found at {stats_path}. Falling back to per-sample normalization.")

	if checkpoint is not None:
	print(f"[INFO] Using checkpoint: {checkpoint}")
	elif require_checkpoint:
	raise FileNotFoundError("LWM requested but no checkpoint available")
	else:
	print("[INFO] No LWM checkpoint required for selected models")

	norm_mode = str(stats.get("normalization", "dataset"))
	if norm_mode.lower() == "dataset":
	print(f"[INFO] Dataset stats -> mean={stats['mean']:.4f}, std={stats['std']:.4f}")
	else:
	print("[INFO] Normalization mode: per_sample")

	return checkpoint, {
	"mean": float(stats.get("mean", 0.0)),
	"std": float(stats.get("std", 1.0)),
	"normalization": norm_mode,
	}


	def identify_modulation(path: str) -> tuple[int \| None, str \| None]:
	for mod_name, label in MODULATION_LABELS.items():
	if mod_name in path:
	return label, mod_name
	return None, None


	def _extract_metadata(parts: Sequence[str]) -> Tuple[str, str, str]:
	rate = next((part for part in parts if part.startswith("rate")), "rate_unknown")
	snr = next((part for part in parts if part.startswith("SNR")), "SNR_unknown")
	mobility = next((part for part in parts if part in {"static", "pedestrian", "vehicular"}), "mobility_unknown")
	return rate, snr, mobility


	def discover_snr_mobility(
	data_root: Path,
	cities: Sequence[str],
	comm_types: Sequence[str],
	fft_folder: str,
	) -> Tuple[List[str], List[str]]:
	snrs: set[str] = set()
	mobilities: set[str] = set()
	for city in cities:
	for comm in comm_types:
	base = data_root / city / comm
	if not base.exists():
	continue
	for root, dirs, _ in os.walk(base):
	parts = Path(root).parts
	for part in parts:
	if part.startswith("SNR") and part.endswith("dB"):
	snrs.add(part)
	elif part in {"static", "pedestrian", "vehicular"}:
	mobilities.add(part)
	if not snrs:
	snrs.add("SNR20dB")
	if not mobilities:
	mobilities.add("static")
	return sorted(snrs), sorted(mobilities)


	def build_config_map(
	data_root: Path,
	cities: Sequence[str],
	comm_types: Sequence[str],
	snrs: Sequence[str],
	mobilities: Sequence[str],
	fft_folder: str,
	) -> Dict[int, Dict[str, List[str]]]:
	class_configs: Dict[int, Dict[str, List[str]]] = defaultdict(lambda: defaultdict(list))
	for city in cities:
	for comm in comm_types:
	base = data_root / city / comm
	for snr in snrs:
	for mobility in mobilities:
	pattern = str(base / "" / snr / mobility / "" / fft_folder / "*" / "spectrograms" / ".pkl")
	for path_str in glob.glob(pattern, recursive=True):
	cls, modulation_name = identify_modulation(path_str)
	if cls is None:
	continue
	rate, _, _ = _extract_metadata(Path(path_str).parts)
	config_name = f"{modulation_name}_{rate}_{snr}_{mobility}"
	class_configs[cls][config_name].append(path_str)
	return class_configs


	def build_global_config_map(
	data_root: Path,
	cities: Sequence[str],
	comm_types: Sequence[str],
	fft_folder: str,
	) -> Dict[int, Dict[str, List[str]]]:
	class_configs: Dict[int, Dict[str, List[str]]] = defaultdict(lambda: defaultdict(list))
	for city in cities:
	for comm in comm_types:
	base = data_root / city / comm
	pattern = str(base / "" / fft_folder / "" / "spectrograms" / "*.pkl")
	for path_str in glob.glob(pattern, recursive=True):
	cls, modulation_name = identify_modulation(path_str)
	if cls is None:
	continue
	rate, snr_part, mobility_part = _extract_metadata(Path(path_str).parts)
	config_name = f"{modulation_name}_{rate}_{snr_part}_{mobility_part}"
	class_configs[cls][config_name].append(path_str)
	return class_configs


	def _count_samples_in_path(path: str) -> int:
	cached = _SAMPLE_COUNT_CACHE.get(path)
	if cached is not None:
	return cached
	arr = load_all_samples(path)
	count = int(arr.shape[0])
	_SAMPLE_COUNT_CACHE[path] = count
	return count


	class LazyConfigArray:
	"""Lazily views spectrograms spread across multiple pickled files."""

	__slots__ = ("paths", "_counts", "_offsets", "_total", "shape", "dtype", "ndim")

	def __init__(self, paths: Sequence[str]) -> None:
	filtered_paths: List[str] = []
	counts: List[int] = []
	for path in sorted(paths):
	count = _count_samples_in_path(path)
	if count <= 0:
	continue
	filtered_paths.append(path)
	counts.append(count)

	self.paths: Tuple[str, ...] = tuple(filtered_paths)
	if counts:
	self._counts = np.array(counts, dtype=np.int64)
	self._offsets = np.concatenate(([0], np.cumsum(self._counts)))
	self._total = int(self._offsets[-1])
	else:
	self._counts = np.empty(0, dtype=np.int64)
	self._offsets = np.array([0], dtype=np.int64)
	self._total = 0

	self.shape = (self._total, 128, 128)
	self.dtype = np.float32
	self.ndim = 3

	def __len__(self) -> int:
	return self._total

	def _resolve_index(self, index: int) -> Tuple[int, int]:
	if self._total == 0:
	raise IndexError("attempting to index empty LazyConfigArray")
	if index < 0:
	index += self._total
	if index < 0 or index >= self._total:
	raise IndexError("index out of range for LazyConfigArray")
	path_idx = int(np.searchsorted(self._offsets[1:], index, side="right"))
	start = int(self._offsets[path_idx])
	return path_idx, int(index - start)

	def _load_path(self, path_idx: int) -> np.ndarray:
	path = self.paths[path_idx]
	return load_all_samples(path)

	def __getitem__(self, item: Any) -> np.ndarray:
	if isinstance(item, (int, np.integer)):
	path_idx, local_idx = self._resolve_index(int(item))
	data = self._load_path(path_idx)
	sample = data[local_idx].copy()
	return sample

	indices = np.asarray(item, dtype=np.int64)
	if indices.ndim == 0:
	indices = indices.reshape(1)
	else:
	indices = indices.reshape(-1)
	if indices.size == 0:
	return np.empty((0, 128, 128), dtype=np.float32)

	resolved: Dict[int, List[Tuple[int, int]]] = {}
	for pos, raw_idx in enumerate(indices):
	path_idx, local_idx = self._resolve_index(int(raw_idx))
	resolved.setdefault(path_idx, []).append((pos, local_idx))

	result = np.empty((indices.size, 128, 128), dtype=np.float32)
	for path_idx, items in resolved.items():
	data = self._load_path(path_idx)
	local_positions = [loc for _, loc in items]
	chunk = data[local_positions]
	for offset, (pos, _) in enumerate(items):
	result[pos] = chunk[offset]
	return result


	def load_config_arrays(class_configs: Dict[int, Dict[str, List[str]]]) -> Dict[int, Dict[str, LazyConfigArray]]:
	loaded: Dict[int, Dict[str, LazyConfigArray]] = {}
	for cls, configs in class_configs.items():
	arrays_for_cls: Dict[str, LazyConfigArray] = {}
	for config_name, paths in configs.items():
	lazy_array = LazyConfigArray(paths)
	if len(lazy_array) == 0:
	continue
	arrays_for_cls[config_name] = lazy_array
	if arrays_for_cls:
	loaded[cls] = arrays_for_cls
	return loaded


	def load_all_samples(path: str) -> np.ndarray:
	with open(path, "rb") as f:
	data = pickle.load(f)
	if isinstance(data, dict) and "spectrograms" in data:
	arr = data["spectrograms"]
	elif isinstance(data, np.ndarray):
	arr = data
	else:
	return np.empty((0, 128, 128), dtype=np.float32)

	arr = np.asarray(arr, dtype=np.float32)
	if arr.ndim == 2:
	arr = arr[None, ...]
	if arr.shape[1:] != (128, 128):
	return np.empty((0, 128, 128), dtype=np.float32)
	return arr


	def sample_from_paths(
	paths: Sequence[str],
	n_samples: int,
	rng: np.random.Generator,
	used_map: Dict[str, set[int]],
	) -> Tuple[np.ndarray, List[Tuple[str, np.ndarray]]]:
	if not paths:
	raise RuntimeError("No files available for sampling")

	paths_array = np.array(paths, dtype=object)
	order = rng.permutation(len(paths_array))
	remaining = n_samples
	collected: List[np.ndarray] = []
	info: List[Tuple[str, np.ndarray]] = []

	for idx in order:
	if remaining <= 0:
	break
	path = str(paths_array[idx])
	samples = load_all_samples(path)
	total = samples.shape[0]
	used = used_map[path]
	if used:
	used_idx = np.fromiter(used, dtype=np.int64, count=len(used))
	available = np.setdiff1d(np.arange(total), used_idx, assume_unique=True)
	else:
	available = np.arange(total)
	if available.size == 0:
	continue
	take = min(remaining, available.size)
	chosen = rng.choice(available, size=take, replace=False)
	collected.append(samples[chosen])
	used_map[path].update(int(i) for i in chosen)
	info.append((path, chosen))
	remaining -= take

	if remaining > 0:
	raise RuntimeError("Insufficient samples remaining to satisfy request")

	result = np.concatenate(collected, axis=0) if len(collected) > 1 else collected[0]
	return result, info


	def _ensure_available(total_needed: int, availability: Dict[str, set]) -> None:
	remaining = sum(len(indices) for indices in availability.values())
	if remaining < total_needed:
	raise RuntimeError(
	f"Insufficient samples: need {total_needed}, only {remaining} available across configs"
	)


	def _sample_from_availability(
	arrays_map: Dict[str, LazyConfigArray],
	availability: Dict[str, set[int]],
	total_needed: int,
	rng: np.random.Generator,
	) -> Tuple[np.ndarray, Dict[str, set[int]]]:
	if total_needed <= 0:
	return np.empty((0, 128, 128), dtype=np.float32), {cfg: set() for cfg in arrays_map}

	_ensure_available(total_needed, availability)
	remaining = total_needed
	configs = [cfg for cfg, indices in availability.items() if indices]
	used: Dict[str, set[int]] = {cfg: set() for cfg in arrays_map}
	collected: List[np.ndarray] = []

	while remaining > 0 and configs:
	cfg = rng.choice(configs)
	available_indices = np.array(list(availability[cfg]), dtype=np.int64)
	if available_indices.size == 0:
	configs = [c for c in configs if c != cfg]
	continue
	take = min(max(1, remaining // max(len(configs), 1)), remaining, available_indices.size)
	chosen = rng.choice(available_indices, size=take, replace=False)
	collected.append(arrays_map[cfg][chosen])
	chosen_set = {int(idx) for idx in chosen}
	used[cfg].update(chosen_set)
	availability[cfg].difference_update(chosen_set)
	remaining -= take
	configs = [c for c in configs if availability[c]]

	if remaining > 0:
	raise RuntimeError("Sampling failed to collect the requested number of samples")

	stacked = np.concatenate(collected, axis=0) if collected else np.empty((0, 128, 128), dtype=np.float32)
	return stacked.astype(np.float32, copy=False), used


	def sample_train_arrays(
	arrays_map: Dict[str, LazyConfigArray],
	availability: Dict[str, set[int]],
	train_size: int,
	rng: np.random.Generator,
	) -> Tuple[np.ndarray, Dict[str, set[int]]]:
	return _sample_from_availability(arrays_map, availability, train_size, rng)


	def sample_global_arrays(
	arrays_map: Dict[str, LazyConfigArray],
	availability: Dict[str, set[int]],
	per_class: int,
	rng: np.random.Generator,
	) -> Tuple[np.ndarray, Dict[str, set[int]]]:
	return _sample_from_availability(arrays_map, availability, per_class, rng)


	class SpectrogramDataset(Dataset):
	def __init__(self, specs: np.ndarray, labels: np.ndarray):
	self.specs = specs.astype(np.float32, copy=False)
	self.labels = labels.astype(np.int64, copy=False)

	def __len__(self) -> int:
	return len(self.labels)

	def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
	return torch.from_numpy(self.specs[idx]), int(self.labels[idx])


	def normalize_batch(specs: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
	mean = specs.mean()
	std = specs.std(unbiased=False)
	std = torch.clamp(std, min=eps)
	return (specs - mean) / std


	def apply_spec_augment(
	specs: torch.Tensor,
	*,
	freq_mask_width: int = 12,
	time_mask_width: int = 16,
	freq_masks: int = 2,
	time_masks: int = 2,
	mask_prob: float = 0.5,
	noise_std: float = 0.0,
	) -> torch.Tensor:
	"""Apply light-weight SpecAugment-style masking to a batch of spectrograms.

	The function accepts tensors shaped ``[B, H, W]`` or ``[B, 1, H, W]`` and
	returns an augmented tensor with the same shape. Masks use the sample mean
	to avoid introducing large bias and are applied per-sample with the given
	probability. Gaussian noise (if requested) is injected before masking.
	"""

	if mask_prob <= 0.0 and noise_std <= 0.0:
	return specs

	if specs.dim() not in (3, 4):
	raise ValueError(f"Spectrograms must be rank-3 or rank-4, got shape {tuple(specs.shape)}")

	needs_squeeze = specs.dim() == 3
	augmented = specs.unsqueeze(1) if needs_squeeze else specs
	batch_size, _, freq_dim, time_dim = augmented.shape

	if mask_prob < 1.0:
	apply_mask = torch.rand(batch_size, device=augmented.device) < mask_prob
	else:
	apply_mask = torch.ones(batch_size, dtype=torch.bool, device=augmented.device)

	freq_mask_width = max(0, int(freq_mask_width))
	time_mask_width = max(0, int(time_mask_width))
	freq_masks = max(0, int(freq_masks))
	time_masks = max(0, int(time_masks))

	for idx in range(batch_size):
	if not apply_mask[idx]:
	continue

	sample = augmented[idx]
	if noise_std > 0.0:
	sample = sample + noise_std * torch.randn_like(sample)

	fill_value = sample.mean()

	if freq_mask_width > 0 and freq_masks > 0:
	max_width = min(freq_mask_width, freq_dim)
	for _ in range(freq_masks):
	width = int(torch.randint(0, max_width + 1, (1,), device=augmented.device).item())
	if width == 0 or width > freq_dim:
	continue
	start = 0 if freq_dim == width else int(torch.randint(0, freq_dim - width + 1, (1,), device=augmented.device).item())
	sample[:, start:start + width, :] = fill_value

	if time_mask_width > 0 and time_masks > 0:
	max_width = min(time_mask_width, time_dim)
	for _ in range(time_masks):
	width = int(torch.randint(0, max_width + 1, (1,), device=augmented.device).item())
	if width == 0 or width > time_dim:
	continue
	start = 0 if time_dim == width else int(torch.randint(0, time_dim - width + 1, (1,), device=augmented.device).item())
	sample[:, :, start:start + width] = fill_value

	augmented[idx] = sample

	return augmented.squeeze(1) if needs_squeeze else augmented




	def _write_epoch_history(
	rep_root: Path,
	records: Sequence[Dict[str, object]],
	enable_csv: bool,
	enable_plot: bool,
	) -> None:
	if not records:
	return

	rep_root.mkdir(parents=True, exist_ok=True)

	if enable_csv:
	base_fields = [
	"model",
	"epoch",
	"phase",
	"train_loss",
	"val_loss",
	"val_acc",
	"val_f1",
	"lr",
	"train_size_requested",
	"train_size_effective",
	]
	extra_fields = sorted(
	{key for rec in records for key in rec.keys() if key not in base_fields}
	)
	fieldnames = base_fields + extra_fields
	sorted_records = sorted(records, key=lambda r: (r["epoch"], r["model"], r.get("phase", "")))
	history_path = rep_root / "epoch_history.csv"
	with open(history_path, "w", newline="", encoding="utf-8") as csvfile:
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction="ignore")
	writer.writeheader()
	writer.writerows(sorted_records)

	if enable_plot and HAVE_MPL:
	models_in_run = sorted({rec["model"] for rec in records})
	fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True)
	for ax in axes:
	ax.grid(True, linestyle='--', alpha=0.3)
	for model_name_plot in models_in_run:
	model_records = [rec for rec in records if rec["model"] == model_name_plot]
	if not model_records:
	continue
	epochs = [rec["epoch"] for rec in model_records]
	val_loss_values = [rec["val_loss"] for rec in model_records]
	val_f1_values = [rec["val_f1"] for rec in model_records]
	axes[0].plot(epochs, val_loss_values, marker='o', label=model_name_plot)
	axes[1].plot(epochs, val_f1_values, marker='o', label=model_name_plot)
	axes[0].set_ylabel('Val Loss')
	axes[1].set_ylabel('Val F1')
	axes[1].set_xlabel('Epoch')
	axes[0].legend(loc='best')
	axes[0].set_title('Per-epoch validation metrics')
	fig.tight_layout()
	fig.savefig(rep_root / 'epoch_history.png', dpi=150)
	plt.close(fig)


	class ResidualBlock1D(nn.Module):
	"""Lightweight residual block used by the res1dcnn head."""

	def __init__(self, in_channels: int, out_channels: int) -> None:
	super().__init__()
	self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1)
	self.bn1 = nn.BatchNorm1d(out_channels)
	self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
	self.bn2 = nn.BatchNorm1d(out_channels)
	self.shortcut = nn.Identity()
	if in_channels != out_channels:
	self.shortcut = nn.Sequential(
	nn.Conv1d(in_channels, out_channels, kernel_size=1),
	nn.BatchNorm1d(out_channels),
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	residual = x
	x = F.relu(self.bn1(self.conv1(x)))
	x = self.bn2(self.conv2(x))
	x = x + self.shortcut(residual)
	x = F.relu(x)
	return x


	class Res1DCNNHead(nn.Module):
	"""Residual 1D CNN classifier head that operates on 128-d LWM features."""

	def __init__(self, input_dim: int, num_classes: int, dropout: float = 0.1) -> None:
	super().__init__()
	self.input_dim = int(input_dim)
	hidden_dim = 64
	self.conv1 = nn.Conv1d(1, hidden_dim, kernel_size=3, padding=1)
	self.bn1 = nn.BatchNorm1d(hidden_dim)
	self.res_block = ResidualBlock1D(hidden_dim, hidden_dim)
	self.dropout = nn.Dropout(dropout)
	self.fc = nn.Linear(hidden_dim, num_classes)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = x.unsqueeze(1)
	x = F.relu(self.bn1(self.conv1(x)))
	x = self.res_block(x)
	x = F.adaptive_avg_pool1d(x, 1).squeeze(-1)
	x = self.dropout(x)
	return self.fc(x)


	class LWMClassifier(nn.Module):
	def __init__(
	self,
	backbone: nn.Module,
	trainable_layers: int,
	num_classes: int,
	classifier_dim: int = 128,
	head_dropout: float = 0.1,
	head_type: str = "mlp",
	):
	super().__init__()
	self.backbone = backbone
	self.patch_size = 4
	self.unfold = nn.Unfold(kernel_size=self.patch_size, stride=self.patch_size)
	head_dropout = max(0.0, float(head_dropout))
	head_type = head_type.lower().strip()

	if head_type == "linear":
	head_layers: List[nn.Module] = [nn.LayerNorm(128)]
	if head_dropout > 0:
	head_layers.append(nn.Dropout(head_dropout))
	head_layers.append(nn.Linear(128, num_classes))
	self.classifier = nn.Sequential(*head_layers)
	elif head_type == "res1dcnn":
	self.classifier = nn.Sequential(
	nn.LayerNorm(128),
	Res1DCNNHead(128, num_classes, dropout=head_dropout),
	)
	else:
	head_layers = [
	nn.LayerNorm(128),
	nn.Linear(128, classifier_dim),
	nn.GELU(),
	]
	if head_dropout > 0:
	head_layers.append(nn.Dropout(head_dropout))
	head_layers.append(nn.Linear(classifier_dim, num_classes))
	self.classifier = nn.Sequential(*head_layers)

	for param in self.backbone.parameters():
	param.requires_grad = False
	if trainable_layers > 0:
	for layer in self.backbone.layers[-trainable_layers:]:
	for param in layer.parameters():
	param.requires_grad = True
	# Enable gradient checkpointing for memory efficiency
	if hasattr(layer, 'gradient_checkpointing'):
	layer.gradient_checkpointing = True

	def spectrogram_to_tokens(self, x: torch.Tensor) -> torch.Tensor:
	x = x.unsqueeze(1)
	patches = self.unfold(x).transpose(1, 2)
	cls = torch.full(
	(patches.size(0), 1, patches.size(-1)), 0.2, dtype=patches.dtype, device=patches.device
	)
	return torch.cat([cls, patches], dim=1)

	def forward_features(self, x: torch.Tensor) -> torch.Tensor:
	tokens = self.spectrogram_to_tokens(x)
	outputs = self.backbone(tokens)
	if outputs.size(1) <= 1:
	return outputs[:, 0, :]
	return outputs[:, 1:, :].mean(dim=1)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	cls = self.forward_features(x)
	return self.classifier(cls)


	def create_simple_cnn(
	num_classes: int,
	hidden_dims: Tuple[int, ...] = (192,),
	dropout: float = 0.3,
	) -> nn.Module:
	"""Create baseline CNN with configurable classifier width."""

	if not hidden_dims:
	raise ValueError("hidden_dims must contain at least one value")

	layers: List[nn.Module] = [
	nn.Conv2d(1, 16, 5, padding=2), nn.ReLU(), nn.MaxPool2d(2),
	nn.Conv2d(16, 32, 5, padding=2), nn.ReLU(), nn.MaxPool2d(2),
	nn.Conv2d(32, 64, 5, padding=2), nn.ReLU(), nn.AdaptiveAvgPool2d((4, 4)),
	nn.Flatten(),
	nn.Dropout(dropout),
	]

	in_dim = 4 * 4 * 64
	fc_layers: List[nn.Module] = []
	for idx, hidden_dim in enumerate(hidden_dims):
	fc_layers.append(nn.Linear(in_dim, hidden_dim))
	fc_layers.append(nn.ReLU())
	fc_layers.append(nn.Dropout(dropout))
	in_dim = hidden_dim

	fc_layers.append(nn.Linear(in_dim, num_classes))

	return nn.Sequential(layers, fc_layers)


	def create_ieee_cnn(
	num_classes: int,
	hidden_dims: Tuple[int, ...] = (512, 256),
	dropout: float = 0.3,
	) -> nn.Module:
	"""CNN inspired by IEEE 2021 joint SNR/mobility classifier."""

	if not hidden_dims:
	raise ValueError("hidden_dims must contain at least one value")

	layers: List[nn.Module] = [
	nn.Conv2d(1, 32, kernel_size=3, padding=1),
	nn.BatchNorm2d(32),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(2, 2),
	nn.Dropout2d(p=dropout),
	nn.Conv2d(32, 64, kernel_size=3, padding=1),
	nn.BatchNorm2d(64),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(2, 2),
	nn.Dropout2d(p=dropout),
	nn.Conv2d(64, 128, kernel_size=3, padding=1),
	nn.BatchNorm2d(128),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(2, 2),
	nn.Dropout2d(p=dropout),
	nn.Conv2d(128, 256, kernel_size=3, padding=1),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),
	nn.MaxPool2d(2, 2),
	nn.Dropout2d(p=dropout),
	nn.Conv2d(256, 256, kernel_size=3, padding=1),
	nn.BatchNorm2d(256),
	nn.ReLU(inplace=True),
	nn.AdaptiveAvgPool2d((4, 4)),
	nn.Flatten(),
	nn.Dropout(dropout),
	]

	in_dim = 4 * 4 * 256
	fc_layers: List[nn.Module] = []
	for hidden_dim in hidden_dims:
	fc_layers.append(nn.Linear(in_dim, hidden_dim))
	fc_layers.append(nn.BatchNorm1d(hidden_dim))
	fc_layers.append(nn.ReLU(inplace=True))
	fc_layers.append(nn.Dropout(dropout))
	in_dim = hidden_dim

	fc_layers.append(nn.Linear(in_dim, num_classes))
	return nn.Sequential(layers, fc_layers)


	def build_model(
	name: str,
	num_classes: int,
	checkpoint: Path,
	device: torch.device,
	trainable_layers: int,
	backbone_lr_factor: float,
	overrides: Dict[str, object] \| None = None,
	) -> Tuple[nn.Module, List[Dict[str, object]]]:
	name = name.lower()
	param_groups: List[Dict[str, object]] = []
	overrides = overrides or {}

	if name == "lwm":
	backbone = lwm_model(element_length=16, d_model=128, n_layers=12, max_len=1025, n_heads=8, dropout=0.1)
	if checkpoint is None:
	raise FileNotFoundError("Checkpoint is required for LWM-based models")
	try:
	state = torch.load(checkpoint, map_location="cpu", weights_only=True)
	except TypeError:
	# Older torch versions do not support weights_only
	state = torch.load(checkpoint, map_location="cpu")
	if any(k.startswith("module.") for k in state):
	state = {k.replace("module.", ""): v for k, v in state.items()}
	if any(k.startswith("backbone.") for k in state):
	backbone_state = {
	k.split("backbone.", 1)[1]: v
	for k, v in state.items()
	if k.startswith("backbone.")
	}
	else:
	backbone_state = {
	k: v
	for k, v in state.items()
	if not k.startswith("classifier.") and not k.startswith("projection_head.")
	}
	backbone.load_state_dict(backbone_state, strict=False)

	classifier_dim = int(overrides.get("lwm_classifier_dim", 96))
	head_dropout = float(overrides.get("lwm_head_dropout", 0.1))
	head_type = str(overrides.get("lwm_head_type", "mlp")).lower()
	model = LWMClassifier(
	backbone,
	trainable_layers=trainable_layers,
	num_classes=num_classes,
	classifier_dim=classifier_dim,
	head_dropout=head_dropout,
	head_type=head_type,
	)
	head_params = list(model.classifier.parameters())
	param_groups.append({"params": head_params, "scale": 1.0})

	if trainable_layers > 0:
	backbone_params: List[nn.Parameter] = []
	for layer in model.backbone.layers[-trainable_layers:]:
	backbone_params.extend(layer.parameters())
	backbone_params = _unique_parameters(backbone_params)
	if backbone_params:
	param_groups.append({"params": backbone_params, "scale": backbone_lr_factor})

	elif name == "resnet18":
	from torchvision import models

	backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
	backbone.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
	nn.init.kaiming_normal_(backbone.conv1.weight, mode='fan_out', nonlinearity='relu')
	in_features = backbone.fc.in_features
	head_width = int(overrides.get("resnet_head_width", 384))
	imagenet_head_dropout = float(overrides.get("imagenet_head_dropout", 0.45))
	imagenet_head_dropout = max(0.0, min(imagenet_head_dropout, 0.9))
	pre_fc_dropout = max(0.0, min(imagenet_head_dropout * 0.5, 0.9))
	backbone.fc = nn.Sequential(
	nn.Dropout(p=pre_fc_dropout),
	nn.Linear(in_features, head_width),
	nn.LayerNorm(head_width),
	nn.ReLU(inplace=True),
	nn.Dropout(p=imagenet_head_dropout),
	nn.Linear(head_width, num_classes),
	)

	for param in backbone.parameters():
	param.requires_grad = False

	head_params = list(backbone.fc.parameters())
	for param in head_params:
	param.requires_grad = True
	imagenet_weight_decay = overrides.get("imagenet_weight_decay", None)
	head_group: Dict[str, object] = {"params": head_params, "scale": 1.0}
	if imagenet_weight_decay is not None:
	head_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(head_group)

	adapt_params: List[nn.Parameter] = []
	if hasattr(backbone.layer4[0], "downsample") and backbone.layer4[0].downsample is not None:
	adapt_params.extend(backbone.layer4[0].downsample[0].parameters())
	if len(backbone.layer4[0].downsample) > 1:
	adapt_params.extend(backbone.layer4[0].downsample[1].parameters())
	for module in backbone.layer4[-1].modules():
	if isinstance(module, nn.BatchNorm2d):
	adapt_params.extend(module.parameters())
	adapt_params = _unique_parameters(adapt_params)
	for param in adapt_params:
	param.requires_grad = True
	if adapt_params:
	adapt_group: Dict[str, object] = {"params": adapt_params, "scale": backbone_lr_factor}
	if imagenet_weight_decay is not None:
	adapt_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(adapt_group)

	model = backbone

	elif name == "efficientnet_b0":
	from torchvision import models

	backbone = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
	first_conv = backbone.features[0][0]
	backbone.features[0][0] = nn.Conv2d(1, first_conv.out_channels, kernel_size=3, stride=2, padding=1, bias=False)
	nn.init.kaiming_normal_(backbone.features[0][0].weight, mode='fan_out', nonlinearity='relu')
	in_features = backbone.classifier[-1].in_features
	head_width = int(overrides.get("efficientnet_head_width", 192))
	imagenet_head_dropout = float(overrides.get("imagenet_head_dropout", 0.45))
	imagenet_head_dropout = max(0.0, min(imagenet_head_dropout, 0.9))
	pre_fc_dropout = max(0.0, min(imagenet_head_dropout * 0.5, 0.9))
	backbone.classifier = nn.Sequential(
	nn.Dropout(p=pre_fc_dropout),
	nn.Linear(in_features, head_width),
	nn.LayerNorm(head_width),
	nn.ReLU(inplace=True),
	nn.Dropout(p=imagenet_head_dropout),
	nn.Linear(head_width, num_classes),
	)

	for param in backbone.parameters():
	param.requires_grad = False

	head_params = list(backbone.classifier.parameters())
	for param in head_params:
	param.requires_grad = True
	imagenet_weight_decay = overrides.get("imagenet_weight_decay", None)
	head_group = {"params": head_params, "scale": 1.0}
	if imagenet_weight_decay is not None:
	head_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(head_group)

	adapt_params: List[nn.Parameter] = []
	final_block = backbone.features[7][0]
	# Depthwise conv + associated norms for the last MBConv block
	depthwise = final_block.block[1][0]
	adapt_params.extend(depthwise.parameters())
	for idx in (0, 1, 3):
	for module in final_block.block[idx].modules():
	if isinstance(module, nn.BatchNorm2d):
	adapt_params.extend(module.parameters())
	adapt_params = _unique_parameters(adapt_params)
	for param in adapt_params:
	param.requires_grad = True
	if adapt_params:
	adapt_group = {"params": adapt_params, "scale": backbone_lr_factor}
	if imagenet_weight_decay is not None:
	adapt_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(adapt_group)

	model = backbone

	elif name == "mobilenet_v3_small":
	from torchvision import models

	backbone = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.DEFAULT)
	backbone.features[0][0] = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1, bias=False)
	nn.init.kaiming_normal_(backbone.features[0][0].weight, mode='fan_out', nonlinearity='relu')
	with torch.no_grad():
	dummy = torch.zeros(1, 1, 128, 128)
	features = backbone.features(dummy)
	pooled = backbone.avgpool(features)
	flattened = torch.flatten(pooled, 1)
	in_features = flattened.shape[1]
	head_width = int(overrides.get("mobilenet_head_width", 320))
	imagenet_head_dropout = float(overrides.get("imagenet_head_dropout", 0.45))
	imagenet_head_dropout = max(0.0, min(imagenet_head_dropout, 0.9))
	pre_fc_dropout = max(0.0, min(imagenet_head_dropout * 0.5, 0.9))
	backbone.classifier = nn.Sequential(
	nn.Dropout(p=pre_fc_dropout),
	nn.Linear(in_features, head_width),
	nn.LayerNorm(head_width),
	nn.Hardswish(),
	nn.Dropout(p=imagenet_head_dropout),
	nn.Linear(head_width, num_classes),
	)

	for param in backbone.parameters():
	param.requires_grad = False

	head_params = list(backbone.classifier.parameters())
	for param in head_params:
	param.requires_grad = True
	imagenet_weight_decay = overrides.get("imagenet_weight_decay", None)
	head_group = {"params": head_params, "scale": 1.0}
	if imagenet_weight_decay is not None:
	head_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(head_group)

	adapt_params: List[nn.Parameter] = []
	adapt_params.extend(backbone.features[-1][0].parameters())
	for module in backbone.features[-2].modules():
	if isinstance(module, nn.BatchNorm2d):
	adapt_params.extend(module.parameters())
	adapt_params = _unique_parameters(adapt_params)
	for param in adapt_params:
	param.requires_grad = True
	if adapt_params:
	adapt_group = {"params": adapt_params, "scale": backbone_lr_factor}
	if imagenet_weight_decay is not None:
	adapt_group["weight_decay"] = float(imagenet_weight_decay)
	param_groups.append(adapt_group)

	model = backbone

	elif name in {"simple_cnn", "simplecnn"}:
	hidden_dims = overrides.get("simple_cnn_hidden_dims", (192,))
	if isinstance(hidden_dims, Sequence) and not isinstance(hidden_dims, str):
	simple_dims = tuple(int(dim) for dim in hidden_dims)
	else:
	simple_dims = (int(hidden_dims),)
	model = create_simple_cnn(num_classes, hidden_dims=simple_dims)
	head_params = list(model.parameters())
	param_groups.append({"params": head_params, "scale": 1.0})

	elif name in {"ieee_cnn", "ieeecnn"}:
	hidden_dims = overrides.get("ieee_cnn_hidden_dims", (512, 256))
	if isinstance(hidden_dims, Sequence) and not isinstance(hidden_dims, str):
	ieee_dims = tuple(int(dim) for dim in hidden_dims)
	else:
	ieee_dims = (int(hidden_dims),)
	dropout = float(overrides.get("ieee_cnn_dropout", 0.3))
	model = create_ieee_cnn(num_classes, hidden_dims=ieee_dims, dropout=dropout)
	head_params = list(model.parameters())
	param_groups.append({"params": head_params, "scale": 1.0})

	else:
	raise ValueError(f"Unknown model: {name}")

	return model.to(device), param_groups


	def _unwrap_module(model: nn.Module) -> nn.Module:
	return model.module if isinstance(model, nn.DataParallel) else model


	def _strip_module_prefix(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
	if not state_dict:
	return state_dict
	needs_strip = any(key.startswith("module.") for key in state_dict)
	if not needs_strip:
	return state_dict
	stripped = state_dict.__class__() if hasattr(state_dict, "__class__") else {}
	for key, value in state_dict.items():
	new_key = key.split("module.", 1)[1] if key.startswith("module.") else key
	stripped[new_key] = value
	return stripped


	def _model_forward(
	model: nn.Module,
	specs: torch.Tensor,
	input_stats: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	base_model = _unwrap_module(model)
	is_lwm_like = isinstance(base_model, LWMClassifier)
	if not is_lwm_like and LWMClassifierMinimal is not None:
	is_lwm_like = isinstance(base_model, LWMClassifierMinimal)
	if not is_lwm_like and hasattr(base_model, "spectrogram_to_tokens"):
	is_lwm_like = True
	if is_lwm_like:
	while specs.dim() > 3 and specs.size(1) == 1:
	specs = specs.squeeze(1)
	if specs.dim() != 3:
	specs = specs.view(specs.size(0), specs.size(-2), specs.size(-1))
	if not is_lwm_like and specs.dim() == 3:
	specs = specs.unsqueeze(1)
	if input_stats is not None:
	input_stats = input_stats.to(specs.device, non_blocking=True)
	supports_stats = bool(
	is_lwm_like and hasattr(base_model, "append_input_stats") and getattr(base_model, "append_input_stats")
	)
	if supports_stats and input_stats is not None:
	return model(specs, input_stats=input_stats)
	return model(specs)


	def train_one_epoch(model, loader, optimizer, device, scaler=None, batch_normalize: bool = False):
	criterion = nn.CrossEntropyLoss(reduction='mean')
	model.train()
	total_loss = 0.0
	total = 0
	for specs, labels in loader:
	specs = specs.to(device, non_blocking=True)
	if batch_normalize:
	specs = normalize_batch(specs)
	labels = labels.to(device, non_blocking=True)
	optimizer.zero_grad(set_to_none=True)

	# Use autocast only for CUDA
	if scaler is not None and device.type == 'cuda':
	with autocast(device_type='cuda'):
	logits = _model_forward(model, specs)
	loss = criterion(logits, labels)
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	else:
	# HPU and CPU use standard forward/backward
	logits = _model_forward(model, specs)
	loss = criterion(logits, labels)
	loss.backward()
	optimizer.step()

	total_loss += loss.item() * labels.size(0)
	total += labels.size(0)

	# Clear cache periodically to prevent memory fragmentation
	if device.type == 'cuda':
	torch.cuda.empty_cache()

	return total_loss / max(total, 1)


	@torch.no_grad()
	def evaluate(
	model,
	loader,
	device,
	debug: Optional[Dict[str, object]] = None,
	batch_normalize: bool = False,
	) -> Tuple[float, float, float]:
	criterion = nn.CrossEntropyLoss(reduction='mean')
	model.eval()
	total_loss = 0.0
	correct = 0
	total = 0
	all_preds: List[np.ndarray] = []
	all_labels: List[np.ndarray] = []

	debug_batches = int(debug.get("log_batches", 0)) if debug else 0
	debug_every = max(1, int(debug.get("log_every", 1))) if debug else 1
	log_softmax = bool(debug.get("log_softmax", False)) if debug else False
	debug_logged = 0

	for batch_idx, batch in enumerate(loader, start=1):
	stats_batch: Optional[torch.Tensor]
	if isinstance(batch, (list, tuple)) and len(batch) == 3:
	specs, stats_batch, labels = batch
	stats_batch = stats_batch.to(device, non_blocking=True)
	else:
	specs, labels = batch # type: ignore[misc]
	stats_batch = None
	specs = specs.to(device, non_blocking=True)
	if batch_normalize:
	specs = normalize_batch(specs)
	labels = labels.to(device, non_blocking=True)

	# Use autocast only for CUDA, not for HPU or CPU
	if device.type == 'cuda':
	context = autocast(device_type='cuda')
	else:
	context = nullcontext()

	with context:
	logits = _model_forward(model, specs, stats_batch)
	loss = criterion(logits, labels)

	preds = logits.argmax(dim=1)
	total_loss += loss.item() * labels.size(0)
	correct += (preds == labels).sum().item()
	total += labels.size(0)
	all_preds.append(preds.detach().cpu().numpy())
	all_labels.append(labels.detach().cpu().numpy())

	should_log = (
	debug_batches > 0
	and debug_logged < debug_batches
	and ((batch_idx - 1) % debug_every == 0)
	)
	if should_log:
	specs_cpu = specs.detach().cpu()
	logits_cpu = logits.detach().cpu()
	loss_scalar = float(loss.detach().cpu().item())
	finite_specs = torch.isfinite(specs).all().item()
	finite_logits = torch.isfinite(logits).all().item()
	print(
	f" [DEBUG][eval][batch {batch_idx}] loss={loss_scalar:.6f} "
	f"reduction={criterion.reduction} labels_shape={tuple(labels.shape)}"
	)
	print(
	f" specs dtype={specs.dtype} mean={specs_cpu.mean():.4f} std={specs_cpu.std():.4f} "
	f"min={specs_cpu.min():.4f} max={specs_cpu.max():.4f} finite={bool(finite_specs)}"
	)
	print(
	f" logits dtype={logits.dtype} mean={logits_cpu.mean():.4f} std={logits_cpu.std():.4f} "
	f"min={logits_cpu.min():.4f} max={logits_cpu.max():.4f} finite={bool(finite_logits)}"
	)
	unique_labels, counts = torch.unique(labels.detach().cpu(), return_counts=True)
	label_info = ", ".join(
	f"{int(lbl)}:{int(cnt)}" for lbl, cnt in zip(unique_labels, counts)
	)
	print(f" label distribution -> {label_info}")
	if log_softmax:
	probs = torch.softmax(logits_cpu, dim=1)
	print(
	f" softmax mean={probs.mean():.4f} std={probs.std():.4f} "
	f"min={probs.min():.4f} max={probs.max():.4f}"
	)
	debug_logged += 1

	# Clear cache periodically
	if device.type == 'cuda':
	torch.cuda.empty_cache()

	y_true = np.concatenate(all_labels) if all_labels else np.empty(0)
	y_pred = np.concatenate(all_preds) if all_preds else np.empty(0)
	f1 = compute_f1(y_true, y_pred) if y_true.size > 0 else 0.0
	return total_loss / max(total, 1), correct / max(total, 1), f1


	def set_seed(seed: int) -> None:
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	if torch.cuda.is_available():
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	# HPU seed setting (if available)
	if HPU_AVAILABLE and hasattr(torch.hpu, "manual_seed"):
	torch.hpu.manual_seed(seed)


	def main() -> None:
	# Set CUDA memory allocation configuration to reduce fragmentation
	if torch.cuda.is_available():
	import os
	os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

	args = parse_args()

	if args.early_min_epochs < 10:
	print(
	f"[INFO] Requested early_min_epochs={args.early_min_epochs} < 10; enforcing minimum of 10"
	)
	args.early_min_epochs = 10
	set_seed(args.seed)

	require_checkpoint = any(model.lower() == "lwm" for model in args.models)
	checkpoint, stats = resolve_checkpoint_and_stats(args, require_checkpoint=require_checkpoint)

	normalization_mode = str(stats.get("normalization", "dataset")).lower()
	print(f"[INFO] Normalization mode from stats: {normalization_mode}")

	data_root = Path(args.data_root)
	available_snrs, available_mobilities = discover_snr_mobility(
	data_root, args.cities, args.comm_types, args.fft_folder
	)
	train_snrs = args.snrs if args.snrs else available_snrs
	train_mobilities = args.mobilities if args.mobilities else available_mobilities
	val_snrs = args.val_snrs if args.val_snrs else available_snrs
	val_mobilities = args.val_mobilities if args.val_mobilities else available_mobilities

	class_configs = build_config_map(
	data_root, args.cities, args.comm_types, train_snrs, train_mobilities, args.fft_folder
	)
	active_labels = [cls for cls, configs in class_configs.items() if any(configs.values())]
	if not active_labels:
	raise RuntimeError("No modulation classes found with the provided filters.")
	class_configs = {cls: class_configs[cls] for cls in active_labels}
	label_to_local = {cls: idx for idx, cls in enumerate(sorted(active_labels))}
	num_classes = len(active_labels)
	print("[INFO] Active modulation classes:", ", ".join(LABEL_NAMES.get(cls, str(cls)) for cls in sorted(active_labels)))
	config_arrays = load_config_arrays(class_configs)
	global_config_map = build_config_map(
	Path(args.data_root), args.cities, args.comm_types, val_snrs, val_mobilities, args.fft_folder
	)
	global_config_arrays = load_config_arrays(global_config_map)
	print("[INFO] Training SNRs:", ", ".join(train_snrs))
	print("[INFO] Training mobilities:", ", ".join(train_mobilities))
	print("[INFO] Validation/Test SNRs:", ", ".join(val_snrs))
	print("[INFO] Validation/Test mobilities:", ", ".join(val_mobilities))

	per_class_totals: Dict[int, int] = {}
	for cls in sorted(active_labels):
	configs = config_arrays[cls]
	total_samples = sum(arr.shape[0] for arr in configs.values())
	per_class_totals[cls] = total_samples
	print(f"[INFO] Class {LABEL_NAMES.get(cls, str(cls))}: {len(configs)} configs, {total_samples} samples")
	if cls not in global_config_arrays or not global_config_arrays[cls]:
	raise RuntimeError(f"No global data found for modulation {LABEL_NAMES.get(cls, str(cls))}")

	min_class_total = min(per_class_totals.values())
	max_train_per_class = min_class_total - args.val_per_class - args.test_per_class
	if max_train_per_class <= 0:
	raise RuntimeError(
	"Requested val/test splits leave no data for training. "
	f"Minimum class has {min_class_total} samples; "
	f"val={args.val_per_class}, test={args.test_per_class}."
	)
	if any(size > max_train_per_class for size in args.train_sizes):
	adjusted: List[int] = []
	for size in args.train_sizes:
	if size > max_train_per_class:
	print(
	f"[WARN] Requested train size {size} exceeds available "
	f"{max_train_per_class} per class after val/test splits; capping."
	)
	capped = min(size, max_train_per_class)
	if capped not in adjusted:
	adjusted.append(capped)
	args.train_sizes = adjusted if adjusted else [max_train_per_class]
	print(f"[INFO] Effective train sizes per class: {args.train_sizes}")

	# Device selection: auto, cuda, hpu, or cpu
	requested_device = args.device.lower()

	if requested_device == "auto":
	if HPU_AVAILABLE:
	requested_device = "hpu"
	elif torch.cuda.is_available():
	requested_device = "cuda"
	else:
	requested_device = "cpu"

	# Setup device based on selection
	if requested_device == "hpu":
	if not HPU_AVAILABLE:
	raise RuntimeError(
	"HPU device requested but not available. "
	"Install Habana PyTorch or select --device cuda/cpu."
	)
	device = torch.device("hpu")
	# Set HPU device (typically device 0 for single-process)
	if hasattr(torch.hpu, "set_device"):
	torch.hpu.set_device(0)
	print(f"[INFO] Using HPU device")
	active_gpu_ids = [] # Not applicable for HPU
	multi_gpu = False

	elif requested_device == "cuda":
	cuda_available = torch.cuda.is_available()
	if not cuda_available:
	raise RuntimeError("CUDA device requested but not available.")

	available_gpu_ids = list(range(torch.cuda.device_count()))
	if args.gpu_ids is not None:
	invalid_ids = [gpu_id for gpu_id in args.gpu_ids if gpu_id not in available_gpu_ids]
	if invalid_ids:
	raise ValueError(
	f"Requested GPU IDs not available: {invalid_ids}; available: {available_gpu_ids}"
	)
	active_gpu_ids = list(dict.fromkeys(args.gpu_ids))
	else:
	active_gpu_ids = available_gpu_ids

	if active_gpu_ids:
	primary_gpu = active_gpu_ids[0]
	torch.cuda.set_device(primary_gpu)
	device = torch.device(f"cuda:{primary_gpu}")
	print(f"[INFO] Using CUDA device(s): {', '.join(str(i) for i in active_gpu_ids)}")
	else:
	device = torch.device("cpu")
	print("[INFO] CUDA requested but no GPUs available, using CPU")

	multi_gpu = len(active_gpu_ids) > 1
	if multi_gpu:
	print(f"[INFO] Enabling DataParallel across GPUs: {', '.join(str(i) for i in active_gpu_ids)}")

	else: # cpu
	device = torch.device("cpu")
	if args.gpu_ids is not None:
	print("[WARN] GPU IDs specified but using CPU")
	print("[INFO] Using CPU")
	active_gpu_ids = []
	multi_gpu = False

	print(f"[INFO] Using device: {device}")
	args.output_dir.mkdir(parents=True, exist_ok=True)
	print(f"[INFO] Saving outputs under: {args.output_dir}")

	eval_debug_config: Optional[Dict[str, object]] = None
	if args.debug_eval_batches > 0:
	eval_debug_config = {
	"log_batches": int(args.debug_eval_batches),
	"log_every": max(1, int(args.debug_eval_interval)),
	"log_softmax": bool(args.debug_eval_softmax),
	}
	print(
	"[INFO] Evaluation debug logging enabled -> batches:"
	f" {eval_debug_config['log_batches']}, interval: {eval_debug_config['log_every']}"
	)

	summary_device = torch.device("cpu") if device.type == "cuda" else device
	model_overrides: Dict[str, object] = {
	"resnet_head_width": args.resnet_head_width,
	"efficientnet_head_width": args.efficientnet_head_width,
	"mobilenet_head_width": args.mobilenet_head_width,
	"simple_cnn_hidden_dims": tuple(args.simple_cnn_hidden_dims),
	"ieee_cnn_hidden_dims": tuple(args.ieee_cnn_hidden_dims),
	"ieee_cnn_dropout": args.ieee_cnn_dropout,
	"lwm_classifier_dim": args.lwm_classifier_dim,
	"lwm_head_dropout": args.lwm_head_dropout,
	"lwm_head_type": args.lwm_head_type,
	"imagenet_head_dropout": args.imagenet_head_dropout,
	"imagenet_weight_decay": args.weight_decay * args.imagenet_weight_decay_scale,
	}
	print("\n[INFO] Parameter counts per model (total/trainable):")
	for model_name in args.models:
	lower_name = model_name.lower()
	trainable_layers = args.lwm_trainable_layers if lower_name == "lwm" else 0
	model_checkpoint = checkpoint
	backbone_lr_factor = args.backbone_lr_factor
	if lower_name == "lwm" and args.lwm_backbone_lr_factor is not None:
	backbone_lr_factor = args.lwm_backbone_lr_factor
	model, _ = build_model(
	model_name,
	num_classes,
	model_checkpoint,
	summary_device,
	trainable_layers=trainable_layers,
	backbone_lr_factor=backbone_lr_factor,
	overrides=model_overrides,
	)
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(f" {model_name}: {total_params:,} / {trainable_params:,}")
	del model
	if device.type == 'cuda':
	torch.cuda.empty_cache()

	raw_input_models = set(args.raw_input_models)
	active_raw_models = [model for model in args.models if model.lower() in raw_input_models]
	if active_raw_models:
	print(
	"[INFO] Raw spectrogram input (per-batch normalization) for models: "
	+ ", ".join(active_raw_models)
	)

	normalized_models = [model for model in args.models if model.lower() not in raw_input_models]
	requires_normalized_inputs = len(normalized_models) > 0
	if requires_normalized_inputs:
	print(
	"[INFO] Applying normalization for models: "
	+ ", ".join(normalized_models)
	)
	else:
	print("[INFO] All selected models consume raw spectrograms; normalization skipped")

	summary: Dict[str, Dict[int, Dict[str, List[float]]]] = {
	model: {size: {"acc": [], "f1": [], "val_f1": [], "val_loss": []} for size in args.train_sizes}
	for model in args.models
	}

	train_sizes_sorted = sorted(args.train_sizes)

	for repetition in range(1, args.repetitions + 1):
	selection_for_repetition: Dict[int, Dict[str, set[int]]] = {}
	val_rng_seed = args.seed + repetition * 100000
	val_rng = np.random.default_rng(val_rng_seed)
	fixed_val_samples: Dict[int, np.ndarray] = {}
	fixed_test_samples: Dict[int, np.ndarray] = {}
	val_reserved_indices: Dict[int, Dict[str, set[int]]] = {}
	test_reserved_indices: Dict[int, Dict[str, set[int]]] = {}

	for cls in sorted(config_arrays.keys()):
	global_arrays = global_config_arrays[cls]
	global_avail = {cfg: set(range(arr.shape[0])) for cfg, arr in global_arrays.items()}
	val_samples, val_used = sample_global_arrays(global_arrays, global_avail, args.val_per_class, val_rng)
	test_samples, test_used = sample_global_arrays(global_arrays, global_avail, args.test_per_class, val_rng)
	fixed_val_samples[cls] = val_samples
	fixed_test_samples[cls] = test_samples
	val_reserved_indices[cls] = {cfg: set(indices) for cfg, indices in val_used.items()}
	test_reserved_indices[cls] = {cfg: set(indices) for cfg, indices in test_used.items()}

	for train_size in train_sizes_sorted:
	rep_seed = args.seed + train_size * 1000 + repetition
	rng = np.random.default_rng(rep_seed)

	repetition_records: List[Dict[str, object]] = []
	per_size_val_metrics: List[Tuple[str, float, float, float]] = []

	train_specs, train_labels = [], []
	val_specs, val_labels = [], []
	test_specs, test_labels = [], []

	class_contexts: Dict[int, Dict[str, Any]] = {}
	class_capacities: Dict[int, int] = {}
	for cls in sorted(config_arrays.keys()):
	arrays_map = config_arrays[cls]
	if not arrays_map:
	raise RuntimeError(f"No data for class {LABEL_NAMES[cls]}")

	if cls not in selection_for_repetition:
	selection_for_repetition[cls] = defaultdict(set)
	prev_config_indices = selection_for_repetition[cls]

	prev_total = sum(len(sel_indices) for sel_indices in prev_config_indices.values())
	if prev_total > train_size:
	raise ValueError(
	f"Requested train size {train_size} is smaller than previously selected {prev_total} "
	f"for class {LABEL_NAMES[cls]}"
	)

	train_avail = {config: set(range(arr.shape[0])) for config, arr in arrays_map.items()}
	for config, sel_indices in prev_config_indices.items():
	if sel_indices and config in train_avail:
	train_avail[config].difference_update(sel_indices)
	val_reserved = val_reserved_indices.get(cls, {})
	test_reserved = test_reserved_indices.get(cls, {})
	for config, reserved in val_reserved.items():
	if reserved and config in train_avail:
	train_avail[config].difference_update(reserved)
	for config, reserved in test_reserved.items():
	if reserved and config in train_avail:
	train_avail[config].difference_update(reserved)

	available_now = sum(len(indices) for indices in train_avail.values())
	capacity = prev_total + available_now
	class_contexts[cls] = {
	"arrays_map": arrays_map,
	"prev_indices": prev_config_indices,
	"train_avail": train_avail,
	}
	class_capacities[cls] = capacity

	if not class_capacities:
	raise RuntimeError("No modulation classes available for training")

	min_capacity = min(class_capacities.values())
	limiting_classes = sorted(cls for cls, cap in class_capacities.items() if cap == min_capacity)
	effective_train_size = min(train_size, min_capacity)
	if effective_train_size < train_size:
	limiting_labels = ", ".join(LABEL_NAMES.get(cls, str(cls)) for cls in limiting_classes)
	if not limiting_labels:
	limiting_labels = "unknown"
	print(
	f"[WARN] Requested train size {train_size} exceeds available "
	f"{min_capacity} after reserving val/test samples; using {effective_train_size} "
	f"(limited by {limiting_labels})"
	)
	if effective_train_size <= 0:
	raise RuntimeError("No training samples available after reserving val/test splits")

	for cls in sorted(config_arrays.keys()):
	ctx = class_contexts[cls]
	arrays_map = ctx["arrays_map"]
	prev_config_indices = ctx["prev_indices"]
	train_avail = ctx["train_avail"]

	selected_arrays: List[np.ndarray] = []
	prev_total = 0
	for config, sel_indices in prev_config_indices.items():
	if sel_indices:
	idx_sorted = sorted(sel_indices)
	selected_arrays.append(arrays_map[config][idx_sorted])
	prev_total += len(sel_indices)

	needed = max(effective_train_size - prev_total, 0)
	if needed > 0:
	additional_samples, train_used = sample_train_arrays(arrays_map, train_avail, needed, rng)
	if additional_samples.size == 0:
	raise RuntimeError("Failed to collect additional training samples")
	selected_arrays.append(additional_samples)
	for config, indices in train_used.items():
	prev_config_indices[config].update(int(idx) for idx in indices)

	if not selected_arrays:
	raise RuntimeError("No training samples collected")

	train_samples = np.concatenate(selected_arrays, axis=0)
	if train_samples.shape[0] != effective_train_size:
	print(
	f"[WARN] Collected {train_samples.shape[0]} training samples for "
	f"{LABEL_NAMES.get(cls, str(cls))}, expected {effective_train_size}"
	)

	val_samples = fixed_val_samples[cls]
	test_samples = fixed_test_samples[cls]
	train_specs.append(train_samples)
	val_specs.append(val_samples)
	test_specs.append(test_samples)
	local_label = label_to_local[cls]
	train_labels.append(np.full(train_samples.shape[0], local_label, dtype=np.int64))
	val_labels.append(np.full(val_samples.shape[0], local_label, dtype=np.int64))
	test_labels.append(np.full(test_samples.shape[0], local_label, dtype=np.int64))

	train_specs_raw = np.concatenate(train_specs)
	val_specs_raw = np.concatenate(val_specs)
	test_specs_raw = np.concatenate(test_specs)
	train_labels = np.concatenate(train_labels)
	val_labels = np.concatenate(val_labels)
	test_labels = np.concatenate(test_labels)

	# Verify no data leakage (all splits are disjoint)
	# Note: Since we sample from different configs with availability tracking,
	# there should be no overlap, but we verify to be safe
	print(
	f"[INFO] Verifying data splits for train_size={train_size} "
	f"(effective {effective_train_size}), rep={repetition}..."
	)
	print(
	f" Train: {len(train_labels)} samples "
	f"(~{effective_train_size} per class expected)"
	)
	print(f" Val: {len(val_labels)} samples ({args.val_per_class} per class)")
	print(f" Test: {len(test_labels)} samples ({args.test_per_class} per class)")

	# Check class balance
	train_class_counts = Counter(train_labels)
	val_class_counts = Counter(val_labels)
	test_class_counts = Counter(test_labels)

	print(f"[INFO] Train class distribution: {dict(train_class_counts)}")
	print(f"[INFO] Val class distribution: {dict(val_class_counts)}")
	print(f"[INFO] Test class distribution: {dict(test_class_counts)}")

	# Verify all classes have expected counts
	expected_train_per_class = effective_train_size
	for cls_idx in range(num_classes):
	if train_class_counts[cls_idx] != expected_train_per_class:
	print(f"[WARN] Class {cls_idx} has {train_class_counts[cls_idx]} train samples, expected {expected_train_per_class}")
	if val_class_counts[cls_idx] != args.val_per_class:
	print(f"[WARN] Class {cls_idx} has {val_class_counts[cls_idx]} val samples, expected {args.val_per_class}")
	if test_class_counts[cls_idx] != args.test_per_class:
	print(f"[WARN] Class {cls_idx} has {test_class_counts[cls_idx]} test samples, expected {args.test_per_class}")

	print(f"[INFO] ✓ All splits have balanced class distribution")

	train_ds_raw = SpectrogramDataset(train_specs_raw, train_labels)
	val_ds_raw = SpectrogramDataset(val_specs_raw, val_labels)
	test_ds_raw = SpectrogramDataset(test_specs_raw, test_labels)

	train_loader_raw = DataLoader(
	train_ds_raw,
	batch_size=args.batch_size,
	shuffle=True,
	num_workers=2,
	pin_memory=False,
	)
	val_loader_raw = DataLoader(
	val_ds_raw,
	batch_size=args.batch_size,
	shuffle=False,
	num_workers=2,
	pin_memory=False,
	)
	test_loader_raw = DataLoader(
	test_ds_raw,
	batch_size=args.batch_size,
	shuffle=False,
	num_workers=2,
	pin_memory=False,
	)

	train_loader_normalized: Optional[DataLoader] = None
	val_loader_normalized: Optional[DataLoader] = None
	test_loader_normalized: Optional[DataLoader] = None

	if requires_normalized_inputs:
	train_specs_normalized = apply_normalization(train_specs_raw, stats)
	val_specs_normalized = apply_normalization(val_specs_raw, stats)
	test_specs_normalized = apply_normalization(test_specs_raw, stats)

	train_ds_normalized = SpectrogramDataset(train_specs_normalized, train_labels)
	val_ds_normalized = SpectrogramDataset(val_specs_normalized, val_labels)
	test_ds_normalized = SpectrogramDataset(test_specs_normalized, test_labels)

	train_loader_normalized = DataLoader(
	train_ds_normalized,
	batch_size=args.batch_size,
	shuffle=True,
	num_workers=2,
	pin_memory=False,
	)
	val_loader_normalized = DataLoader(
	val_ds_normalized,
	batch_size=args.batch_size,
	shuffle=False,
	num_workers=2,
	pin_memory=False,
	)
	test_loader_normalized = DataLoader(
	test_ds_normalized,
	batch_size=args.batch_size,
	shuffle=False,
	num_workers=2,
	pin_memory=False,
	)

	rep_root = args.output_dir / f"size_{train_size}" / f"rep_{repetition}"
	rep_root.mkdir(parents=True, exist_ok=True)

	for model_name in args.models:
	model_root = rep_root / model_name
	model_root.mkdir(parents=True, exist_ok=True)
	epoch_ckpt_dir: Optional[Path] = None
	if args.save_epoch_checkpoints:
	epoch_ckpt_dir = model_root / "epoch_checkpoints"
	epoch_ckpt_dir.mkdir(parents=True, exist_ok=True)
	for old_path in epoch_ckpt_dir.glob("epoch_*.pth"):
	if old_path.is_file():
	old_path.unlink()
	print(
	f"\n[INFO] Size {train_size} (effective {effective_train_size}), "
	f"repetition {repetition}, model {model_name}"
	)
	set_seed(rep_seed + hash(model_name) % 1000)
	lower_name = model_name.lower()
	use_raw_input = lower_name in raw_input_models
	if use_raw_input:
	train_loader = train_loader_raw
	val_loader = val_loader_raw
	test_loader = test_loader_raw
	print(" [INFO] Feeding raw spectrograms with per-batch normalization")
	else:
	if (
	train_loader_normalized is None
	or val_loader_normalized is None
	or test_loader_normalized is None
	):
	raise RuntimeError(
	"Normalized loaders were requested but could not be constructed."
	)
	train_loader = train_loader_normalized
	val_loader = val_loader_normalized
	test_loader = test_loader_normalized

	trainable_layers = args.lwm_trainable_layers if lower_name == "lwm" else 0
	backbone_lr_factor = args.backbone_lr_factor
	if lower_name == "lwm" and args.lwm_backbone_lr_factor is not None:
	backbone_lr_factor = args.lwm_backbone_lr_factor
	model_checkpoint = checkpoint
	model, param_groups = build_model(
	model_name,
	num_classes,
	model_checkpoint,
	device,
	trainable_layers=trainable_layers,
	backbone_lr_factor=backbone_lr_factor,
	overrides=model_overrides,
	)
	if multi_gpu:
	model = nn.DataParallel(model, device_ids=active_gpu_ids)
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	print(
	f"[INFO] Parameters (total/trainable): {total_params:,} / {trainable_params:,}"
	)
	def make_optimizer(base_lr: float) -> torch.optim.Optimizer:
	optim_groups: List[Dict[str, object]] = []
	if param_groups:
	for group in param_groups:
	scale = float(group.get("scale", 1.0))
	params = [p for p in group.get("params", []) if p.requires_grad]
	if params:
	group_cfg: Dict[str, object] = {
	"params": list(params),
	"lr": base_lr * scale,
	}
	if "weight_decay" in group:
	group_cfg["weight_decay"] = float(group["weight_decay"])
	optim_groups.append(group_cfg)
	if not optim_groups:
	optim_groups.append({
	"params": [p for p in model.parameters() if p.requires_grad],
	"lr": base_lr,
	})
	return torch.optim.AdamW(optim_groups, lr=base_lr, weight_decay=args.weight_decay)

	def make_scheduler(optimizer: torch.optim.Optimizer, base_lr: float, patience_limit: int):
	plateau_patience = max(2, patience_limit // 2)
	return torch.optim.lr_scheduler.ReduceLROnPlateau(
	optimizer,
	mode="min",
	factor=0.5,
	patience=plateau_patience,
	min_lr=base_lr * 0.01,
	)

	# Initialize mixed precision scaler for CUDA
	# GradScaler only for CUDA, not for HPU or CPU
	scaler = GradScaler('cuda') if device.type == 'cuda' else None
	best_val_loss = float("inf")
	best_val_acc = 0.0
	best_state = None
	epoch_history: List[Dict[str, object]] = []
	best_val_f1 = 0.0
	best_epoch = 0
	total_epochs_ran = 0
	overall_early_stopped = False

	phase_configs = [
	{
	"name": "main",
	"max_epochs": args.epochs,
	"base_lr": args.lr,
	"patience": max(1, args.early_patience),
	"min_epochs": max(0, args.early_min_epochs),
	}
	]

	ft_epochs = max(0, args.finetune_epochs)
	ft_lr_factor = args.finetune_lr_factor
	ft_patience = max(1, args.finetune_patience)
	ft_min_epochs = max(0, args.finetune_min_epochs)

	if ft_epochs > 0:
	phase_configs.append(
	{
	"name": "finetune",
	"max_epochs": ft_epochs,
	"base_lr": args.lr * ft_lr_factor,
	"patience": ft_patience,
	"min_epochs": ft_min_epochs,
	}
	)

	for phase_idx, phase in enumerate(phase_configs):
	if phase["max_epochs"] <= 0:
	continue

	if phase_idx > 0:
	print(
	f"\n [INFO] Starting {phase['name']} phase: lr={phase['base_lr']:.2e}, "
	f"max_epochs={phase['max_epochs']}"
	)
	if best_state is not None:
	model.load_state_dict(best_state["model"])

	optimizer = make_optimizer(phase["base_lr"])
	scheduler = make_scheduler(optimizer, phase["base_lr"], phase["patience"])
	patience_counter = 0
	phase_early_stopped = False
	phase_epochs_completed = 0
	phase_min_epochs = max(0, phase["min_epochs"])

	for local_epoch in range(1, phase["max_epochs"] + 1):
	overall_epoch = total_epochs_ran + local_epoch
	train_loss = train_one_epoch(
	model,
	train_loader,
	optimizer,
	device,
	scaler,
	batch_normalize=use_raw_input,
	)
	val_loss, val_acc, val_f1 = evaluate(
	model,
	val_loader,
	device,
	eval_debug_config,
	batch_normalize=use_raw_input,
	)
	scheduler.step(val_loss)
	current_lr = optimizer.param_groups[0]["lr"]
	print(
	f" [{phase['name']}] Epoch {overall_epoch:02d}: "
	f"train_loss={train_loss:.4f} val_loss={val_loss:.4f} "
	f"val_acc={val_acc:.4%} val_f1={val_f1:.4f}"
	)
	epoch_history.append(
	{
	"epoch": int(overall_epoch),
	"train_loss": float(train_loss),
	"val_loss": float(val_loss),
	"val_acc": float(val_acc),
	"val_f1": float(val_f1),
	"lr": float(current_lr),
	"phase": phase["name"],
	}
	)
	repetition_records.append(
	{
	"model": model_name,
	"epoch": int(overall_epoch),
	"phase": phase["name"],
	"train_loss": float(train_loss),
	"val_loss": float(val_loss),
	"val_acc": float(val_acc),
	"val_f1": float(val_f1),
	"lr": float(current_lr),
	"train_size_requested": int(train_size),
	"train_size_effective": int(effective_train_size),
	}
	)
	_write_epoch_history(rep_root, repetition_records, args.save_epoch_history, args.plot_epoch_history)
	raw_epoch_state = _strip_module_prefix(model.state_dict())
	if epoch_ckpt_dir is not None:
	epoch_state = raw_epoch_state.__class__()
	for key, value in raw_epoch_state.items():
	epoch_state[key] = value.detach().cpu()
	epoch_ckpt_path = epoch_ckpt_dir / f"epoch_{overall_epoch:03d}.pth"
	torch.save(epoch_state, epoch_ckpt_path)
	if val_loss < best_val_loss:
	best_val_loss = val_loss
	best_val_acc = val_acc
	best_val_f1 = val_f1
	best_model_state = {
	key: value.detach().cpu()
	for key, value in model.state_dict().items()
	}
	best_state = {
	"model": best_model_state,
	"val_loss": val_loss,
	"val_acc": val_acc,
	"val_f1": val_f1,
	"epoch": int(overall_epoch),
	"lr": current_lr,
	"phase": phase["name"],
	}
	best_epoch = int(overall_epoch)
	patience_counter = 0
	else:
	if local_epoch >= phase_min_epochs:
	patience_counter += 1
	if patience_counter >= phase["patience"]:
	print(
	f" [INFO] Early stopping ({phase['name']}) at epoch {overall_epoch:02d} "
	f"after {patience_counter} epochs without val loss improvement"
	)
	overall_early_stopped = True
	phase_early_stopped = True
	phase_epochs_completed = local_epoch
	break
	phase_epochs_completed = local_epoch

	total_epochs_ran += phase_epochs_completed

	if phase_early_stopped is False and phase_epochs_completed < phase["max_epochs"]:
	# Loop exited early via break without setting the flag (should not happen)
	phase_early_stopped = True

	if best_state is None:
	raise RuntimeError("Training finished without recording a validation improvement")

	model.load_state_dict(best_state["model"])
	test_loss, test_acc, test_f1 = evaluate(
	model,
	test_loader,
	device,
	eval_debug_config,
	batch_normalize=use_raw_input,
	)
	print(
	f" -> Test loss={test_loss:.4f} Test acc={test_acc:.4%} Test f1={test_f1:.4f}"
	)

	export_dir = getattr(args, "export_full_model", None)
	if export_dir is not None:
	export_dir = export_dir.expanduser().resolve()
	export_dir.mkdir(parents=True, exist_ok=True)
	comm_token = getattr(args, "comm_suffix", "multi")
	filename = f"{comm_token}_{model_name}_size{train_size}_rep{repetition}.pth"
	export_path = export_dir / filename
	full_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
	torch.save(full_state, export_path)
	print(f" [INFO] Saved full model (backbone + head) to {export_path}")

	summary[model_name][train_size]["acc"].append(test_acc)
	summary[model_name][train_size]["f1"].append(test_f1)
	summary[model_name][train_size]["val_f1"].append(best_state["val_f1"])
	summary[model_name][train_size]["val_loss"].append(best_state["val_loss"])
	per_size_val_metrics.append(
	(model_name, best_state["val_f1"], best_state["val_loss"], test_f1)
	)

	result_dir = args.output_dir / f"size_{train_size}" / f"rep_{repetition}" / model_name
	result_dir.mkdir(parents=True, exist_ok=True)
	state_to_save = copy.deepcopy(best_state)
	state_to_save["model"] = _strip_module_prefix(state_to_save["model"])
	torch.save(state_to_save, result_dir / "checkpoint.pt")
	with open(result_dir / "metrics.json", "w", encoding="utf-8") as f:
	json.dump(
	{
	"train_size_per_class": effective_train_size,
	"train_size_per_class_requested": train_size,
	"repetition": repetition,
	"model": model_name,
	"best_val_loss": best_state.get("val_loss", None),
	"best_val_acc": best_val_acc,
	"best_val_f1": best_state["val_f1"],
	"test_loss": test_loss,
	"test_acc": test_acc,
	"test_f1": test_f1,
	"best_epoch": best_epoch,
	"epochs_ran": total_epochs_ran,
	"early_stopped": overall_early_stopped,
	"history": epoch_history,
	},
	f,
	indent=2,
	)

	rep_root = args.output_dir / f"size_{train_size}" / f"rep_{repetition}"
	rep_root.mkdir(parents=True, exist_ok=True)

	if args.plot_epoch_history and HAVE_MPL and repetition_records:
	models_in_run = sorted({rec["model"] for rec in repetition_records})
	fig, axes = plt.subplots(2, 1, figsize=(8, 6), sharex=True)
	for ax in axes:
	ax.grid(True, linestyle='--', alpha=0.3)
	for model_name_plot in models_in_run:
	model_records = [rec for rec in repetition_records if rec["model"] == model_name_plot]
	if not model_records:
	continue
	epochs = [rec["epoch"] for rec in model_records]
	val_loss_values = [rec["val_loss"] for rec in model_records]
	val_f1_values = [rec["val_f1"] for rec in model_records]
	axes[0].plot(epochs, val_loss_values, marker='o', label=model_name_plot)
	axes[1].plot(epochs, val_f1_values, marker='o', label=model_name_plot)
	axes[0].set_ylabel('Val Loss')
	axes[1].set_ylabel('Val F1')
	axes[1].set_xlabel('Epoch')
	axes[0].legend(loc='best')
	axes[0].set_title(f'Size {train_size} / Rep {repetition} per-epoch metrics')
	fig.tight_layout()
	fig.savefig(rep_root / 'epoch_history.png', dpi=150)
	plt.close(fig)

	# Clean up memory after each model
	del model, optimizer, scheduler, best_state
	if scaler is not None:
	del scaler
	if device.type == 'cuda':
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	if per_size_val_metrics:
	print(f"\n[INFO] Validation summary for train_size={train_size}, rep={repetition}:")
	for model_name, val_f1, val_loss, test_f1 in sorted(
	per_size_val_metrics, key=lambda item: item[1], reverse=True
	):
	print(
	f" {model_name:<16} val_f1={val_f1:.4f} "
	f"val_loss={val_loss:.4f} test_f1={test_f1:.4f}"
	)

	summary_path = args.output_dir / "summary.json"
	summary_path.parent.mkdir(parents=True, exist_ok=True)
	serializable_summary = {
	model_name: {
	size: {
	"acc": metrics["acc"],
	"f1": metrics["f1"],
	"val_f1": metrics["val_f1"],
	"val_loss": metrics["val_loss"],
	}
	for size, metrics in size_dict.items()
	}
	for model_name, size_dict in summary.items()
	}
	with open(summary_path, "w", encoding="utf-8") as f:
	json.dump(serializable_summary, f, indent=2)

	print("\n[INFO] Final accuracy summary:")
	for model_name, results in summary.items():
	for size, metrics in results.items():
	if metrics["acc"]:
	acc_mean = float(np.mean(metrics["acc"]))
	acc_std = float(np.std(metrics["acc"]))
	f1_mean = float(np.mean(metrics["f1"]))
	f1_std = float(np.std(metrics["f1"]))
	n = len(metrics["acc"])
	print(
	f" {model_name} @ {size:4d}/class -> "
	f"acc={acc_mean:.4%} ± {acc_std:.4%}, f1={f1_mean:.4f} ± {f1_std:.4f} (n={n})"
	)

	print("\n[INFO] Final validation F1 summary:")
	for model_name, results in summary.items():
	for size, metrics in results.items():
	if metrics["val_f1"]:
	val_mean = float(np.mean(metrics["val_f1"]))
	val_std = float(np.std(metrics["val_f1"]))
	n = len(metrics["val_f1"])
	print(
	f" {model_name} @ {size:4d}/class -> "
	f"val_f1={val_mean:.4f} ± {val_std:.4f} (n={n})"
	)

	print("\n[INFO] Final validation loss summary:")
	for model_name, results in summary.items():
	for size, metrics in results.items():
	if metrics["val_loss"]:
	loss_mean = float(np.mean(metrics["val_loss"]))
	loss_std = float(np.std(metrics["val_loss"]))
	n = len(metrics["val_loss"])
	print(
	f" {model_name} @ {size:4d}/class -> "
	f"val_loss={loss_mean:.4f} ± {loss_std:.4f} (n={n})"
	)

	if HAVE_MPL:
	train_sizes_sorted = sorted(args.train_sizes)
	plt.figure(figsize=(8, 5))
	plotted = False
	for model_name in args.models:
	model_results = summary.get(model_name, {})
	means: List[float] = []
	for size in train_sizes_sorted:
	val_list = model_results.get(size, {}).get("val_f1", [])
	means.append(float(np.mean(val_list)) if val_list else float("nan"))
	if not any(np.isfinite(means)):
	continue
	plt.plot(train_sizes_sorted, means, marker="o", linewidth=2, label=model_name)
	plotted = True
	if plotted:
	plt.title("Validation F1 vs. Training Size")
	plt.xlabel("Training samples per class")
	plt.ylabel("Validation F1 (macro)")
	plt.xticks(train_sizes_sorted)
	plt.ylim(0.0, 1.0)
	plt.grid(True, which="both", linestyle="--", alpha=0.4)
	plt.legend(title="Model", frameon=False)
	plt.tight_layout()
	plot_path = args.output_dir / "val_f1_summary.png"
	plt.savefig(plot_path, dpi=200)
	plt.close()
	print(f"[INFO] Saved validation F1 plot to {plot_path}")
	else:
	plt.close()
	print("[WARN] No validation F1 data available to plot.")
	else:
	print("[WARN] Matplotlib not available; skipping validation F1 plot.")


	if __name__ == "__main__":
	main()