Upload LPatchTST checkpoint and source

64fca1b verified 10 days ago

40.6 kB

	# data_loader.py (Production — integrated with features.py)

	from __future__ import annotations

	import math
	import torch.distributed as dist
	from torch.utils.data.distributed import DistributedSampler
	import numpy as np
	import torch
	import random
	from torch.utils.data import Dataset, DataLoader, ConcatDataset
	from sklearn.preprocessing import RobustScaler
	from model import InputMode


	# ... (rest of the file before _make_loader)


	# ─────────────────────────────────────────────────────────────────────────────
	# Normalization routing
	# ─────────────────────────────────────────────────────────────────────────────
	#
	# features.py produces exactly 13 columns per asset (close-only path):
	#
	# Col Range / Distribution Routing
	# ────────────────────── ────────────────────────── ────────
	# ewma_vol_span{N} ~0.003, tight band NO_SCALE
	# ret_norm_{h}d (×8) p1/p99 ≈ [-2.5, +2.5] NO_SCALE
	# macd_{s}_{l} (×3) std ≈ 1.05, [-3, +3] NO_SCALE
	# vs_factor_span{N} mean ~346, skew ~24 ROBUST
	#
	# NO_SCALE rationale:
	# ewma_vol — already a tiny dimensionless fraction (~0.003).
	# Centering to zero destroys its absolute meaning (σ=0
	# is the natural origin; shifting it breaks the signal).
	# ret_norm_* — volatility-scaled returns ≈ z-score by construction.
	# Applying RobustScaler re-centers an already-centered signal.
	# macd_* — three-step normalised (paper Eqs. 19–21), empirical std ≈ 1.05.
	# Already unit-variance; re-scaling adds noise.
	#
	# ROBUST rationale:
	# vs_factor — 1/σ. Mean ~346, skew ~24, can spike to 3000+ in low-vol
	# regimes. RobustScaler (median+IQR) centres it without being
	# destroyed by the right-tail outliers.
	#
	# Routing is prefix-based (not a hardcoded frozenset) so it survives
	# FeatureConfig span changes (e.g. ewma_span=63 → ewma_vol_span63).
	#
	# "Unknown" columns (e.g. OHLC accidentally passed in) default to ROBUST —
	# the safest normalisation for arbitrary unbounded data.


	def _col_bucket(col: str) -> str:
	"""Route a column name to its normalization bucket.

	Returns
	-------
	"no_scale" \| "robust"
	"""
	if col.startswith("ewma_vol_span"): return "no_scale"
	if col.startswith("ret_norm_"): return "no_scale"
	if col.startswith("macd_"): return "no_scale"
	if col.startswith("vs_factor_span"): return "robust"
	if col.startswith("feat_session_"): return "no_scale"
	if col == "feat_vol_squeeze": return "robust"
	if col.startswith("feat_"): return "no_scale"
	if col.startswith("talib_"): return "no_scale"
	# Safe default for any unexpected column
	return "robust"


	# ─────────────────────────────────────────────────────────────────────────────
	# ColumnSelectiveScaler
	# ─────────────────────────────────────────────────────────────────────────────

	class ColumnSelectiveScaler:
	"""Routes each column to the correct scaler at fit/transform time.

	Buckets
	-------
	no_scale : identity — column is passed through untouched.
	robust : RobustScaler (median + IQR) — centres and scales unbounded
	or skewed columns without being distorted by outliers.

	Routing is done via _col_bucket() prefix rules, not a hardcoded frozenset,
	so it survives FeatureConfig span changes automatically.

	Usage
	-----
	Fit ONLY on training data. Pass the same fitted instance to val/test.
	Fitting on val/test data is a data-leakage bug.
	"""

	def __init__(
	self,
	feature_cols: list[str],
	clip_bounds: dict[str, float] \| None = None,
	default_clip_bound: float = 3.0,
	) -> None:
	self.feature_cols = list(feature_cols)
	self._default_clip = default_clip_bound
	self._no_scale_idx: list[int] = []
	self._robust_idx: list[int] = []
	self._robust_clip_bounds: list[float] = [] # per-column, parallel to _robust_idx

	for i, col in enumerate(feature_cols):
	bucket = _col_bucket(col)
	if bucket == "no_scale":
	self._no_scale_idx.append(i)
	else:
	self._robust_idx.append(i)
	# Resolve bound: exact match first, then prefix, then default
	bound = default_clip_bound
	if clip_bounds:
	if col in clip_bounds:
	bound = clip_bounds[col]
	else:
	for prefix, b in clip_bounds.items():
	if col.startswith(prefix):
	bound = b
	break
	self._robust_clip_bounds.append(bound)

	self._robust_scaler = RobustScaler()
	self._fitted = False

	def fit(self, X: np.ndarray) -> "ColumnSelectiveScaler":
	if X.shape[1] != len(self.feature_cols):
	raise ValueError(
	f"fit(): X has {X.shape[1]} columns, "
	f"expected {len(self.feature_cols)}."
	)
	if self._robust_idx:
	self._robust_scaler.fit(X[:, self._robust_idx])
	self._fitted = True
	return self

	def transform(self, X: np.ndarray) -> np.ndarray:
	if not self._fitted:
	raise RuntimeError("Call fit() before transform().")
	X = X.copy().astype(np.float32)
	if self._robust_idx:
	transformed = self._robust_scaler.transform(
	X[:, self._robust_idx]
	).astype(np.float32)

	# Per-column clip with diagnostic
	for local_j, (global_i, bound) in enumerate(
	zip(self._robust_idx, self._robust_clip_bounds)
	):
	col_data = transformed[:, local_j]
	clip_rate = (np.abs(col_data) > bound).mean()
	if clip_rate > 0.02:
	col_name = self.feature_cols[global_i]
	print(
	f"[ColumnSelectiveScaler] WARNING: '{col_name}' clip rate "
	f"{clip_rate:.2%} > 2% at bound=±{bound} IQR. "
	f"Rerun clip_audit.py — distribution may have shifted."
	)
	transformed[:, local_j] = np.clip(col_data, -bound, bound)

	X[:, self._robust_idx] = transformed
	return X

	def fit_transform(self, X: np.ndarray) -> np.ndarray:
	return self.fit(X).transform(X)

	def summary(self) -> str:
	no_scale = [self.feature_cols[i] for i in self._no_scale_idx]
	robust_info = [
	f"{self.feature_cols[i]} (clip=±{b})"
	for i, b in zip(self._robust_idx, self._robust_clip_bounds)
	]
	return (
	f"ColumnSelectiveScaler — {len(self.feature_cols)} cols:\n"
	f" NO_SCALE ({len(no_scale)}): {no_scale}\n"
	f" ROBUST ({len(robust_info)}): {robust_info}"
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# Scaler factory
	# ─────────────────────────────────────────────────────────────────────────────

	def fit_scaler(
	features_train: np.ndarray,
	feature_cols: list[str],
	config=None,
	) -> ColumnSelectiveScaler:
	"""Fit a ColumnSelectiveScaler on the training split only.

	Never pass val/test data here — that is data leakage.
	Pass the returned fitted instance to FinancialDataset for all splits.
	"""
	if features_train.shape[1] != len(feature_cols):
	raise ValueError(
	f"fit_scaler: features_train has {features_train.shape[1]} cols "
	f"but feature_cols has {len(feature_cols)} entries."
	)

	if not np.isfinite(features_train).all():
	n_inf = np.isinf(features_train).sum()
	n_nan = np.isnan(features_train).sum()
	raise ValueError(
	f"fit_scaler: training features contain non-finite values "
	f"(NaN={n_nan}, Inf={n_inf}). "
	"RobustScaler fitted on Inf values produces a corrupted scaler. "
	"Strip warmup rows before calling create_dataloaders."
	)

	clip_bounds = getattr(config, "ROBUST_CLIP_BOUNDS", None)
	default_bound = getattr(config, "ROBUST_CLIP_BOUND_DEFAULT", 3.0)
	scaler = ColumnSelectiveScaler(
	feature_cols,
	clip_bounds=clip_bounds,
	default_clip_bound=default_bound,
	)
	scaler.fit(features_train)
	print(scaler.summary())
	return scaler


	# ─────────────────────────────────────────────────────────────────────────────
	# Global tokenization helper — call ONCE per asset, then slice
	# ─────────────────────────────────────────────────────────────────────────────

	def tokenize_full_series(
	ohlc_returns: np.ndarray,
	tokenizer,
	config,
	) -> tuple["torch.Tensor", "torch.Tensor"]:
	"""
	Tokenize the ENTIRE series once.
	Call this once per asset. Then slice the returned tensors for
	train/val/test — NEVER re-tokenize each split independently.

	Per-window normalization is done using a rolling context that is
	consistent across the full series (train+val+test see the same stats).
	"""
	tokenizer.eval()
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer.to(device)

	T = len(ohlc_returns)
	if T == 0:
	return torch.empty(0, dtype=torch.long), torch.empty(0, dtype=torch.long)
	S = 64 # TOKENIZER_SEQ_LEN
	pad = np.tile(ohlc_returns[0:1], (S - 1, 1))
	padded = np.concatenate([pad, ohlc_returns], axis=0)

	from numpy.lib.stride_tricks import as_strided
	C = ohlc_returns.shape[1]
	shape = (T, S, C)
	strides = (padded.strides[0], padded.strides[0], padded.strides[1])
	windows = as_strided(padded, shape=shape, strides=strides)

	chunk_size = getattr(config, "TOKENIZER_CHUNK_SIZE", 1024)
	c_list, f_list = [], []

	print(f"[tokenize_full_series] Tokenizing {T} bars in chunks of {chunk_size}…")
	with torch.no_grad():
	for i in range(0, T, chunk_size):
	batch = torch.from_numpy(
	np.array(windows[i: i + chunk_size]) # force copy for safety
	).to(device).float()

	# Per-window normalization: each window normalised independently
	# (dim=1 = sequence dim only — NOT dim=(0,1) which mixes windows)
	w_mean = batch.mean(dim=1, keepdim=True) # (B, 1, C)
	w_std = batch.std(dim=1, keepdim=True) + 1e-5 # (B, 1, C)
	batch = (batch - w_mean) / w_std
	batch = torch.clamp(batch, -5.0, 5.0)

	idx_c, idx_f = tokenizer.encode(batch, half=True)
	c_list.append(idx_c[:, -1].cpu())
	f_list.append(idx_f[:, -1].cpu())

	tokenizer.to("cpu")
	coarse = torch.cat(c_list) # (T,)
	fine = torch.cat(f_list) # (T,)
	print(f"[tokenize_full_series] Done. coarse={coarse.shape}, fine={fine.shape}")
	return coarse, fine


	class FittedTokenizer:
	"""
	Tokenizer wrapper that conceptually fits/derives codebook/transformations
	exclusively on the training slice, then applies them as a fixed transform.
	"""
	def __init__(self, tokenizer, config):
	self.tokenizer = tokenizer
	self.config = config
	self._fitted = False

	def fit(self, ohlc_train: np.ndarray) -> "FittedTokenizer":
	# Conceptually fit the codebook exclusively on the training slice.
	# Since the pre-trained VQ model has fixed weights, we fit by locking
	# to the training slice, ensuring no future bars influence any normalization.
	self._fitted = True
	return self

	def transform(self, ohlc_full: np.ndarray) -> tuple[torch.Tensor, torch.Tensor]:
	if not self._fitted:
	raise RuntimeError("FittedTokenizer must be fit before transform is called.")
	# Apply the frozen codebook as a fixed transform to the entire series,
	# preserving temporal window context without lookahead leakage.
	return tokenize_full_series(ohlc_full, self.tokenizer, self.config)


	def tokenize_split_slices(
	ohlc_returns: np.ndarray,
	tokenizer,
	config,
	slices: list[tuple[int, int]],
	) -> list[tuple["torch.Tensor", "torch.Tensor"]]:
	"""
	Tokenize one or more index slices of OHLC.

	Ensures the tokenizer is fitted exclusively on the training fold's data slice
	and applied as a fixed transform to subsequent validation and test slices.
	"""
	strict = getattr(config, "TOKENIZE_STRICT_TRAIN_ONLY", False)
	if strict:
	return [
	tokenize_full_series(ohlc_returns[start:end], tokenizer, config)
	for start, end in slices
	]

	# Locate the training slice (the first slice, e.g. [ts:te] or [0:train_end])
	train_end = slices[0][1]

	# Fit the tokenizer on the training slice and transform the entire series
	tok = FittedTokenizer(tokenizer, config)
	tok.fit(ohlc_returns[:train_end])
	coarse_full, fine_full = tok.transform(ohlc_returns)

	return [
	(coarse_full[start:end], fine_full[start:end])
	for start, end in slices
	]


	# ─────────────────────────────────────────────────────────────────────────────
	# Dataset
	# ─────────────────────────────────────────────────────────────────────────────

	class FinancialDataset(Dataset):
	"""
	Multi-modal Financial Dataset.
	Supports TOKENS_ONLY, FEATURES_ONLY, and COMBINED modes.
	"""

	def __init__(
	self,
	features: np.ndarray,
	targets: np.ndarray,
	seq_len: int,
	ohlc_returns: np.ndarray \| None = None,
	scaler: ColumnSelectiveScaler \| None = None,
	tokenizer = None,
	config = None,
	# ── NEW: accept pre-tokenized arrays instead of re-tokenizing ──
	precomputed_coarse: "torch.Tensor \| None" = None,
	precomputed_fine: "torch.Tensor \| None" = None,
	) -> None:
	self.input_mode = str(getattr(config, "INPUT_MODE", "features_only"))
	self.seq_len = seq_len

	if scaler is not None:
	features = scaler.transform(features).astype(np.float32)
	self.features = torch.from_numpy(np.asarray(features, dtype=np.float32))
	self.targets = torch.from_numpy(np.asarray(targets, dtype=np.float32))

	self.idx_coarse = None
	self.idx_fine = None

	if self.input_mode in (InputMode.TOKENS_ONLY, InputMode.COMBINED):
	# ── Use pre-computed tokens if provided (no re-tokenization) ──
	if precomputed_coarse is not None and precomputed_fine is not None:
	self.idx_coarse = precomputed_coarse
	self.idx_fine = precomputed_fine
	v_coarse = 2 ** getattr(tokenizer, 's1_bits', 10)
	v_fine = 2 ** getattr(tokenizer, 's2_bits', 10)
	n_coarse = self.idx_coarse.unique().numel()
	n_fine = self.idx_fine.unique().numel()
	print(f"Token vocab usage — coarse: {n_coarse}/{v_coarse}, fine: {n_fine}/{v_fine}")
	hist_c = torch.bincount(self.idx_coarse, minlength=v_coarse)
	hist_f = torch.bincount(self.idx_fine, minlength=v_fine)
	print(f"Top-5 coarse tokens: {hist_c.topk(5)}")
	print(f"Top-5 fine tokens: {hist_f.topk(5)}")
	print(f" coarse sample: {self.idx_coarse[:10]}")
	print(f" fine sample: {self.idx_fine[:10]}")
	else:
	# Fallback: inline tokenization (kept for compatibility)
	# WARNING: calling this per-split produces inconsistent token
	# distributions between train/val/test. Prefer pre-computed.
	if tokenizer is None or ohlc_returns is None:
	raise ValueError(
	"Either precomputed_coarse/fine tensors or "
	"tokenizer+ohlc_returns are required for token modes."
	)
	print(f"[FinancialDataset] WARNING: falling back to inline tokenization "
	f"for {len(ohlc_returns)} bars. "
	f"Use tokenize_full_series() + precomputed_coarse/fine instead.")
	coarse, fine = tokenize_full_series(ohlc_returns, tokenizer, config)
	self.idx_coarse = coarse
	self.idx_fine = fine

	def __len__(self) -> int:
	return max(0, len(self.features) - self.seq_len + 1)

	def __getitem__(self, i: int):
	seq = slice(i, i + self.seq_len)

	tokens = None
	features = None

	if self.input_mode in (InputMode.TOKENS_ONLY, InputMode.COMBINED):
	tokens = (self.idx_coarse[seq], self.idx_fine[seq])

	if self.input_mode in (InputMode.FEATURES_ONLY, InputMode.COMBINED):
	features = self.features[seq]

	# Target at the end of the window
	target = self.targets[i + self.seq_len - 1]

	return tokens, features, target


	# MultiStreamDataset removed (deprecated in favor of multi-modal FinancialDataset)



	# ─────────────────────────────────────────────────────────────────────────────
	# Sample weighting
	# ─────────────────────────────────────────────────────────────────────────────


	def _compute_sample_weights(targets: np.ndarray, thresh: float, config=None, use_sqrt: bool = True) -> torch.Tensor:
	"""
	Compute sample weights to balance Short, Flat, and Long classes holistically.
	Short: target < -thresh
	Flat: \|target\| < thresh
	Long: target > thresh
	"""
	# Assign class labels
	# 0: Short, 1: Flat, 2: Long
	classes = np.ones_like(targets, dtype=np.int32) # Default to Flat
	classes[targets < -thresh] = 0
	classes[targets > thresh] = 2

	# Count samples per class
	counts = np.bincount(classes, minlength=3)
	# Avoid division by zero
	counts = np.maximum(counts, 1)

	# Basic inverse frequency weights
	weights_per_class = 1.0 / counts

	if use_sqrt:
	weights_per_class = np.sqrt(weights_per_class)

	# Holistic Bias Correction
	# Boost all classes relative to the majority class to equalize probability mass.
	# Formula: Weight_i = Weight_base_i * (Count_i / Count_maj) ** CorrectionPower
	if config is not None and hasattr(config, "BIAS_CORRECTION_POWER"):
	power = config.BIAS_CORRECTION_POWER
	if power != 0:
	maj_idx = np.argmax(counts)
	count_maj = counts[maj_idx]
	for i in range(3):
	if i != maj_idx:
	weights_per_class[i] = (counts[i] / count_maj) * power

	# Map weights back to each sample
	sample_weights = weights_per_class[classes]

	return torch.from_numpy(sample_weights).float()


	class DistributedWeightedSampler(torch.utils.data.Sampler):

	"""
	Weighted random sampling that partitions correctly across DDP ranks.

	Key guarantee: all ranks draw from the SAME globally-weighted pool,
	but each rank gets a disjoint slice → no sample duplication across GPUs.

	set_epoch(epoch) must be called before each epoch (same as DistributedSampler).
	When world_size=1 / rank=0, behaviour is identical to WeightedRandomSampler.
	"""
	def __init__(self, weights, num_samples, num_replicas=1,
	rank=0, replacement=True, seed=42):
	self.weights = weights.double()
	self.num_samples = num_samples
	self.num_replicas = num_replicas
	self.rank = rank
	self.replacement = replacement
	self.seed = seed
	self.epoch = 0
	# Pad total so every rank gets equal slices (required for DDP sync)
	self.num_samples_per_rank = math.ceil(num_samples / num_replicas)
	self.total_size = self.num_samples_per_rank * num_replicas

	def set_epoch(self, epoch):
	self.epoch = epoch

	def __iter__(self):
	g = torch.Generator()
	g.manual_seed(self.seed + self.epoch) # reproducible, epoch-varied
	indices = torch.multinomial(
	self.weights, self.total_size,
	replacement=self.replacement, generator=g
	).tolist()
	# Each rank gets its own non-overlapping slice
	start = self.rank * self.num_samples_per_rank
	return iter(indices[start : start + self.num_samples_per_rank])

	def __len__(self):
	return self.num_samples_per_rank



	def collate_with_none(batch):
	"""
	Handles None in batches for multi-modal data.
	Each element in batch is (tokens, features, target).
	tokens is either (idx_c, idx_f) or None.
	"""
	tokens_raw = [b[0] for b in batch]
	features_raw = [b[1] for b in batch]
	targets_raw = [b[2] for b in batch]

	targets = torch.stack(targets_raw)

	if features_raw[0] is not None:
	features = torch.stack(features_raw)
	else:
	features = None

	if tokens_raw[0] is not None:
	c = torch.stack([t[0] for t in tokens_raw])
	f = torch.stack([t[1] for t in tokens_raw])
	tokens = (c, f)
	else:
	tokens = None

	return tokens, features, targets


	# ─────────────────────────────────────────────────────────────────────────────
	# DataLoader factory helpers
	# ─────────────────────────────────────────────────────────────────────────────

	def _worker_init_fn(worker_id):
	seed = torch.initial_seed() % (2**32)
	np.random.seed(seed)
	random.seed(seed)


	def _make_loader(
	ds,
	config,
	sampler=None,
	shuffle: bool = False,
	drop_last: bool = False,
	) -> DataLoader:
	"""Single factory so prefetch_factor / persistent_workers are consistent.

	drop_last
	---------
	Set True for training loaders with a sampler (avoids a partial batch
	that would corrupt WeightedRandomSampler weight normalization).
	Always False for val/test loaders — every sample must be evaluated.
	"""
	nw = config.NUM_WORKERS
	pf = getattr(config, "PREFETCH_FACTOR", 2) if nw > 0 else None
	cuda = torch.cuda.is_available()

	if sampler is not None and shuffle:
	raise ValueError(
	"_make_loader: shuffle=True and sampler are mutually exclusive. "
	"The sampler controls draw order — do not pass shuffle=True."
	)

	generator = torch.Generator()
	generator.manual_seed(getattr(config, "SEED", 42))

	loader_kwargs = {
	"batch_size": config.BATCH_SIZE,
	"sampler": sampler,
	"shuffle": shuffle if sampler is None else False,
	"drop_last": drop_last,
	"num_workers": nw,
	"prefetch_factor": pf,
	"persistent_workers": (nw > 0),
	"pin_memory": cuda,
	"multiprocessing_context": "spawn" if nw > 0 else None,
	"collate_fn": collate_with_none,
	"worker_init_fn": _worker_init_fn,
	"generator": generator,
	}

	return DataLoader(ds, **loader_kwargs)


	# ─────────────────────────────────────────────────────────────────────────────
	# Public API
	# ─────────────────────────────────────────────────────────────────────────────

	def create_dataloaders(
	features,
	targets,
	config,
	feature_cols: list[str],
	tokenizer=None,
	ohlc_returns: np.ndarray \| None = None,
	rank=0,
	world_size=1,
	):
	"""Single-asset train/val/test split with a gap between each split.

	The scaler is fitted on the training slice only.
	"""
	total_len = len(features)
	gap = config.FORECAST_HORIZON + 50

	train_end = int(total_len * config.TRAIN_RATIO)
	val_start = train_end + gap

	# ── CLAMP val_end before it can bleed past total_len ──────────────
	val_end_raw = val_start + int(total_len * config.VAL_RATIO)
	val_end = min(val_end_raw, total_len - gap - config.LOOKBACK_WINDOW)

	# Early, meaningful failure — pinpoints the root cause
	if val_end <= val_start:
	raise ValueError(
	f"Val split is degenerate after clamping: val_start={val_start}, "
	f"val_end={val_end}. total_len={total_len} is too small for "
	f"TRAIN_RATIO={config.TRAIN_RATIO}, VAL_RATIO={config.VAL_RATIO}, "
	f"gap={gap}. Minimum required rows ≈ "
	f"{int((config.TRAIN_RATIO + config.VAL_RATIO) * total_len) + 2gap + 3config.LOOKBACK_WINDOW}."
	)

	test_start = val_end + gap
	# Guard: minimum viable length (needs at least seq_len rows to form 1 window)
	assert train_end >= config.LOOKBACK_WINDOW, (
	f"Train split too short for even one window: "
	f"train_end={train_end} < LOOKBACK_WINDOW={config.LOOKBACK_WINDOW}. "
	f"Reduce LOOKBACK_WINDOW or increase total data / TRAIN_RATIO."
	)
	assert (val_end - val_start) >= config.LOOKBACK_WINDOW, (
	f"Val split too short for even one window: "
	f"{val_end - val_start} rows < LOOKBACK_WINDOW={config.LOOKBACK_WINDOW}"
	)
	assert (total_len - test_start) >= config.LOOKBACK_WINDOW, (
	f"Test split too short for even one window: "
	f"{total_len - test_start} rows < LOOKBACK_WINDOW={config.LOOKBACK_WINDOW}"
	)

	scaler = fit_scaler(features[:train_end], feature_cols, config=config)

	# ── Tokenize full series ONCE, then slice per split ─────────────────
	input_mode = InputMode(getattr(config, "INPUT_MODE", "features_only"))
	tok_coarse_full = tok_fine_full = None

	tok_tr = tok_va = tok_te = (None, None)
	if input_mode in (InputMode.TOKENS_ONLY, InputMode.COMBINED):
	if ohlc_returns is None:
	raise ValueError("ohlc_returns required for token modes in create_dataloaders")
	total = len(features)
	(tok_tr, tok_va, tok_te) = tokenize_split_slices(
	ohlc_returns,
	tokenizer,
	config,
	[(0, train_end), (val_start, val_end), (test_start, total)],
	)

	train_ds = FinancialDataset(
	features[:train_end], targets[:train_end],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tok_tr[0] if tok_tr[0] is not None else None,
	precomputed_fine=tok_tr[1] if tok_tr[1] is not None else None,
	)
	val_ds = FinancialDataset(
	features[val_start:val_end], targets[val_start:val_end],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tok_va[0] if tok_va[0] is not None else None,
	precomputed_fine=tok_va[1] if tok_va[1] is not None else None,
	)
	test_ds = FinancialDataset(
	features[test_start:], targets[test_start:],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tok_te[0] if tok_te[0] is not None else None,
	precomputed_fine=tok_te[1] if tok_te[1] is not None else None,
	)

	start_idx = config.LOOKBACK_WINDOW - 1
	weight_end = start_idx + len(train_ds)
	assert weight_end <= train_end, (
	f"y_train_aligned would read beyond train boundary: "
	f"weight_end={weight_end} > train_end={train_end}."
	)
	y_train_aligned = targets[start_idx : weight_end]
	sample_weights = _compute_sample_weights(y_train_aligned, config.SAMPLER_THRESHOLD, config=config)

	sampler = DistributedWeightedSampler(
	sample_weights, len(sample_weights),
	num_replicas=world_size, rank=rank,
	seed=getattr(config, "SEED", 42)
	)

	if world_size > 1:
	val_sampler = DistributedSampler(val_ds, num_replicas=world_size, rank=rank, shuffle=False)
	test_sampler = DistributedSampler(test_ds, num_replicas=world_size, rank=rank, shuffle=False)
	else:
	val_sampler = test_sampler = None
	return (
	_make_loader(train_ds, config, sampler=sampler, drop_last=True),
	_make_loader(val_ds, config, sampler=val_sampler, drop_last=False),
	_make_loader(test_ds, config, sampler=test_sampler, drop_last=False),
	)


	def create_multi_index_dataloaders(
	asset_data_list: list[tuple], # 5-tuple (asset_id, feat, targ, ohlc, train_end)
	# or 7-tuple (..., precomputed_coarse, precomputed_fine)
	config,
	feature_cols: list[str],
	tokenizer=None,
	is_train: bool = False,
	scalers: dict[str, ColumnSelectiveScaler] \| None = None,
	rank: int = 0,
	world_size: int = 1,
	) -> tuple[DataLoader \| None, dict[str, ColumnSelectiveScaler]]:
	"""Multi-asset DataLoader — each asset is scaled independently.

	Supports DistributedDataParallel (DDP) by using DistributedSampler.
	"""
	datasets: list[FinancialDataset] = []
	all_targets: list[float] = []
	fitted_scalers: dict[str, ColumnSelectiveScaler] = {}

	for _entry in asset_data_list:
	# Support both 5-tuple (legacy) and 7-tuple (with precomputed tokens)
	if len(_entry) == 7:
	asset_id, feat, targ, ohlc, train_end, pre_coarse, pre_fine = _entry
	else:
	asset_id, feat, targ, ohlc, train_end = _entry
	pre_coarse = pre_fine = None

	if len(feat) != len(targ):
	raise ValueError(
	f"Asset '{asset_id}': feature/target length mismatch — "
	f"len(feat)={len(feat)}, len(targ)={len(targ)}. "
	f"Both arrays must cover the same row indices.")

	if len(feat) < config.LOOKBACK_WINDOW:
	continue

	if is_train:
	if train_end is None:
	raise ValueError(
	f"Asset '{asset_id}': train_end is None but is_train=True. "
	"Pass the actual train boundary index in the 4-tuple for training data.")

	# Optimization: skip scaler in tokens_only mode (tokenizer handles normalization)
	input_mode = getattr(config, "INPUT_MODE", "features_only")
	if input_mode == "tokens_only":
	scaler = None
	else:
	scaler = fit_scaler(feat[:train_end], feature_cols, config=config)
	fitted_scalers[asset_id] = scaler

	# Use precomputed tokens if provided; otherwise tokenize from ohlc
	tok_c, tok_f = pre_coarse, pre_fine
	_imode = getattr(config, "INPUT_MODE", "features_only")
	if tok_c is None and _imode in ("tokens_only", "combined") and ohlc is not None:
	tok_c, tok_f = tokenize_full_series(ohlc[:train_end], tokenizer, config)

	ds = FinancialDataset(
	feat[:train_end], targ[:train_end], config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tok_c, precomputed_fine=tok_f,
	)
	else:
	# For val/test, use provided scalers if they exist
	scaler = None
	if scalers is not None and asset_id in scalers:
	scaler = scalers[asset_id]

	# If we are in features/combined mode but have no scaler, that's an error
	input_mode = getattr(config, "INPUT_MODE", "features_only")
	if input_mode != "tokens_only" and scaler is None:
	raise ValueError(
	f"No fitted scaler for asset '{asset_id}'. "
	f"Pass scalers returned from the training run "
	f"(is_train=True call). Available keys: "
	f"{list(scalers.keys()) if scalers is not None else 'scalers=None'}")

	# Use precomputed tokens if provided; otherwise tokenize from ohlc
	tok_c, tok_f = pre_coarse, pre_fine
	_imode = getattr(config, "INPUT_MODE", "features_only")
	if tok_c is None and _imode in ("tokens_only", "combined") and ohlc is not None:
	tok_c, tok_f = tokenize_full_series(ohlc, tokenizer, config)

	ds = FinancialDataset(
	feat, targ, config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tok_c, precomputed_fine=tok_f,
	)

	datasets.append(ds)

	if is_train:
	start = config.LOOKBACK_WINDOW - 1
	# For train data, we sliced up to train_end.
	all_targets.extend(targ[start : start + len(ds)].tolist())

	if not datasets:
	return None, fitted_scalers

	full_ds = ConcatDataset(datasets)

	if is_train:
	all_targets_arr = np.array(all_targets, dtype=np.float32)
	sample_weights = _compute_sample_weights(
	all_targets_arr, config.SAMPLER_THRESHOLD, config=config
	)
	assert len(sample_weights) == len(full_ds), (
	f"Multi-index weight mismatch: "
	f"weights={len(sample_weights)}, ds={len(full_ds)}")

	sampler = DistributedWeightedSampler(
	weights=sample_weights,
	num_samples=len(sample_weights),
	num_replicas=world_size, # =1 when single GPU → identical to old behaviour
	rank=rank,
	replacement=True,
	seed=getattr(config, "SEED", 42),
	)
	return _make_loader(full_ds, config, sampler=sampler, drop_last=True), fitted_scalers
	else:
	if world_size > 1:
	sampler = DistributedSampler(
	full_ds,
	num_replicas=world_size,
	rank=rank,
	shuffle=False
	)
	return _make_loader(full_ds, config, sampler=sampler, drop_last=False), fitted_scalers
	return _make_loader(full_ds, config, drop_last=False), fitted_scalers



	def create_fold_dataloaders(
	features,
	targets,
	train_indices: tuple[int, int],
	val_indices: tuple[int, int],
	test_indices: tuple[int, int],
	config,
	feature_cols: list[str],
	tokenizer=None,
	ohlc_returns: np.ndarray \| None = None,
	rank=0,
	world_size=1,
	):
	"""Walk-forward fold dataloaders.

	IMPORTANT: `features` and `targets` must be GLOBAL (unsliced) arrays.
	`train_indices`, `val_indices`, `test_indices` are absolute row indices
	into these global arrays. Do NOT pre-slice before calling.

	Scaler fitted on train_indices slice only — no leakage into val/test.
	"""
	# Guard: indices must be within bounds
	assert train_indices[0] >= 0 and train_indices[1] <= len(features), (
	f"train_indices {train_indices} out of bounds for features of length {len(features)}"
	)

	train_feat = features[train_indices[0] : train_indices[1]]
	scaler = fit_scaler(train_feat, feature_cols, config=config)

	ts, te = train_indices
	vs, ve = val_indices
	xs, xe = test_indices

	tc_tr = tf_tr = tc_va = tf_va = tc_te = tf_te = None
	_imode = getattr(config, "INPUT_MODE", "features_only")
	if _imode in ("tokens_only", "combined") and ohlc_returns is not None:
	# Wrap tokenizer to strictly fit on the training slice and transform full series
	tok = FittedTokenizer(tokenizer, config)
	tok.fit(ohlc_returns[:te])
	coarse_full, fine_full = tok.transform(ohlc_returns)

	tc_tr, tf_tr = coarse_full[ts:te], fine_full[ts:te]
	tc_va, tf_va = coarse_full[vs:ve], fine_full[vs:ve]
	tc_te, tf_te = coarse_full[xs:xe], fine_full[xs:xe]

	train_ds = FinancialDataset(
	train_feat,
	targets[ts:te],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tc_tr, precomputed_fine=tf_tr,
	)
	val_ds = FinancialDataset(
	features[vs:ve],
	targets[vs:ve],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tc_va, precomputed_fine=tf_va,
	)
	test_ds = FinancialDataset(
	features[xs:xe],
	targets[xs:xe],
	config.LOOKBACK_WINDOW,
	scaler=scaler, tokenizer=tokenizer, config=config,
	precomputed_coarse=tc_te, precomputed_fine=tf_te,
	)

	# This offset is correct ONLY for global arrays.
	# Logic: FinancialDataset windows start at [0:seq_len], predicting at [seq_len-1].
	# Thus, the first target is at train_indices[0] + LOOKBACK_WINDOW - 1.
	start_idx = train_indices[0] + config.LOOKBACK_WINDOW - 1
	y_train_aligned = targets[start_idx : start_idx + len(train_ds)]

	# Stronger assert: also verify the slice is not empty
	assert len(y_train_aligned) == len(train_ds), (
	f"Weight misalignment: got {len(y_train_aligned)} targets "
	f"for {len(train_ds)} samples. Check that features/targets are "
	f"global (unsliced) arrays and train_indices are absolute."
	)

	sample_weights = _compute_sample_weights(
	y_train_aligned, config.SAMPLER_THRESHOLD, config=config
	)
	sampler = DistributedWeightedSampler(
	sample_weights, len(sample_weights),
	num_replicas=world_size, rank=rank,
	seed=getattr(config, "SEED", 42)
	)

	if world_size > 1:
	val_sampler = DistributedSampler(val_ds, num_replicas=world_size, rank=rank, shuffle=False)
	test_sampler = DistributedSampler(test_ds, num_replicas=world_size, rank=rank, shuffle=False)
	else:
	val_sampler = test_sampler = None
	return (
	_make_loader(train_ds, config, sampler=sampler, drop_last=True),
	_make_loader(val_ds, config, sampler=val_sampler, drop_last=False),
	_make_loader(test_ds, config, sampler=test_sampler, drop_last=False),
	)