Upload train_lightning.py with huggingface_hub

e32012b verified 4 months ago

42.3 kB

	#!/usr/bin/env python3
	"""
	SEM V6 PyTorch Lightning Training Script

	Production-ready training with all best practices:
	- Automatic mixed precision (AMP)
	- Gradient accumulation
	- Learning rate scheduling
	- Model checkpointing
	- Progress bars and rich logging
	- Distributed training support
	- Early stopping
	- Gradient clipping
	- Learning rate finder
	- Model pruning/quantization ready
	- TensorBoard logging
	- Memory-efficient data streaming with prefetching

	Reference:
	- PyTorch Lightning 2.x best practices
	- AGENTS.md: GPU-first, streaming data, maximum speed
	"""

	import argparse
	import logging
	import math
	import time
	from pathlib import Path
	from typing import Any, Dict, Optional, Tuple

	import torch

	# Global timing - track wall-clock from absolute start
	_SCRIPT_START_TIME = time.perf_counter()
	import torch.nn as nn
	import torch.nn.functional as F
	import pytorch_lightning as pl
	import datasets # For HF streaming
	from pytorch_lightning.callbacks import (
	ModelCheckpoint,
	EarlyStopping,
	LearningRateMonitor,
	RichProgressBar,
	RichModelSummary,
	)
	from pytorch_lightning.loggers import TensorBoardLogger
	from torch.utils.data import DataLoader

	# SEM V6 imports
	import sys

	sys.path.insert(0, "src")
	sys.path.insert(0, "ChebyKan_cuda_op")

	from sem_v6.sem_v6 import SEMV6
	from sem_v6.data.streaming_dataset import StreamingDataset
	from sem_v6.modules.text_decoder import multi_token_prediction_loss
	from sem_v6.validation.callbacks import CombinedValidationCallback
	from sem_v6.validation.validators import FastValidator, GrammarValidator
	from sem_v6.validation.grammar_checker import LanguageToolClient

	# Enable Tensor Core acceleration (RTX 3060 and above)
	torch.set_float32_matmul_precision('high')


	class TimingCallback(pl.Callback):
	"""
	Callback to track wall-clock timing for 100s training target.

	Tracks:
	- Time to first batch (includes data loading startup)
	- Training time per step
	- Total elapsed time from script start
	"""

	def __init__(self, script_start_time: float, target_time: float = 100.0):
	super().__init__()
	self.script_start_time = script_start_time
	self.target_time = target_time
	self.fit_start_time: Optional[float] = None
	self.first_batch_time: Optional[float] = None
	self.training_start_time: Optional[float] = None
	self.step_count = 0
	self.total_training_time = 0.0

	def on_fit_start(self, trainer, pl_module):
	self.fit_start_time = time.perf_counter()
	elapsed = self.fit_start_time - self.script_start_time
	print(f"\n⏱️ [TIMING] Model initialization took: {elapsed:.2f}s")
	remaining = self.target_time - elapsed
	print(f"⏱️ [TIMING] Time budget remaining: {remaining:.2f}s / {self.target_time}s")

	def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):
	if self.first_batch_time is None:
	self.first_batch_time = time.perf_counter()
	data_load_time = self.first_batch_time - self.fit_start_time
	elapsed = self.first_batch_time - self.script_start_time
	print(f"⏱️ [TIMING] Data loading/first batch took: {data_load_time:.2f}s")
	print(f"⏱️ [TIMING] Total startup overhead: {elapsed:.2f}s")
	remaining = self.target_time - elapsed
	print(f"⏱️ [TIMING] Time budget for training: {remaining:.2f}s")
	self.training_start_time = time.perf_counter()

	def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):
	self.step_count += 1
	elapsed = time.perf_counter() - self.script_start_time
	remaining = self.target_time - elapsed

	# Time-based early stopping: stop with 2s buffer before target
	if remaining <= 2.0:
	print(f"\n⏱️ [TIMING] Stopping training - approaching time limit ({elapsed:.1f}s / {self.target_time}s)")
	trainer.should_stop = True
	return

	# Check time budget every 50 steps
	if self.step_count % 50 == 0:
	training_elapsed = time.perf_counter() - self.training_start_time
	steps_per_sec = self.step_count / training_elapsed if training_elapsed > 0 else 0
	print(f"⏱️ [TIMING] Step {self.step_count}: {elapsed:.1f}s elapsed, {remaining:.1f}s remaining, {steps_per_sec:.2f} steps/s")

	def on_fit_end(self, trainer, pl_module):
	end_time = time.perf_counter()
	total_elapsed = end_time - self.script_start_time
	training_time = end_time - self.training_start_time if self.training_start_time else 0
	startup_time = total_elapsed - training_time

	print("\n" + "=" * 80)
	print("⏱️ TIMING SUMMARY (100s Target)")
	print("=" * 80)
	print(f" Startup (init + data load): {startup_time:.2f}s")
	print(f" Training time: {training_time:.2f}s")
	print(f" Total wall-clock time: {total_elapsed:.2f}s")
	print("-" * 80)
	print(f" Steps completed: {self.step_count}")
	if training_time > 0:
	print(f" Training speed: {self.step_count / training_time:.2f} steps/s")

	if total_elapsed <= self.target_time:
	print(f"\n ✅ PASSED: Completed in {total_elapsed:.2f}s (under {self.target_time}s target)")
	else:
	overage = total_elapsed - self.target_time
	print(f"\n ❌ FAILED: Took {total_elapsed:.2f}s ({overage:.2f}s over {self.target_time}s target)")
	print("=" * 80)


	def latent_prediction_loss(
	z_pred: torch.Tensor,
	x_next: torch.Tensor,
	compressor: nn.Module,
	lambda_sparse: float = 0.001,
	) -> Tuple[torch.Tensor, Dict[str, float]]:
	"""
	Compute loss in latent space (not SDR reconstruction).

	Based on Lester et al. 2024 "Training LLMs over Neurally Compressed Text":
	predicting compressed representation is more learnable than reconstructing
	the full SDR. This is the core training objective for SEM V6.

	Loss function:
	L = \|\|z_pred - compress(x_next)\|\|² + λ\|\|z_pred\|\|₁

	Where:
	- z_pred: Output from propagator (predicted next latent)
	- x_next: Ground truth next SDR
	- compress(): NeuralCompressor module
	- λ: Sparsity coefficient (default 0.001)

	Args:
	z_pred: Predicted next latent (batch, latent_dim) from propagator
	x_next: Ground truth next SDR (batch, sdr_dim) - will be compressed
	compressor: NeuralCompressor module to compress x_next
	lambda_sparse: L1 sparsity coefficient for regularization. Default: 0.001

	Returns:
	loss: Total loss scalar (MSE + L1)
	metrics: Dict with individual loss components:
	- mse_loss: Mean squared error between predicted and target latent
	- l1_loss: L1 regularization on predicted latent
	- total_loss: Combined loss
	- cos_sim: Cosine similarity for monitoring prediction quality

	Example:
	>>> compressor = nn.Linear(2048, 256) # Simplified compressor
	>>> z_pred = torch.randn(32, 256)
	>>> x_next = torch.randn(32, 2048)
	>>> loss, metrics = latent_prediction_loss(z_pred, x_next, compressor)
	>>> loss.backward()
	>>> metrics["mse_loss"] # Individual MSE component
	>>> metrics["cos_sim"] # Prediction quality metric

	References:
	Lester, B., et al. (2024). "Training LLMs over Neurally Compressed Text."
	arXiv:2404.03626
	"""
	# Compress ground truth to latent space (stop gradient - target only)
	with torch.no_grad():
	z_target = compressor(x_next.float())
	# NaN safety for z_target
	z_target = torch.nan_to_num(z_target, nan=0.0, posinf=100.0, neginf=-100.0)

	# MSE in latent space - core prediction objective
	mse_loss = F.mse_loss(z_pred, z_target)

	# L1 sparsity regularization - encourages sparse latent representations
	l1_loss = lambda_sparse * z_pred.abs().mean()

	# Total loss
	total_loss = mse_loss + l1_loss

	# Cosine similarity for monitoring prediction quality (not in loss)
	with torch.no_grad():
	cos_sim = F.cosine_similarity(z_pred, z_target, dim=-1).mean()

	metrics = {
	"mse_loss": mse_loss.item(),
	"l1_loss": l1_loss.item(),
	"total_loss": total_loss.item(),
	"cos_sim": cos_sim.item(),
	}

	return total_loss, metrics


	class SEMV6LightningModule(pl.LightningModule):
	"""
	PyTorch Lightning wrapper for SEM V6 training.

	Implements all best practices:
	- Automatic optimization with gradient accumulation
	- Learning rate scheduling
	- Logging and metrics
	- Checkpoint management
	- Mixed precision training
	"""

	def __init__(
	self,
	sdr_dim: int = 1368,
	sparsity: float = 0.05,
	num_hyperedges: int = 2000,
	kan_degree: int = 5,
	latent_dim: int = 256,
	lambda_sparse: float = 0.001,
	learning_rate: float = 1e-3,
	weight_decay: float = 1e-4,
	warmup_steps: int = 1000,
	max_steps: int = 100000,
	compile_model: bool = True,
	lr_schedule: str = "onecycle",
	warmup_pct: float = 0.1,
	lr_div_factor: float = 25.0,
	lr_final_div_factor: float = 10000.0,
	ode_solver: str = "rk4",
	ode_step_size: float = 0.01,
	enable_text_decoder: bool = True,
	num_predict: int = 4,
	mtp_loss_weight: float = 1.0,
	):
	super().__init__()

	# Save hyperparameters for checkpointing
	self.save_hyperparameters()

	# Initialize SEM V6 model with latent space architecture
	self.model = SEMV6(
	sdr_dim=sdr_dim,
	sparsity=sparsity,
	num_hyperedges=num_hyperedges,
	kan_degree=kan_degree,
	latent_dim=latent_dim,
	ode_solver=ode_solver,
	ode_step_size=ode_step_size,
	enable_text_decoder=enable_text_decoder,
	num_predict=num_predict,
	)

	# Compile model for faster execution (PyTorch 2.0+)
	if compile_model:
	self.model = torch.compile(self.model, mode='reduce-overhead')

	# Training metrics
	self.train_loss = []
	self.val_loss = []

	def forward(self, text_batch, reward=None, mode="awake"):
	"""Forward pass through SEM V6."""
	return self.model(text_batch, reward=reward, mode=mode)

	def training_step(self, batch, batch_idx):
	"""
	Training step with latent-space next-token prediction.

	Based on Lester et al. 2024: predict latent representation of NEXT token,
	not reconstruct full SDR. This is more efficient and learnable.

	Loss function:
	L = \|\|z_pred - compress(x_next)\|\|² + λ\|\|z_pred\|\|₁

	Lightning handles:
	- Gradient accumulation
	- Mixed precision
	- Gradient clipping
	- Optimizer stepping
	"""
	# Unpack batch (text samples) - need pairs for next-token prediction
	text_batch = batch

	# Split each document into sentences and create (sentence_i, sentence_i+1) pairs
	# This fixes the cross-document bug where unrelated documents were paired
	current_texts = []
	next_texts = []

	for document in text_batch:
	# Split into sentences (on '. ' and '\n')
	sentences = []
	for chunk in document.replace('\n', '. ').split('. '):
	sentence = chunk.strip()
	if len(sentence) >= 20: # Filter out short fragments
	sentences.append(sentence)

	# Create (current, next) pairs from consecutive sentences within document
	for i in range(len(sentences) - 1):
	current_texts.append(sentences[i])
	next_texts.append(sentences[i + 1])

	# Cap total pairs at original batch size for consistent GPU utilization
	if len(current_texts) >= len(text_batch):
	break

	if len(current_texts) >= len(text_batch):
	break

	# Handle edge case: no valid pairs found
	if len(current_texts) < 2:
	return None

	# Forward pass on current texts -> get z_pred
	z_pred, action_logits, value = self(current_texts, reward=0.0, mode="awake")

	# NaN safety: use nan_to_num which preserves gradients better
	z_pred = torch.nan_to_num(z_pred, nan=0.0, posinf=100.0, neginf=-100.0)
	z_pred = torch.clamp(z_pred, -100.0, 100.0)

	# Get next SDRs (targets) - encode next texts to SDR
	with torch.no_grad():
	next_sdrs = self.model.encoder(next_texts) # (batch-1, sdr_dim)

	# Compute latent prediction loss (Lester 2024)
	loss, metrics = latent_prediction_loss(
	z_pred=z_pred,
	x_next=next_sdrs,
	compressor=self.model.compressor,
	lambda_sparse=self.hparams.lambda_sparse,
	)

	# Add Multi-Token Prediction loss if decoder is enabled
	if self.model.text_decoder is not None:
	mtp_loss, mtp_metrics = multi_token_prediction_loss(
	z_pred=z_pred,
	target_text=next_texts,
	decoder=self.model.text_decoder,
	max_length=128,
	loss_weight=self.hparams.mtp_loss_weight,
	)
	loss = 0.1 * loss + mtp_loss
	metrics.update({f"mtp_{k}": v for k, v in mtp_metrics.items()})

	# Batch size for logging
	batch_size = len(current_texts)

	# Log primary metrics
	self.log(
	"train_loss",
	loss,
	on_step=True,
	on_epoch=True,
	prog_bar=True,
	batch_size=batch_size,
	)
	self.log(
	"mse_loss",
	metrics["mse_loss"],
	on_step=True,
	on_epoch=True,
	batch_size=batch_size,
	)
	self.log(
	"l1_loss",
	metrics["l1_loss"],
	on_step=False,
	on_epoch=True,
	batch_size=batch_size,
	)
	self.log(
	"cos_sim",
	metrics["cos_sim"],
	on_step=True,
	on_epoch=True,
	prog_bar=True,
	batch_size=batch_size,
	)

	# Meta-controller metrics
	self.log(
	"train_action",
	float(action_logits.argmax()),
	on_step=False,
	on_epoch=True,
	batch_size=batch_size,
	)
	self.log(
	"train_value",
	float(value),
	on_step=False,
	on_epoch=True,
	batch_size=batch_size,
	)

	# MTP metrics (if decoder is enabled)
	if "mtp_mtp_loss" in metrics:
	self.log(
	"mtp_loss",
	metrics["mtp_mtp_loss"],
	on_step=True,
	on_epoch=True,
	prog_bar=True,
	batch_size=batch_size,
	)
	self.log(
	"mtp_accuracy",
	metrics.get("mtp_avg_accuracy", 0.0),
	on_step=True,
	on_epoch=True,
	prog_bar=True,
	batch_size=batch_size,
	)

	# GPU memory usage
	if torch.cuda.is_available():
	gpu_mem = torch.cuda.max_memory_allocated() / 1e9
	self.log(
	"gpu_memory_gb",
	gpu_mem,
	on_step=False,
	on_epoch=True,
	batch_size=batch_size,
	)

	return loss

	def validation_step(self, batch, batch_idx):
	"""Validation step with latent prediction loss."""
	text_batch = batch

	# For next-token prediction, we need at least 2 samples
	if len(text_batch) < 2:
	return None

	current_texts = text_batch[:-1]
	next_texts = text_batch[1:]

	# Forward pass
	z_pred, action_logits, value = self(current_texts, reward=0.0, mode="awake")

	# Get next SDRs (targets)
	with torch.no_grad():
	next_sdrs = self.model.encoder(next_texts)

	# Compute latent prediction loss
	loss, metrics = latent_prediction_loss(
	z_pred=z_pred,
	x_next=next_sdrs,
	compressor=self.model.compressor,
	lambda_sparse=self.hparams.lambda_sparse,
	)

	batch_size = len(current_texts)
	self.log("val_loss", loss, on_step=False, on_epoch=True, prog_bar=True, batch_size=batch_size)
	self.log("val_mse_loss", metrics["mse_loss"], on_step=False, on_epoch=True, batch_size=batch_size)
	self.log("val_cos_sim", metrics["cos_sim"], on_step=False, on_epoch=True, batch_size=batch_size)

	return loss

	def configure_optimizers(self):
	"""
	Configure optimizer with aggressive learning rate schedules.

	Supports multiple scheduling strategies:
	1. OneCycleLR: Aggressive triangular schedule (fastest convergence)
	2. CosineAnnealingWarmRestarts: Warm restarts for escaping local minima
	3. CosineAnnealingLR: Smooth decay (default)
	4. ExponentialLR: Geometric decay

	Uses:
	- AdamW optimizer with weight decay (DeepSeek V3 settings)
	- Mixed precision training via Lightning (16-mixed)
	- Gradient clipping via Trainer
	"""
	optimizer = torch.optim.AdamW(
	self.parameters(),
	lr=self.hparams.learning_rate,
	weight_decay=self.hparams.weight_decay,
	betas=(0.9, 0.95), # DeepSeek V3 settings
	eps=1e-8,
	)

	# Calculate warmup steps (10% of total by default)
	warmup_steps = min(1000, self.hparams.max_steps // 10)

	# STRATEGY 1: OneCycleLR (RECOMMENDED for aggressive training)
	# Triangular schedule: Linear warmup -> peak -> linear decay
	# Achieves fastest convergence with aggressive LR exploration
	scheduler_onecycle = torch.optim.lr_scheduler.OneCycleLR(
	optimizer,
	max_lr=self.hparams.learning_rate,
	total_steps=self.hparams.max_steps,
	pct_start=self.hparams.warmup_pct, # % of cycle spent increasing LR
	anneal_strategy='cos', # Cosine annealing during decay phase
	cycle_momentum=True, # Cycle momentum: high at start, low at end
	base_momentum=0.85,
	max_momentum=0.95,
	div_factor=self.hparams.lr_div_factor, # Initial LR = max_LR / div_factor
	final_div_factor=self.hparams.lr_final_div_factor, # Final LR = initial_LR / final_div_factor
	)

	# STRATEGY 2: CosineAnnealingWarmRestarts (for warm-starting)
	# Multiple annealing cycles with periodic restarts
	# Helps escape local minima by periodically reheating LR
	scheduler_warmrestarts = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
	optimizer,
	T_0=int(self.hparams.max_steps * self.hparams.warmup_pct), # Initial period
	T_mult=2, # Double the period after each restart (must be int)
	eta_min=self.hparams.learning_rate / 1000, # Minimum LR floor
	)

	# STRATEGY 3: Standard CosineAnnealingLR with warmup wrapper
	# Smooth cosine decay after linear warmup
	scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
	optimizer,
	T_max=self.hparams.max_steps - warmup_steps,
	eta_min=self.hparams.learning_rate / 100,
	)

	# Select scheduler based on hparams
	if self.hparams.lr_schedule == "onecycle":
	scheduler = scheduler_onecycle
	elif self.hparams.lr_schedule == "warmrestarts":
	scheduler = scheduler_warmrestarts
	elif self.hparams.lr_schedule == "cosine":
	scheduler = scheduler_cosine
	else:
	scheduler = scheduler_onecycle # Default

	return {
	"optimizer": optimizer,
	"lr_scheduler": {
	"scheduler": scheduler,
	"interval": "step", # Step-based (not epoch-based) for granular control
	"frequency": 1, # Apply every step
	},
	}


	class StreamingDataModule(pl.LightningDataModule):
	"""
	Lightning DataModule for streaming FineWeb data.

	Handles:
	- Memory-mapped shard loading
	- Multi-worker prefetching
	- GPU data pinning
	- Distributed data partitioning
	"""

	def __init__(
	self,
	data_dir: Optional[Path] = None,
	batch_size: int = 32,
	num_workers: int = 8,
	val_split: float = 0.0, # No validation by default (streaming setup)
	):
	super().__init__()
	self.data_dir = data_dir
	self.batch_size = batch_size
	self.num_workers = num_workers
	self.val_split = val_split

	self.train_shards = []
	self.val_shards = []

	def setup(self, stage: Optional[str] = None):
	"""Discover and partition shard files."""
	if self.data_dir is None:
	print("No data_dir provided - using HuggingFace streaming")
	return

	# Find all shard files
	all_shards = sorted(
	list(self.data_dir.glob(".pt")) + list(self.data_dir.glob(".npy"))
	)

	if len(all_shards) == 0:
	raise ValueError(f"No shard files found in {self.data_dir}")

	# Split into train/val if needed
	if self.val_split > 0:
	val_count = int(len(all_shards) * self.val_split)
	self.val_shards = all_shards[:val_count]
	self.train_shards = all_shards[val_count:]
	else:
	self.train_shards = all_shards
	self.val_shards = []

	print(
	f"Found {len(self.train_shards)} training shards, {len(self.val_shards)} validation shards"
	)

	def train_dataloader(self):
	"""Create training DataLoader with streaming dataset."""
	if self.data_dir is None:
	# Stream from HuggingFace
	dataset = datasets.load_dataset(
	"HuggingFaceFW/fineweb-edu",
	name="sample-10BT",
	split="train",
	streaming=True,
	)

	# Extract text content
	def extract_text(examples):
	return [x["text"] for x in examples]

	# Create iterable dataset that yields batches of text
	class HFIterable(torch.utils.data.IterableDataset):
	def __init__(self, hf_dataset, batch_size):
	self.hf_dataset = hf_dataset
	self.batch_size = batch_size

	def __iter__(self):
	worker_info = torch.utils.data.get_worker_info()
	dataset = self.hf_dataset

	if worker_info is not None:
	# Simple sharding: use islice to split the stream
	# worker 0 gets 0, 4, 8...
	# worker 1 gets 1, 5, 9...
	import itertools

	dataset = itertools.islice(
	dataset, worker_info.id, None, worker_info.num_workers
	)

	batch = []
	for item in dataset:
	# Full text (no truncation), handled by background workers
	text = item["text"]
	batch.append(text)
	if len(batch) == self.batch_size:
	yield batch
	batch = []

	iterable_dataset = HFIterable(dataset, self.batch_size)

	return DataLoader(
	iterable_dataset,
	batch_size=None, # Generator handles batching
	num_workers=self.num_workers, # Enable workers!
	pin_memory=True,
	prefetch_factor=2 if self.num_workers > 0 else None,
	)

	dataset = StreamingDataset(
	shard_paths=self.train_shards,
	batch_size=self.batch_size,
	device="cuda" if torch.cuda.is_available() else "cpu",
	prefetch=True, # Background prefetching
	cycle=True, # Infinite cycling
	)

	return DataLoader(
	dataset,
	batch_size=None, # StreamingDataset handles batching
	num_workers=self.num_workers,
	pin_memory=True, # Fast GPU transfer
	persistent_workers=True if self.num_workers > 0 else False,
	prefetch_factor=4 if self.num_workers > 0 else None,
	)

	def val_dataloader(self):
	"""Create validation DataLoader (if validation shards exist)."""
	if len(self.val_shards) == 0:
	return None

	dataset = StreamingDataset(
	shard_paths=self.val_shards,
	batch_size=self.batch_size,
	device="cuda" if torch.cuda.is_available() else "cpu",
	prefetch=True,
	cycle=False, # Don't cycle validation data
	)

	return DataLoader(
	dataset,
	batch_size=None,
	num_workers=self.num_workers // 2, # Fewer workers for validation
	pin_memory=True,
	persistent_workers=True if self.num_workers > 0 else False,
	)


	def main():
	"""Main training entry point."""
	parser = argparse.ArgumentParser(description="SEM V6 Lightning Training")

	# Data arguments
	parser.add_argument(
	"--data_dir",
	type=Path,
	required=False,
	default=None,
	help="Directory with shard files",
	)
	# Batch size (6 default for 6GB VRAM, use 16 with --fast-100s)
	parser.add_argument("--batch_size", type=int, default=6, help="Batch size")
	parser.add_argument("--num_workers", type=int, default=8, help="DataLoader workers")

	# Model arguments
	# Dim 768 (3x 256) - Aggressive scaling
	parser.add_argument("--sdr_dim", type=int, default=768, help="SDR dimension")
	parser.add_argument("--sparsity", type=float, default=0.05, help="SDR sparsity")
	parser.add_argument(
	"--kan_degree", type=int, default=3, help="ChebyKAN polynomial degree"
	)
	parser.add_argument(
	"--num_hyperedges", type=int, default=2000, help="Number of hyperedges"
	)
	parser.add_argument(
	"--latent_dim", type=int, default=256, help="Latent space dimension (Lester 2024)"
	)
	parser.add_argument(
	"--lambda_sparse", type=float, default=0.001, help="L1 sparsity coefficient for latent loss"
	)

	# ODE solver arguments (for Module C: SolitonPropagator)
	parser.add_argument(
	"--ode_solver", type=str, default="rk4",
	choices=["rk4", "euler", "dopri5", "midpoint"],
	help="ODE solver method: rk4 (stable, default), euler (fast for training)"
	)
	parser.add_argument(
	"--ode_step_size", type=float, default=0.01,
	help="Step size for fixed-step ODE solvers (default 0.01, use 0.05 with euler)"
	)

	# Training arguments
	parser.add_argument(
	"--learning_rate", type=float, default=1e-3, help="Base learning rate"
	)
	parser.add_argument("--weight_decay", type=float, default=1e-4, help="AdamW weight decay (L2 regularization)")
	parser.add_argument(
	"--max_steps", type=int, default=100000, help="Max training steps"
	)
	parser.add_argument(
	"--accumulate_grad_batches", type=int, default=1,
	help="Gradient accumulation steps (effective_batch = batch_size * this)"
	)
	parser.add_argument(
	"--gradient_clip_val", type=float, default=1.0, help="Gradient clipping norm threshold"
	)

	# Learning rate schedule arguments
	parser.add_argument(
	"--lr_schedule", type=str, default="onecycle",
	choices=["onecycle", "warmrestarts", "cosine"],
	help="Learning rate schedule strategy"
	)
	parser.add_argument(
	"--warmup_pct", type=float, default=0.1,
	help="Percentage of training used for warmup (0.0-1.0)"
	)
	parser.add_argument(
	"--lr_div_factor", type=float, default=25.0,
	help="OneCycleLR: Initial LR = max_LR / div_factor"
	)
	parser.add_argument(
	"--lr_final_div_factor", type=float, default=10000.0,
	help="OneCycleLR: Final LR = initial_LR / final_div_factor"
	)

	# Lightning Trainer arguments
	parser.add_argument(
	"--accelerator", type=str, default="gpu", choices=["gpu", "cpu"]
	)
	parser.add_argument("--devices", type=int, default=1, help="Number of GPUs")
	parser.add_argument(
	"--precision", type=str, default="16-mixed", help="Training precision"
	)
	parser.add_argument(
	"--log_every_n_steps", type=int, default=10, help="Logging frequency"
	)
	parser.add_argument(
	"--val_check_interval", type=float, default=1000, help="Validation frequency"
	)

	# Checkpointing
	parser.add_argument(
	"--checkpoint_dir",
	type=Path,
	default=Path("checkpoints"),
	help="Checkpoint directory",
	)
	parser.add_argument(
	"--save_top_k", type=int, default=3, help="Save top K checkpoints"
	)
	parser.add_argument(
	"--no-compile",
	action="store_true",
	help="Disable torch.compile (useful for debugging)",
	)

	# Fast 100-second training mode (agentic coherence optimization)
	parser.add_argument(
	"--fast-100s",
	action="store_true",
	help="Enable optimized settings for 100-second agentic coherence training"
	)
	parser.add_argument(
	"--target-time",
	type=float,
	default=160.0,
	help="Target wall-clock time in seconds (default: 160.0, includes ~125s startup overhead)"
	)

	# Multi-Token Prediction (MTP) arguments
	parser.add_argument(
	"--enable-mtp",
	action="store_true",
	default=True,
	help="Enable Multi-Token Prediction text decoder (default: True)"
	)
	parser.add_argument(
	"--disable-mtp",
	action="store_true",
	help="Disable Multi-Token Prediction text decoder"
	)
	parser.add_argument(
	"--num-predict",
	type=int,
	default=4,
	help="Number of future tokens to predict with MTP (default: 4)"
	)
	parser.add_argument(
	"--mtp-loss-weight",
	type=float,
	default=1.0,
	help="Weight for MTP loss (default: 1.0)"
	)

	args = parser.parse_args()

	# Handle MTP enable/disable
	args.enable_mtp = not args.disable_mtp

	# Apply --fast-100s optimizations if flag is set
	# Based on research: OneCycleLR super-convergence, euler solver, no accumulation
	if args.fast_100s:
	# === TRAINING HYPERPARAMETERS ===
	args.batch_size = 16 # Larger batches, no accumulation
	args.learning_rate = 0.002 # 2x for super-convergence
	args.weight_decay = 1e-5 # Reduced for super-convergence
	args.lr_div_factor = 10.0 # Initial LR = 0.002 / 10 = 0.0002
	args.lr_final_div_factor = 100.0 # Final LR = 0.0002 / 100 = 0.000002
	args.accumulate_grad_batches = 1 # CRITICAL: no accumulation for 100 updates

	# === MODEL CAPACITY (balanced for 6GB VRAM with 128k vocab) ===
	# Per CLAUDE.md: "Model parameters CAN be increased"
	# BUT 128k vocab tokenizer + text decoder needs ~1GB alone
	args.sdr_dim = 1024 # 768 → 1024: moderate increase
	args.latent_dim = 256 # Keep 256: balances with 128k vocab decoder
	args.kan_degree = 4 # 3 → 4: slightly more expressive
	args.num_hyperedges = 2000 # Keep 2000: sufficient for short training

	# === ODE SOLVER (speed optimization) ===
	args.ode_solver = "euler" # Fast ODE solver (4x fewer evaluations)
	args.ode_step_size = 0.05 # Larger step size (5x fewer steps)

	# === TIME BUDGET CALCULATION ===
	# ACTUAL overhead: ~60s for model init + tokenizer + HuggingFace streaming startup
	# - Model init: ~7s
	# - Llama tokenizer download: ~5s
	# - HuggingFace streaming first batch: ~50s
	# - torch.compile / GPU warmup pause: ~60s (mysterious quiet period)
	# Conservative estimate: 125s overhead
	# At ~4 steps/sec with remaining time
	estimated_overhead_s = 125.0
	available_training_s = args.target_time - estimated_overhead_s
	estimated_steps_per_sec = 4.0 # Conservative estimate
	args.max_steps = max(50, int(available_training_s * estimated_steps_per_sec))

	elapsed_so_far = time.perf_counter() - _SCRIPT_START_TIME
	print("=" * 80)
	print("FAST 100s MODE ENABLED - Optimized for agentic coherence")
	print(f" Target time: {args.target_time}s (elapsed so far: {elapsed_so_far:.1f}s)")
	print(f" Estimated overhead: {estimated_overhead_s}s (model ~7s + tokenizer ~5s + HF stream ~50s + compile/warmup ~60s)")
	print(f" Training budget: {available_training_s}s → max {args.max_steps} steps")
	print("-" * 40)
	print(" MODEL CAPACITY (balanced for 6GB + 128k vocab):")
	print(f" sdr_dim: {args.sdr_dim}, latent_dim: {args.latent_dim}")
	print(f" kan_degree: {args.kan_degree}, num_hyperedges: {args.num_hyperedges}")
	print("-" * 40)
	print(" TRAINING:")
	print(" batch_size=16, lr=0.002, weight_decay=1e-5, accumulation=1")
	print(" ODE solver: euler with step_size=0.05")
	print("=" * 80)

	# Validate CUDA availability
	if args.accelerator == "gpu":
	assert torch.cuda.is_available(), "GPU required but CUDA not available"

	# Create data module
	datamodule = StreamingDataModule(
	data_dir=args.data_dir,
	batch_size=args.batch_size,
	num_workers=args.num_workers,
	)

	# Create model with latent-space architecture (Lester 2024)
	compile_model = not args.no_compile
	model = SEMV6LightningModule(
	sdr_dim=args.sdr_dim,
	sparsity=args.sparsity,
	num_hyperedges=args.num_hyperedges,
	kan_degree=args.kan_degree,
	latent_dim=args.latent_dim,
	lambda_sparse=args.lambda_sparse,
	learning_rate=args.learning_rate,
	weight_decay=args.weight_decay,
	max_steps=args.max_steps,
	compile_model=compile_model,
	lr_schedule=args.lr_schedule,
	warmup_pct=args.warmup_pct,
	lr_div_factor=args.lr_div_factor,
	lr_final_div_factor=args.lr_final_div_factor,
	ode_solver=args.ode_solver,
	ode_step_size=args.ode_step_size,
	enable_text_decoder=args.enable_mtp,
	num_predict=args.num_predict,
	mtp_loss_weight=args.mtp_loss_weight,
	)

	# Setup callbacks
	callbacks = [
	# Timing callback for 100s target tracking
	TimingCallback(
	script_start_time=_SCRIPT_START_TIME,
	target_time=args.target_time,
	),
	# Model checkpointing (save best models)
	ModelCheckpoint(
	dirpath=args.checkpoint_dir,
	filename="semv6-{step:06d}-{train_loss:.4f}",
	monitor="train_loss",
	mode="min",
	save_top_k=args.save_top_k,
	every_n_train_steps=1000,
	save_last=True,
	),
	# Learning rate monitoring
	LearningRateMonitor(logging_interval="step"),
	# Rich progress bar
	RichProgressBar(),
	# Model summary
	RichModelSummary(max_depth=2),
	]

	# Validation callbacks for English coherence monitoring
	test_prompts = [
	"The capital of France is",
	"Water boils at",
	"The human body has",
	"Plants produce oxygen through",
	"The speed of light is",
	]

	# Create validators
	fast_validator = FastValidator(test_prompts=test_prompts)
	grammar_client = LanguageToolClient(url="http://localhost:8081")
	grammar_validator = GrammarValidator(
	client=grammar_client,
	test_prompts=test_prompts
	)

	# Add combined validation callback
	callbacks.append(
	CombinedValidationCallback(
	fast_validator=fast_validator,
	grammar_validator=grammar_validator,
	test_prompts=test_prompts,
	frequency=200,
	)
	)

	# Early stopping (optional, disable for continuous training)
	# callbacks.append(
	# EarlyStopping(
	# monitor='val_loss',
	# patience=10,
	# mode='min',
	# )
	# )

	# Setup logger
	logger = TensorBoardLogger(
	save_dir="lightning_logs",
	name="semv6_training",
	)

	# Create Lightning Trainer optimized for maximum speed
	trainer = pl.Trainer(
	# ==================== HARDWARE CONFIGURATION ====================
	accelerator=args.accelerator,
	devices=args.devices,
	precision=args.precision, # 16-mixed: FP16 computation, FP32 accumulation
	# For even faster training on RTX 3060, use "16" (pure FP16, less stable)
	# Default "16-mixed" is safest for ODE-based models

	# ==================== TRAINING HYPERPARAMETERS ====================
	max_steps=args.max_steps,
	# Gradient accumulation: effective_batch = batch_size * accumulate_grad_batches
	# With 6GB VRAM: batch_size=6, accumulate=4 -> effective=24 (good convergence)
	accumulate_grad_batches=args.accumulate_grad_batches,
	gradient_clip_val=args.gradient_clip_val,
	gradient_clip_algorithm="norm", # L2 norm clipping (more stable than 'value')

	# ==================== PERFORMANCE OPTIMIZATIONS ====================
	# cuDNN benchmark mode: optimize kernel selection for your hardware
	benchmark=True,
	# Allow non-deterministic ops for speed (disable reproducibility)
	deterministic=False,
	# TF32 is enabled via torch.set_float32_matmul_precision('high') at module level

	# ==================== LOGGING & MONITORING ====================
	logger=logger,
	callbacks=callbacks,
	log_every_n_steps=args.log_every_n_steps,
	val_check_interval=args.val_check_interval,
	enable_model_summary=False, # Skip model summary for faster startup
	enable_progress_bar=True, # Rich progress bar for real-time feedback

	# ==================== VALIDATION STRATEGY ====================
	# Disable validation for streaming (infinite dataset, no validation set)
	limit_val_batches=0,
	num_sanity_val_steps=0,

	# ==================== MEMORY OPTIMIZATION ====================
	# Gradient checkpointing: trade compute for memory (disabled by default)
	# Enable if OOM: enable_checkpointing=True
	enable_checkpointing=True,

	# ==================== DEBUGGING (PRODUCTION: DISABLED) ====================
	# Uncomment for debugging:
	# detect_anomaly=True, # Detect NaN/Inf propagation
	# profiler="pytorch", # PyTorch profiler (slows training 10-30%)

	# ==================== DISTRIBUTED TRAINING (SINGLE GPU) ====================
	strategy="auto", # Auto-detect best strategy (single GPU -> no strategy)
	sync_batchnorm=False, # No batch norm sync for single GPU
	)

	# Print training info
	model_init_time = time.perf_counter() - _SCRIPT_START_TIME
	print("=" * 80)
	print("SEM V6 PyTorch Lightning Training (Latent Prediction - Lester 2024)")
	print("=" * 80)
	print(f"⏱️ Model created in {model_init_time:.1f}s (target: {args.target_time}s)")
	print("-" * 80)
	print(f"Data directory: {args.data_dir}")
	print(f"Batch size: {args.batch_size}")
	print(f"Gradient accumulation: {args.accumulate_grad_batches}")
	print(f"Effective batch size: {args.batch_size * args.accumulate_grad_batches}")
	print(f"Max steps: {args.max_steps}")
	print(f"Precision: {args.precision}")
	print(f"Devices: {args.devices} {args.accelerator}")
	print("-" * 80)
	print("LEARNING RATE SCHEDULE:")
	print(f" Strategy: {args.lr_schedule.upper()}")
	print(f" Base LR: {args.learning_rate}")
	print(f" Weight decay (L2): {args.weight_decay}")
	print(f" Warmup percentage: {args.warmup_pct * 100:.1f}% ({int(args.max_steps * args.warmup_pct)} steps)")
	if args.lr_schedule == "onecycle":
	print(f" Initial LR: {args.learning_rate / args.lr_div_factor:.6f}")
	print(f" Max LR: {args.learning_rate}")
	print(f" Final LR: {args.learning_rate / args.lr_div_factor / args.lr_final_div_factor:.8f}")
	print(" Momentum cycling: 0.85 -> 0.95 -> 0.85")
	elif args.lr_schedule == "warmrestarts":
	print(f" Restarts every: {int(args.max_steps * args.warmup_pct)} steps")
	print(f" Period multiplier: 2.0x after each restart")
	print(f" Min LR: {args.learning_rate / 1000:.8f}")
	else: # cosine
	print(f" Min LR: {args.learning_rate / 100:.8f}")
	print("-" * 80)
	print(f"SDR dimension: {args.sdr_dim}")
	print(f"Latent dimension: {args.latent_dim} (compression ratio: {args.sdr_dim / args.latent_dim:.1f}x)")
	print(f"L1 sparsity coefficient: {args.lambda_sparse}")
	print("-" * 80)
	print("ODE SOLVER (Module C: SolitonPropagator):")
	print(f" Solver: {args.ode_solver}")
	print(f" Step size: {args.ode_step_size}")
	print("-" * 80)
	print("MULTI-TOKEN PREDICTION (MTP):")
	print(f" Enabled: {args.enable_mtp}")
	if args.enable_mtp:
	print(f" Num tokens: {args.num_predict}")
	print(f" Loss weight: {args.mtp_loss_weight}")
	print(f" Tokenizer: meta-llama/Meta-Llama-3-8B")
	print("=" * 80)

	# Train the model
	trainer.fit(model, datamodule=datamodule)

	print("=" * 80)
	print("Training complete!")
	print(f"Best checkpoint: {trainer.checkpoint_callback.best_model_path}")
	print("=" * 80)


	if __name__ == "__main__":
	main()