Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +14 -0
generate.py +85 -0
modeling_eve.py +286 -0
pytorch_model.bin +3 -0
requirements.txt +5 -0
train.py +482 -0

config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "architecture": "Eve-2-MoE",
+  "vocab_size": 50304,
+  "n_layer": 12,
+  "n_embd": 512,
+  "n_head": 8,
+  "head_dim": 64,
+  "block_size": 2048,
+  "num_experts": 8,
+  "top_k": 2,
+  "expert_intermediate_size": 1408,
+  "shared_expert_intermediate_size": 1408,
+  "rope_theta": 10000.0
+}

generate.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Eve-2-MoE Inference
+===================
+Quick generation script. Works with local weights or HuggingFace download.
+Usage:
+    python generate.py --prompt "The future of AI is"
+    python generate.py --prompt "The future of AI is" --model_path ./model_final/pytorch_model.bin
+    python generate.py --prompt "The future of AI is" --hf_repo anthonym21/Eve-2-MoE-250M
+"""
+import argparse
+import torch
+import tiktoken
+from modeling_eve import ModelConfig, DeepSeekMoE
+def load_model(model_path: str = None, hf_repo: str = None, device: str = "cuda"):
+    config = ModelConfig()
+    model = DeepSeekMoE(config)
+    if hf_repo:
+        from huggingface_hub import hf_hub_download
+        model_path = hf_hub_download(repo_id=hf_repo, filename="pytorch_model.bin")
+    if model_path:
+        state_dict = torch.load(model_path, map_location=device, weights_only=True)
+        model.load_state_dict(state_dict)
+    return model.to(device).eval()
+def generate_streaming(model, prompt: str, max_tokens: int = 200,
+                       temperature: float = 0.8, top_k: int = 50, device: str = "cuda"):
+    enc = tiktoken.get_encoding("gpt2")
+    tokens = torch.tensor(enc.encode(prompt), dtype=torch.long, device=device).unsqueeze(0)
+    print(prompt, end="", flush=True)
+    with torch.no_grad():
+        for _ in range(max_tokens):
+            idx_cond = tokens[:, -model.config.block_size:]
+            with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=(device == "cuda")):
+                logits, _ = model(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = torch.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            tokens = torch.cat((tokens, idx_next), dim=1)
+            print(enc.decode([idx_next.item()]), end="", flush=True)
+    print("\n")
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--prompt", type=str, default="The future of artificial intelligence is")
+    p.add_argument("--model_path", type=str, default=None)
+    p.add_argument("--hf_repo", type=str, default=None)
+    p.add_argument("--max_tokens", type=int, default=200)
+    p.add_argument("--temperature", type=float, default=0.8)
+    p.add_argument("--top_k", type=int, default=50)
+    p.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu")
+    args = p.parse_args()
+    if not args.model_path and not args.hf_repo:
+        args.hf_repo = "anthonym21/Eve-2-MoE-250M"
+    print(f"Loading model on {args.device}...")
+    model = load_model(args.model_path, args.hf_repo, args.device)
+    param_count = sum(p.numel() for p in model.parameters())
+    print(f"Parameters: {param_count / 1e6:.2f}M\n")
+    generate_streaming(model, args.prompt, args.max_tokens, args.temperature, args.top_k, args.device)
+if __name__ == "__main__":
+    main()

modeling_eve.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+Eve-2-MoE — Custom Mixture of Experts Language Model
+Architecture: DeepSeek-V3 style Shared Expert + Top-K Routed Experts + RoPE
+Author: Anthony Maio / Making Minds AI Research
+License: MIT
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from dataclasses import dataclass
+@dataclass
+class ModelConfig:
+    """Configuration for Eve-2-MoE."""
+    # Model dimensions
+    vocab_size: int = 50304
+    n_layer: int = 12
+    n_embd: int = 512
+    n_head: int = 8
+    head_dim: int = 64
+    block_size: int = 2048
+    # MoE settings
+    num_experts: int = 8
+    top_k: int = 2
+    expert_intermediate_size: int = 1408
+    shared_expert_intermediate_size: int = 1408
+    router_aux_loss_coef: float = 0.01
+    # Training settings
+    use_checkpointing: bool = False  # Gradient checkpointing (saves VRAM, costs speed)
+    # RoPE settings
+    rope_theta: float = 10000.0
+class RMSNorm(nn.Module):
+    """Root Mean Square Layer Normalization."""
+    def __init__(self, dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
+def precompute_rope_freqs(head_dim: int, max_seq_len: int, theta: float = 10000.0,
+                          device: torch.device = None) -> torch.Tensor:
+    """Precompute the complex exponential frequencies for RoPE.
+    Returns a (max_seq_len, head_dim // 2) complex tensor.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, head_dim, 2, device=device).float() / head_dim))
+    t = torch.arange(max_seq_len, device=device).float()
+    freqs = torch.outer(t, freqs)
+    return torch.polar(torch.ones_like(freqs), freqs)  # complex64
+def apply_rope(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+    """Apply rotary position embeddings to input tensor.
+    Args:
+        x: (B, n_head, T, head_dim)
+        freqs_cis: (T, head_dim // 2) complex
+    Returns:
+        (B, n_head, T, head_dim) with rotary embeddings applied
+    """
+    # Reshape x to complex: (B, n_head, T, head_dim//2, 2) -> complex
+    B, H, T, D = x.shape
+    x_complex = torch.view_as_complex(x.float().reshape(B, H, T, D // 2, 2))
+    # Broadcast freqs_cis: (1, 1, T, head_dim//2)
+    freqs_cis = freqs_cis[:T].unsqueeze(0).unsqueeze(0)
+    x_rotated = x_complex * freqs_cis
+    # Back to real: (B, H, T, head_dim)
+    return torch.view_as_real(x_rotated).reshape(B, H, T, D).type_as(x)
+class MLP(nn.Module):
+    """Feed-forward network with SwiGLU activation."""
+    def __init__(self, config: ModelConfig, intermediate_size: int = None):
+        super().__init__()
+        hidden_dim = intermediate_size or config.expert_intermediate_size
+        self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Gate
+        self.w2 = nn.Linear(config.n_embd, hidden_dim, bias=False)  # Up
+        self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)  # Down
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.c_proj(F.silu(self.w1(x)) * self.w2(x))
+class SharedMoE(nn.Module):
+    """Mixture of Experts with one shared expert and K routed experts.
+    DeepSeek-V3 style: a shared expert processes all tokens while a top-k
+    router selects from a pool of specialized experts per token.
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.top_k = config.top_k
+        # Shared expert (always active)
+        self.shared_expert = MLP(config, config.shared_expert_intermediate_size)
+        # Routed experts
+        self.experts = nn.ModuleList([MLP(config) for _ in range(config.num_experts)])
+        self.router = nn.Linear(config.n_embd, config.num_experts, bias=False)
+    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        B, T, C = x.shape
+        # Shared path
+        shared_out = self.shared_expert(x)
+        # Router
+        logits = self.router(x)
+        probs = F.softmax(logits, dim=-1)
+        # Top-K selection with normalized weights
+        top_k_weights, top_k_indices = torch.topk(probs, self.top_k, dim=-1)
+        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
+        # Load balancing auxiliary loss
+        flat_probs = probs.view(-1, self.config.num_experts)
+        expert_usage = flat_probs.mean(dim=0)
+        aux_loss = torch.sum(expert_usage * expert_usage) * self.config.num_experts
+        # Route tokens to experts
+        routed_out = torch.zeros_like(x)
+        flat_x = x.view(-1, C)
+        flat_indices = top_k_indices.view(-1, self.top_k)
+        flat_weights = top_k_weights.view(-1, self.top_k)
+        for i, expert in enumerate(self.experts):
+            mask = flat_indices == i
+            batch_idx, rank_idx = torch.where(mask)
+            if batch_idx.numel() > 0:
+                expert_input = flat_x[batch_idx]
+                expert_output = expert(expert_input)
+                weight = flat_weights[batch_idx, rank_idx].unsqueeze(-1)
+                routed_out.view(-1, C).index_add_(0, batch_idx, expert_output * weight)
+        return shared_out + routed_out, aux_loss
+class CausalSelfAttention(nn.Module):
+    """Multi-head causal self-attention with Rotary Position Embeddings."""
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.n_head = config.n_head
+        self.head_dim = config.head_dim
+        self.n_embd = config.n_embd
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor:
+        B, T, C = x.shape
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split(self.n_embd, dim=2)
+        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        # Apply RoPE to Q and K
+        q = apply_rope(q, freqs_cis)
+        k = apply_rope(k, freqs_cis)
+        # Flash Attention (auto-dispatches to cuDNN/FlashAttn kernels)
+        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        return self.c_proj(y)
+class Block(nn.Module):
+    """Transformer block: RMSNorm → Attention → RMSNorm → MoE."""
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = RMSNorm(config.n_embd)
+        self.mlp = SharedMoE(config)
+    def forward(self, x: torch.Tensor, freqs_cis: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        x = x + self.attn(self.ln_1(x), freqs_cis)
+        mlp_out, aux_loss = self.mlp(self.ln_2(x))
+        x = x + mlp_out
+        return x, aux_loss
+class DeepSeekMoE(nn.Module):
+    """Eve-2-MoE: DeepSeek-V3 style Mixture of Experts language model.
+    Architecture:
+        - Token embeddings (no learned position embeddings — uses RoPE)
+        - N transformer blocks with RoPE attention + shared MoE FFN
+        - RMSNorm + tied linear head
+    """
+    def __init__(self, config: ModelConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = nn.ModuleDict(dict(
+            wte=nn.Embedding(config.vocab_size, config.n_embd),
+            h=nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f=RMSNorm(config.n_embd),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # Weight tying
+        self.transformer.wte.weight = self.lm_head.weight
+        # Precompute RoPE frequencies (registered as buffer so they move with .to(device))
+        freqs_cis = precompute_rope_freqs(config.head_dim, config.block_size, config.rope_theta)
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+        # Initialize weights
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx: torch.Tensor, targets: torch.Tensor = None) -> tuple[torch.Tensor, torch.Tensor]:
+        B, T = idx.shape
+        assert T <= self.config.block_size, f"Sequence length {T} exceeds block_size {self.config.block_size}"
+        x = self.transformer.wte(idx)
+        total_aux_loss = 0.0
+        for block in self.transformer.h:
+            if self.config.use_checkpointing and self.training:
+                x, aux_loss = torch.utils.checkpoint.checkpoint(
+                    block, x, self.freqs_cis, use_reentrant=False
+                )
+            else:
+                x, aux_loss = block(x, self.freqs_cis)
+            total_aux_loss += aux_loss
+        x = self.transformer.ln_f(x)
+        logits = self.lm_head(x)
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+            loss = loss + self.config.router_aux_loss_coef * total_aux_loss
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, idx: torch.Tensor, max_new_tokens: int,
+                 temperature: float = 0.8, top_k: int = 50) -> torch.Tensor:
+        """Autoregressive generation with temperature and top-k sampling."""
+        for _ in range(max_new_tokens):
+            idx_cond = idx[:, -self.config.block_size:]
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float("Inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68b3a3b00732a4977ef4c27d6dfbcc5ca70f73d47047103c108baac3a5d2108a
+size 1088054098

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.2.0
+tiktoken
+datasets
+huggingface_hub
+wandb

train.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""
+Eve-2-MoE Training Script — Multi-GPU DDP
+==========================================
+Usage:
+    Single GPU:   python train.py
+    Multi-GPU:    torchrun --nproc_per_node=2 train.py
+    4x GPU:       torchrun --nproc_per_node=4 train.py
+Override config:  torchrun --nproc_per_node=2 train.py --max_steps 15000 --batch_size 48
+Author: Anthony Maio / Making Minds AI Research
+"""
+import os
+import sys
+import math
+import time
+import json
+import argparse
+import logging
+from pathlib import Path
+from contextlib import nullcontext
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import tiktoken
+from datasets import load_dataset
+from modeling_eve import ModelConfig, DeepSeekMoE
+# ---------------------------------------------------------------------------
+# Distributed setup
+# ---------------------------------------------------------------------------
+def setup_distributed():
+    """Initialize DDP if launched with torchrun, otherwise single-GPU."""
+    if "RANK" in os.environ:
+        dist.init_process_group(backend="nccl")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        torch.cuda.set_device(local_rank)
+        device = torch.device(f"cuda:{local_rank}")
+    else:
+        rank = 0
+        world_size = 1
+        local_rank = 0
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    is_master = rank == 0
+    return rank, world_size, local_rank, device, is_master
+def cleanup_distributed():
+    if dist.is_initialized():
+        dist.destroy_process_group()
+# ---------------------------------------------------------------------------
+# Data loading
+# ---------------------------------------------------------------------------
+class StreamingDataLoader:
+    """Streams tokenized batches from FineWeb-Edu.
+    Each DDP rank skips interleaved samples so no two GPUs see the same data.
+    """
+    def __init__(self, batch_size: int, block_size: int, rank: int = 0,
+                 world_size: int = 1, dataset_name: str = "sample-10BT"):
+        self.batch_size = batch_size
+        self.block_size = block_size
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset_name = dataset_name
+        self.enc = tiktoken.get_encoding("gpt2")
+        self._init_stream()
+    def _init_stream(self):
+        ds = load_dataset("HuggingFaceFW/fineweb-edu", name=self.dataset_name,
+                          split="train", streaming=True)
+        # Shard the stream across DDP ranks
+        if self.world_size > 1:
+            ds = ds.shard(num_shards=self.world_size, index=self.rank)
+        self.iter_dataset = iter(ds)
+    def get_batch(self) -> tuple[torch.Tensor, torch.Tensor]:
+        total_tokens = self.batch_size * self.block_size
+        batch_tokens = []
+        while len(batch_tokens) < total_tokens + 1:
+            try:
+                text = next(self.iter_dataset)["text"]
+                tokens = self.enc.encode(text, allowed_special={"<|endoftext|>"})
+                batch_tokens.extend(tokens)
+            except StopIteration:
+                print(f"[Rank {self.rank}] Dataset exhausted, restarting stream...")
+                self._init_stream()
+        data = torch.tensor(batch_tokens[:total_tokens + 1], dtype=torch.long)
+        x = data[:total_tokens].view(self.batch_size, self.block_size)
+        y = data[1:total_tokens + 1].view(self.batch_size, self.block_size)
+        return x, y
+class ValidationLoader:
+    """WikiText-2 validation set."""
+    def __init__(self, block_size: int, device: torch.device):
+        self.block_size = block_size
+        self.device = device
+        enc = tiktoken.get_encoding("gpt2")
+        ds = load_dataset("wikitext", "wikitext-2-v1", split="test")
+        text = "\n\n".join(ds["text"])
+        tokens = enc.encode(text, allowed_special={"<|endoftext|>"})
+        self.data = torch.tensor(tokens, dtype=torch.long, device=device)
+    @torch.no_grad()
+    def estimate_loss(self, model, eval_iters: int = 50, batch_size: int = 32) -> float:
+        model.eval()
+        losses = torch.zeros(eval_iters, device=self.device)
+        for k in range(eval_iters):
+            ix = torch.randint(len(self.data) - self.block_size, (batch_size,))
+            x = torch.stack([self.data[i:i + self.block_size] for i in ix])
+            y = torch.stack([self.data[i + 1:i + self.block_size + 1] for i in ix])
+            with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+                _, loss = model(x, y)
+            losses[k] = loss.item()
+        model.train()
+        return losses.mean().item()
+# ---------------------------------------------------------------------------
+# Learning rate schedule
+# ---------------------------------------------------------------------------
+def get_lr(step: int, max_steps: int, warmup_steps: int, peak_lr: float, min_lr_ratio: float = 0.1) -> float:
+    """Cosine decay with linear warmup."""
+    min_lr = peak_lr * min_lr_ratio
+    # Linear warmup
+    if step < warmup_steps:
+        return peak_lr * (step + 1) / (warmup_steps + 1)
+    # Post-training (shouldn't happen, but safe)
+    if step > max_steps:
+        return min_lr
+    # Cosine decay
+    decay_ratio = (step - warmup_steps) / (max_steps - warmup_steps)
+    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+    return min_lr + coeff * (peak_lr - min_lr)
+# ---------------------------------------------------------------------------
+# Checkpointing
+# ---------------------------------------------------------------------------
+def save_checkpoint(model, optimizer, step: int, loss: float, val_loss: float,
+                    config: ModelConfig, checkpoint_dir: Path, is_ddp: bool):
+    """Save training checkpoint (model weights, optimizer state, metadata)."""
+    raw_model = model.module if is_ddp else model
+    checkpoint = {
+        "step": step,
+        "model_state_dict": raw_model.state_dict(),
+        "optimizer_state_dict": optimizer.state_dict(),
+        "train_loss": loss,
+        "val_loss": val_loss,
+        "config": {
+            "vocab_size": config.vocab_size,
+            "n_layer": config.n_layer,
+            "n_embd": config.n_embd,
+            "n_head": config.n_head,
+            "head_dim": config.head_dim,
+            "block_size": config.block_size,
+            "num_experts": config.num_experts,
+            "top_k": config.top_k,
+            "expert_intermediate_size": config.expert_intermediate_size,
+            "shared_expert_intermediate_size": config.shared_expert_intermediate_size,
+            "rope_theta": config.rope_theta,
+        },
+    }
+    path = checkpoint_dir / f"step_{step}.pt"
+    torch.save(checkpoint, path)
+    print(f"  Checkpoint saved: {path}")
+    # Also save a "latest" symlink/copy for easy resume
+    latest = checkpoint_dir / "latest.pt"
+    torch.save(checkpoint, latest)
+def save_final_model(model, config: ModelConfig, output_dir: Path, is_ddp: bool):
+    """Save just the model weights + config for HuggingFace upload."""
+    raw_model = model.module if is_ddp else model
+    output_dir.mkdir(parents=True, exist_ok=True)
+    torch.save(raw_model.state_dict(), output_dir / "pytorch_model.bin")
+    config_data = {
+        "architecture": "Eve-2-MoE",
+        "vocab_size": config.vocab_size,
+        "n_layer": config.n_layer,
+        "n_embd": config.n_embd,
+        "n_head": config.n_head,
+        "head_dim": config.head_dim,
+        "block_size": config.block_size,
+        "num_experts": config.num_experts,
+        "top_k": config.top_k,
+        "expert_intermediate_size": config.expert_intermediate_size,
+        "shared_expert_intermediate_size": config.shared_expert_intermediate_size,
+        "rope_theta": config.rope_theta,
+    }
+    with open(output_dir / "config.json", "w") as f:
+        json.dump(config_data, f, indent=2)
+    print(f"  Final model saved to {output_dir}")
+# ---------------------------------------------------------------------------
+# Main training loop
+# ---------------------------------------------------------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="Eve-2-MoE Training")
+    # Architecture (defaults match 250M config)
+    p.add_argument("--n_layer", type=int, default=12)
+    p.add_argument("--n_embd", type=int, default=512)
+    p.add_argument("--n_head", type=int, default=8)
+    p.add_argument("--num_experts", type=int, default=8)
+    p.add_argument("--block_size", type=int, default=2048)
+    # Training
+    p.add_argument("--max_steps", type=int, default=7500,
+                   help="Total training steps. 7500 steps ≈ 500M tokens (1hr single B200)")
+    p.add_argument("--batch_size", type=int, default=32,
+                   help="Per-GPU batch size")
+    p.add_argument("--learning_rate", type=float, default=5e-4)
+    p.add_argument("--warmup_steps", type=int, default=200)
+    p.add_argument("--weight_decay", type=float, default=0.1)
+    p.add_argument("--grad_clip", type=float, default=1.0)
+    p.add_argument("--min_lr_ratio", type=float, default=0.1,
+                   help="Minimum LR as fraction of peak (cosine decay floor)")
+    # Data
+    p.add_argument("--dataset", type=str, default="sample-10BT",
+                   help="FineWeb-Edu subset name")
+    # Checkpointing
+    p.add_argument("--save_every", type=int, default=500)
+    p.add_argument("--val_every", type=int, default=500)
+    p.add_argument("--checkpoint_dir", type=str, default="checkpoints")
+    p.add_argument("--output_dir", type=str, default="model_final")
+    # Misc
+    p.add_argument("--compile", action="store_true", default=True,
+                   help="Use torch.compile (recommended for B200/H100)")
+    p.add_argument("--no_compile", action="store_true",
+                   help="Disable torch.compile")
+    p.add_argument("--wandb_project", type=str, default="Eve-2-MoE",
+                   help="WandB project name (empty to disable)")
+    p.add_argument("--wandb_run", type=str, default=None,
+                   help="WandB run name")
+    p.add_argument("--resume", type=str, default=None,
+                   help="Path to checkpoint to resume from")
+    p.add_argument("--use_checkpointing", action="store_true",
+                   help="Enable gradient checkpointing (saves VRAM)")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    # --- Distributed setup ---
+    rank, world_size, local_rank, device, is_master = setup_distributed()
+    if is_master:
+        print(f"{'=' * 60}")
+        print(f"  Eve-2-MoE Training")
+        print(f"  GPUs: {world_size} | Device: {torch.cuda.get_device_name(device)}")
+        print(f"  Steps: {args.max_steps} | Batch/GPU: {args.batch_size}")
+        print(f"  Global batch: {args.batch_size * world_size} × {args.block_size} = "
+              f"{args.batch_size * world_size * args.block_size:,} tokens/step")
+        print(f"  Total tokens: ~{args.max_steps * args.batch_size * world_size * args.block_size / 1e9:.1f}B")
+        print(f"{'=' * 60}")
+    # --- Model ---
+    config = ModelConfig(
+        n_layer=args.n_layer,
+        n_embd=args.n_embd,
+        n_head=args.n_head,
+        num_experts=args.num_experts,
+        block_size=args.block_size,
+        use_checkpointing=args.use_checkpointing,
+    )
+    model = DeepSeekMoE(config).to(device)
+    if is_master:
+        param_count = sum(p.numel() for p in model.parameters())
+        print(f"  Parameters: {param_count / 1e6:.2f}M")
+    # Compile
+    if args.compile and not args.no_compile:
+        if is_master:
+            print("  Compiling model with torch.compile...")
+        model = torch.compile(model)
+    # DDP wrapper
+    is_ddp = world_size > 1
+    if is_ddp:
+        model = DDP(model, device_ids=[local_rank], find_unused_parameters=True)
+    raw_model = model.module if is_ddp else model
+    # --- Optimizer ---
+    optimizer = torch.optim.AdamW(
+        raw_model.parameters(),
+        lr=args.learning_rate,
+        betas=(0.9, 0.95),
+        weight_decay=args.weight_decay,
+    )
+    # --- Resume from checkpoint ---
+    start_step = 0
+    if args.resume:
+        if is_master:
+            print(f"  Resuming from {args.resume}...")
+        ckpt = torch.load(args.resume, map_location=device)
+        raw_model.load_state_dict(ckpt["model_state_dict"])
+        optimizer.load_state_dict(ckpt["optimizer_state_dict"])
+        start_step = ckpt["step"] + 1
+        if is_master:
+            print(f"  Resumed at step {start_step}")
+    # --- Data ---
+    train_loader = StreamingDataLoader(
+        batch_size=args.batch_size,
+        block_size=config.block_size,
+        rank=rank,
+        world_size=world_size,
+        dataset_name=args.dataset,
+    )
+    val_loader = None
+    if is_master:
+        val_loader = ValidationLoader(config.block_size, device)
+    # --- Checkpoint directory ---
+    checkpoint_dir = Path(args.checkpoint_dir)
+    if is_master:
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # --- WandB ---
+    wandb_enabled = False
+    if is_master and args.wandb_project:
+        try:
+            import wandb
+            wandb.init(
+                project=args.wandb_project,
+                name=args.wandb_run or f"eve2-{world_size}gpu-{args.max_steps}steps",
+                config=vars(args),
+            )
+            wandb_enabled = True
+        except ImportError:
+            print("  WandB not installed, skipping.")
+    # --- Training loop ---
+    model.train()
+    tokens_per_step = args.batch_size * world_size * config.block_size
+    if is_master:
+        print(f"\n  Starting training from step {start_step}...\n")
+    for step in range(start_step, args.max_steps):
+        t0 = time.time()
+        # Learning rate schedule
+        lr = get_lr(step, args.max_steps, args.warmup_steps, args.learning_rate, args.min_lr_ratio)
+        for param_group in optimizer.param_groups:
+            param_group["lr"] = lr
+        # Get batch
+        x, y = train_loader.get_batch()
+        x, y = x.to(device), y.to(device)
+        # Forward
+        with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+            logits, loss = model(x, y)
+        # Backward
+        optimizer.zero_grad(set_to_none=True)
+        loss.backward()
+        # Gradient clipping
+        if args.grad_clip > 0:
+            grad_norm = torch.nn.utils.clip_grad_norm_(raw_model.parameters(), args.grad_clip)
+        else:
+            grad_norm = None
+        optimizer.step()
+        # Timing
+        torch.cuda.synchronize()
+        t1 = time.time()
+        dt_ms = (t1 - t0) * 1000
+        tok_per_sec = tokens_per_step / (t1 - t0)
+        # --- Logging ---
+        if is_master and step % 10 == 0:
+            grad_str = f" | Grad: {grad_norm:.2f}" if grad_norm is not None else ""
+            print(f"  Step {step:>6d}/{args.max_steps} | Loss: {loss.item():.4f} | "
+                  f"LR: {lr:.2e} | {tok_per_sec:,.0f} tok/s | {dt_ms:.0f}ms{grad_str}")
+        if wandb_enabled:
+            import wandb
+            log = {
+                "train_loss": loss.item(),
+                "lr": lr,
+                "tokens_per_sec": tok_per_sec,
+                "step_time_ms": dt_ms,
+            }
+            if grad_norm is not None:
+                log["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+            wandb.log(log, step=step)
+        # --- Validation ---
+        if is_master and val_loader and step > 0 and step % args.val_every == 0:
+            val_loss = val_loader.estimate_loss(raw_model)
+            print(f"  >>> Validation Loss: {val_loss:.4f}")
+            if wandb_enabled:
+                wandb.log({"val_loss": val_loss}, step=step)
+            # Save checkpoint
+            save_checkpoint(model, optimizer, step, loss.item(), val_loss,
+                            config, checkpoint_dir, is_ddp)
+        # --- Periodic save (no val) ---
+        elif is_master and step > 0 and step % args.save_every == 0 and step % args.val_every != 0:
+            save_checkpoint(model, optimizer, step, loss.item(), -1.0,
+                            config, checkpoint_dir, is_ddp)
+    # --- Final validation & save ---
+    if is_master:
+        print(f"\n{'=' * 60}")
+        print("  Training complete!")
+        if val_loader:
+            final_val = val_loader.estimate_loss(raw_model)
+            print(f"  Final Val Loss: {final_val:.4f}")
+        # Save final model for HF upload
+        output_dir = Path(args.output_dir)
+        save_final_model(model, config, output_dir, is_ddp)
+        # Save final checkpoint too
+        save_checkpoint(model, optimizer, args.max_steps, loss.item(),
+                        final_val if val_loader else -1.0,
+                        config, checkpoint_dir, is_ddp)
+        print(f"\n  Upload to HuggingFace:")
+        print(f"    huggingface-cli upload anthonym21/Eve-2-MoE-250M {output_dir}/")
+        print(f"{'=' * 60}")
+    if wandb_enabled:
+        import wandb
+        wandb.finish()
+    cleanup_distributed()
+if __name__ == "__main__":
+    main()