Arko007
/

Zenyx-42M-Instruct

Model card Files Files and versions

xet

Community

Arko007 commited on Oct 18, 2025

Commit

eae9a7e

verified ·

1 Parent(s): f55bc2e

Upload config.py with huggingface_hub

Browse files

Files changed (1) hide show

config.py +79 -0

config.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""
+Nano-GPT Configuration
+L4-SAFE: Reduced memory usage
+"""
+import torch
+from dataclasses import dataclass
+@dataclass
+class NanoGPTConfig:
+    # Model Architecture
+    vocab_size: int = 32000
+    n_layers: int = 8          # REDUCED from 12
+    n_heads: int = 8           # REDUCED from 12
+    n_embd: int = 512          # REDUCED from 768
+    block_size: int = 512      # REDUCED from 1024 (KEY!)
+    dropout: float = 0.1
+    bias: bool = True
+    # Training Hyperparameters
+    batch_size: int = 16       # REDUCED from 32 (KEY!)
+    gradient_accumulation_steps: int = 8  # INCREASED from 4
+    learning_rate: float = 3e-4
+    max_iters: int = 100000
+    weight_decay: float = 0.1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0
+    # Learning Rate Scheduling
+    decay_lr: bool = True
+    warmup_iters: int = 2000
+    lr_decay_iters: int = 100000
+    min_lr: float = 3e-5
+    # Evaluation & Logging
+    eval_interval: int = 1000
+    eval_iters: int = 100      # REDUCED from 200
+    log_interval: int = 100
+    # Checkpointing
+    save_interval: int = 5000
+    checkpoint_dir: str = "checkpoints"
+    # Data
+    dataset_mix: dict = None
+    # Hardware
+    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
+    dtype: str = 'bfloat16'
+    compile: bool = False      # DISABLED torch.compile (uses more memory!)
+    # Reproducibility
+    seed: int = 42
+    def __post_init__(self):
+        if self.dataset_mix is None:
+            self.dataset_mix = {
+                'fineweb': 1.0
+            }
+    @property
+    def n_params(self):
+        return (2 * self.vocab_size * self.n_embd +
+                12 * self.n_layers * self.n_embd * self.n_embd) / 1e6
+config = NanoGPTConfig()
+if __name__ == "__main__":
+    print(f"Model size: ~{config.n_params:.1f}M parameters")
+    print(f"Sequence length: {config.block_size}")
+    print(f"Batch size: {config.batch_size}")
+    print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")
+@dataclass
+class FinetuneConfig(NanoGPTConfig):
+    """Config for instruction-tuned models"""
+    finetune_lr: float = 1e-4
+    finetune_epochs: int = 3