Arko007
/

Zenyx-42M-Instruct-v2

Model card Files Files and versions

Zenyx-42M-Instruct-v2 / config.py

Arko007's picture

Upload config.py with huggingface_hub

9f66630 verified 3 months ago

history blame contribute delete

2.05 kB

	"""
	Nano-GPT Configuration
	L4-SAFE: Reduced memory usage
	"""

	import torch
	from dataclasses import dataclass

	@dataclass
	class NanoGPTConfig:
	# Model Architecture
	vocab_size: int = 32000
	n_layers: int = 8 # REDUCED from 12
	n_heads: int = 8 # REDUCED from 12
	n_embd: int = 512 # REDUCED from 768
	block_size: int = 512 # REDUCED from 1024 (KEY!)
	dropout: float = 0.1
	bias: bool = True

	# Training Hyperparameters
	batch_size: int = 16 # REDUCED from 32 (KEY!)
	gradient_accumulation_steps: int = 8 # INCREASED from 4
	learning_rate: float = 3e-4
	max_iters: int = 100000
	weight_decay: float = 0.1
	beta1: float = 0.9
	beta2: float = 0.95
	grad_clip: float = 1.0

	# Learning Rate Scheduling
	decay_lr: bool = True
	warmup_iters: int = 2000
	lr_decay_iters: int = 100000
	min_lr: float = 3e-5

	# Evaluation & Logging
	eval_interval: int = 1000
	eval_iters: int = 100 # REDUCED from 200
	log_interval: int = 100

	# Checkpointing
	save_interval: int = 5000
	checkpoint_dir: str = "checkpoints"

	# Data
	dataset_mix: dict = None

	# Hardware
	device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
	dtype: str = 'bfloat16'
	compile: bool = False # DISABLED torch.compile (uses more memory!)

	# Reproducibility
	seed: int = 42

	def __post_init__(self):
	if self.dataset_mix is None:
	self.dataset_mix = {
	'fineweb': 1.0
	}

	@property
	def n_params(self):
	return (2 * self.vocab_size * self.n_embd +
	12 * self.n_layers * self.n_embd * self.n_embd) / 1e6

	config = NanoGPTConfig()

	if __name__ == "__main__":
	print(f"Model size: ~{config.n_params:.1f}M parameters")
	print(f"Sequence length: {config.block_size}")
	print(f"Batch size: {config.batch_size}")
	print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")