Arko007 commited on
Commit
eae9a7e
·
verified ·
1 Parent(s): f55bc2e

Upload config.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. config.py +79 -0
config.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Nano-GPT Configuration
3
+ L4-SAFE: Reduced memory usage
4
+ """
5
+
6
+ import torch
7
+ from dataclasses import dataclass
8
+
9
+ @dataclass
10
+ class NanoGPTConfig:
11
+ # Model Architecture
12
+ vocab_size: int = 32000
13
+ n_layers: int = 8 # REDUCED from 12
14
+ n_heads: int = 8 # REDUCED from 12
15
+ n_embd: int = 512 # REDUCED from 768
16
+ block_size: int = 512 # REDUCED from 1024 (KEY!)
17
+ dropout: float = 0.1
18
+ bias: bool = True
19
+
20
+ # Training Hyperparameters
21
+ batch_size: int = 16 # REDUCED from 32 (KEY!)
22
+ gradient_accumulation_steps: int = 8 # INCREASED from 4
23
+ learning_rate: float = 3e-4
24
+ max_iters: int = 100000
25
+ weight_decay: float = 0.1
26
+ beta1: float = 0.9
27
+ beta2: float = 0.95
28
+ grad_clip: float = 1.0
29
+
30
+ # Learning Rate Scheduling
31
+ decay_lr: bool = True
32
+ warmup_iters: int = 2000
33
+ lr_decay_iters: int = 100000
34
+ min_lr: float = 3e-5
35
+
36
+ # Evaluation & Logging
37
+ eval_interval: int = 1000
38
+ eval_iters: int = 100 # REDUCED from 200
39
+ log_interval: int = 100
40
+
41
+ # Checkpointing
42
+ save_interval: int = 5000
43
+ checkpoint_dir: str = "checkpoints"
44
+
45
+ # Data
46
+ dataset_mix: dict = None
47
+
48
+ # Hardware
49
+ device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
50
+ dtype: str = 'bfloat16'
51
+ compile: bool = False # DISABLED torch.compile (uses more memory!)
52
+
53
+ # Reproducibility
54
+ seed: int = 42
55
+
56
+ def __post_init__(self):
57
+ if self.dataset_mix is None:
58
+ self.dataset_mix = {
59
+ 'fineweb': 1.0
60
+ }
61
+
62
+ @property
63
+ def n_params(self):
64
+ return (2 * self.vocab_size * self.n_embd +
65
+ 12 * self.n_layers * self.n_embd * self.n_embd) / 1e6
66
+
67
+ config = NanoGPTConfig()
68
+
69
+ if __name__ == "__main__":
70
+ print(f"Model size: ~{config.n_params:.1f}M parameters")
71
+ print(f"Sequence length: {config.block_size}")
72
+ print(f"Batch size: {config.batch_size}")
73
+ print(f"Effective batch: {config.batch_size * config.gradient_accumulation_steps}")
74
+
75
+ @dataclass
76
+ class FinetuneConfig(NanoGPTConfig):
77
+ """Config for instruction-tuned models"""
78
+ finetune_lr: float = 1e-4
79
+ finetune_epochs: int = 3