Delete configs
Browse files- configs/examples/demo.yaml +0 -48
- configs/examples/pico-decoder-large.yaml +0 -35
- configs/examples/pico-decoder-medium.yaml +0 -35
- configs/examples/pico-decoder-small.yaml +0 -35
- configs/examples/pico-decoder-tiny.yaml +0 -35
- configs/pico-decoder-tiny-dolma10M-v1.yaml +0 -78
- configs/pico-decoder-tiny-dolma20M-v1.yaml +0 -78
- configs/pico-decoder-tiny-dolma5M-v1.yaml +0 -78
configs/examples/demo.yaml
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
# Demo config file
|
| 2 |
-
# You can follow this template to create your own config file
|
| 3 |
-
# Refer to the config files in the configs/ directory to see all the available options
|
| 4 |
-
|
| 5 |
-
data:
|
| 6 |
-
dataloader:
|
| 7 |
-
batch_size: 32
|
| 8 |
-
|
| 9 |
-
checkpointing:
|
| 10 |
-
run_name: "pico-decoder-demo-1"
|
| 11 |
-
save_every_n_steps: 50
|
| 12 |
-
|
| 13 |
-
save_to_hf: true
|
| 14 |
-
hf_checkpoint:
|
| 15 |
-
repo_id: "pico-lm/demo"
|
| 16 |
-
|
| 17 |
-
learning_dynamics:
|
| 18 |
-
batch_size: 16
|
| 19 |
-
|
| 20 |
-
model:
|
| 21 |
-
d_model: 96
|
| 22 |
-
activation_hidden_dim: 384
|
| 23 |
-
|
| 24 |
-
evaluation:
|
| 25 |
-
paloma:
|
| 26 |
-
batch_size: 32
|
| 27 |
-
|
| 28 |
-
monitoring:
|
| 29 |
-
|
| 30 |
-
save_to_wandb: true
|
| 31 |
-
wandb:
|
| 32 |
-
project: "pico-demo"
|
| 33 |
-
entity: "pico-lm"
|
| 34 |
-
|
| 35 |
-
logging:
|
| 36 |
-
log_every_n_steps: 10
|
| 37 |
-
|
| 38 |
-
training:
|
| 39 |
-
max_steps: 100
|
| 40 |
-
|
| 41 |
-
optimization:
|
| 42 |
-
lr: 0.001
|
| 43 |
-
lr_warmup_steps: 30
|
| 44 |
-
|
| 45 |
-
gradient_accumulation_steps: 2
|
| 46 |
-
|
| 47 |
-
fabric:
|
| 48 |
-
num_devices: 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/examples/pico-decoder-large.yaml
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# Demo config file
|
| 2 |
-
# You can follow this template to create your own config file
|
| 3 |
-
# Refer to the config files in the configs/ directory to see all the available options
|
| 4 |
-
|
| 5 |
-
checkpointing:
|
| 6 |
-
run_name: "pico-decoder-large-1"
|
| 7 |
-
save_to_hf: true
|
| 8 |
-
hf_checkpoint:
|
| 9 |
-
repo_id: "pico-lm/pico-decoder-large"
|
| 10 |
-
|
| 11 |
-
learning_dynamics:
|
| 12 |
-
batch_size: 128
|
| 13 |
-
|
| 14 |
-
model:
|
| 15 |
-
d_model: 1536
|
| 16 |
-
activation_hidden_dim: 6144
|
| 17 |
-
|
| 18 |
-
monitoring:
|
| 19 |
-
save_to_wandb: true
|
| 20 |
-
wandb:
|
| 21 |
-
project: "pico-decoder"
|
| 22 |
-
entity: "pico-lm"
|
| 23 |
-
|
| 24 |
-
training:
|
| 25 |
-
optimization:
|
| 26 |
-
gradient_accumulation_steps: 8
|
| 27 |
-
|
| 28 |
-
fabric:
|
| 29 |
-
num_nodes: 4
|
| 30 |
-
num_devices: 4
|
| 31 |
-
|
| 32 |
-
evaluation:
|
| 33 |
-
paloma:
|
| 34 |
-
batch_size: 16
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/examples/pico-decoder-medium.yaml
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# Demo config file
|
| 2 |
-
# You can follow this template to create your own config file
|
| 3 |
-
# Refer to the config files in the configs/ directory to see all the available options
|
| 4 |
-
|
| 5 |
-
checkpointing:
|
| 6 |
-
run_name: "pico-decoder-medium-1"
|
| 7 |
-
save_to_hf: true
|
| 8 |
-
hf_checkpoint:
|
| 9 |
-
repo_id: "pico-lm/pico-decoder-medium"
|
| 10 |
-
|
| 11 |
-
learning_dynamics:
|
| 12 |
-
batch_size: 128
|
| 13 |
-
|
| 14 |
-
model:
|
| 15 |
-
d_model: 768
|
| 16 |
-
activation_hidden_dim: 3072
|
| 17 |
-
|
| 18 |
-
monitoring:
|
| 19 |
-
save_to_wandb: true
|
| 20 |
-
wandb:
|
| 21 |
-
project: "pico-decoder"
|
| 22 |
-
entity: "pico-lm"
|
| 23 |
-
|
| 24 |
-
training:
|
| 25 |
-
optimization:
|
| 26 |
-
gradient_accumulation_steps: 8
|
| 27 |
-
|
| 28 |
-
fabric:
|
| 29 |
-
num_nodes: 4
|
| 30 |
-
num_devices: 4
|
| 31 |
-
|
| 32 |
-
evaluation:
|
| 33 |
-
paloma:
|
| 34 |
-
batch_size: 16
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/examples/pico-decoder-small.yaml
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# Demo config file
|
| 2 |
-
# You can follow this template to create your own config file
|
| 3 |
-
# Refer to the config files in the configs/ directory to see all the available options
|
| 4 |
-
|
| 5 |
-
checkpointing:
|
| 6 |
-
run_name: "pico-decoder-small-1"
|
| 7 |
-
save_to_hf: true
|
| 8 |
-
hf_checkpoint:
|
| 9 |
-
repo_id: "pico-lm/pico-decoder-small"
|
| 10 |
-
|
| 11 |
-
learning_dynamics:
|
| 12 |
-
batch_size: 128
|
| 13 |
-
|
| 14 |
-
model:
|
| 15 |
-
d_model: 384
|
| 16 |
-
activation_hidden_dim: 1536
|
| 17 |
-
|
| 18 |
-
monitoring:
|
| 19 |
-
save_to_wandb: true
|
| 20 |
-
wandb:
|
| 21 |
-
project: "pico-decoder"
|
| 22 |
-
entity: "pico-lm"
|
| 23 |
-
|
| 24 |
-
training:
|
| 25 |
-
optimization:
|
| 26 |
-
gradient_accumulation_steps: 8
|
| 27 |
-
|
| 28 |
-
fabric:
|
| 29 |
-
num_nodes: 4
|
| 30 |
-
num_devices: 4
|
| 31 |
-
|
| 32 |
-
evaluation:
|
| 33 |
-
paloma:
|
| 34 |
-
batch_size: 16
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/examples/pico-decoder-tiny.yaml
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# Demo config file
|
| 2 |
-
# You can follow this template to create your own config file
|
| 3 |
-
# Refer to the config files in the configs/ directory to see all the available options
|
| 4 |
-
|
| 5 |
-
checkpointing:
|
| 6 |
-
run_name: "pico-decoder-tiny-1"
|
| 7 |
-
save_to_hf: true
|
| 8 |
-
hf_checkpoint:
|
| 9 |
-
repo_id: "pico-lm/pico-decoder-tiny"
|
| 10 |
-
|
| 11 |
-
learning_dynamics:
|
| 12 |
-
batch_size: 256
|
| 13 |
-
|
| 14 |
-
model:
|
| 15 |
-
d_model: 96
|
| 16 |
-
activation_hidden_dim: 384
|
| 17 |
-
|
| 18 |
-
monitoring:
|
| 19 |
-
save_to_wandb: true
|
| 20 |
-
wandb:
|
| 21 |
-
project: "pico-decoder"
|
| 22 |
-
entity: "pico-lm"
|
| 23 |
-
|
| 24 |
-
training:
|
| 25 |
-
optimization:
|
| 26 |
-
gradient_accumulation_steps: 4
|
| 27 |
-
|
| 28 |
-
fabric:
|
| 29 |
-
num_nodes: 4
|
| 30 |
-
num_devices: 4
|
| 31 |
-
|
| 32 |
-
evaluation:
|
| 33 |
-
paloma:
|
| 34 |
-
batch_size: 32
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/pico-decoder-tiny-dolma10M-v1.yaml
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
# High Quality Training Config - Optimized for H100 80GB Performance
|
| 2 |
-
# Fast training configuration maintaining identical model quality
|
| 3 |
-
# Optimized for H100 80GB with maximum throughput while preserving stability
|
| 4 |
-
# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
|
| 5 |
-
|
| 6 |
-
checkpointing:
|
| 7 |
-
run_name: "pico-decoder-tiny-dolma10M-v1"
|
| 8 |
-
save_to_hf: true
|
| 9 |
-
hf_checkpoint:
|
| 10 |
-
repo_id: "ThomasTheMaker/pico-decoder-tiny"
|
| 11 |
-
save_every_n_steps: 2000 # Reduced checkpoint frequency for faster training
|
| 12 |
-
|
| 13 |
-
learning_dynamics:
|
| 14 |
-
batch_size: 1 # Minimal batch size for learning dynamics
|
| 15 |
-
eval_data: null # Disable learning dynamics to save memory
|
| 16 |
-
|
| 17 |
-
model:
|
| 18 |
-
d_model: 96
|
| 19 |
-
activation_hidden_dim: 384
|
| 20 |
-
dropout: 0.15 # Increased dropout for stronger regularization
|
| 21 |
-
attention_dropout: 0.15 # Increased attention dropout
|
| 22 |
-
layer_norm_eps: 1e-5 # Tighter normalization for stability
|
| 23 |
-
weight_init_type: "truncated_normal" # Truncated normal for stability
|
| 24 |
-
layer_norm_type: "rms_norm" # RMSNorm for better stability
|
| 25 |
-
use_qk_norm: true # Query-Key normalization for attention stability
|
| 26 |
-
|
| 27 |
-
monitoring:
|
| 28 |
-
save_to_wandb: false
|
| 29 |
-
wandb:
|
| 30 |
-
project: "pico-decoder-tiny"
|
| 31 |
-
entity: "boymyc"
|
| 32 |
-
logging:
|
| 33 |
-
log_every_n_steps: 100 # Reduced logging frequency for faster training
|
| 34 |
-
|
| 35 |
-
training:
|
| 36 |
-
max_steps: 100000 # Longer training for better convergence
|
| 37 |
-
optimization:
|
| 38 |
-
lr: 0.0002 # Scaled learning rate for larger batch size (4x increase)
|
| 39 |
-
lr_warmup_steps: 2000 # Reduced warmup for faster convergence
|
| 40 |
-
lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
|
| 41 |
-
weight_decay: 0.02 # Increased weight decay for stronger regularization
|
| 42 |
-
max_grad_norm: 0.5 # Tighter gradient clipping for stability
|
| 43 |
-
gradient_accumulation_steps: 1 # Reduced for faster training with larger batches
|
| 44 |
-
optimizer: "adamw"
|
| 45 |
-
adam_beta1: 0.9 # Standard AdamW beta1
|
| 46 |
-
adam_beta2: 0.999 # Standard AdamW beta2
|
| 47 |
-
adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
|
| 48 |
-
|
| 49 |
-
fabric:
|
| 50 |
-
num_nodes: 1
|
| 51 |
-
num_devices: 1
|
| 52 |
-
precision: "bf16-mixed" # BF16 for Tensor Core optimization
|
| 53 |
-
|
| 54 |
-
evaluation:
|
| 55 |
-
paloma:
|
| 56 |
-
batch_size: 1 # Minimal evaluation batch size
|
| 57 |
-
eval_every_n_steps: 1000 # Reduced evaluation frequency for faster training
|
| 58 |
-
|
| 59 |
-
data:
|
| 60 |
-
dataset:
|
| 61 |
-
name: "ThomasTheMaker/pretokenized-dolma-10M" # Updated to 5M token dataset
|
| 62 |
-
dataloader:
|
| 63 |
-
batch_size: 16 # Conservative H100 optimization - 4x larger for stable fast training
|
| 64 |
-
tokenizer:
|
| 65 |
-
name: "allenai/OLMo-7B-0724-hf"
|
| 66 |
-
vocab_size: 50304
|
| 67 |
-
|
| 68 |
-
# H100-optimized training strategy for fast, memory-safe training:
|
| 69 |
-
# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
|
| 70 |
-
# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
|
| 71 |
-
# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
|
| 72 |
-
# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
|
| 73 |
-
# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
|
| 74 |
-
# 6. Same model architecture and regularization for identical final performance
|
| 75 |
-
# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
|
| 76 |
-
# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
|
| 77 |
-
# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
|
| 78 |
-
# 10. Same convergence quality with significant speedup and no memory issues
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/pico-decoder-tiny-dolma20M-v1.yaml
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
# High Quality Training Config - Optimized for H100 80GB Performance
|
| 2 |
-
# Fast training configuration maintaining identical model quality
|
| 3 |
-
# Optimized for H100 80GB with maximum throughput while preserving stability
|
| 4 |
-
# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
|
| 5 |
-
|
| 6 |
-
checkpointing:
|
| 7 |
-
run_name: "pico-decoder-tiny-dolma20M-v1"
|
| 8 |
-
save_to_hf: false
|
| 9 |
-
hf_checkpoint:
|
| 10 |
-
repo_id: "ThomasTheMaker/pico-decoder-tiny"
|
| 11 |
-
save_every_n_steps: 1000 # Reduced checkpoint frequency for faster training
|
| 12 |
-
|
| 13 |
-
learning_dynamics:
|
| 14 |
-
batch_size: 1 # Minimal batch size for learning dynamics
|
| 15 |
-
eval_data: null # Disable learning dynamics to save memory
|
| 16 |
-
|
| 17 |
-
model:
|
| 18 |
-
d_model: 96
|
| 19 |
-
activation_hidden_dim: 384
|
| 20 |
-
dropout: 0.15 # Increased dropout for stronger regularization
|
| 21 |
-
attention_dropout: 0.15 # Increased attention dropout
|
| 22 |
-
layer_norm_eps: 1e-5 # Tighter normalization for stability
|
| 23 |
-
weight_init_type: "truncated_normal" # Truncated normal for stability
|
| 24 |
-
layer_norm_type: "rms_norm" # RMSNorm for better stability
|
| 25 |
-
use_qk_norm: true # Query-Key normalization for attention stability
|
| 26 |
-
|
| 27 |
-
monitoring:
|
| 28 |
-
save_to_wandb: false
|
| 29 |
-
wandb:
|
| 30 |
-
project: "pico-decoder-tiny"
|
| 31 |
-
entity: "boymyc"
|
| 32 |
-
logging:
|
| 33 |
-
log_every_n_steps: 100 # Reduced logging frequency for faster training
|
| 34 |
-
|
| 35 |
-
training:
|
| 36 |
-
max_steps: 100000 # Longer training for better convergence
|
| 37 |
-
optimization:
|
| 38 |
-
lr: 0.0002 # Scaled learning rate for larger batch size (4x increase)
|
| 39 |
-
lr_warmup_steps: 2000 # Reduced warmup for faster convergence
|
| 40 |
-
lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
|
| 41 |
-
weight_decay: 0.02 # Increased weight decay for stronger regularization
|
| 42 |
-
max_grad_norm: 0.5 # Tighter gradient clipping for stability
|
| 43 |
-
gradient_accumulation_steps: 1 # Reduced for faster training with larger batches
|
| 44 |
-
optimizer: "adamw"
|
| 45 |
-
adam_beta1: 0.9 # Standard AdamW beta1
|
| 46 |
-
adam_beta2: 0.999 # Standard AdamW beta2
|
| 47 |
-
adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
|
| 48 |
-
|
| 49 |
-
fabric:
|
| 50 |
-
num_nodes: 1
|
| 51 |
-
num_devices: 1
|
| 52 |
-
precision: "bf16-mixed" # BF16 for Tensor Core optimization
|
| 53 |
-
|
| 54 |
-
evaluation:
|
| 55 |
-
paloma:
|
| 56 |
-
batch_size: 1 # Minimal evaluation batch size
|
| 57 |
-
eval_every_n_steps: 1000 # Reduced evaluation frequency for faster training
|
| 58 |
-
|
| 59 |
-
data:
|
| 60 |
-
dataset:
|
| 61 |
-
name: "ThomasTheMaker/pretokenized-dolma-20M" # Updated to 5M token dataset
|
| 62 |
-
dataloader:
|
| 63 |
-
batch_size: 16 # Conservative H100 optimization - 4x larger for stable fast training
|
| 64 |
-
tokenizer:
|
| 65 |
-
name: "allenai/OLMo-7B-0724-hf"
|
| 66 |
-
vocab_size: 50304
|
| 67 |
-
|
| 68 |
-
# H100-optimized training strategy for fast, memory-safe training:
|
| 69 |
-
# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
|
| 70 |
-
# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
|
| 71 |
-
# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
|
| 72 |
-
# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
|
| 73 |
-
# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
|
| 74 |
-
# 6. Same model architecture and regularization for identical final performance
|
| 75 |
-
# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
|
| 76 |
-
# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
|
| 77 |
-
# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
|
| 78 |
-
# 10. Same convergence quality with significant speedup and no memory issues
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
configs/pico-decoder-tiny-dolma5M-v1.yaml
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
# High Quality Training Config - Optimized for superior model performance
|
| 2 |
-
# This configuration prioritizes model quality over training speed
|
| 3 |
-
# Designed for RTX 5090 with focus on preventing overfitting and maximizing generalization
|
| 4 |
-
# Updated for scaling training on Dolma 5M tokens with stability-focused hyperparameters
|
| 5 |
-
|
| 6 |
-
checkpointing:
|
| 7 |
-
run_name: "pico-decoder-tiny-dolma5M-v1"
|
| 8 |
-
save_to_hf: true
|
| 9 |
-
hf_checkpoint:
|
| 10 |
-
repo_id: "ThomasTheMaker/pico-decoder-tiny"
|
| 11 |
-
save_every_n_steps: 500 # Frequent checkpoints for quality monitoring
|
| 12 |
-
|
| 13 |
-
learning_dynamics:
|
| 14 |
-
batch_size: 1 # Minimal batch size for learning dynamics
|
| 15 |
-
eval_data: null # Disable learning dynamics to save memory
|
| 16 |
-
|
| 17 |
-
model:
|
| 18 |
-
d_model: 96
|
| 19 |
-
activation_hidden_dim: 384
|
| 20 |
-
dropout: 0.15 # Increased dropout for stronger regularization
|
| 21 |
-
attention_dropout: 0.15 # Increased attention dropout
|
| 22 |
-
layer_norm_eps: 1e-5 # Tighter normalization for stability
|
| 23 |
-
weight_init_type: "truncated_normal" # Truncated normal for stability
|
| 24 |
-
layer_norm_type: "rms_norm" # RMSNorm for better stability
|
| 25 |
-
use_qk_norm: true # Query-Key normalization for attention stability
|
| 26 |
-
|
| 27 |
-
monitoring:
|
| 28 |
-
save_to_wandb: false
|
| 29 |
-
wandb:
|
| 30 |
-
project: "pico-decoder-tiny"
|
| 31 |
-
entity: "boymyc"
|
| 32 |
-
logging:
|
| 33 |
-
log_every_n_steps: 25 # Very frequent logging for quality monitoring
|
| 34 |
-
|
| 35 |
-
training:
|
| 36 |
-
max_steps: 100000 # Longer training for better convergence
|
| 37 |
-
optimization:
|
| 38 |
-
lr: 0.00005 # Even lower learning rate for precision training
|
| 39 |
-
lr_warmup_steps: 8000 # Extended warmup for stability
|
| 40 |
-
lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
|
| 41 |
-
weight_decay: 0.02 # Increased weight decay for stronger regularization
|
| 42 |
-
max_grad_norm: 0.5 # Tighter gradient clipping for stability
|
| 43 |
-
gradient_accumulation_steps: 4 # Increased for better gradient estimates
|
| 44 |
-
optimizer: "adamw"
|
| 45 |
-
adam_beta1: 0.9 # Standard AdamW beta1
|
| 46 |
-
adam_beta2: 0.999 # Standard AdamW beta2
|
| 47 |
-
adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
|
| 48 |
-
|
| 49 |
-
fabric:
|
| 50 |
-
num_nodes: 1
|
| 51 |
-
num_devices: 1
|
| 52 |
-
precision: "bf16-mixed" # BF16 for Tensor Core optimization
|
| 53 |
-
|
| 54 |
-
evaluation:
|
| 55 |
-
paloma:
|
| 56 |
-
batch_size: 1 # Minimal evaluation batch size
|
| 57 |
-
eval_every_n_steps: 250 # Very frequent evaluation for quality monitoring
|
| 58 |
-
|
| 59 |
-
data:
|
| 60 |
-
dataset:
|
| 61 |
-
name: "ThomasTheMaker/pretokenized-dolma-5M" # Updated to 5M token dataset
|
| 62 |
-
dataloader:
|
| 63 |
-
batch_size: 4 # Reduced for more stable training
|
| 64 |
-
tokenizer:
|
| 65 |
-
name: "allenai/OLMo-7B-0724-hf"
|
| 66 |
-
vocab_size: 50304
|
| 67 |
-
|
| 68 |
-
# Stability-focused training strategy for large-scale Dolma training:
|
| 69 |
-
# 1. Cosine learning rate schedule for sustained learning over full dataset
|
| 70 |
-
# 2. Truncated normal weight initialization to prevent extreme outliers
|
| 71 |
-
# 3. RMSNorm for better gradient stability during long training runs
|
| 72 |
-
# 4. Query-Key normalization (QK-Norm) to prevent attention logit overflow
|
| 73 |
-
# 5. AdamW epsilon 1e-8 for improved training stability and convergence
|
| 74 |
-
# 6. Extended warmup (8000 steps) for stable foundation
|
| 75 |
-
# 7. Stronger regularization (dropout 0.15, weight decay 0.02)
|
| 76 |
-
# 8. Tighter gradient clipping (0.5) for stability
|
| 77 |
-
# 9. More frequent evaluation (every 250 steps) for quality monitoring
|
| 78 |
-
# 10. Longer training (40000 steps) for full convergence on 5M tokens
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|