ThomasTheMaker commited on
Commit
a1f100c
·
verified ·
1 Parent(s): feba2ad

Delete configs

Browse files
configs/examples/demo.yaml DELETED
@@ -1,48 +0,0 @@
1
- # Demo config file
2
- # You can follow this template to create your own config file
3
- # Refer to the config files in the configs/ directory to see all the available options
4
-
5
- data:
6
- dataloader:
7
- batch_size: 32
8
-
9
- checkpointing:
10
- run_name: "pico-decoder-demo-1"
11
- save_every_n_steps: 50
12
-
13
- save_to_hf: true
14
- hf_checkpoint:
15
- repo_id: "pico-lm/demo"
16
-
17
- learning_dynamics:
18
- batch_size: 16
19
-
20
- model:
21
- d_model: 96
22
- activation_hidden_dim: 384
23
-
24
- evaluation:
25
- paloma:
26
- batch_size: 32
27
-
28
- monitoring:
29
-
30
- save_to_wandb: true
31
- wandb:
32
- project: "pico-demo"
33
- entity: "pico-lm"
34
-
35
- logging:
36
- log_every_n_steps: 10
37
-
38
- training:
39
- max_steps: 100
40
-
41
- optimization:
42
- lr: 0.001
43
- lr_warmup_steps: 30
44
-
45
- gradient_accumulation_steps: 2
46
-
47
- fabric:
48
- num_devices: 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/examples/pico-decoder-large.yaml DELETED
@@ -1,35 +0,0 @@
1
- # Demo config file
2
- # You can follow this template to create your own config file
3
- # Refer to the config files in the configs/ directory to see all the available options
4
-
5
- checkpointing:
6
- run_name: "pico-decoder-large-1"
7
- save_to_hf: true
8
- hf_checkpoint:
9
- repo_id: "pico-lm/pico-decoder-large"
10
-
11
- learning_dynamics:
12
- batch_size: 128
13
-
14
- model:
15
- d_model: 1536
16
- activation_hidden_dim: 6144
17
-
18
- monitoring:
19
- save_to_wandb: true
20
- wandb:
21
- project: "pico-decoder"
22
- entity: "pico-lm"
23
-
24
- training:
25
- optimization:
26
- gradient_accumulation_steps: 8
27
-
28
- fabric:
29
- num_nodes: 4
30
- num_devices: 4
31
-
32
- evaluation:
33
- paloma:
34
- batch_size: 16
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/examples/pico-decoder-medium.yaml DELETED
@@ -1,35 +0,0 @@
1
- # Demo config file
2
- # You can follow this template to create your own config file
3
- # Refer to the config files in the configs/ directory to see all the available options
4
-
5
- checkpointing:
6
- run_name: "pico-decoder-medium-1"
7
- save_to_hf: true
8
- hf_checkpoint:
9
- repo_id: "pico-lm/pico-decoder-medium"
10
-
11
- learning_dynamics:
12
- batch_size: 128
13
-
14
- model:
15
- d_model: 768
16
- activation_hidden_dim: 3072
17
-
18
- monitoring:
19
- save_to_wandb: true
20
- wandb:
21
- project: "pico-decoder"
22
- entity: "pico-lm"
23
-
24
- training:
25
- optimization:
26
- gradient_accumulation_steps: 8
27
-
28
- fabric:
29
- num_nodes: 4
30
- num_devices: 4
31
-
32
- evaluation:
33
- paloma:
34
- batch_size: 16
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/examples/pico-decoder-small.yaml DELETED
@@ -1,35 +0,0 @@
1
- # Demo config file
2
- # You can follow this template to create your own config file
3
- # Refer to the config files in the configs/ directory to see all the available options
4
-
5
- checkpointing:
6
- run_name: "pico-decoder-small-1"
7
- save_to_hf: true
8
- hf_checkpoint:
9
- repo_id: "pico-lm/pico-decoder-small"
10
-
11
- learning_dynamics:
12
- batch_size: 128
13
-
14
- model:
15
- d_model: 384
16
- activation_hidden_dim: 1536
17
-
18
- monitoring:
19
- save_to_wandb: true
20
- wandb:
21
- project: "pico-decoder"
22
- entity: "pico-lm"
23
-
24
- training:
25
- optimization:
26
- gradient_accumulation_steps: 8
27
-
28
- fabric:
29
- num_nodes: 4
30
- num_devices: 4
31
-
32
- evaluation:
33
- paloma:
34
- batch_size: 16
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/examples/pico-decoder-tiny.yaml DELETED
@@ -1,35 +0,0 @@
1
- # Demo config file
2
- # You can follow this template to create your own config file
3
- # Refer to the config files in the configs/ directory to see all the available options
4
-
5
- checkpointing:
6
- run_name: "pico-decoder-tiny-1"
7
- save_to_hf: true
8
- hf_checkpoint:
9
- repo_id: "pico-lm/pico-decoder-tiny"
10
-
11
- learning_dynamics:
12
- batch_size: 256
13
-
14
- model:
15
- d_model: 96
16
- activation_hidden_dim: 384
17
-
18
- monitoring:
19
- save_to_wandb: true
20
- wandb:
21
- project: "pico-decoder"
22
- entity: "pico-lm"
23
-
24
- training:
25
- optimization:
26
- gradient_accumulation_steps: 4
27
-
28
- fabric:
29
- num_nodes: 4
30
- num_devices: 4
31
-
32
- evaluation:
33
- paloma:
34
- batch_size: 32
35
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/pico-decoder-tiny-dolma10M-v1.yaml DELETED
@@ -1,78 +0,0 @@
1
- # High Quality Training Config - Optimized for H100 80GB Performance
2
- # Fast training configuration maintaining identical model quality
3
- # Optimized for H100 80GB with maximum throughput while preserving stability
4
- # Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
5
-
6
- checkpointing:
7
- run_name: "pico-decoder-tiny-dolma10M-v1"
8
- save_to_hf: true
9
- hf_checkpoint:
10
- repo_id: "ThomasTheMaker/pico-decoder-tiny"
11
- save_every_n_steps: 2000 # Reduced checkpoint frequency for faster training
12
-
13
- learning_dynamics:
14
- batch_size: 1 # Minimal batch size for learning dynamics
15
- eval_data: null # Disable learning dynamics to save memory
16
-
17
- model:
18
- d_model: 96
19
- activation_hidden_dim: 384
20
- dropout: 0.15 # Increased dropout for stronger regularization
21
- attention_dropout: 0.15 # Increased attention dropout
22
- layer_norm_eps: 1e-5 # Tighter normalization for stability
23
- weight_init_type: "truncated_normal" # Truncated normal for stability
24
- layer_norm_type: "rms_norm" # RMSNorm for better stability
25
- use_qk_norm: true # Query-Key normalization for attention stability
26
-
27
- monitoring:
28
- save_to_wandb: false
29
- wandb:
30
- project: "pico-decoder-tiny"
31
- entity: "boymyc"
32
- logging:
33
- log_every_n_steps: 100 # Reduced logging frequency for faster training
34
-
35
- training:
36
- max_steps: 100000 # Longer training for better convergence
37
- optimization:
38
- lr: 0.0002 # Scaled learning rate for larger batch size (4x increase)
39
- lr_warmup_steps: 2000 # Reduced warmup for faster convergence
40
- lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
41
- weight_decay: 0.02 # Increased weight decay for stronger regularization
42
- max_grad_norm: 0.5 # Tighter gradient clipping for stability
43
- gradient_accumulation_steps: 1 # Reduced for faster training with larger batches
44
- optimizer: "adamw"
45
- adam_beta1: 0.9 # Standard AdamW beta1
46
- adam_beta2: 0.999 # Standard AdamW beta2
47
- adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
48
-
49
- fabric:
50
- num_nodes: 1
51
- num_devices: 1
52
- precision: "bf16-mixed" # BF16 for Tensor Core optimization
53
-
54
- evaluation:
55
- paloma:
56
- batch_size: 1 # Minimal evaluation batch size
57
- eval_every_n_steps: 1000 # Reduced evaluation frequency for faster training
58
-
59
- data:
60
- dataset:
61
- name: "ThomasTheMaker/pretokenized-dolma-10M" # Updated to 5M token dataset
62
- dataloader:
63
- batch_size: 16 # Conservative H100 optimization - 4x larger for stable fast training
64
- tokenizer:
65
- name: "allenai/OLMo-7B-0724-hf"
66
- vocab_size: 50304
67
-
68
- # H100-optimized training strategy for fast, memory-safe training:
69
- # 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
70
- # 2. Reduced gradient accumulation (1 step) for faster optimization cycles
71
- # 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
72
- # 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
73
- # 5. Reduced checkpoint/logging frequency to minimize I/O overhead
74
- # 6. Same model architecture and regularization for identical final performance
75
- # 7. Expected 4-6x training speedup while maintaining model quality and memory safety
76
- # 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
77
- # 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
78
- # 10. Same convergence quality with significant speedup and no memory issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/pico-decoder-tiny-dolma20M-v1.yaml DELETED
@@ -1,78 +0,0 @@
1
- # High Quality Training Config - Optimized for H100 80GB Performance
2
- # Fast training configuration maintaining identical model quality
3
- # Optimized for H100 80GB with maximum throughput while preserving stability
4
- # Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
5
-
6
- checkpointing:
7
- run_name: "pico-decoder-tiny-dolma20M-v1"
8
- save_to_hf: false
9
- hf_checkpoint:
10
- repo_id: "ThomasTheMaker/pico-decoder-tiny"
11
- save_every_n_steps: 1000 # Reduced checkpoint frequency for faster training
12
-
13
- learning_dynamics:
14
- batch_size: 1 # Minimal batch size for learning dynamics
15
- eval_data: null # Disable learning dynamics to save memory
16
-
17
- model:
18
- d_model: 96
19
- activation_hidden_dim: 384
20
- dropout: 0.15 # Increased dropout for stronger regularization
21
- attention_dropout: 0.15 # Increased attention dropout
22
- layer_norm_eps: 1e-5 # Tighter normalization for stability
23
- weight_init_type: "truncated_normal" # Truncated normal for stability
24
- layer_norm_type: "rms_norm" # RMSNorm for better stability
25
- use_qk_norm: true # Query-Key normalization for attention stability
26
-
27
- monitoring:
28
- save_to_wandb: false
29
- wandb:
30
- project: "pico-decoder-tiny"
31
- entity: "boymyc"
32
- logging:
33
- log_every_n_steps: 100 # Reduced logging frequency for faster training
34
-
35
- training:
36
- max_steps: 100000 # Longer training for better convergence
37
- optimization:
38
- lr: 0.0002 # Scaled learning rate for larger batch size (4x increase)
39
- lr_warmup_steps: 2000 # Reduced warmup for faster convergence
40
- lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
41
- weight_decay: 0.02 # Increased weight decay for stronger regularization
42
- max_grad_norm: 0.5 # Tighter gradient clipping for stability
43
- gradient_accumulation_steps: 1 # Reduced for faster training with larger batches
44
- optimizer: "adamw"
45
- adam_beta1: 0.9 # Standard AdamW beta1
46
- adam_beta2: 0.999 # Standard AdamW beta2
47
- adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
48
-
49
- fabric:
50
- num_nodes: 1
51
- num_devices: 1
52
- precision: "bf16-mixed" # BF16 for Tensor Core optimization
53
-
54
- evaluation:
55
- paloma:
56
- batch_size: 1 # Minimal evaluation batch size
57
- eval_every_n_steps: 1000 # Reduced evaluation frequency for faster training
58
-
59
- data:
60
- dataset:
61
- name: "ThomasTheMaker/pretokenized-dolma-20M" # Updated to 5M token dataset
62
- dataloader:
63
- batch_size: 16 # Conservative H100 optimization - 4x larger for stable fast training
64
- tokenizer:
65
- name: "allenai/OLMo-7B-0724-hf"
66
- vocab_size: 50304
67
-
68
- # H100-optimized training strategy for fast, memory-safe training:
69
- # 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
70
- # 2. Reduced gradient accumulation (1 step) for faster optimization cycles
71
- # 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
72
- # 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
73
- # 5. Reduced checkpoint/logging frequency to minimize I/O overhead
74
- # 6. Same model architecture and regularization for identical final performance
75
- # 7. Expected 4-6x training speedup while maintaining model quality and memory safety
76
- # 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
77
- # 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
78
- # 10. Same convergence quality with significant speedup and no memory issues
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/pico-decoder-tiny-dolma5M-v1.yaml DELETED
@@ -1,78 +0,0 @@
1
- # High Quality Training Config - Optimized for superior model performance
2
- # This configuration prioritizes model quality over training speed
3
- # Designed for RTX 5090 with focus on preventing overfitting and maximizing generalization
4
- # Updated for scaling training on Dolma 5M tokens with stability-focused hyperparameters
5
-
6
- checkpointing:
7
- run_name: "pico-decoder-tiny-dolma5M-v1"
8
- save_to_hf: true
9
- hf_checkpoint:
10
- repo_id: "ThomasTheMaker/pico-decoder-tiny"
11
- save_every_n_steps: 500 # Frequent checkpoints for quality monitoring
12
-
13
- learning_dynamics:
14
- batch_size: 1 # Minimal batch size for learning dynamics
15
- eval_data: null # Disable learning dynamics to save memory
16
-
17
- model:
18
- d_model: 96
19
- activation_hidden_dim: 384
20
- dropout: 0.15 # Increased dropout for stronger regularization
21
- attention_dropout: 0.15 # Increased attention dropout
22
- layer_norm_eps: 1e-5 # Tighter normalization for stability
23
- weight_init_type: "truncated_normal" # Truncated normal for stability
24
- layer_norm_type: "rms_norm" # RMSNorm for better stability
25
- use_qk_norm: true # Query-Key normalization for attention stability
26
-
27
- monitoring:
28
- save_to_wandb: false
29
- wandb:
30
- project: "pico-decoder-tiny"
31
- entity: "boymyc"
32
- logging:
33
- log_every_n_steps: 25 # Very frequent logging for quality monitoring
34
-
35
- training:
36
- max_steps: 100000 # Longer training for better convergence
37
- optimization:
38
- lr: 0.00005 # Even lower learning rate for precision training
39
- lr_warmup_steps: 8000 # Extended warmup for stability
40
- lr_scheduler: "cosine" # Cosine decay over full dataset for sustained learning
41
- weight_decay: 0.02 # Increased weight decay for stronger regularization
42
- max_grad_norm: 0.5 # Tighter gradient clipping for stability
43
- gradient_accumulation_steps: 4 # Increased for better gradient estimates
44
- optimizer: "adamw"
45
- adam_beta1: 0.9 # Standard AdamW beta1
46
- adam_beta2: 0.999 # Standard AdamW beta2
47
- adam_epsilon: 1e-8 # Tighter epsilon for numerical stability and convergence
48
-
49
- fabric:
50
- num_nodes: 1
51
- num_devices: 1
52
- precision: "bf16-mixed" # BF16 for Tensor Core optimization
53
-
54
- evaluation:
55
- paloma:
56
- batch_size: 1 # Minimal evaluation batch size
57
- eval_every_n_steps: 250 # Very frequent evaluation for quality monitoring
58
-
59
- data:
60
- dataset:
61
- name: "ThomasTheMaker/pretokenized-dolma-5M" # Updated to 5M token dataset
62
- dataloader:
63
- batch_size: 4 # Reduced for more stable training
64
- tokenizer:
65
- name: "allenai/OLMo-7B-0724-hf"
66
- vocab_size: 50304
67
-
68
- # Stability-focused training strategy for large-scale Dolma training:
69
- # 1. Cosine learning rate schedule for sustained learning over full dataset
70
- # 2. Truncated normal weight initialization to prevent extreme outliers
71
- # 3. RMSNorm for better gradient stability during long training runs
72
- # 4. Query-Key normalization (QK-Norm) to prevent attention logit overflow
73
- # 5. AdamW epsilon 1e-8 for improved training stability and convergence
74
- # 6. Extended warmup (8000 steps) for stable foundation
75
- # 7. Stronger regularization (dropout 0.15, weight decay 0.02)
76
- # 8. Tighter gradient clipping (0.5) for stability
77
- # 9. More frequent evaluation (every 250 steps) for quality monitoring
78
- # 10. Longer training (40000 steps) for full convergence on 5M tokens