Delete configs

Browse files

Files changed (8) hide show

configs/examples/demo.yaml +0 -48
configs/examples/pico-decoder-large.yaml +0 -35
configs/examples/pico-decoder-medium.yaml +0 -35
configs/examples/pico-decoder-small.yaml +0 -35
configs/examples/pico-decoder-tiny.yaml +0 -35
configs/pico-decoder-tiny-dolma10M-v1.yaml +0 -78
configs/pico-decoder-tiny-dolma20M-v1.yaml +0 -78
configs/pico-decoder-tiny-dolma5M-v1.yaml +0 -78

configs/examples/demo.yaml DELETED Viewed

@@ -1,48 +0,0 @@
-# Demo config file
-# You can follow this template to create your own config file
-# Refer to the config files in the configs/ directory to see all the available options
-data:
-  dataloader:
-    batch_size: 32
-checkpointing:
-  run_name: "pico-decoder-demo-1"
-  save_every_n_steps: 50
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "pico-lm/demo"
-  learning_dynamics:
-    batch_size: 16
-model:
-    d_model: 96
-    activation_hidden_dim: 384
-evaluation:
-  paloma:
-    batch_size: 32
-monitoring:
-  save_to_wandb: true
-  wandb:
-    project: "pico-demo"
-    entity: "pico-lm"
-  logging:
-    log_every_n_steps: 10
-training:
-  max_steps: 100
-  optimization:
-    lr: 0.001
-    lr_warmup_steps: 30
-    gradient_accumulation_steps: 2
-  fabric:
-    num_devices: 1

configs/examples/pico-decoder-large.yaml DELETED Viewed

@@ -1,35 +0,0 @@
-# Demo config file
-# You can follow this template to create your own config file
-# Refer to the config files in the configs/ directory to see all the available options
-checkpointing:
-  run_name: "pico-decoder-large-1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "pico-lm/pico-decoder-large"
-  learning_dynamics:
-    batch_size: 128
-model:
-    d_model: 1536
-    activation_hidden_dim: 6144
-monitoring:
-  save_to_wandb: true
-  wandb:
-    project: "pico-decoder"
-    entity: "pico-lm"
-training:
-  optimization:
-    gradient_accumulation_steps: 8
-  fabric:
-    num_nodes: 4
-    num_devices: 4
-evaluation:
-  paloma:
-    batch_size: 16

configs/examples/pico-decoder-medium.yaml DELETED Viewed

@@ -1,35 +0,0 @@
-# Demo config file
-# You can follow this template to create your own config file
-# Refer to the config files in the configs/ directory to see all the available options
-checkpointing:
-  run_name: "pico-decoder-medium-1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "pico-lm/pico-decoder-medium"
-  learning_dynamics:
-    batch_size: 128
-model:
-    d_model: 768
-    activation_hidden_dim: 3072
-monitoring:
-  save_to_wandb: true
-  wandb:
-    project: "pico-decoder"
-    entity: "pico-lm"
-training:
-  optimization:
-    gradient_accumulation_steps: 8
-  fabric:
-    num_nodes: 4
-    num_devices: 4
-evaluation:
-  paloma:
-    batch_size: 16

configs/examples/pico-decoder-small.yaml DELETED Viewed

@@ -1,35 +0,0 @@
-# Demo config file
-# You can follow this template to create your own config file
-# Refer to the config files in the configs/ directory to see all the available options
-checkpointing:
-  run_name: "pico-decoder-small-1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "pico-lm/pico-decoder-small"
-  learning_dynamics:
-    batch_size: 128
-model:
-    d_model: 384
-    activation_hidden_dim: 1536
-monitoring:
-  save_to_wandb: true
-  wandb:
-    project: "pico-decoder"
-    entity: "pico-lm"
-training:
-  optimization:
-    gradient_accumulation_steps: 8
-  fabric:
-    num_nodes: 4
-    num_devices: 4
-evaluation:
-  paloma:
-    batch_size: 16

configs/examples/pico-decoder-tiny.yaml DELETED Viewed

@@ -1,35 +0,0 @@
-# Demo config file
-# You can follow this template to create your own config file
-# Refer to the config files in the configs/ directory to see all the available options
-checkpointing:
-  run_name: "pico-decoder-tiny-1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "pico-lm/pico-decoder-tiny"
-  learning_dynamics:
-    batch_size: 256
-model:
-    d_model: 96
-    activation_hidden_dim: 384
-monitoring:
-  save_to_wandb: true
-  wandb:
-    project: "pico-decoder"
-    entity: "pico-lm"
-training:
-  optimization:
-    gradient_accumulation_steps: 4
-  fabric:
-    num_nodes: 4
-    num_devices: 4
-evaluation:
-  paloma:
-    batch_size: 32

configs/pico-decoder-tiny-dolma10M-v1.yaml DELETED Viewed

@@ -1,78 +0,0 @@
-# High Quality Training Config - Optimized for H100 80GB Performance
-# Fast training configuration maintaining identical model quality
-# Optimized for H100 80GB with maximum throughput while preserving stability
-# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
-checkpointing:
-  run_name: "pico-decoder-tiny-dolma10M-v1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "ThomasTheMaker/pico-decoder-tiny"
-  save_every_n_steps: 2000  # Reduced checkpoint frequency for faster training
-  learning_dynamics:
-    batch_size: 1  # Minimal batch size for learning dynamics
-    eval_data: null  # Disable learning dynamics to save memory
-model:
-    d_model: 96
-    activation_hidden_dim: 384
-    dropout: 0.15  # Increased dropout for stronger regularization
-    attention_dropout: 0.15  # Increased attention dropout
-    layer_norm_eps: 1e-5  # Tighter normalization for stability
-    weight_init_type: "truncated_normal"  # Truncated normal for stability
-    layer_norm_type: "rms_norm"  # RMSNorm for better stability
-    use_qk_norm: true  # Query-Key normalization for attention stability
-monitoring:
-  save_to_wandb: false
-  wandb:
-    project: "pico-decoder-tiny"
-    entity: "boymyc"
-  logging:
-    log_every_n_steps: 100  # Reduced logging frequency for faster training
-training:
-  max_steps: 100000  # Longer training for better convergence
-  optimization:
-    lr: 0.0002  # Scaled learning rate for larger batch size (4x increase)
-    lr_warmup_steps: 2000  # Reduced warmup for faster convergence
-    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
-    weight_decay: 0.02  # Increased weight decay for stronger regularization
-    max_grad_norm: 0.5  # Tighter gradient clipping for stability
-    gradient_accumulation_steps: 1  # Reduced for faster training with larger batches
-    optimizer: "adamw"
-    adam_beta1: 0.9  # Standard AdamW beta1
-    adam_beta2: 0.999  # Standard AdamW beta2
-    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
-  fabric:
-    num_nodes: 1
-    num_devices: 1
-    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
-evaluation:
-  paloma:
-    batch_size: 1  # Minimal evaluation batch size
-    eval_every_n_steps: 1000  # Reduced evaluation frequency for faster training
-data:
-  dataset:
-    name: "ThomasTheMaker/pretokenized-dolma-10M"  # Updated to 5M token dataset
-  dataloader:
-    batch_size: 16  # Conservative H100 optimization - 4x larger for stable fast training
-  tokenizer:
-    name: "allenai/OLMo-7B-0724-hf"
-    vocab_size: 50304
-# H100-optimized training strategy for fast, memory-safe training:
-# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
-# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
-# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
-# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
-# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
-# 6. Same model architecture and regularization for identical final performance
-# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
-# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
-# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
-# 10. Same convergence quality with significant speedup and no memory issues

configs/pico-decoder-tiny-dolma20M-v1.yaml DELETED Viewed

@@ -1,78 +0,0 @@
-# High Quality Training Config - Optimized for H100 80GB Performance
-# Fast training configuration maintaining identical model quality
-# Optimized for H100 80GB with maximum throughput while preserving stability
-# Updated for efficient training on Dolma 10M tokens with H100-optimized hyperparameters
-checkpointing:
-  run_name: "pico-decoder-tiny-dolma20M-v1"
-  save_to_hf: false
-  hf_checkpoint:
-    repo_id: "ThomasTheMaker/pico-decoder-tiny"
-  save_every_n_steps: 1000  # Reduced checkpoint frequency for faster training
-  learning_dynamics:
-    batch_size: 1  # Minimal batch size for learning dynamics
-    eval_data: null  # Disable learning dynamics to save memory
-model:
-    d_model: 96
-    activation_hidden_dim: 384
-    dropout: 0.15  # Increased dropout for stronger regularization
-    attention_dropout: 0.15  # Increased attention dropout
-    layer_norm_eps: 1e-5  # Tighter normalization for stability
-    weight_init_type: "truncated_normal"  # Truncated normal for stability
-    layer_norm_type: "rms_norm"  # RMSNorm for better stability
-    use_qk_norm: true  # Query-Key normalization for attention stability
-monitoring:
-  save_to_wandb: false
-  wandb:
-    project: "pico-decoder-tiny"
-    entity: "boymyc"
-  logging:
-    log_every_n_steps: 100  # Reduced logging frequency for faster training
-training:
-  max_steps: 100000  # Longer training for better convergence
-  optimization:
-    lr: 0.0002  # Scaled learning rate for larger batch size (4x increase)
-    lr_warmup_steps: 2000  # Reduced warmup for faster convergence
-    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
-    weight_decay: 0.02  # Increased weight decay for stronger regularization
-    max_grad_norm: 0.5  # Tighter gradient clipping for stability
-    gradient_accumulation_steps: 1  # Reduced for faster training with larger batches
-    optimizer: "adamw"
-    adam_beta1: 0.9  # Standard AdamW beta1
-    adam_beta2: 0.999  # Standard AdamW beta2
-    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
-  fabric:
-    num_nodes: 1
-    num_devices: 1
-    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
-evaluation:
-  paloma:
-    batch_size: 1  # Minimal evaluation batch size
-    eval_every_n_steps: 1000  # Reduced evaluation frequency for faster training
-data:
-  dataset:
-    name: "ThomasTheMaker/pretokenized-dolma-20M"  # Updated to 5M token dataset
-  dataloader:
-    batch_size: 16  # Conservative H100 optimization - 4x larger for stable fast training
-  tokenizer:
-    name: "allenai/OLMo-7B-0724-hf"
-    vocab_size: 50304
-# H100-optimized training strategy for fast, memory-safe training:
-# 1. Conservative batch size (16) with scaled learning rate (0.0002) for stable H100 utilization
-# 2. Reduced gradient accumulation (1 step) for faster optimization cycles
-# 3. Shorter warmup (2000 steps) for quicker convergence with larger batches
-# 4. Reduced evaluation frequency (1000 steps) to minimize training interruptions
-# 5. Reduced checkpoint/logging frequency to minimize I/O overhead
-# 6. Same model architecture and regularization for identical final performance
-# 7. Expected 4-6x training speedup while maintaining model quality and memory safety
-# 8. Memory usage: ~15-25GB of 80GB H100 VRAM (safe utilization avoiding OOM)
-# 9. Maintains all stability features: RMSNorm, QK-Norm, dropout, weight decay
-# 10. Same convergence quality with significant speedup and no memory issues

configs/pico-decoder-tiny-dolma5M-v1.yaml DELETED Viewed

@@ -1,78 +0,0 @@
-# High Quality Training Config - Optimized for superior model performance
-# This configuration prioritizes model quality over training speed
-# Designed for RTX 5090 with focus on preventing overfitting and maximizing generalization
-# Updated for scaling training on Dolma 5M tokens with stability-focused hyperparameters
-checkpointing:
-  run_name: "pico-decoder-tiny-dolma5M-v1"
-  save_to_hf: true
-  hf_checkpoint:
-    repo_id: "ThomasTheMaker/pico-decoder-tiny"
-  save_every_n_steps: 500  # Frequent checkpoints for quality monitoring
-  learning_dynamics:
-    batch_size: 1  # Minimal batch size for learning dynamics
-    eval_data: null  # Disable learning dynamics to save memory
-model:
-    d_model: 96
-    activation_hidden_dim: 384
-    dropout: 0.15  # Increased dropout for stronger regularization
-    attention_dropout: 0.15  # Increased attention dropout
-    layer_norm_eps: 1e-5  # Tighter normalization for stability
-    weight_init_type: "truncated_normal"  # Truncated normal for stability
-    layer_norm_type: "rms_norm"  # RMSNorm for better stability
-    use_qk_norm: true  # Query-Key normalization for attention stability
-monitoring:
-  save_to_wandb: false
-  wandb:
-    project: "pico-decoder-tiny"
-    entity: "boymyc"
-  logging:
-    log_every_n_steps: 25  # Very frequent logging for quality monitoring
-training:
-  max_steps: 100000  # Longer training for better convergence
-  optimization:
-    lr: 0.00005  # Even lower learning rate for precision training
-    lr_warmup_steps: 8000  # Extended warmup for stability
-    lr_scheduler: "cosine"  # Cosine decay over full dataset for sustained learning
-    weight_decay: 0.02  # Increased weight decay for stronger regularization
-    max_grad_norm: 0.5  # Tighter gradient clipping for stability
-    gradient_accumulation_steps: 4  # Increased for better gradient estimates
-    optimizer: "adamw"
-    adam_beta1: 0.9  # Standard AdamW beta1
-    adam_beta2: 0.999  # Standard AdamW beta2
-    adam_epsilon: 1e-8  # Tighter epsilon for numerical stability and convergence
-  fabric:
-    num_nodes: 1
-    num_devices: 1
-    precision: "bf16-mixed"  # BF16 for Tensor Core optimization
-evaluation:
-  paloma:
-    batch_size: 1  # Minimal evaluation batch size
-    eval_every_n_steps: 250  # Very frequent evaluation for quality monitoring
-data:
-  dataset:
-    name: "ThomasTheMaker/pretokenized-dolma-5M"  # Updated to 5M token dataset
-  dataloader:
-    batch_size: 4  # Reduced for more stable training
-  tokenizer:
-    name: "allenai/OLMo-7B-0724-hf"
-    vocab_size: 50304
-# Stability-focused training strategy for large-scale Dolma training:
-# 1. Cosine learning rate schedule for sustained learning over full dataset
-# 2. Truncated normal weight initialization to prevent extreme outliers
-# 3. RMSNorm for better gradient stability during long training runs
-# 4. Query-Key normalization (QK-Norm) to prevent attention logit overflow
-# 5. AdamW epsilon 1e-8 for improved training stability and convergence
-# 6. Extended warmup (8000 steps) for stable foundation
-# 7. Stronger regularization (dropout 0.15, weight decay 0.02)
-# 8. Tighter gradient clipping (0.5) for stability
-# 9. More frequent evaluation (every 250 steps) for quality monitoring
-# 10. Longer training (40000 steps) for full convergence on 5M tokens