UltraThinking-LLM-Training / configs /train_large.yaml
Vedisasi's picture
Upload folder using huggingface_hub
54c5666 verified
# Large Model Configuration - Full Scale Production
# Suitable for: Multi-node clusters, A100/H100 GPUs (40-80GB VRAM)
model:
vocab_size: 100352
hidden_size: 4096
num_layers: 32
num_heads: 32
num_kv_heads: 8 # GQA
intermediate_size: 14336
max_seq_length: 8192
activation: swiglu
dropout: 0.0
attention_dropout: 0.0
use_flash_attention: true
gradient_checkpointing: true
# Advanced Features - ALL ENABLED
advanced:
enable_moe: true
enable_dre: true
enable_constitutional: true
enable_rlhf: false # Enable after pretraining
enable_multimodal: true
dre_warmup_steps: 10000
# MoE Settings - Full Configuration
moe:
num_knowledge_experts: 64
num_skill_experts: 32
num_meta_experts: 16
num_safety_experts: 8
moe_top_k: 2
expert_capacity: 1.25
# Multimodal Settings
multimodal:
image_size: 224
patch_size: 14
audio_sample_rate: 16000
# Training Configuration
training:
batch_size: 32
gradient_accumulation_steps: 4
learning_rate: 3e-5
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
warmup_steps: 10000
max_steps: 1000000
num_epochs: 3
gradient_clipping: 1.0
use_amp: true
# Distributed Training - 4D Parallelism
distributed:
enabled: true
use_4d_parallelism: true
data_parallel_size: 4
tensor_parallel_size: 2
pipeline_parallel_size: 2
expert_parallel_size: 2
zero_stage: 3
deepspeed_config: ./config/deepspeed_z3.json
launcher: deepspeed
# Data Configuration
data:
dataset: pile
mix_datasets: "wikitext:0.2,openwebtext:0.3,pile:0.4,c4:0.1"
tokenizer_name: gpt2
max_samples: null
train_samples: 10000000
val_samples: 50000
num_workers: 16
streaming: true
# Evaluation
evaluation:
eval_frequency: 5
# RLHF Configuration (for fine-tuning phase)
rlhf:
rlhf_frequency: 5
rlhf_iterations: 100
rlhf_steps_per_iteration: 1000
ppo_epochs: 4
ppo_batch_size: 32
# Logging
logging:
use_mlflow: true
mlflow_tracking_uri: file:./mlruns
mlflow_experiment: UltraThink-Large
run_name: large_model_training
# Output
output:
output_dir: ./outputs/large_model