Helion-V1.5-XL / trainer_config.yaml
Trouter-Library's picture
Create trainer_config.yaml
fdac17c verified
raw
history blame
5.13 kB
# Helion-V1.5-XL Training Configuration
# Model Architecture
model:
model_type: helion
vocab_size: 100000
hidden_size: 6144
intermediate_size: 24576
num_hidden_layers: 48
num_attention_heads: 32
num_key_value_heads: 8
max_position_embeddings: 16384
rope_theta: 10000.0
rope_scaling:
type: linear
factor: 2.0
hidden_act: silu
initializer_range: 0.02
rms_norm_eps: 1.0e-6
use_cache: true
tie_word_embeddings: false
attention_bias: false
attention_dropout: 0.0
# Training Configuration
training:
# Optimization
optimizer: adamw
learning_rate: 3.0e-4
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1.0e-8
max_grad_norm: 1.0
# Learning Rate Schedule
lr_scheduler_type: cosine
warmup_steps: 2000
min_learning_rate: 3.0e-5
# Batch Configuration
per_device_train_batch_size: 32
gradient_accumulation_steps: 8
global_batch_size: 4194304 # in tokens
max_sequence_length: 4096
# Training Steps
max_steps: 875000
save_steps: 5000
eval_steps: 1000
logging_steps: 100
# Mixed Precision
fp16: false
bf16: true
tf32: true
# Distributed Training
distributed_strategy: fsdp
fsdp_config:
fsdp_transformer_layer_cls_to_wrap: HelionDecoderLayer
fsdp_backward_prefetch: backward_pre
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_cpu_offload: false
# Gradient Checkpointing
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
# Compilation
torch_compile: true
torch_compile_backend: inductor
torch_compile_mode: max-autotune
# Data Configuration
data:
# Dataset Mixing Ratios
datasets:
- name: web_text
weight: 0.45
sources:
- common_crawl_filtered
- c4
- redpajama_web
- name: books
weight: 0.20
sources:
- books3
- gutenberg
- bookcorpus
- name: code
weight: 0.15
sources:
- github_code
- stack_overflow
- starcoder_data
- name: scientific
weight: 0.10
sources:
- arxiv
- pubmed
- semantic_scholar
- name: instruction
weight: 0.08
sources:
- openorca
- ultrachat
- wizardlm
- alpaca
- name: multilingual
weight: 0.02
sources:
- mc4_multilingual
- wikipedia_multilingual
# Data Processing
preprocessing:
tokenizer: helion_tokenizer
max_length: 4096
padding: false
truncation: true
# Data Quality
quality_filters:
- deduplication: true
dedup_threshold: 0.85
- min_token_length: 50
- max_token_length: 8192
- perplexity_filter: true
perplexity_threshold: 1500
- toxicity_filter: true
toxicity_threshold: 0.5
- pii_removal: true
# Infrastructure
infrastructure:
# Compute
num_gpus: 512
gpu_type: A100-80GB
num_nodes: 64
gpus_per_node: 8
# Networking
interconnect: infiniband
bandwidth_per_gpu: 400 # Gbps
communication_backend: nccl
# Storage
checkpoint_dir: /mnt/checkpoints/helion-v15-xl
data_dir: /mnt/data/training_corpus
tensorboard_dir: /mnt/logs/tensorboard
# Monitoring
wandb_project: helion-v15-xl
wandb_entity: deepxr-research
log_level: info
# Evaluation
evaluation:
eval_datasets:
- mmlu
- hellaswag
- arc_challenge
- arc_easy
- truthfulqa
- gsm8k
- humaneval
- mbpp
eval_batch_size: 16
eval_accumulation_steps: 4
# Few-shot Configuration
few_shot_examples:
mmlu: 5
hellaswag: 10
arc_challenge: 25
arc_easy: 25
gsm8k: 8
humaneval: 0
mbpp: 0
# Fine-tuning Stages
stages:
# Stage 1: Pre-training
- name: pretraining
steps: 750000
data_mix: [web_text, books, code, scientific]
learning_rate: 3.0e-4
# Stage 2: Domain Adaptation
- name: domain_adaptation
steps: 80000
data_mix: [code, scientific]
learning_rate: 1.0e-4
# Stage 3: Instruction Tuning
- name: instruction_tuning
steps: 45000
data_mix: [instruction]
learning_rate: 5.0e-5
lr_scheduler_type: linear
# Checkpointing
checkpointing:
save_total_limit: 10
save_strategy: steps
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
# Resume Training
resume_from_checkpoint: null
auto_resume: true
# Hardware Optimization
optimization:
# Memory Optimization
activation_checkpointing: true
cpu_offload: false
zero_stage: 2
# Flash Attention
use_flash_attention: true
flash_attention_version: 2
# Kernel Fusion
fused_adam: true
fused_lamb: false
# Communication
overlap_communication: true
bucket_size_mb: 25
# Safety and Alignment
safety:
# Content Filtering
content_filters:
- toxicity_classifier
- bias_detector
- pii_detector
# Constitutional AI
constitutional_principles:
- harmlessness
- helpfulness
- honesty
# RLHF Configuration
rlhf:
enabled: false
reward_model: null
ppo_epochs: 4
kl_coefficient: 0.1