| |
|
|
| |
| model: |
| model_type: helion |
| vocab_size: 100000 |
| hidden_size: 6144 |
| intermediate_size: 24576 |
| num_hidden_layers: 48 |
| num_attention_heads: 32 |
| num_key_value_heads: 8 |
| max_position_embeddings: 16384 |
| rope_theta: 10000.0 |
| rope_scaling: |
| type: linear |
| factor: 2.0 |
| hidden_act: silu |
| initializer_range: 0.02 |
| rms_norm_eps: 1.0e-6 |
| use_cache: true |
| tie_word_embeddings: false |
| attention_bias: false |
| attention_dropout: 0.0 |
|
|
| |
| training: |
| |
| optimizer: adamw |
| learning_rate: 3.0e-4 |
| weight_decay: 0.1 |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| adam_epsilon: 1.0e-8 |
| max_grad_norm: 1.0 |
| |
| |
| lr_scheduler_type: cosine |
| warmup_steps: 2000 |
| min_learning_rate: 3.0e-5 |
| |
| |
| per_device_train_batch_size: 32 |
| gradient_accumulation_steps: 8 |
| global_batch_size: 4194304 |
| max_sequence_length: 4096 |
| |
| |
| max_steps: 875000 |
| save_steps: 5000 |
| eval_steps: 1000 |
| logging_steps: 100 |
| |
| |
| fp16: false |
| bf16: true |
| tf32: true |
| |
| |
| distributed_strategy: fsdp |
| fsdp_config: |
| fsdp_transformer_layer_cls_to_wrap: HelionDecoderLayer |
| fsdp_backward_prefetch: backward_pre |
| fsdp_state_dict_type: FULL_STATE_DICT |
| fsdp_cpu_offload: false |
| |
| |
| gradient_checkpointing: true |
| gradient_checkpointing_kwargs: |
| use_reentrant: false |
| |
| |
| torch_compile: true |
| torch_compile_backend: inductor |
| torch_compile_mode: max-autotune |
|
|
| |
| data: |
| |
| datasets: |
| - name: web_text |
| weight: 0.45 |
| sources: |
| - common_crawl_filtered |
| - c4 |
| - redpajama_web |
| |
| - name: books |
| weight: 0.20 |
| sources: |
| - books3 |
| - gutenberg |
| - bookcorpus |
| |
| - name: code |
| weight: 0.15 |
| sources: |
| - github_code |
| - stack_overflow |
| - starcoder_data |
| |
| - name: scientific |
| weight: 0.10 |
| sources: |
| - arxiv |
| - pubmed |
| - semantic_scholar |
| |
| - name: instruction |
| weight: 0.08 |
| sources: |
| - openorca |
| - ultrachat |
| - wizardlm |
| - alpaca |
| |
| - name: multilingual |
| weight: 0.02 |
| sources: |
| - mc4_multilingual |
| - wikipedia_multilingual |
| |
| |
| preprocessing: |
| tokenizer: helion_tokenizer |
| max_length: 4096 |
| padding: false |
| truncation: true |
| |
| |
| quality_filters: |
| - deduplication: true |
| dedup_threshold: 0.85 |
| - min_token_length: 50 |
| - max_token_length: 8192 |
| - perplexity_filter: true |
| perplexity_threshold: 1500 |
| - toxicity_filter: true |
| toxicity_threshold: 0.5 |
| - pii_removal: true |
|
|
| |
| infrastructure: |
| |
| num_gpus: 512 |
| gpu_type: A100-80GB |
| num_nodes: 64 |
| gpus_per_node: 8 |
| |
| |
| interconnect: infiniband |
| bandwidth_per_gpu: 400 |
| communication_backend: nccl |
| |
| |
| checkpoint_dir: /mnt/checkpoints/helion-v15-xl |
| data_dir: /mnt/data/training_corpus |
| tensorboard_dir: /mnt/logs/tensorboard |
| |
| |
| wandb_project: helion-v15-xl |
| wandb_entity: deepxr-research |
| log_level: info |
|
|
| |
| evaluation: |
| eval_datasets: |
| - mmlu |
| - hellaswag |
| - arc_challenge |
| - arc_easy |
| - truthfulqa |
| - gsm8k |
| - humaneval |
| - mbpp |
| |
| eval_batch_size: 16 |
| eval_accumulation_steps: 4 |
| |
| |
| few_shot_examples: |
| mmlu: 5 |
| hellaswag: 10 |
| arc_challenge: 25 |
| arc_easy: 25 |
| gsm8k: 8 |
| humaneval: 0 |
| mbpp: 0 |
|
|
| |
| stages: |
| |
| - name: pretraining |
| steps: 750000 |
| data_mix: [web_text, books, code, scientific] |
| learning_rate: 3.0e-4 |
| |
| |
| - name: domain_adaptation |
| steps: 80000 |
| data_mix: [code, scientific] |
| learning_rate: 1.0e-4 |
| |
| |
| - name: instruction_tuning |
| steps: 45000 |
| data_mix: [instruction] |
| learning_rate: 5.0e-5 |
| lr_scheduler_type: linear |
|
|
| |
| checkpointing: |
| save_total_limit: 10 |
| save_strategy: steps |
| load_best_model_at_end: true |
| metric_for_best_model: eval_loss |
| greater_is_better: false |
| |
| |
| resume_from_checkpoint: null |
| auto_resume: true |
|
|
| |
| optimization: |
| |
| activation_checkpointing: true |
| cpu_offload: false |
| zero_stage: 2 |
| |
| |
| use_flash_attention: true |
| flash_attention_version: 2 |
| |
| |
| fused_adam: true |
| fused_lamb: false |
| |
| |
| overlap_communication: true |
| bucket_size_mb: 25 |
|
|
| |
| safety: |
| |
| content_filters: |
| - toxicity_classifier |
| - bias_detector |
| - pii_detector |
| |
| |
| constitutional_principles: |
| - harmlessness |
| - helpfulness |
| - honesty |
| |
| |
| rlhf: |
| enabled: false |
| reward_model: null |
| ppo_epochs: 4 |
| kl_coefficient: 0.1 |