trainer_config.yaml · DeepXR/Helion-V1.5-XL at refs/pr/3

Helion-V1.5-XL / trainer_config.yaml

Create trainer_config.yaml

fdac17c verified 6 months ago

5.13 kB

	# Helion-V1.5-XL Training Configuration

	# Model Architecture
	model:
	model_type: helion
	vocab_size: 100000
	hidden_size: 6144
	intermediate_size: 24576
	num_hidden_layers: 48
	num_attention_heads: 32
	num_key_value_heads: 8
	max_position_embeddings: 16384
	rope_theta: 10000.0
	rope_scaling:
	type: linear
	factor: 2.0
	hidden_act: silu
	initializer_range: 0.02
	rms_norm_eps: 1.0e-6
	use_cache: true
	tie_word_embeddings: false
	attention_bias: false
	attention_dropout: 0.0

	# Training Configuration
	training:
	# Optimization
	optimizer: adamw
	learning_rate: 3.0e-4
	weight_decay: 0.1
	adam_beta1: 0.9
	adam_beta2: 0.95
	adam_epsilon: 1.0e-8
	max_grad_norm: 1.0

	# Learning Rate Schedule
	lr_scheduler_type: cosine
	warmup_steps: 2000
	min_learning_rate: 3.0e-5

	# Batch Configuration
	per_device_train_batch_size: 32
	gradient_accumulation_steps: 8
	global_batch_size: 4194304 # in tokens
	max_sequence_length: 4096

	# Training Steps
	max_steps: 875000
	save_steps: 5000
	eval_steps: 1000
	logging_steps: 100

	# Mixed Precision
	fp16: false
	bf16: true
	tf32: true

	# Distributed Training
	distributed_strategy: fsdp
	fsdp_config:
	fsdp_transformer_layer_cls_to_wrap: HelionDecoderLayer
	fsdp_backward_prefetch: backward_pre
	fsdp_state_dict_type: FULL_STATE_DICT
	fsdp_cpu_offload: false

	# Gradient Checkpointing
	gradient_checkpointing: true
	gradient_checkpointing_kwargs:
	use_reentrant: false

	# Compilation
	torch_compile: true
	torch_compile_backend: inductor
	torch_compile_mode: max-autotune

	# Data Configuration
	data:
	# Dataset Mixing Ratios
	datasets:
	- name: web_text
	weight: 0.45
	sources:
	- common_crawl_filtered
	- c4
	- redpajama_web

	- name: books
	weight: 0.20
	sources:
	- books3
	- gutenberg
	- bookcorpus

	- name: code
	weight: 0.15
	sources:
	- github_code
	- stack_overflow
	- starcoder_data

	- name: scientific
	weight: 0.10
	sources:
	- arxiv
	- pubmed
	- semantic_scholar

	- name: instruction
	weight: 0.08
	sources:
	- openorca
	- ultrachat
	- wizardlm
	- alpaca

	- name: multilingual
	weight: 0.02
	sources:
	- mc4_multilingual
	- wikipedia_multilingual

	# Data Processing
	preprocessing:
	tokenizer: helion_tokenizer
	max_length: 4096
	padding: false
	truncation: true

	# Data Quality
	quality_filters:
	- deduplication: true
	dedup_threshold: 0.85
	- min_token_length: 50
	- max_token_length: 8192
	- perplexity_filter: true
	perplexity_threshold: 1500
	- toxicity_filter: true
	toxicity_threshold: 0.5
	- pii_removal: true

	# Infrastructure
	infrastructure:
	# Compute
	num_gpus: 512
	gpu_type: A100-80GB
	num_nodes: 64
	gpus_per_node: 8

	# Networking
	interconnect: infiniband
	bandwidth_per_gpu: 400 # Gbps
	communication_backend: nccl

	# Storage
	checkpoint_dir: /mnt/checkpoints/helion-v15-xl
	data_dir: /mnt/data/training_corpus
	tensorboard_dir: /mnt/logs/tensorboard

	# Monitoring
	wandb_project: helion-v15-xl
	wandb_entity: deepxr-research
	log_level: info

	# Evaluation
	evaluation:
	eval_datasets:
	- mmlu
	- hellaswag
	- arc_challenge
	- arc_easy
	- truthfulqa
	- gsm8k
	- humaneval
	- mbpp

	eval_batch_size: 16
	eval_accumulation_steps: 4

	# Few-shot Configuration
	few_shot_examples:
	mmlu: 5
	hellaswag: 10
	arc_challenge: 25
	arc_easy: 25
	gsm8k: 8
	humaneval: 0
	mbpp: 0

	# Fine-tuning Stages
	stages:
	# Stage 1: Pre-training
	- name: pretraining
	steps: 750000
	data_mix: [web_text, books, code, scientific]
	learning_rate: 3.0e-4

	# Stage 2: Domain Adaptation
	- name: domain_adaptation
	steps: 80000
	data_mix: [code, scientific]
	learning_rate: 1.0e-4

	# Stage 3: Instruction Tuning
	- name: instruction_tuning
	steps: 45000
	data_mix: [instruction]
	learning_rate: 5.0e-5
	lr_scheduler_type: linear

	# Checkpointing
	checkpointing:
	save_total_limit: 10
	save_strategy: steps
	load_best_model_at_end: true
	metric_for_best_model: eval_loss
	greater_is_better: false

	# Resume Training
	resume_from_checkpoint: null
	auto_resume: true

	# Hardware Optimization
	optimization:
	# Memory Optimization
	activation_checkpointing: true
	cpu_offload: false
	zero_stage: 2

	# Flash Attention
	use_flash_attention: true
	flash_attention_version: 2

	# Kernel Fusion
	fused_adam: true
	fused_lamb: false

	# Communication
	overlap_communication: true
	bucket_size_mb: 25

	# Safety and Alignment
	safety:
	# Content Filtering
	content_filters:
	- toxicity_classifier
	- bias_detector
	- pii_detector

	# Constitutional AI
	constitutional_principles:
	- harmlessness
	- helpfulness
	- honesty

	# RLHF Configuration
	rlhf:
	enabled: false
	reward_model: null
	ppo_epochs: 4
	kl_coefficient: 0.1