Vedisasi
/

UltraThinking-LLM-Training

Model card Files Files and versions

UltraThinking-LLM-Training / configs /train_large.yaml

Vedisasi's picture

Upload folder using huggingface_hub

54c5666 verified 7 months ago

history blame contribute delete

2.19 kB

	# Large Model Configuration - Full Scale Production
	# Suitable for: Multi-node clusters, A100/H100 GPUs (40-80GB VRAM)

	model:
	vocab_size: 100352
	hidden_size: 4096
	num_layers: 32
	num_heads: 32
	num_kv_heads: 8 # GQA
	intermediate_size: 14336
	max_seq_length: 8192
	activation: swiglu
	dropout: 0.0
	attention_dropout: 0.0
	use_flash_attention: true
	gradient_checkpointing: true

	# Advanced Features - ALL ENABLED
	advanced:
	enable_moe: true
	enable_dre: true
	enable_constitutional: true
	enable_rlhf: false # Enable after pretraining
	enable_multimodal: true
	dre_warmup_steps: 10000

	# MoE Settings - Full Configuration
	moe:
	num_knowledge_experts: 64
	num_skill_experts: 32
	num_meta_experts: 16
	num_safety_experts: 8
	moe_top_k: 2
	expert_capacity: 1.25

	# Multimodal Settings
	multimodal:
	image_size: 224
	patch_size: 14
	audio_sample_rate: 16000

	# Training Configuration
	training:
	batch_size: 32
	gradient_accumulation_steps: 4
	learning_rate: 3e-5
	weight_decay: 0.01
	adam_beta1: 0.9
	adam_beta2: 0.999
	warmup_steps: 10000
	max_steps: 1000000
	num_epochs: 3
	gradient_clipping: 1.0
	use_amp: true

	# Distributed Training - 4D Parallelism
	distributed:
	enabled: true
	use_4d_parallelism: true
	data_parallel_size: 4
	tensor_parallel_size: 2
	pipeline_parallel_size: 2
	expert_parallel_size: 2
	zero_stage: 3
	deepspeed_config: ./config/deepspeed_z3.json
	launcher: deepspeed

	# Data Configuration
	data:
	dataset: pile
	mix_datasets: "wikitext:0.2,openwebtext:0.3,pile:0.4,c4:0.1"
	tokenizer_name: gpt2
	max_samples: null
	train_samples: 10000000
	val_samples: 50000
	num_workers: 16
	streaming: true

	# Evaluation
	evaluation:
	eval_frequency: 5

	# RLHF Configuration (for fine-tuning phase)
	rlhf:
	rlhf_frequency: 5
	rlhf_iterations: 100
	rlhf_steps_per_iteration: 1000
	ppo_epochs: 4
	ppo_batch_size: 32

	# Logging
	logging:
	use_mlflow: true
	mlflow_tracking_uri: file:./mlruns
	mlflow_experiment: UltraThink-Large
	run_name: large_model_training

	# Output
	output:
	output_dir: ./outputs/large_model