Helion-V2 / training_args.yaml
Trouter-Library's picture
Create training_args.yaml
4ba76d2 verified
raw
history blame
2.53 kB
# Training Configuration for Helion-V2
# Model Configuration
model_name_or_path: "DeepXR/Helion-V2-base"
model_type: "helion"
tokenizer_name: "DeepXR/Helion-V2-tokenizer"
# Data Configuration
dataset_name: "DeepXR/helion-training-data"
dataset_config_name: null
train_file: null
validation_file: null
test_file: null
max_seq_length: 8192
preprocessing_num_workers: 64
overwrite_cache: false
validation_split_percentage: 1
# Training Arguments
output_dir: "./helion-v2-checkpoints"
overwrite_output_dir: true
do_train: true
do_eval: true
do_predict: false
evaluation_strategy: "steps"
eval_steps: 500
per_device_train_batch_size: 4
per_device_eval_batch_size: 8
gradient_accumulation_steps: 32
eval_accumulation_steps: 1
learning_rate: 3.0e-4
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1.0e-8
max_grad_norm: 1.0
num_train_epochs: 3
max_steps: -1
lr_scheduler_type: "cosine"
warmup_ratio: 0.03
warmup_steps: 2000
log_level: "info"
logging_dir: "./logs"
logging_strategy: "steps"
logging_steps: 10
save_strategy: "steps"
save_steps: 1000
save_total_limit: 3
seed: 42
data_seed: 42
bf16: true
fp16: false
tf32: true
dataloader_num_workers: 8
dataloader_pin_memory: true
remove_unused_columns: false
label_names: ["labels"]
load_best_model_at_end: true
metric_for_best_model: "eval_loss"
greater_is_better: false
ignore_data_skip: false
ddp_find_unused_parameters: false
ddp_bucket_cap_mb: 25
dataloader_drop_last: false
eval_steps: 500
save_safetensors: true
push_to_hub: false
hub_private_repo: true
gradient_checkpointing: true
include_inputs_for_metrics: false
auto_find_batch_size: false
full_determinism: false
report_to: ["tensorboard", "wandb"]
# DeepSpeed Configuration
deepspeed: "./ds_config_zero3.json"
# Optimization
optim: "adamw_torch"
group_by_length: true
length_column_name: "length"
# Model-specific Settings
torch_dtype: "bfloat16"
low_cpu_mem_usage: true
use_flash_attention_2: true
attention_dropout: 0.0
residual_dropout: 0.0
# Resume Training
resume_from_checkpoint: null
ignore_mismatched_sizes: false
# Distributed Training
local_rank: -1
ddp_backend: "nccl"
sharded_ddp: []
fsdp: []
fsdp_config: null
# Evaluation
prediction_loss_only: false
per_device_eval_batch_size: 8
eval_delay: 0
# Callbacks
early_stopping_patience: null
early_stopping_threshold: 0.0
# Tokenization
padding: "max_length"
truncation: true
return_overflowing_tokens: false
return_length: false
# Additional Training Settings
max_steps_per_epoch: null
gradient_checkpointing_kwargs:
use_reentrant: false