| |
|
|
| |
| model_name_or_path: "DeepXR/Helion-V2-base" |
| model_type: "helion" |
| tokenizer_name: "DeepXR/Helion-V2-tokenizer" |
|
|
| |
| dataset_name: "DeepXR/helion-training-data" |
| dataset_config_name: null |
| train_file: null |
| validation_file: null |
| test_file: null |
| max_seq_length: 8192 |
| preprocessing_num_workers: 64 |
| overwrite_cache: false |
| validation_split_percentage: 1 |
|
|
| |
| output_dir: "./helion-v2-checkpoints" |
| overwrite_output_dir: true |
| do_train: true |
| do_eval: true |
| do_predict: false |
| evaluation_strategy: "steps" |
| eval_steps: 500 |
| per_device_train_batch_size: 4 |
| per_device_eval_batch_size: 8 |
| gradient_accumulation_steps: 32 |
| eval_accumulation_steps: 1 |
| learning_rate: 3.0e-4 |
| weight_decay: 0.01 |
| adam_beta1: 0.9 |
| adam_beta2: 0.95 |
| adam_epsilon: 1.0e-8 |
| max_grad_norm: 1.0 |
| num_train_epochs: 3 |
| max_steps: -1 |
| lr_scheduler_type: "cosine" |
| warmup_ratio: 0.03 |
| warmup_steps: 2000 |
| log_level: "info" |
| logging_dir: "./logs" |
| logging_strategy: "steps" |
| logging_steps: 10 |
| save_strategy: "steps" |
| save_steps: 1000 |
| save_total_limit: 3 |
| seed: 42 |
| data_seed: 42 |
| bf16: true |
| fp16: false |
| tf32: true |
| dataloader_num_workers: 8 |
| dataloader_pin_memory: true |
| remove_unused_columns: false |
| label_names: ["labels"] |
| load_best_model_at_end: true |
| metric_for_best_model: "eval_loss" |
| greater_is_better: false |
| ignore_data_skip: false |
| ddp_find_unused_parameters: false |
| ddp_bucket_cap_mb: 25 |
| dataloader_drop_last: false |
| eval_steps: 500 |
| save_safetensors: true |
| push_to_hub: false |
| hub_private_repo: true |
| gradient_checkpointing: true |
| include_inputs_for_metrics: false |
| auto_find_batch_size: false |
| full_determinism: false |
| report_to: ["tensorboard", "wandb"] |
|
|
| |
| deepspeed: "./ds_config_zero3.json" |
|
|
| |
| optim: "adamw_torch" |
| group_by_length: true |
| length_column_name: "length" |
|
|
| |
| torch_dtype: "bfloat16" |
| low_cpu_mem_usage: true |
| use_flash_attention_2: true |
| attention_dropout: 0.0 |
| residual_dropout: 0.0 |
|
|
| |
| resume_from_checkpoint: null |
| ignore_mismatched_sizes: false |
|
|
| |
| local_rank: -1 |
| ddp_backend: "nccl" |
| sharded_ddp: [] |
| fsdp: [] |
| fsdp_config: null |
|
|
| |
| prediction_loss_only: false |
| per_device_eval_batch_size: 8 |
| eval_delay: 0 |
|
|
| |
| early_stopping_patience: null |
| early_stopping_threshold: 0.0 |
|
|
| |
| padding: "max_length" |
| truncation: true |
| return_overflowing_tokens: false |
| return_length: false |
|
|
| |
| max_steps_per_epoch: null |
| gradient_checkpointing_kwargs: |
| use_reentrant: false |