Trouter-Library commited on
Commit
4ba76d2
·
verified ·
1 Parent(s): 0c9c45e

Create training_args.yaml

Browse files
Files changed (1) hide show
  1. training_args.yaml +119 -0
training_args.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Configuration for Helion-V2
2
+
3
+ # Model Configuration
4
+ model_name_or_path: "DeepXR/Helion-V2-base"
5
+ model_type: "helion"
6
+ tokenizer_name: "DeepXR/Helion-V2-tokenizer"
7
+
8
+ # Data Configuration
9
+ dataset_name: "DeepXR/helion-training-data"
10
+ dataset_config_name: null
11
+ train_file: null
12
+ validation_file: null
13
+ test_file: null
14
+ max_seq_length: 8192
15
+ preprocessing_num_workers: 64
16
+ overwrite_cache: false
17
+ validation_split_percentage: 1
18
+
19
+ # Training Arguments
20
+ output_dir: "./helion-v2-checkpoints"
21
+ overwrite_output_dir: true
22
+ do_train: true
23
+ do_eval: true
24
+ do_predict: false
25
+ evaluation_strategy: "steps"
26
+ eval_steps: 500
27
+ per_device_train_batch_size: 4
28
+ per_device_eval_batch_size: 8
29
+ gradient_accumulation_steps: 32
30
+ eval_accumulation_steps: 1
31
+ learning_rate: 3.0e-4
32
+ weight_decay: 0.01
33
+ adam_beta1: 0.9
34
+ adam_beta2: 0.95
35
+ adam_epsilon: 1.0e-8
36
+ max_grad_norm: 1.0
37
+ num_train_epochs: 3
38
+ max_steps: -1
39
+ lr_scheduler_type: "cosine"
40
+ warmup_ratio: 0.03
41
+ warmup_steps: 2000
42
+ log_level: "info"
43
+ logging_dir: "./logs"
44
+ logging_strategy: "steps"
45
+ logging_steps: 10
46
+ save_strategy: "steps"
47
+ save_steps: 1000
48
+ save_total_limit: 3
49
+ seed: 42
50
+ data_seed: 42
51
+ bf16: true
52
+ fp16: false
53
+ tf32: true
54
+ dataloader_num_workers: 8
55
+ dataloader_pin_memory: true
56
+ remove_unused_columns: false
57
+ label_names: ["labels"]
58
+ load_best_model_at_end: true
59
+ metric_for_best_model: "eval_loss"
60
+ greater_is_better: false
61
+ ignore_data_skip: false
62
+ ddp_find_unused_parameters: false
63
+ ddp_bucket_cap_mb: 25
64
+ dataloader_drop_last: false
65
+ eval_steps: 500
66
+ save_safetensors: true
67
+ push_to_hub: false
68
+ hub_private_repo: true
69
+ gradient_checkpointing: true
70
+ include_inputs_for_metrics: false
71
+ auto_find_batch_size: false
72
+ full_determinism: false
73
+ report_to: ["tensorboard", "wandb"]
74
+
75
+ # DeepSpeed Configuration
76
+ deepspeed: "./ds_config_zero3.json"
77
+
78
+ # Optimization
79
+ optim: "adamw_torch"
80
+ group_by_length: true
81
+ length_column_name: "length"
82
+
83
+ # Model-specific Settings
84
+ torch_dtype: "bfloat16"
85
+ low_cpu_mem_usage: true
86
+ use_flash_attention_2: true
87
+ attention_dropout: 0.0
88
+ residual_dropout: 0.0
89
+
90
+ # Resume Training
91
+ resume_from_checkpoint: null
92
+ ignore_mismatched_sizes: false
93
+
94
+ # Distributed Training
95
+ local_rank: -1
96
+ ddp_backend: "nccl"
97
+ sharded_ddp: []
98
+ fsdp: []
99
+ fsdp_config: null
100
+
101
+ # Evaluation
102
+ prediction_loss_only: false
103
+ per_device_eval_batch_size: 8
104
+ eval_delay: 0
105
+
106
+ # Callbacks
107
+ early_stopping_patience: null
108
+ early_stopping_threshold: 0.0
109
+
110
+ # Tokenization
111
+ padding: "max_length"
112
+ truncation: true
113
+ return_overflowing_tokens: false
114
+ return_length: false
115
+
116
+ # Additional Training Settings
117
+ max_steps_per_epoch: null
118
+ gradient_checkpointing_kwargs:
119
+ use_reentrant: false