Trouter-Library commited on
Commit
a4013c5
·
verified ·
1 Parent(s): 06cb8fc

Create training_config.json

Browse files
Files changed (1) hide show
  1. training_config.json +143 -0
training_config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name_or_path": "meta-llama/Meta-Llama-3.1-70B",
3
+ "output_dir": "./checkpoints/helion-2.5-rnd",
4
+ "overwrite_output_dir": true,
5
+ "do_train": true,
6
+ "do_eval": true,
7
+ "evaluation_strategy": "steps",
8
+ "eval_steps": 500,
9
+ "per_device_train_batch_size": 4,
10
+ "per_device_eval_batch_size": 4,
11
+ "gradient_accumulation_steps": 8,
12
+ "learning_rate": 2e-05,
13
+ "weight_decay": 0.01,
14
+ "adam_beta1": 0.9,
15
+ "adam_beta2": 0.999,
16
+ "adam_epsilon": 1e-08,
17
+ "max_grad_norm": 1.0,
18
+ "num_train_epochs": 3,
19
+ "max_steps": 150000,
20
+ "lr_scheduler_type": "cosine_with_restarts",
21
+ "warmup_steps": 2000,
22
+ "logging_dir": "./logs",
23
+ "logging_strategy": "steps",
24
+ "logging_steps": 10,
25
+ "save_strategy": "steps",
26
+ "save_steps": 1000,
27
+ "save_total_limit": 5,
28
+ "fp16": false,
29
+ "bf16": true,
30
+ "dataloader_num_workers": 8,
31
+ "dataloader_pin_memory": true,
32
+ "gradient_checkpointing": true,
33
+ "gradient_checkpointing_kwargs": {
34
+ "use_reentrant": false
35
+ },
36
+ "deepspeed": {
37
+ "train_batch_size": "auto",
38
+ "train_micro_batch_size_per_gpu": "auto",
39
+ "gradient_accumulation_steps": "auto",
40
+ "gradient_clipping": 1.0,
41
+ "zero_optimization": {
42
+ "stage": 2,
43
+ "offload_optimizer": {
44
+ "device": "cpu",
45
+ "pin_memory": true
46
+ },
47
+ "offload_param": {
48
+ "device": "cpu",
49
+ "pin_memory": true
50
+ },
51
+ "overlap_comm": true,
52
+ "contiguous_gradients": true,
53
+ "reduce_bucket_size": 5e7,
54
+ "stage3_prefetch_bucket_size": 5e7,
55
+ "stage3_param_persistence_threshold": 1e5
56
+ },
57
+ "fp16": {
58
+ "enabled": false
59
+ },
60
+ "bf16": {
61
+ "enabled": true
62
+ },
63
+ "optimizer": {
64
+ "type": "AdamW",
65
+ "params": {
66
+ "lr": "auto",
67
+ "betas": "auto",
68
+ "eps": "auto",
69
+ "weight_decay": "auto"
70
+ }
71
+ },
72
+ "scheduler": {
73
+ "type": "WarmupDecayLR",
74
+ "params": {
75
+ "warmup_min_lr": "auto",
76
+ "warmup_max_lr": "auto",
77
+ "warmup_num_steps": "auto",
78
+ "total_num_steps": "auto"
79
+ }
80
+ },
81
+ "zero_allow_untested_optimizer": true,
82
+ "wall_clock_breakdown": false
83
+ },
84
+ "fsdp": "",
85
+ "fsdp_config": {},
86
+ "report_to": ["tensorboard", "wandb"],
87
+ "run_name": "helion-2.5-rnd",
88
+ "disable_tqdm": false,
89
+ "remove_unused_columns": false,
90
+ "label_names": ["labels"],
91
+ "load_best_model_at_end": true,
92
+ "metric_for_best_model": "eval_loss",
93
+ "greater_is_better": false,
94
+ "ignore_data_skip": false,
95
+ "ddp_timeout": 1800,
96
+ "torch_compile": false,
97
+ "torch_compile_backend": "inductor",
98
+ "torch_compile_mode": null,
99
+ "optim": "adamw_torch_fused",
100
+ "group_by_length": false,
101
+ "length_column_name": "length",
102
+ "ddp_find_unused_parameters": false,
103
+ "ddp_bucket_cap_mb": null,
104
+ "ddp_broadcast_buffers": null,
105
+ "dataset_config": {
106
+ "scientific_papers": {
107
+ "path": "arxiv_papers",
108
+ "split": "train",
109
+ "weight": 0.25
110
+ },
111
+ "code_repositories": {
112
+ "path": "the-stack-dedup",
113
+ "split": "train",
114
+ "weight": 0.25
115
+ },
116
+ "mathematical_proofs": {
117
+ "path": "math_qa",
118
+ "split": "train",
119
+ "weight": 0.15
120
+ },
121
+ "conversational_data": {
122
+ "path": "sharegpt",
123
+ "split": "train",
124
+ "weight": 0.20
125
+ },
126
+ "multilingual_corpus": {
127
+ "path": "mc4",
128
+ "split": "train",
129
+ "weight": 0.15
130
+ }
131
+ },
132
+ "data_preprocessing": {
133
+ "max_seq_length": 131072,
134
+ "truncation": true,
135
+ "padding": "max_length",
136
+ "return_tensors": "pt"
137
+ },
138
+ "model_config_updates": {
139
+ "use_cache": false,
140
+ "attention_dropout": 0.0,
141
+ "hidden_dropout": 0.0
142
+ }
143
+ }