HectorHe commited on
Commit
57388aa
·
verified ·
1 Parent(s): 43dbce4

Training in progress, step 20

Browse files
config.json CHANGED
@@ -55,7 +55,7 @@
55
  "topk_method": "greedy",
56
  "torch_dtype": "bfloat16",
57
  "transformers_version": "4.49.0",
58
- "use_cache": false,
59
  "v_head_dim": 128,
60
  "vocab_size": 102400
61
  }
 
55
  "topk_method": "greedy",
56
  "torch_dtype": "bfloat16",
57
  "transformers_version": "4.49.0",
58
+ "use_cache": true,
59
  "v_head_dim": 128,
60
  "vocab_size": 102400
61
  }
expert_selection.log ADDED
@@ -0,0 +1,555 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-04-24 23:33:57 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2
+ 2025-04-24 23:33:57 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
+ 2025-04-24 23:33:57 - INFO - __main__ - Training parameters EfficientDistillationConfig(
4
+ _n_gpu=1,
5
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
+ adafactor=False,
7
+ adam_beta1=0.9,
8
+ adam_beta2=0.999,
9
+ adam_epsilon=1e-08,
10
+ auto_find_batch_size=False,
11
+ average_tokens_across_devices=False,
12
+ batch_eval_metrics=False,
13
+ benchmarks=[],
14
+ bf16=True,
15
+ bf16_full_eval=False,
16
+ callbacks=[],
17
+ chars_per_token=<CHARS_PER_TOKEN>,
18
+ chat_template=None,
19
+ data_seed=None,
20
+ dataloader_drop_last=False,
21
+ dataloader_num_workers=0,
22
+ dataloader_persistent_workers=False,
23
+ dataloader_pin_memory=True,
24
+ dataloader_prefetch_factor=None,
25
+ dataset_batch_size=None,
26
+ dataset_kwargs=None,
27
+ dataset_num_proc=None,
28
+ dataset_text_field=text,
29
+ ddp_backend=None,
30
+ ddp_broadcast_buffers=None,
31
+ ddp_bucket_cap_mb=None,
32
+ ddp_find_unused_parameters=None,
33
+ ddp_timeout=180000000,
34
+ debug=[],
35
+ deepspeed=None,
36
+ disable_dropout=True,
37
+ disable_tqdm=False,
38
+ dispatch_batches=None,
39
+ do_eval=True,
40
+ do_predict=False,
41
+ do_train=False,
42
+ eval_accumulation_steps=None,
43
+ eval_delay=0,
44
+ eval_do_concat_batches=True,
45
+ eval_on_start=False,
46
+ eval_packing=None,
47
+ eval_steps=None,
48
+ eval_strategy=IntervalStrategy.NO,
49
+ eval_use_gather_object=False,
50
+ evaluation_strategy=None,
51
+ fp16=False,
52
+ fp16_backend=auto,
53
+ fp16_full_eval=False,
54
+ fp16_opt_level=O1,
55
+ fsdp=[],
56
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
57
+ fsdp_min_num_params=0,
58
+ fsdp_transformer_layer_cls_to_wrap=None,
59
+ full_determinism=False,
60
+ gradient_accumulation_steps=4,
61
+ gradient_checkpointing=True,
62
+ gradient_checkpointing_kwargs={'use_reentrant': False},
63
+ greater_is_better=None,
64
+ group_by_length=False,
65
+ half_precision_backend=auto,
66
+ hub_always_push=False,
67
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
68
+ hub_model_revision=main,
69
+ hub_private_repo=None,
70
+ hub_strategy=HubStrategy.EVERY_SAVE,
71
+ hub_token=<HUB_TOKEN>,
72
+ ignore_data_skip=False,
73
+ include_for_metrics=[],
74
+ include_inputs_for_metrics=False,
75
+ include_num_input_tokens_seen=False,
76
+ include_tokens_per_second=False,
77
+ jit_mode_eval=False,
78
+ label_names=None,
79
+ label_smoothing_factor=0.0,
80
+ learning_rate=5e-05,
81
+ length_column_name=length,
82
+ lmbda=0.0,
83
+ load_best_model_at_end=False,
84
+ local_rank=0,
85
+ log_level=info,
86
+ log_level_replica=warning,
87
+ log_on_each_node=True,
88
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr24_23-33-57_q-h100,
89
+ logging_first_step=False,
90
+ logging_nan_inf_filter=True,
91
+ logging_steps=1,
92
+ logging_strategy=IntervalStrategy.STEPS,
93
+ loss_type=forward_kl,
94
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
95
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
96
+ max_grad_norm=1.0,
97
+ max_length=2048,
98
+ max_new_tokens=128,
99
+ max_seq_length=None,
100
+ max_steps=-1,
101
+ metric_for_best_model=None,
102
+ model_init_kwargs=None,
103
+ mp_parameters=,
104
+ neftune_noise_alpha=None,
105
+ no_cuda=False,
106
+ num_of_sequences=None,
107
+ num_train_epochs=3,
108
+ optim=OptimizerNames.ADAMW_TORCH,
109
+ optim_args=None,
110
+ optim_target_modules=None,
111
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
112
+ overwrite_hub_revision=False,
113
+ overwrite_output_dir=True,
114
+ packing=False,
115
+ past_index=-1,
116
+ per_device_eval_batch_size=16,
117
+ per_device_train_batch_size=4,
118
+ prediction_loss_only=False,
119
+ push_to_hub=True,
120
+ push_to_hub_model_id=None,
121
+ push_to_hub_organization=None,
122
+ push_to_hub_revision=False,
123
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
124
+ ray_scope=last,
125
+ reduction=sum,
126
+ remove_unused_columns=True,
127
+ report_to=['wandb'],
128
+ restore_callback_states_from_checkpoint=False,
129
+ resume_from_checkpoint=None,
130
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
131
+ save_on_each_node=False,
132
+ save_only_model=False,
133
+ save_safetensors=True,
134
+ save_steps=200,
135
+ save_strategy=SaveStrategy.STEPS,
136
+ save_total_limit=1,
137
+ seed=42,
138
+ skip_memory_metrics=True,
139
+ split_batches=None,
140
+ system_prompt=None,
141
+ teacher_model_init_kwargs=None,
142
+ teacher_model_name_or_path=None,
143
+ temperature=0.9,
144
+ tf32=None,
145
+ torch_compile=False,
146
+ torch_compile_backend=None,
147
+ torch_compile_mode=None,
148
+ torch_empty_cache_steps=None,
149
+ torchdynamo=None,
150
+ tpu_metrics_debug=False,
151
+ tpu_num_cores=None,
152
+ use_cpu=False,
153
+ use_ipex=False,
154
+ use_legacy_prediction_loop=False,
155
+ use_liger=False,
156
+ use_liger_kernel=False,
157
+ use_mps_device=False,
158
+ wandb_entity=None,
159
+ wandb_project=None,
160
+ warmup_ratio=0.1,
161
+ warmup_steps=0,
162
+ weight_decay=0.0,
163
+ )
164
+ 2025-04-24 23:34:00 - INFO - __main__ - *** Initializing model kwargs ***
165
+ 2025-04-24 23:34:00 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
166
+ Memory reserved: 0.0
167
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 8743.642578125
168
+ Memory reserved: 9596.0
169
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 8743.642578125
170
+ Memory reserved: 8782.0
171
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 8743.642578125
172
+ Memory reserved: 8782.0
173
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 8743.642578125
174
+ Memory reserved: 8782.0
175
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 8743.64306640625
176
+ Memory reserved: 8782.0
177
+ 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 8743.64306640625
178
+ Memory reserved: 8782.0
179
+ 2025-04-24 23:34:19 - INFO - __main__ - Running prediction on test subset to record expert activations...
180
+ 2025-04-24 23:34:27 - INFO - __main__ - Model memory in step 5, after prediction:Memory allocated: 7696.69775390625
181
+ Memory reserved: 15308.0
182
+ 2025-04-24 23:34:27 - INFO - __main__ - Top k experts selected: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]}
183
+ 2025-04-24 23:34:27 - INFO - __main__ - Top k experts saved to: data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json
184
+ 2025-04-24 23:34:27 - INFO - __main__ - Model memory before cleanup:Memory allocated: 7696.69775390625
185
+ Memory reserved: 15308.0
186
+ 2025-04-24 23:34:30 - INFO - __main__ - Model memory after cleanup:Memory allocated: 7695.6298828125
187
+ Memory reserved: 7814.0
188
+ 2025-04-24 23:34:30 - INFO - __main__ - Expert selection completed successfully. Run part 2 for training.
189
+ 2025-04-28 20:35:37 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
190
+ 2025-04-28 20:35:37 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
191
+ 2025-04-28 20:35:37 - INFO - __main__ - Training parameters EfficientDistillationConfig(
192
+ _n_gpu=1,
193
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
194
+ adafactor=False,
195
+ adam_beta1=0.9,
196
+ adam_beta2=0.999,
197
+ adam_epsilon=1e-08,
198
+ auto_find_batch_size=False,
199
+ average_tokens_across_devices=False,
200
+ batch_eval_metrics=False,
201
+ benchmarks=[],
202
+ bf16=True,
203
+ bf16_full_eval=False,
204
+ callbacks=[],
205
+ chars_per_token=<CHARS_PER_TOKEN>,
206
+ chat_template=None,
207
+ data_seed=None,
208
+ dataloader_drop_last=False,
209
+ dataloader_num_workers=0,
210
+ dataloader_persistent_workers=False,
211
+ dataloader_pin_memory=True,
212
+ dataloader_prefetch_factor=None,
213
+ dataset_batch_size=None,
214
+ dataset_kwargs=None,
215
+ dataset_num_proc=None,
216
+ dataset_text_field=text,
217
+ ddp_backend=None,
218
+ ddp_broadcast_buffers=None,
219
+ ddp_bucket_cap_mb=None,
220
+ ddp_find_unused_parameters=None,
221
+ ddp_timeout=1800000000,
222
+ debug=[],
223
+ deepspeed=None,
224
+ disable_dropout=True,
225
+ disable_tqdm=False,
226
+ dispatch_batches=None,
227
+ do_eval=True,
228
+ do_predict=False,
229
+ do_train=False,
230
+ eval_accumulation_steps=None,
231
+ eval_delay=0,
232
+ eval_do_concat_batches=True,
233
+ eval_on_start=False,
234
+ eval_packing=None,
235
+ eval_steps=None,
236
+ eval_strategy=IntervalStrategy.NO,
237
+ eval_use_gather_object=False,
238
+ evaluation_strategy=None,
239
+ fp16=False,
240
+ fp16_backend=auto,
241
+ fp16_full_eval=False,
242
+ fp16_opt_level=O1,
243
+ fsdp=[],
244
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
245
+ fsdp_min_num_params=0,
246
+ fsdp_transformer_layer_cls_to_wrap=None,
247
+ full_determinism=False,
248
+ gradient_accumulation_steps=4,
249
+ gradient_checkpointing=False,
250
+ gradient_checkpointing_kwargs={'use_reentrant': False},
251
+ greater_is_better=None,
252
+ group_by_length=False,
253
+ half_precision_backend=auto,
254
+ hub_always_push=False,
255
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
256
+ hub_model_revision=main,
257
+ hub_private_repo=None,
258
+ hub_strategy=HubStrategy.EVERY_SAVE,
259
+ hub_token=<HUB_TOKEN>,
260
+ ignore_data_skip=False,
261
+ include_for_metrics=[],
262
+ include_inputs_for_metrics=False,
263
+ include_num_input_tokens_seen=False,
264
+ include_tokens_per_second=False,
265
+ jit_mode_eval=False,
266
+ label_names=None,
267
+ label_smoothing_factor=0.0,
268
+ learning_rate=5e-05,
269
+ length_column_name=length,
270
+ lmbda=0.0,
271
+ load_best_model_at_end=False,
272
+ local_rank=0,
273
+ log_level=info,
274
+ log_level_replica=warning,
275
+ log_on_each_node=True,
276
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-35-36_q-h100,
277
+ logging_first_step=False,
278
+ logging_nan_inf_filter=True,
279
+ logging_steps=1,
280
+ logging_strategy=IntervalStrategy.STEPS,
281
+ loss_type=forward_kl,
282
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
283
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
284
+ max_grad_norm=1.0,
285
+ max_length=4096,
286
+ max_new_tokens=1024,
287
+ max_seq_length=None,
288
+ max_steps=-1,
289
+ metric_for_best_model=None,
290
+ model_init_kwargs=None,
291
+ mp_parameters=,
292
+ neftune_noise_alpha=None,
293
+ no_cuda=False,
294
+ num_of_sequences=None,
295
+ num_train_epochs=3,
296
+ optim=OptimizerNames.ADAMW_TORCH,
297
+ optim_args=None,
298
+ optim_target_modules=None,
299
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
300
+ overwrite_hub_revision=False,
301
+ overwrite_output_dir=True,
302
+ packing=False,
303
+ past_index=-1,
304
+ per_device_eval_batch_size=16,
305
+ per_device_train_batch_size=4,
306
+ prediction_loss_only=False,
307
+ push_to_hub=True,
308
+ push_to_hub_model_id=None,
309
+ push_to_hub_organization=None,
310
+ push_to_hub_revision=False,
311
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
312
+ ray_scope=last,
313
+ reduction=sum,
314
+ remove_unused_columns=True,
315
+ report_to=['wandb'],
316
+ restore_callback_states_from_checkpoint=False,
317
+ resume_from_checkpoint=None,
318
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
319
+ save_on_each_node=False,
320
+ save_only_model=False,
321
+ save_safetensors=True,
322
+ save_steps=200,
323
+ save_strategy=SaveStrategy.STEPS,
324
+ save_total_limit=1,
325
+ seed=42,
326
+ skip_memory_metrics=True,
327
+ split_batches=None,
328
+ system_prompt=None,
329
+ teacher_model_init_kwargs=None,
330
+ teacher_model_name_or_path=None,
331
+ temperature=0.9,
332
+ tf32=None,
333
+ torch_compile=False,
334
+ torch_compile_backend=None,
335
+ torch_compile_mode=None,
336
+ torch_empty_cache_steps=None,
337
+ torchdynamo=None,
338
+ tpu_metrics_debug=False,
339
+ tpu_num_cores=None,
340
+ use_cpu=False,
341
+ use_ipex=False,
342
+ use_legacy_prediction_loop=False,
343
+ use_liger=False,
344
+ use_liger_kernel=False,
345
+ use_mps_device=False,
346
+ wandb_entity=None,
347
+ wandb_project=None,
348
+ warmup_ratio=0.1,
349
+ warmup_steps=0,
350
+ weight_decay=0.0,
351
+ )
352
+ 2025-04-28 20:35:39 - INFO - __main__ - *** Initializing model kwargs ***
353
+ 2025-04-28 20:35:39 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
354
+ Memory reserved: 0.0
355
+ 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 0.0
356
+ Memory reserved: 0.0
357
+ 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 0.0
358
+ Memory reserved: 0.0
359
+ 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 0.0
360
+ Memory reserved: 0.0
361
+ 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 0.0
362
+ Memory reserved: 0.0
363
+ 2025-04-28 20:35:52 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 0.00048828125
364
+ Memory reserved: 2.0
365
+ 2025-04-28 20:35:52 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 0.00048828125
366
+ Memory reserved: 2.0
367
+ 2025-04-28 20:35:52 - INFO - __main__ - Running prediction on test subset to record expert activations...
368
+ 2025-04-28 20:38:13 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
369
+ 2025-04-28 20:38:13 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
370
+ 2025-04-28 20:38:13 - INFO - __main__ - Training parameters EfficientDistillationConfig(
371
+ _n_gpu=1,
372
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
373
+ adafactor=False,
374
+ adam_beta1=0.9,
375
+ adam_beta2=0.999,
376
+ adam_epsilon=1e-08,
377
+ auto_find_batch_size=False,
378
+ average_tokens_across_devices=False,
379
+ batch_eval_metrics=False,
380
+ benchmarks=[],
381
+ bf16=True,
382
+ bf16_full_eval=False,
383
+ callbacks=[],
384
+ chars_per_token=<CHARS_PER_TOKEN>,
385
+ chat_template=None,
386
+ data_seed=None,
387
+ dataloader_drop_last=False,
388
+ dataloader_num_workers=0,
389
+ dataloader_persistent_workers=False,
390
+ dataloader_pin_memory=True,
391
+ dataloader_prefetch_factor=None,
392
+ dataset_batch_size=None,
393
+ dataset_kwargs=None,
394
+ dataset_num_proc=None,
395
+ dataset_text_field=text,
396
+ ddp_backend=None,
397
+ ddp_broadcast_buffers=None,
398
+ ddp_bucket_cap_mb=None,
399
+ ddp_find_unused_parameters=None,
400
+ ddp_timeout=1800000000,
401
+ debug=[],
402
+ deepspeed=None,
403
+ disable_dropout=True,
404
+ disable_tqdm=False,
405
+ dispatch_batches=None,
406
+ do_eval=True,
407
+ do_predict=False,
408
+ do_train=False,
409
+ eval_accumulation_steps=None,
410
+ eval_delay=0,
411
+ eval_do_concat_batches=True,
412
+ eval_on_start=False,
413
+ eval_packing=None,
414
+ eval_steps=None,
415
+ eval_strategy=IntervalStrategy.NO,
416
+ eval_use_gather_object=False,
417
+ evaluation_strategy=None,
418
+ fp16=False,
419
+ fp16_backend=auto,
420
+ fp16_full_eval=False,
421
+ fp16_opt_level=O1,
422
+ fsdp=[],
423
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
424
+ fsdp_min_num_params=0,
425
+ fsdp_transformer_layer_cls_to_wrap=None,
426
+ full_determinism=False,
427
+ gradient_accumulation_steps=4,
428
+ gradient_checkpointing=False,
429
+ gradient_checkpointing_kwargs={'use_reentrant': False},
430
+ greater_is_better=None,
431
+ group_by_length=False,
432
+ half_precision_backend=auto,
433
+ hub_always_push=False,
434
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
435
+ hub_model_revision=main,
436
+ hub_private_repo=None,
437
+ hub_strategy=HubStrategy.EVERY_SAVE,
438
+ hub_token=<HUB_TOKEN>,
439
+ ignore_data_skip=False,
440
+ include_for_metrics=[],
441
+ include_inputs_for_metrics=False,
442
+ include_num_input_tokens_seen=False,
443
+ include_tokens_per_second=False,
444
+ jit_mode_eval=False,
445
+ label_names=None,
446
+ label_smoothing_factor=0.0,
447
+ learning_rate=5e-05,
448
+ length_column_name=length,
449
+ lmbda=0.0,
450
+ load_best_model_at_end=False,
451
+ local_rank=0,
452
+ log_level=info,
453
+ log_level_replica=warning,
454
+ log_on_each_node=True,
455
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-38-13_q-h100,
456
+ logging_first_step=False,
457
+ logging_nan_inf_filter=True,
458
+ logging_steps=1,
459
+ logging_strategy=IntervalStrategy.STEPS,
460
+ loss_type=forward_kl,
461
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
462
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
463
+ max_grad_norm=1.0,
464
+ max_length=4096,
465
+ max_new_tokens=1024,
466
+ max_seq_length=None,
467
+ max_steps=-1,
468
+ metric_for_best_model=None,
469
+ model_init_kwargs=None,
470
+ mp_parameters=,
471
+ neftune_noise_alpha=None,
472
+ no_cuda=False,
473
+ num_of_sequences=None,
474
+ num_train_epochs=3,
475
+ optim=OptimizerNames.ADAMW_TORCH,
476
+ optim_args=None,
477
+ optim_target_modules=None,
478
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
479
+ overwrite_hub_revision=False,
480
+ overwrite_output_dir=True,
481
+ packing=False,
482
+ past_index=-1,
483
+ per_device_eval_batch_size=16,
484
+ per_device_train_batch_size=4,
485
+ prediction_loss_only=False,
486
+ push_to_hub=True,
487
+ push_to_hub_model_id=None,
488
+ push_to_hub_organization=None,
489
+ push_to_hub_revision=False,
490
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
491
+ ray_scope=last,
492
+ reduction=sum,
493
+ remove_unused_columns=True,
494
+ report_to=['wandb'],
495
+ restore_callback_states_from_checkpoint=False,
496
+ resume_from_checkpoint=None,
497
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
498
+ save_on_each_node=False,
499
+ save_only_model=False,
500
+ save_safetensors=True,
501
+ save_steps=200,
502
+ save_strategy=SaveStrategy.STEPS,
503
+ save_total_limit=1,
504
+ seed=42,
505
+ skip_memory_metrics=True,
506
+ split_batches=None,
507
+ system_prompt=None,
508
+ teacher_model_init_kwargs=None,
509
+ teacher_model_name_or_path=None,
510
+ temperature=0.9,
511
+ tf32=None,
512
+ torch_compile=False,
513
+ torch_compile_backend=None,
514
+ torch_compile_mode=None,
515
+ torch_empty_cache_steps=None,
516
+ torchdynamo=None,
517
+ tpu_metrics_debug=False,
518
+ tpu_num_cores=None,
519
+ use_cpu=False,
520
+ use_ipex=False,
521
+ use_legacy_prediction_loop=False,
522
+ use_liger=False,
523
+ use_liger_kernel=False,
524
+ use_mps_device=False,
525
+ wandb_entity=None,
526
+ wandb_project=None,
527
+ warmup_ratio=0.1,
528
+ warmup_steps=0,
529
+ weight_decay=0.0,
530
+ )
531
+ 2025-04-28 20:38:15 - INFO - __main__ - *** Initializing model kwargs ***
532
+ 2025-04-28 20:38:15 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
533
+ Memory reserved: 0.0
534
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 6091.439453125
535
+ Memory reserved: 7478.0
536
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 6091.439453125
537
+ Memory reserved: 6556.0
538
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 6091.439453125
539
+ Memory reserved: 6556.0
540
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 6091.439453125
541
+ Memory reserved: 6556.0
542
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 6091.43994140625
543
+ Memory reserved: 6556.0
544
+ 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 6091.43994140625
545
+ Memory reserved: 6556.0
546
+ 2025-04-28 20:38:39 - INFO - __main__ - Running prediction on test subset to record expert activations...
547
+ 2025-04-28 20:38:47 - INFO - __main__ - Model memory in step 5, after prediction:Memory allocated: 5044.49462890625
548
+ Memory reserved: 15056.0
549
+ 2025-04-28 20:38:47 - INFO - __main__ - Top k experts selected: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]}
550
+ 2025-04-28 20:38:47 - INFO - __main__ - Top k experts saved to: data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json
551
+ 2025-04-28 20:38:47 - INFO - __main__ - Model memory before cleanup:Memory allocated: 5044.49462890625
552
+ Memory reserved: 15056.0
553
+ 2025-04-28 20:38:50 - INFO - __main__ - Model memory after cleanup:Memory allocated: 5043.4267578125
554
+ Memory reserved: 5948.0
555
+ 2025-04-28 20:38:50 - INFO - __main__ - Expert selection completed successfully. Run part 2 for training.
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6708ea601efb4f587b288726e2d5a65eb4a7a4c404bf4e0e06af1f7d213b42aa
3
+ size 4902968072
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2824f10339f6804c027eb8c08240cd840beb37ac66ebf73cd53f8907acbaf324
3
+ size 419430528
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer.json CHANGED
@@ -1,6 +1,11 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 7000,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
  "padding": null,
10
  "added_tokens": [
11
  {
tokenizer_config.json CHANGED
@@ -153,6 +153,7 @@
153
  "clean_up_tokenization_spaces": false,
154
  "eos_token": "<|end▁of▁sentence|>",
155
  "extra_special_tokens": {},
 
156
  "legacy": true,
157
  "model_max_length": 16384,
158
  "pad_token": "<|end▁of▁sentence|>",
 
153
  "clean_up_tokenization_spaces": false,
154
  "eos_token": "<|end▁of▁sentence|>",
155
  "extra_special_tokens": {},
156
+ "fast_tokenizer": true,
157
  "legacy": true,
158
  "model_max_length": 16384,
159
  "pad_token": "<|end▁of▁sentence|>",
top_6_experts.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model.layers.1.mlp": [45, 51, 44, 61, 22, 14], "model.layers.2.mlp": [25, 18, 27, 13, 23, 3], "model.layers.3.mlp": [54, 28, 25, 41, 23, 57], "model.layers.4.mlp": [11, 21, 49, 33, 14, 37], "model.layers.5.mlp": [35, 54, 20, 9, 47, 52], "model.layers.6.mlp": [45, 22, 1, 42, 47, 13], "model.layers.7.mlp": [58, 24, 43, 62, 18, 44], "model.layers.8.mlp": [47, 39, 54, 58, 30, 56], "model.layers.9.mlp": [31, 22, 32, 13, 12, 24], "model.layers.10.mlp": [22, 47, 42, 19, 2, 13], "model.layers.11.mlp": [11, 17, 29, 10, 22, 59], "model.layers.12.mlp": [4, 3, 59, 56, 5, 26], "model.layers.13.mlp": [17, 10, 47, 14, 42, 58], "model.layers.14.mlp": [51, 7, 27, 31, 61, 18], "model.layers.15.mlp": [24, 14, 17, 55, 41, 5], "model.layers.16.mlp": [61, 33, 19, 49, 9, 63], "model.layers.17.mlp": [32, 29, 26, 43, 0, 27], "model.layers.18.mlp": [56, 5, 2, 36, 1, 42], "model.layers.19.mlp": [24, 36, 40, 0, 23, 2], "model.layers.20.mlp": [1, 56, 38, 48, 58, 20], "model.layers.21.mlp": [19, 5, 28, 15, 13, 10], "model.layers.22.mlp": [32, 14, 58, 31, 3, 45], "model.layers.23.mlp": [20, 58, 0, 42, 33, 45], "model.layers.24.mlp": [7, 63, 47, 42, 10, 62], "model.layers.25.mlp": [45, 39, 46, 11, 38, 48], "model.layers.26.mlp": [6, 46, 49, 13, 57, 11]}
top_k_experts.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model.layers.1.mlp": [45, 51, 44], "model.layers.2.mlp": [25, 18, 27], "model.layers.3.mlp": [54, 28, 25], "model.layers.4.mlp": [11, 21, 49], "model.layers.5.mlp": [35, 54, 20], "model.layers.6.mlp": [45, 22, 1], "model.layers.7.mlp": [58, 24, 43], "model.layers.8.mlp": [47, 39, 54], "model.layers.9.mlp": [31, 22, 32], "model.layers.10.mlp": [22, 47, 42], "model.layers.11.mlp": [11, 17, 29], "model.layers.12.mlp": [4, 3, 59], "model.layers.13.mlp": [17, 10, 47], "model.layers.14.mlp": [51, 7, 27], "model.layers.15.mlp": [24, 14, 17], "model.layers.16.mlp": [61, 33, 19], "model.layers.17.mlp": [32, 29, 26], "model.layers.18.mlp": [56, 5, 2], "model.layers.19.mlp": [24, 36, 40], "model.layers.20.mlp": [1, 56, 38], "model.layers.21.mlp": [19, 5, 28], "model.layers.22.mlp": [32, 14, 58], "model.layers.23.mlp": [20, 58, 0], "model.layers.24.mlp": [7, 63, 47], "model.layers.25.mlp": [45, 39, 46], "model.layers.26.mlp": [6, 46, 49]}
training.log CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:784c8ddede97b768ac7df72d26cc44295ab6c84ab394fde376d273efc9af5e4c
3
- size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f1ee949d3c02c1e467580bf3863307c1318d6d8f14e61d099b4d6190e4e75be
3
+ size 7864
training_distill.log ADDED
The diff for this file is too large to render. See raw diff