HectorHe commited on
Commit
4b9f571
·
verified ·
1 Parent(s): 3d045c1

Training in progress, step 100

Browse files
expert_selection.log CHANGED
@@ -1,194 +1,6 @@
1
- 2025-04-24 23:33:57 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2
- 2025-04-24 23:33:57 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
- 2025-04-24 23:33:57 - INFO - __main__ - Training parameters EfficientDistillationConfig(
4
- _n_gpu=1,
5
- accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
- adafactor=False,
7
- adam_beta1=0.9,
8
- adam_beta2=0.999,
9
- adam_epsilon=1e-08,
10
- auto_find_batch_size=False,
11
- average_tokens_across_devices=False,
12
- batch_eval_metrics=False,
13
- benchmarks=[],
14
- bf16=True,
15
- bf16_full_eval=False,
16
- callbacks=[],
17
- chars_per_token=<CHARS_PER_TOKEN>,
18
- chat_template=None,
19
- data_seed=None,
20
- dataloader_drop_last=False,
21
- dataloader_num_workers=0,
22
- dataloader_persistent_workers=False,
23
- dataloader_pin_memory=True,
24
- dataloader_prefetch_factor=None,
25
- dataset_batch_size=None,
26
- dataset_kwargs=None,
27
- dataset_num_proc=None,
28
- dataset_text_field=text,
29
- ddp_backend=None,
30
- ddp_broadcast_buffers=None,
31
- ddp_bucket_cap_mb=None,
32
- ddp_find_unused_parameters=None,
33
- ddp_timeout=180000000,
34
- debug=[],
35
- deepspeed=None,
36
- disable_dropout=True,
37
- disable_tqdm=False,
38
- dispatch_batches=None,
39
- do_eval=True,
40
- do_predict=False,
41
- do_train=False,
42
- eval_accumulation_steps=None,
43
- eval_delay=0,
44
- eval_do_concat_batches=True,
45
- eval_on_start=False,
46
- eval_packing=None,
47
- eval_steps=None,
48
- eval_strategy=IntervalStrategy.NO,
49
- eval_use_gather_object=False,
50
- evaluation_strategy=None,
51
- fp16=False,
52
- fp16_backend=auto,
53
- fp16_full_eval=False,
54
- fp16_opt_level=O1,
55
- fsdp=[],
56
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
57
- fsdp_min_num_params=0,
58
- fsdp_transformer_layer_cls_to_wrap=None,
59
- full_determinism=False,
60
- gradient_accumulation_steps=4,
61
- gradient_checkpointing=True,
62
- gradient_checkpointing_kwargs={'use_reentrant': False},
63
- greater_is_better=None,
64
- group_by_length=False,
65
- half_precision_backend=auto,
66
- hub_always_push=False,
67
- hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
68
- hub_model_revision=main,
69
- hub_private_repo=None,
70
- hub_strategy=HubStrategy.EVERY_SAVE,
71
- hub_token=<HUB_TOKEN>,
72
- ignore_data_skip=False,
73
- include_for_metrics=[],
74
- include_inputs_for_metrics=False,
75
- include_num_input_tokens_seen=False,
76
- include_tokens_per_second=False,
77
- jit_mode_eval=False,
78
- label_names=None,
79
- label_smoothing_factor=0.0,
80
- learning_rate=5e-05,
81
- length_column_name=length,
82
- lmbda=0.0,
83
- load_best_model_at_end=False,
84
- local_rank=0,
85
- log_level=info,
86
- log_level_replica=warning,
87
- log_on_each_node=True,
88
- logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr24_23-33-57_q-h100,
89
- logging_first_step=False,
90
- logging_nan_inf_filter=True,
91
- logging_steps=1,
92
- logging_strategy=IntervalStrategy.STEPS,
93
- loss_type=forward_kl,
94
- lr_scheduler_kwargs={'min_lr_rate': 0.1},
95
- lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
96
- max_grad_norm=1.0,
97
- max_length=2048,
98
- max_new_tokens=128,
99
- max_seq_length=None,
100
- max_steps=-1,
101
- metric_for_best_model=None,
102
- model_init_kwargs=None,
103
- mp_parameters=,
104
- neftune_noise_alpha=None,
105
- no_cuda=False,
106
- num_of_sequences=None,
107
- num_train_epochs=3,
108
- optim=OptimizerNames.ADAMW_TORCH,
109
- optim_args=None,
110
- optim_target_modules=None,
111
- output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
112
- overwrite_hub_revision=False,
113
- overwrite_output_dir=True,
114
- packing=False,
115
- past_index=-1,
116
- per_device_eval_batch_size=16,
117
- per_device_train_batch_size=4,
118
- prediction_loss_only=False,
119
- push_to_hub=True,
120
- push_to_hub_model_id=None,
121
- push_to_hub_organization=None,
122
- push_to_hub_revision=False,
123
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
124
- ray_scope=last,
125
- reduction=sum,
126
- remove_unused_columns=True,
127
- report_to=['wandb'],
128
- restore_callback_states_from_checkpoint=False,
129
- resume_from_checkpoint=None,
130
- run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
131
- save_on_each_node=False,
132
- save_only_model=False,
133
- save_safetensors=True,
134
- save_steps=200,
135
- save_strategy=SaveStrategy.STEPS,
136
- save_total_limit=1,
137
- seed=42,
138
- skip_memory_metrics=True,
139
- split_batches=None,
140
- system_prompt=None,
141
- teacher_model_init_kwargs=None,
142
- teacher_model_name_or_path=None,
143
- temperature=0.9,
144
- tf32=None,
145
- torch_compile=False,
146
- torch_compile_backend=None,
147
- torch_compile_mode=None,
148
- torch_empty_cache_steps=None,
149
- torchdynamo=None,
150
- tpu_metrics_debug=False,
151
- tpu_num_cores=None,
152
- use_cpu=False,
153
- use_ipex=False,
154
- use_legacy_prediction_loop=False,
155
- use_liger=False,
156
- use_liger_kernel=False,
157
- use_mps_device=False,
158
- wandb_entity=None,
159
- wandb_project=None,
160
- warmup_ratio=0.1,
161
- warmup_steps=0,
162
- weight_decay=0.0,
163
- )
164
- 2025-04-24 23:34:00 - INFO - __main__ - *** Initializing model kwargs ***
165
- 2025-04-24 23:34:00 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
166
- Memory reserved: 0.0
167
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 8743.642578125
168
- Memory reserved: 9596.0
169
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 8743.642578125
170
- Memory reserved: 8782.0
171
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 8743.642578125
172
- Memory reserved: 8782.0
173
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 8743.642578125
174
- Memory reserved: 8782.0
175
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 8743.64306640625
176
- Memory reserved: 8782.0
177
- 2025-04-24 23:34:19 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 8743.64306640625
178
- Memory reserved: 8782.0
179
- 2025-04-24 23:34:19 - INFO - __main__ - Running prediction on test subset to record expert activations...
180
- 2025-04-24 23:34:27 - INFO - __main__ - Model memory in step 5, after prediction:Memory allocated: 7696.69775390625
181
- Memory reserved: 15308.0
182
- 2025-04-24 23:34:27 - INFO - __main__ - Top k experts selected: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]}
183
- 2025-04-24 23:34:27 - INFO - __main__ - Top k experts saved to: data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json
184
- 2025-04-24 23:34:27 - INFO - __main__ - Model memory before cleanup:Memory allocated: 7696.69775390625
185
- Memory reserved: 15308.0
186
- 2025-04-24 23:34:30 - INFO - __main__ - Model memory after cleanup:Memory allocated: 7695.6298828125
187
- Memory reserved: 7814.0
188
- 2025-04-24 23:34:30 - INFO - __main__ - Expert selection completed successfully. Run part 2 for training.
189
- 2025-04-28 20:35:37 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
190
- 2025-04-28 20:35:37 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
191
- 2025-04-28 20:35:37 - INFO - __main__ - Training parameters EfficientDistillationConfig(
192
  _n_gpu=1,
193
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
194
  adafactor=False,
@@ -245,7 +57,7 @@ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_
245
  fsdp_min_num_params=0,
246
  fsdp_transformer_layer_cls_to_wrap=None,
247
  full_determinism=False,
248
- gradient_accumulation_steps=4,
249
  gradient_checkpointing=False,
250
  gradient_checkpointing_kwargs={'use_reentrant': False},
251
  greater_is_better=None,
@@ -273,7 +85,7 @@ local_rank=0,
273
  log_level=info,
274
  log_level_replica=warning,
275
  log_on_each_node=True,
276
- logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-35-36_q-h100,
277
  logging_first_step=False,
278
  logging_nan_inf_filter=True,
279
  logging_steps=1,
@@ -282,186 +94,7 @@ loss_type=forward_kl,
282
  lr_scheduler_kwargs={'min_lr_rate': 0.1},
283
  lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
284
  max_grad_norm=1.0,
285
- max_length=4096,
286
- max_new_tokens=1024,
287
- max_seq_length=None,
288
- max_steps=-1,
289
- metric_for_best_model=None,
290
- model_init_kwargs=None,
291
- mp_parameters=,
292
- neftune_noise_alpha=None,
293
- no_cuda=False,
294
- num_of_sequences=None,
295
- num_train_epochs=3,
296
- optim=OptimizerNames.ADAMW_TORCH,
297
- optim_args=None,
298
- optim_target_modules=None,
299
- output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
300
- overwrite_hub_revision=False,
301
- overwrite_output_dir=True,
302
- packing=False,
303
- past_index=-1,
304
- per_device_eval_batch_size=16,
305
- per_device_train_batch_size=4,
306
- prediction_loss_only=False,
307
- push_to_hub=True,
308
- push_to_hub_model_id=None,
309
- push_to_hub_organization=None,
310
- push_to_hub_revision=False,
311
- push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
312
- ray_scope=last,
313
- reduction=sum,
314
- remove_unused_columns=True,
315
- report_to=['wandb'],
316
- restore_callback_states_from_checkpoint=False,
317
- resume_from_checkpoint=None,
318
- run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
319
- save_on_each_node=False,
320
- save_only_model=False,
321
- save_safetensors=True,
322
- save_steps=200,
323
- save_strategy=SaveStrategy.STEPS,
324
- save_total_limit=1,
325
- seed=42,
326
- skip_memory_metrics=True,
327
- split_batches=None,
328
- system_prompt=None,
329
- teacher_model_init_kwargs=None,
330
- teacher_model_name_or_path=None,
331
- temperature=0.9,
332
- tf32=None,
333
- torch_compile=False,
334
- torch_compile_backend=None,
335
- torch_compile_mode=None,
336
- torch_empty_cache_steps=None,
337
- torchdynamo=None,
338
- tpu_metrics_debug=False,
339
- tpu_num_cores=None,
340
- use_cpu=False,
341
- use_ipex=False,
342
- use_legacy_prediction_loop=False,
343
- use_liger=False,
344
- use_liger_kernel=False,
345
- use_mps_device=False,
346
- wandb_entity=None,
347
- wandb_project=None,
348
- warmup_ratio=0.1,
349
- warmup_steps=0,
350
- weight_decay=0.0,
351
- )
352
- 2025-04-28 20:35:39 - INFO - __main__ - *** Initializing model kwargs ***
353
- 2025-04-28 20:35:39 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
354
- Memory reserved: 0.0
355
- 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 0.0
356
- Memory reserved: 0.0
357
- 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 0.0
358
- Memory reserved: 0.0
359
- 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 0.0
360
- Memory reserved: 0.0
361
- 2025-04-28 20:35:49 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 0.0
362
- Memory reserved: 0.0
363
- 2025-04-28 20:35:52 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 0.00048828125
364
- Memory reserved: 2.0
365
- 2025-04-28 20:35:52 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 0.00048828125
366
- Memory reserved: 2.0
367
- 2025-04-28 20:35:52 - INFO - __main__ - Running prediction on test subset to record expert activations...
368
- 2025-04-28 20:38:13 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
369
- 2025-04-28 20:38:13 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
370
- 2025-04-28 20:38:13 - INFO - __main__ - Training parameters EfficientDistillationConfig(
371
- _n_gpu=1,
372
- accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
373
- adafactor=False,
374
- adam_beta1=0.9,
375
- adam_beta2=0.999,
376
- adam_epsilon=1e-08,
377
- auto_find_batch_size=False,
378
- average_tokens_across_devices=False,
379
- batch_eval_metrics=False,
380
- benchmarks=[],
381
- bf16=True,
382
- bf16_full_eval=False,
383
- callbacks=[],
384
- chars_per_token=<CHARS_PER_TOKEN>,
385
- chat_template=None,
386
- data_seed=None,
387
- dataloader_drop_last=False,
388
- dataloader_num_workers=0,
389
- dataloader_persistent_workers=False,
390
- dataloader_pin_memory=True,
391
- dataloader_prefetch_factor=None,
392
- dataset_batch_size=None,
393
- dataset_kwargs=None,
394
- dataset_num_proc=None,
395
- dataset_text_field=text,
396
- ddp_backend=None,
397
- ddp_broadcast_buffers=None,
398
- ddp_bucket_cap_mb=None,
399
- ddp_find_unused_parameters=None,
400
- ddp_timeout=1800000000,
401
- debug=[],
402
- deepspeed=None,
403
- disable_dropout=True,
404
- disable_tqdm=False,
405
- dispatch_batches=None,
406
- do_eval=True,
407
- do_predict=False,
408
- do_train=False,
409
- eval_accumulation_steps=None,
410
- eval_delay=0,
411
- eval_do_concat_batches=True,
412
- eval_on_start=False,
413
- eval_packing=None,
414
- eval_steps=None,
415
- eval_strategy=IntervalStrategy.NO,
416
- eval_use_gather_object=False,
417
- evaluation_strategy=None,
418
- fp16=False,
419
- fp16_backend=auto,
420
- fp16_full_eval=False,
421
- fp16_opt_level=O1,
422
- fsdp=[],
423
- fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
424
- fsdp_min_num_params=0,
425
- fsdp_transformer_layer_cls_to_wrap=None,
426
- full_determinism=False,
427
- gradient_accumulation_steps=4,
428
- gradient_checkpointing=False,
429
- gradient_checkpointing_kwargs={'use_reentrant': False},
430
- greater_is_better=None,
431
- group_by_length=False,
432
- half_precision_backend=auto,
433
- hub_always_push=False,
434
- hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
435
- hub_model_revision=main,
436
- hub_private_repo=None,
437
- hub_strategy=HubStrategy.EVERY_SAVE,
438
- hub_token=<HUB_TOKEN>,
439
- ignore_data_skip=False,
440
- include_for_metrics=[],
441
- include_inputs_for_metrics=False,
442
- include_num_input_tokens_seen=False,
443
- include_tokens_per_second=False,
444
- jit_mode_eval=False,
445
- label_names=None,
446
- label_smoothing_factor=0.0,
447
- learning_rate=5e-05,
448
- length_column_name=length,
449
- lmbda=0.0,
450
- load_best_model_at_end=False,
451
- local_rank=0,
452
- log_level=info,
453
- log_level_replica=warning,
454
- log_on_each_node=True,
455
- logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-38-13_q-h100,
456
- logging_first_step=False,
457
- logging_nan_inf_filter=True,
458
- logging_steps=1,
459
- logging_strategy=IntervalStrategy.STEPS,
460
- loss_type=forward_kl,
461
- lr_scheduler_kwargs={'min_lr_rate': 0.1},
462
- lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
463
- max_grad_norm=1.0,
464
- max_length=4096,
465
  max_new_tokens=1024,
466
  max_seq_length=None,
467
  max_steps=-1,
@@ -471,17 +104,17 @@ mp_parameters=,
471
  neftune_noise_alpha=None,
472
  no_cuda=False,
473
  num_of_sequences=None,
474
- num_train_epochs=3,
475
  optim=OptimizerNames.ADAMW_TORCH,
476
  optim_args=None,
477
  optim_target_modules=None,
478
- output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
479
  overwrite_hub_revision=False,
480
  overwrite_output_dir=True,
481
  packing=False,
482
  past_index=-1,
483
  per_device_eval_batch_size=16,
484
- per_device_train_batch_size=4,
485
  prediction_loss_only=False,
486
  push_to_hub=True,
487
  push_to_hub_model_id=None,
@@ -494,14 +127,14 @@ remove_unused_columns=True,
494
  report_to=['wandb'],
495
  restore_callback_states_from_checkpoint=False,
496
  resume_from_checkpoint=None,
497
- run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill,
498
  save_on_each_node=False,
499
  save_only_model=False,
500
  save_safetensors=True,
501
- save_steps=200,
502
  save_strategy=SaveStrategy.STEPS,
503
  save_total_limit=1,
504
- seed=42,
505
  skip_memory_metrics=True,
506
  split_batches=None,
507
  system_prompt=None,
@@ -528,28 +161,28 @@ warmup_ratio=0.1,
528
  warmup_steps=0,
529
  weight_decay=0.0,
530
  )
531
- 2025-04-28 20:38:15 - INFO - __main__ - *** Initializing model kwargs ***
532
- 2025-04-28 20:38:15 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
533
  Memory reserved: 0.0
534
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 6091.439453125
535
- Memory reserved: 7478.0
536
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 6091.439453125
537
- Memory reserved: 6556.0
538
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 6091.439453125
539
- Memory reserved: 6556.0
540
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 6091.439453125
541
- Memory reserved: 6556.0
542
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 6091.43994140625
543
- Memory reserved: 6556.0
544
- 2025-04-28 20:38:39 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 6091.43994140625
545
- Memory reserved: 6556.0
546
- 2025-04-28 20:38:39 - INFO - __main__ - Running prediction on test subset to record expert activations...
547
- 2025-04-28 20:38:47 - INFO - __main__ - Model memory in step 5, after prediction:Memory allocated: 5044.49462890625
548
- Memory reserved: 15056.0
549
- 2025-04-28 20:38:47 - INFO - __main__ - Top k experts selected: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]}
550
- 2025-04-28 20:38:47 - INFO - __main__ - Top k experts saved to: data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json
551
- 2025-04-28 20:38:47 - INFO - __main__ - Model memory before cleanup:Memory allocated: 5044.49462890625
552
- Memory reserved: 15056.0
553
- 2025-04-28 20:38:50 - INFO - __main__ - Model memory after cleanup:Memory allocated: 5043.4267578125
554
- Memory reserved: 5948.0
555
- 2025-04-28 20:38:50 - INFO - __main__ - Expert selection completed successfully. Run part 2 for training.
 
1
+ 2025-05-02 01:30:45 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2
+ 2025-05-02 01:30:45 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='lmms-lab/Math10K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
+ 2025-05-02 01:30:45 - INFO - __main__ - Training parameters EfficientDistillationConfig(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  _n_gpu=1,
5
  accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
  adafactor=False,
 
57
  fsdp_min_num_params=0,
58
  fsdp_transformer_layer_cls_to_wrap=None,
59
  full_determinism=False,
60
+ gradient_accumulation_steps=1,
61
  gradient_checkpointing=False,
62
  gradient_checkpointing_kwargs={'use_reentrant': False},
63
  greater_is_better=None,
 
85
  log_level=info,
86
  log_level_replica=warning,
87
  log_on_each_node=True,
88
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K/runs/May02_01-30-44_q-h100,
89
  logging_first_step=False,
90
  logging_nan_inf_filter=True,
91
  logging_steps=1,
 
94
  lr_scheduler_kwargs={'min_lr_rate': 0.1},
95
  lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
96
  max_grad_norm=1.0,
97
+ max_length=2048,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  max_new_tokens=1024,
99
  max_seq_length=None,
100
  max_steps=-1,
 
104
  neftune_noise_alpha=None,
105
  no_cuda=False,
106
  num_of_sequences=None,
107
+ num_train_epochs=1,
108
  optim=OptimizerNames.ADAMW_TORCH,
109
  optim_args=None,
110
  optim_target_modules=None,
111
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K,
112
  overwrite_hub_revision=False,
113
  overwrite_output_dir=True,
114
  packing=False,
115
  past_index=-1,
116
  per_device_eval_batch_size=16,
117
+ per_device_train_batch_size=16,
118
  prediction_loss_only=False,
119
  push_to_hub=True,
120
  push_to_hub_model_id=None,
 
127
  report_to=['wandb'],
128
  restore_callback_states_from_checkpoint=False,
129
  resume_from_checkpoint=None,
130
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K,
131
  save_on_each_node=False,
132
  save_only_model=False,
133
  save_safetensors=True,
134
+ save_steps=100,
135
  save_strategy=SaveStrategy.STEPS,
136
  save_total_limit=1,
137
+ seed=1234,
138
  skip_memory_metrics=True,
139
  split_batches=None,
140
  system_prompt=None,
 
161
  warmup_steps=0,
162
  weight_decay=0.0,
163
  )
164
+ 2025-05-02 01:30:48 - INFO - __main__ - *** Initializing model kwargs ***
165
+ 2025-05-02 01:30:48 - INFO - __main__ - Model memory in step 1, before model initialization (0):Memory allocated: 0.0
166
  Memory reserved: 0.0
167
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 1, after model initialization:Memory allocated: 4836.39697265625
168
+ Memory reserved: 7322.0
169
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 2, before data collator initialization:Memory allocated: 4836.39697265625
170
+ Memory reserved: 6442.0
171
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 2, after data collator initialization:Memory allocated: 4836.39697265625
172
+ Memory reserved: 6442.0
173
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 4, before trainer initialization:Memory allocated: 4836.39697265625
174
+ Memory reserved: 6442.0
175
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 4, after trainer initialization:Memory allocated: 4836.3974609375
176
+ Memory reserved: 6442.0
177
+ 2025-05-02 01:31:13 - INFO - __main__ - Model memory in step 5, before prediction:Memory allocated: 4836.3974609375
178
+ Memory reserved: 6442.0
179
+ 2025-05-02 01:31:13 - INFO - __main__ - Running prediction on test subset to record expert activations...
180
+ 2025-05-02 01:31:21 - INFO - __main__ - Model memory in step 5, after prediction:Memory allocated: 3789.373046875
181
+ Memory reserved: 16034.0
182
+ 2025-05-02 01:31:21 - INFO - __main__ - Top k experts selected: {'model.layers.1.mlp': [51, 61, 44, 45, 14, 22], 'model.layers.2.mlp': [27, 25, 18, 13, 3, 23], 'model.layers.3.mlp': [54, 25, 41, 23, 28, 57], 'model.layers.4.mlp': [37, 21, 33, 49, 11, 14], 'model.layers.5.mlp': [54, 47, 35, 20, 52, 9], 'model.layers.6.mlp': [22, 1, 13, 45, 42, 47], 'model.layers.7.mlp': [58, 43, 24, 18, 44, 62], 'model.layers.8.mlp': [47, 39, 56, 30, 54, 58], 'model.layers.9.mlp': [31, 13, 22, 24, 12, 32], 'model.layers.10.mlp': [47, 19, 42, 2, 13, 22], 'model.layers.11.mlp': [29, 11, 17, 10, 59, 22], 'model.layers.12.mlp': [5, 56, 3, 59, 4, 26], 'model.layers.13.mlp': [10, 42, 58, 14, 47, 17], 'model.layers.14.mlp': [51, 7, 27, 18, 31, 61], 'model.layers.15.mlp': [24, 55, 5, 17, 14, 41], 'model.layers.16.mlp': [61, 33, 63, 49, 19, 9], 'model.layers.17.mlp': [0, 26, 43, 32, 27, 29], 'model.layers.18.mlp': [5, 56, 42, 36, 2, 1], 'model.layers.19.mlp': [2, 23, 24, 36, 40, 0], 'model.layers.20.mlp': [1, 56, 38, 20, 48, 58], 'model.layers.21.mlp': [5, 13, 15, 28, 19, 10], 'model.layers.22.mlp': [58, 32, 31, 3, 45, 14], 'model.layers.23.mlp': [20, 0, 58, 45, 33, 42], 'model.layers.24.mlp': [62, 7, 42, 47, 10, 63], 'model.layers.25.mlp': [45, 48, 39, 11, 46, 38], 'model.layers.26.mlp': [46, 49, 6, 13, 11, 57]}
183
+ 2025-05-02 01:31:21 - INFO - __main__ - Top k experts saved to: data/DeepSeek-Coder-V2-Lite-Instruct/distill_math10K/top_6_experts_lmms-lab_Math10K.json
184
+ 2025-05-02 01:31:21 - INFO - __main__ - Model memory before cleanup:Memory allocated: 3789.373046875
185
+ Memory reserved: 16034.0
186
+ 2025-05-02 01:31:24 - INFO - __main__ - Model memory after cleanup:Memory allocated: 3788.38427734375
187
+ Memory reserved: 5794.0
188
+ 2025-05-02 01:31:24 - INFO - __main__ - Expert selection completed successfully. Run part 2 for training.
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e1c552e865528723d5b54b4cedc423a49d3d10eccf0bbd05e2f41fa8d66d70a
3
  size 4902968072
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d62155bafa336c7e377773eda20d19306bf2d96c81f4bac036350db4808897fd
3
  size 4902968072
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baa71965742a1ad79abdab5450c95ced97a599f508de4276ce0530ed47893c16
3
  size 419430528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88c16a22d62088d0b882af5ee79fc4631e789190605d0fc52b54a7daf3b62952
3
  size 419430528
tokenizer.json CHANGED
@@ -2,7 +2,7 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 6000,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 4096,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
top_6_experts_lmms-lab_Math10K.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model.layers.1.mlp": [51, 61, 44, 45, 14, 22], "model.layers.2.mlp": [27, 25, 18, 13, 3, 23], "model.layers.3.mlp": [54, 25, 41, 23, 28, 57], "model.layers.4.mlp": [37, 21, 33, 49, 11, 14], "model.layers.5.mlp": [54, 47, 35, 20, 52, 9], "model.layers.6.mlp": [22, 1, 13, 45, 42, 47], "model.layers.7.mlp": [58, 43, 24, 18, 44, 62], "model.layers.8.mlp": [47, 39, 56, 30, 54, 58], "model.layers.9.mlp": [31, 13, 22, 24, 12, 32], "model.layers.10.mlp": [47, 19, 42, 2, 13, 22], "model.layers.11.mlp": [29, 11, 17, 10, 59, 22], "model.layers.12.mlp": [5, 56, 3, 59, 4, 26], "model.layers.13.mlp": [10, 42, 58, 14, 47, 17], "model.layers.14.mlp": [51, 7, 27, 18, 31, 61], "model.layers.15.mlp": [24, 55, 5, 17, 14, 41], "model.layers.16.mlp": [61, 33, 63, 49, 19, 9], "model.layers.17.mlp": [0, 26, 43, 32, 27, 29], "model.layers.18.mlp": [5, 56, 42, 36, 2, 1], "model.layers.19.mlp": [2, 23, 24, 36, 40, 0], "model.layers.20.mlp": [1, 56, 38, 20, 48, 58], "model.layers.21.mlp": [5, 13, 15, 28, 19, 10], "model.layers.22.mlp": [58, 32, 31, 3, 45, 14], "model.layers.23.mlp": [20, 0, 58, 45, 33, 42], "model.layers.24.mlp": [62, 7, 42, 47, 10, 63], "model.layers.25.mlp": [45, 48, 39, 11, 46, 38], "model.layers.26.mlp": [46, 49, 6, 13, 11, 57]}
training.log CHANGED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ced090b665c197f01c79a114114b4285eb2b605531936de5663e0dbc63c65dd6
3
- size 7928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:704190d4216fcaaa319f4fdcbc24ff07ace3d70a8844da8e10f26947ad156c65
3
+ size 7864