diff --git "a/training.log" "b/training.log" --- "a/training.log" +++ "b/training.log" @@ -1,67 +1,1333 @@ - Downloading shards: 0%| | 0/4 [00:00 -[rank1]: main(script_args, training_args, model_args) -[rank1]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main -[rank1]: train_result = trainer.train(resume_from_checkpoint=checkpoint) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train -[rank1]: return inner_training_loop( -[rank1]: ^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop -[rank1]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step -[rank1]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss -[rank1]: (loss, outputs) = super().compute_loss( -[rank1]: ^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss -[rank1]: outputs = model(**inputs) -[rank1]: ^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank1]: return self._call_impl(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl -[rank1]: return forward_call(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn -[rank1]: ret_val = func(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward -[rank1]: loss = self.module(*inputs, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank1]: return self._call_impl(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl -[rank1]: return inner() -[rank1]: ^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner -[rank1]: result = forward_call(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward -[rank1]: loss = loss_fct(shift_logits, shift_labels) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank1]: return self._call_impl(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl -[rank1]: return forward_call(*args, **kwargs) -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward -[rank1]: return F.cross_entropy( -[rank1]: ^^^^^^^^^^^^^^^^ -[rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy -[rank1]: return torch._C._nn.cross_entropy_loss( -[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 1 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -king train dataset: 2%|▏ | 2000/93733 [00:38<20:32, 74.46 examples/s] Packing train dataset: 3%|▎ | 3000/93733 [00:39<19:14, 78.58 examples/s] Packing train dataset: 3%|▎ | 3000/93733 [00:51<19:14, 78.58 examples/s] Packing train dataset: 4%|▍ | 4000/93733 [00:51<19:08, 78.10 examples/s] Packing train dataset: 4%|▍ | 4000/93733 [01:03<19:08, 78.10 examples/s] Packing train dataset: 5%|▌ | 5000/93733 [01:04<18:50, 78.50 examples/s] Packing train dataset: 5%|▌ | 5000/93733 [01:16<18:50, 78.50 examples/s] Packing train dataset: 6%|▋ | 6000/93733 [01:16<18:18, 79.87 examples/s] Packing train dataset: 6%|▋ | 6000/93733 [01:28<18:18, 79.87 examples/s] Packing train dataset: 7%|▋ | 7000/93733 [01:29<18:13, 79.31 examples/s] Packing train dataset: 7%|▋ | 7000/93733 [01:42<18:13, 79.31 examples/s] Packing train dataset: 9%|▊ | 8000/93733 [01:43<18:31, 77.11 examples/s] Packing train dataset: 9%|▊ | 8000/93733 [01:55<18:31, 77.11 examples/s] Packing train dataset: 10%|▉ | 9000/93733 [01:55<18:09, 77.76 examples/s] Packing train dataset: 10%|▉ | 9000/93733 [02:08<18:09, 77.76 examples/s] Packing train dataset: 11%|█ | 10000/93733 [02:09<18:11, 76.69 examples/s] Packing train dataset: 11%|█ | 10000/93733 [02:21<18:11, 76.69 examples/s] Packing train dataset: 12%|█▏ | 11000/93733 [02:22<17:51, 77.18 examples/s] Packing train dataset: 12%|█▏ | 11000/93733 [02:34<17:51, 77.18 examples/s] Packing train dataset: 13%|█▎ | 12000/93733 [02:35<17:44, 76.77 examples/s] Packing train dataset: 13%|█▎ | 12000/93733 [02:47<17:44, 76.77 examples/s] Packing train dataset: 14%|█▍ | 13000/93733 [02:48<17:28, 77.00 examples/s] Packing train dataset: 14%|█▍ | 13000/93733 [03:00<17:28, 77.00 examples/s] Packing train dataset: 15%|█▍ | 14000/93733 [03:00<17:04, 77.81 examples/s] Packing train dataset: 15%|█▍ | 14000/93733 [03:12<17:04, 77.81 examples/s] Packing train dataset: 16%|█▌ | 15000/93733 [03:13<16:52, 77.76 examples/s] Packing train dataset: 16%|█▌ | 15000/93733 [03:26<16:52, 77.76 examples/s] Packing train dataset: 17%|█▋ | 16000/93733 [03:26<16:53, 76.73 examples/s] Packing train dataset: 17%|█▋ | 16000/93733 [03:38<16:53, 76.73 examples/s] Packing train dataset: 18%|█▊ | 17000/93733 [03:39<16:28, 77.63 examples/s] Packing train dataset: 18%|█▊ | 17000/93733 [03:52<16:28, 77.63 examples/s] Packing train dataset: 19%|█▉ | 18000/93733 [03:53<16:31, 76.37 examples/s] Packing train dataset: 19%|█▉ | 18000/93733 [04:05<16:31, 76.37 examples/s] Packing train dataset: 20%|██ | 19000/93733 [04:05<16:10, 76.99 examples/s] Packing train dataset: 20%|██ | 19000/93733 [04:17<16:10, 76.99 examples/s] Packing train dataset: 21%|██▏ | 20000/93733 [04:18<15:40, 78.36 examples/s] Packing train dataset: 21%|██▏ | 20000/93733 [04:29<15:40, 78.36 examples/s] Packing train dataset: 22%|██▏ | 21000/93733 [04:30<15:15, 79.41 examples/s] Packing train dataset: 22%|██▏ | 21000/93733 [04:42<15:15, 79.41 examples/s] Packing train dataset: 23%|██▎ | 22000/93733 [04:43<15:08, 78.97 examples/s] Packing train dataset: 23%|██▎ | 22000/93733 [04:55<15:08, 78.97 examples/s] Packing train dataset: 25%|██▍ | 23000/93733 [04:56<15:10, 77.69 examples/s] Packing train dataset: 25%|██▍ | 23000/93733 [05:08<15:10, 77.69 examples/s] Packing train dataset: 26%|██▌ | 24000/93733 [05:09<14:59, 77.55 examples/s] Packing train dataset: 26%|██▌ | 24000/93733 [05:21<14:59, 77.55 examples/s] Packing train dataset: 27%|██▋ | 25000/93733 [05:22<14:43, 77.82 examples/s] Packing train dataset: 27%|██▋ | 25000/93733 [05:34<14:43, 77.82 examples/s] Packing train dataset: 28%|██▊ | 26000/93733 [05:34<14:27, 78.10 examples/s] Packing train dataset: 28%|██▊ | 26000/93733 [05:46<14:27, 78.10 examples/s] Packing train dataset: 29%|██▉ | 27000/93733 [05:47<14:09, 78.51 examples/s] Packing train dataset: 29%|██▉ | 27000/93733 [05:59<14:09, 78.51 examples/s] Packing train dataset: 30%|██▉ | 28000/93733 [06:00<14:00, 78.25 examples/s] Packing train dataset: 30%|██▉ | 28000/93733 [06:13<14:00, 78.25 examples/s] Packing train dataset: 31%|███ | 29000/93733 [06:13<14:03, 76.77 examples/s] Packing train dataset: 31%|███ | 29000/93733 [06:26<14:03, 76.77 examples/s] Packing train dataset: 32%|███▏ | 30000/93733 [06:27<14:00, 75.81 examples/s] Packing train dataset: 32%|███▏ | 30000/93733 [06:40<14:00, 75.81 examples/s] Packing train dataset: 33%|███▎ | 31000/93733 [06:40<13:48, 75.69 examples/s] Packing train dataset: 33%|███▎ | 31000/93733 [06:53<13:48, 75.69 examples/s] Packing train dataset: 34%|███▍ | 32000/93733 [06:54<13:45, 74.74 examples/s] Packing train dataset: 34%|███▍ | 32000/93733 [07:06<13:45, 74.74 examples/s] Packing train dataset: 35%|███▌ | 33000/93733 [07:07<13:17, 76.16 examples/s] Packing train dataset: 35%|███▌ | 33000/93733 [07:19<13:17, 76.16 examples/s] Packing train dataset: 36%|███▋ | 34000/93733 [07:19<12:59, 76.63 examples/s] Packing train dataset: 36%|███▋ | 34000/93733 [07:31<12:59, 76.63 examples/s] Packing train dataset: 37%|███▋ | 35000/93733 [07:32<12:35, 77.73 examples/s] Packing train dataset: 37%|███▋ | 35000/93733 [07:45<12:35, 77.73 examples/s] Packing train dataset: 38%|███▊ | 36000/93733 [07:45<12:36, 76.35 examples/s] Packing train dataset: 38%|███▊ | 36000/93733 [07:58<12:36, 76.35 examples/s] Packing train dataset: 39%|███▉ | 37000/93733 [07:59<12:30, 75.60 examples/s] Packing train dataset: 39%|███▉ | 37000/93733 [08:11<12:30, 75.60 examples/s] Packing train dataset: 41%|████ | 38000/93733 [08:12<12:15, 75.79 examples/s] Packing train dataset: 41%|████ | 38000/93733 [08:24<12:15, 75.79 examples/s] Packing train dataset: 42%|████▏ | 39000/93733 [08:25<11:56, 76.41 examples/s] Packing train dataset: 42%|████▏ | 39000/93733 [08:38<11:56, 76.41 examples/s] Packing train dataset: 43%|████▎ | 40000/93733 [08:38<11:49, 75.70 examples/s] Packing train dataset: 43%|████▎ | 40000/93733 [08:52<11:49, 75.70 examples/s] Packing train dataset: 44%|████▎ | 41000/93733 [08:52<11:46, 74.62 examples/s] Packing train dataset: 44%|████▎ | 41000/93733 [09:05<11:46, 74.62 examples/s] Packing train dataset: 45%|████▍ | 42000/93733 [09:05<11:26, 75.41 examples/s] Packing train dataset: 45%|████▍ | 42000/93733 [09:19<11:26, 75.41 examples/s] Packing train dataset: 46%|████▌ | 43000/93733 [09:19<11:24, 74.11 examples/s] Packing train dataset: 46%|████▌ | 43000/93733 [09:33<11:24, 74.11 examples/s] Packing train dataset: 47%|████▋ | 44000/93733 [09:33<11:15, 73.57 examples/s] Packing train dataset: 47%|████▋ | 44000/93733 [09:46<11:15, 73.57 examples/s] Packing train dataset: 48%|████▊ | 45000/93733 [09:46<10:54, 74.41 examples/s] Packing train dataset: 48%|████▊ | 45000/93733 [09:59<10:54, 74.41 examples/s] Packing train dataset: 49%|████▉ | 46000/93733 [10:00<10:40, 74.54 examples/s] Packing train dataset: 49%|████▉ | 46000/93733 [10:12<10:40, 74.54 examples/s] Packing train dataset: 50%|█████ | 47000/93733 [10:13<10:27, 74.50 examples/s] Packing train dataset: 50%|█████ | 47000/93733 [10:26<10:27, 74.50 examples/s] Packing train dataset: 51%|█████ | 48000/93733 [10:27<10:16, 74.12 examples/s] Packing train dataset: 51%|█████ | 48000/93733 [10:39<10:16, 74.12 examples/s] Packing train dataset: 52%|█████▏ | 49000/93733 [10:40<09:59, 74.64 examples/s] Packing train dataset: 52%|█████▏ | 49000/93733 [10:53<09:59, 74.64 examples/s] Packing train dataset: 53%|█████▎ | 50000/93733 [10:53<09:46, 74.53 examples/s] Packing train dataset: 53%|█████▎ | 50000/93733 [11:06<09:46, 74.53 examples/s] Packing train dataset: 54%|█████▍ | 51000/93733 [11:06<09:30, 74.90 examples/s] Packing train dataset: 54%|█████▍ | 51000/93733 [11:19<09:30, 74.90 examples/s] Packing train dataset: 55%|█████▌ | 52000/93733 [11:20<09:21, 74.32 examples/s] Packing train dataset: 55%|█████▌ | 52000/93733 [11:33<09:21, 74.32 examples/s] Packing train dataset: 57%|█████▋ | 53000/93733 [11:34<09:09, 74.14 examples/s] Packing train dataset: 57%|█████▋ | 53000/93733 [11:47<09:09, 74.14 examples/s] Packing train dataset: 58%|█████▊ | 54000/93733 [11:48<09:00, 73.50 examples/s] Packing train dataset: 58%|█████▊ | 54000/93733 [12:00<09:00, 73.50 examples/s] Packing train dataset: 59%|█████▊ | 55000/93733 [12:01<08:42, 74.09 examples/s] Packing train dataset: 59%|█████▊ | 55000/93733 [12:15<08:42, 74.09 examples/s] Packing train dataset: 60%|█████▉ | 56000/93733 [12:15<08:37, 72.92 examples/s] Packing train dataset: 60%|█████▉ | 56000/93733 [12:28<08:37, 72.92 examples/s] Packing train dataset: 61%|██████ | 57000/93733 [12:28<08:18, 73.65 examples/s] Packing train dataset: 61%|██████ | 57000/93733 [12:41<08:18, 73.65 examples/s] Packing train dataset: 62%|██████▏ | 58000/93733 [12:42<08:02, 74.07 examples/s] Packing train dataset: 62%|██████▏ | 58000/93733 [12:54<08:02, 74.07 examples/s] Packing train dataset: 63%|██████▎ | 59000/93733 [12:55<07:42, 75.11 examples/s] Packing train dataset: 63%|██████▎ | 59000/93733 [13:07<07:42, 75.11 examples/s] Packing train dataset: 64%|██████▍ | 60000/93733 [13:08<07:31, 74.77 examples/s] Packing train dataset: 64%|██████▍ | 60000/93733 [13:22<07:31, 74.77 examples/s] Packing train dataset: 65%|██████▌ | 61000/93733 [13:22<07:24, 73.68 examples/s] Packing train dataset: 65%|██████▌ | 61000/93733 [13:34<07:24, 73.68 examples/s] Packing train dataset: 66%|██████▌ | 62000/93733 [13:35<07:03, 75.00 examples/s] Packing train dataset: 66%|██████▌ | 62000/93733 [13:46<07:03, 75.00 examples/s] Packing train dataset: 67%|██████▋ | 63000/93733 [13:47<06:36, 77.56 examples/s] Packing train dataset: 67%|██████▋ | 63000/93733 [13:59<06:36, 77.56 examples/s] Packing train dataset: 68%|██████▊ | 64000/93733 [14:00<06:26, 77.00 examples/s] Packing train dataset: 68%|██████▊ | 64000/93733 [14:13<06:26, 77.00 examples/s] Packing train dataset: 69%|██████▉ | 65000/93733 [14:14<06:18, 75.93 examples/s] Packing train dataset: 69%|██████▉ | 65000/93733 [14:27<06:18, 75.93 examples/s] Packing train dataset: 70%|███████ | 66000/93733 [14:28<06:12, 74.45 examples/s] Packing train dataset: 70%|███████ | 66000/93733 [14:41<06:12, 74.45 examples/s] Packing train dataset: 71%|███████▏ | 67000/93733 [14:42<06:03, 73.55 examples/s] Packing train dataset: 71%|███████▏ | 67000/93733 [14:54<06:03, 73.55 examples/s] Packing train dataset: 73%|███████▎ | 68000/93733 [14:55<05:49, 73.68 examples/s] Packing train dataset: 73%|███████▎ | 68000/93733 [15:08<05:49, 73.68 examples/s] Packing train dataset: 74%|███████▎ | 69000/93733 [15:09<05:37, 73.18 examples/s] Packing train dataset: 74%|███████▎ | 69000/93733 [15:22<05:37, 73.18 examples/s] Packing train dataset: 75%|███████▍ | 70000/93733 [15:22<05:21, 73.76 examples/s] Packing train dataset: 75%|███████▍ | 70000/93733 [15:35<05:21, 73.76 examples/s] Packing train dataset: 76%|███████▌ | 71000/93733 [15:36<05:08, 73.71 examples/s] Packing train dataset: 76%|███████▌ | 71000/93733 [15:48<05:08, 73.71 examples/s] Packing train dataset: 77%|███████▋ | 72000/93733 [15:49<04:49, 75.15 examples/s] Packing train dataset: 77%|███████▋ | 72000/93733 [16:01<04:49, 75.15 examples/s] Packing train dataset: 78%|███████▊ | 73000/93733 [16:02<04:36, 75.03 examples/s] Packing train dataset: 78%|███████▊ | 73000/93733 [16:14<04:36, 75.03 examples/s] Packing train dataset: 79%|███████▉ | 74000/93733 [16:15<04:21, 75.36 examples/s] Packing train dataset: 79%|███████▉ | 74000/93733 [16:27<04:21, 75.36 examples/s] Packing train dataset: 80%|████████ | 75000/93733 [16:28<04:06, 75.84 examples/s] Packing train dataset: 80%|████████ | 75000/93733 [16:40<04:06, 75.84 examples/s] Packing train dataset: 81%|████████ | 76000/93733 [16:41<03:51, 76.72 examples/s] Packing train dataset: 81%|████████ | 76000/93733 [16:53<03:51, 76.72 examples/s] Packing train dataset: 82%|████████▏ | 77000/93733 [16:53<03:35, 77.67 examples/s] Packing train dataset: 82%|████████▏ | 77000/93733 [17:05<03:35, 77.67 examples/s] Packing train dataset: 83%|████████▎ | 78000/93733 [17:06<03:22, 77.83 examples/s] Packing train dataset: 83%|████████▎ | 78000/93733 [17:19<03:22, 77.83 examples/s] Packing train dataset: 84%|████████▍ | 79000/93733 [17:19<03:11, 77.05 examples/s] Packing train dataset: 84%|████████▍ | 79000/93733 [17:32<03:11, 77.05 examples/s] Packing train dataset: 85%|████████▌ | 80000/93733 [17:33<03:00, 76.11 examples/s] Packing train dataset: 85%|████████▌ | 80000/93733 [17:44<03:00, 76.11 examples/s] Packing train dataset: 86%|████████▋ | 81000/93733 [17:45<02:43, 77.72 examples/s] Packing train dataset: 86%|████████▋ | 81000/93733 [17:57<02:43, 77.72 examples/s] Packing train dataset: 87%|████████▋ | 82000/93733 [17:58<02:31, 77.53 examples/s] Packing train dataset: 87%|████████▋ | 82000/93733 [18:11<02:31, 77.53 examples/s] Packing train dataset: 89%|████████▊ | 83000/93733 [18:11<02:20, 76.58 examples/s] Packing train dataset: 89%|████████▊ | 83000/93733 [18:24<02:20, 76.58 examples/s] Packing train dataset: 90%|████████▉ | 84000/93733 [18:25<02:07, 76.53 examples/s] Packing train dataset: 90%|████████▉ | 84000/93733 [18:38<02:07, 76.53 examples/s] Packing train dataset: 91%|█████████ | 85000/93733 [18:38<01:55, 75.51 examples/s] Packing train dataset: 91%|█████████ | 85000/93733 [18:52<01:55, 75.51 examples/s] Packing train dataset: 92%|█████████▏| 86000/93733 [18:52<01:44, 74.26 examples/s] Packing train dataset: 92%|█████████▏| 86000/93733 [19:04<01:44, 74.26 examples/s] Packing train dataset: 93%|█████████▎| 87000/93733 [19:04<01:27, 76.52 examples/s] Packing train dataset: 93%|█████████▎| 87000/93733 [19:17<01:27, 76.52 examples/s] Packing train dataset: 94%|█████████▍| 88000/93733 [19:17<01:14, 76.75 examples/s] Packing train dataset: 94%|█████████▍| 88000/93733 [19:29<01:14, 76.75 examples/s] Packing train dataset: 95%|█████████▍| 89000/93733 [19:30<01:00, 78.05 examples/s] Packing train dataset: 95%|█████████▍| 89000/93733 [19:42<01:00, 78.05 examples/s] Packing train dataset: 96%|█████████▌| 90000/93733 [19:43<00:48, 77.47 examples/s] Packing train dataset: 96%|█████████▌| 90000/93733 [19:55<00:48, 77.47 examples/s] Packing train dataset: 97%|█████████▋| 91000/93733 [19:56<00:35, 77.16 examples/s] Packing train dataset: 97%|█████████▋| 91000/93733 [20:08<00:35, 77.16 examples/s] Packing train dataset: 98%|█████████▊| 92000/93733 [20:08<00:22, 77.93 examples/s] Packing train dataset: 98%|█████████▊| 92000/93733 [20:22<00:22, 77.93 examples/s] Packing train dataset: 99%|█████████▉| 93000/93733 [20:22<00:09, 76.18 examples/s] Packing train dataset: 100%|██████████| 93733/93733 [20:28<00:00, 83.02 examples/s] Packing train dataset: 100%|██████████| 93733/93733 [20:29<00:00, 76.27 examples/s] -Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. -2025-04-12 00:20:30 - INFO - __main__ - *** Train *** -2025-04-12 00:20:30 - INFO - __main__ - DeepseekV2ForCausalLM( +2025-04-20 03:40:15 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-20 03:40:15 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-20 03:40:15 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr20_03-40-15_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-20 03:40:21 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-20 19:56:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-20 19:56:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-20 19:56:05 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr20_19-56-05_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-20 19:56:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-20 21:50:18 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-20 21:50:18 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-20 21:50:18 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr20_21-50-18_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-20 21:50:20 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-20 22:29:43 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-20 22:29:43 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-20 22:29:43 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr20_22-29-43_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-20 22:29:46 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-20 23:19:05 - INFO - __main__ - *** Train *** +2025-04-20 23:19:05 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-20 23:20:07 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-20 23:20:07 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-20 23:20:07 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr20_23-20-06_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-20 23:20:09 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-20 23:22:40 - INFO - __main__ - *** Train *** +2025-04-20 23:22:40 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 01:20:42 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 01:20:42 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 01:20:42 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_01-20-42_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 01:20:44 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 01:23:18 - INFO - __main__ - *** Train *** +2025-04-21 01:23:18 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 01:53:56 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 01:53:56 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 01:53:56 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_01-53-56_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 01:53:58 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 01:56:30 - INFO - __main__ - *** Train *** +2025-04-21 01:56:30 - INFO - __main__ - DeepseekV2ForCausalLM( (model): DeepseekV2Model( (embed_tokens): Embedding(102400, 2048) (layers): ModuleList( @@ -94,13 +1360,232 @@ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; ) (mlp): DeepseekV2MoE( (experts): ModuleList( - (0-63): 64 x DeepseekV2MLP( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) (up_proj): Linear(in_features=2048, out_features=1408, bias=False) (down_proj): Linear(in_features=1408, out_features=2048, bias=False) (act_fn): SiLU() ) ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 02:34:50 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 02:34:50 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 02:34:50 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_02-34-49_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=True, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 02:34:53 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 02:37:25 - INFO - __main__ - *** Train *** +2025-04-21 02:37:25 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) (gate): MoEGate() (shared_experts): DeepseekV2MLP( (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) @@ -108,6 +1593,26493 @@ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; (down_proj): Linear(in_features=2816, out_features=2048, bias=False) (act_fn): SiLU() ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 02:41:23 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 02:41:23 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 02:41:23 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_02-41-22_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 02:41:25 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 02:49:34 - INFO - __main__ - *** Train *** +2025-04-21 02:49:34 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 02:52:10 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 02:52:10 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 02:52:10 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_02-52-10_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 02:52:13 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 02:54:44 - INFO - __main__ - *** Train *** +2025-04-21 02:54:44 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 02:56:06 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 02:56:06 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 02:56:06 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_02-56-06_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 02:56:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 02:58:40 - INFO - __main__ - *** Train *** +2025-04-21 02:58:40 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 03:01:28 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:01:28 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:01:28 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-01-27_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:01:33 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 03:04:04 - INFO - __main__ - *** Train *** +2025-04-21 03:04:04 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 03:08:12 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:08:12 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:08:12 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-08-12_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=4, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:08:15 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 03:24:16 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:24:16 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:24:16 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-24-16_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=4, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:24:20 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 03:51:31 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:51:31 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:51:31 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-51-31_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:51:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 03:54:10 - INFO - __main__ - *** Train *** +2025-04-21 03:54:10 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 03:54:58 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:54:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:54:58 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-54-58_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:55:01 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 03:57:18 - INFO - __main__ - *** Train *** +2025-04-21 03:57:18 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 03:59:37 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 03:59:37 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 03:59:37 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_03-59-37_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 03:59:41 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 04:30:16 - INFO - __main__ - *** Train *** +2025-04-21 04:30:16 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 05:10:12 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, use_dora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:10:12 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:10:12 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +do_eval=True, +do_predict=False, +do_train=False, +eos_token=, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-10-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +pad_token=, +padding_free=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tp_size=0, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:11:01 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, use_dora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:11:01 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:11:01 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +do_eval=True, +do_predict=False, +do_train=False, +eos_token=, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-11-01_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +pad_token=, +padding_free=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tp_size=0, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:12:30 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, use_dora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:12:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:12:30 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +do_eval=True, +do_predict=False, +do_train=False, +eos_token=, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-12-30_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +pad_token=, +padding_free=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tp_size=0, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:14:12 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, use_dora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:14:12 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:14:12 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +do_eval=True, +do_predict=False, +do_train=False, +eos_token=, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-14-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +pad_token=, +padding_free=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tp_size=0, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:16:02 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:16:02 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:16:02 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-16-02_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:16:05 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 05:18:19 - INFO - __main__ - *** Train *** +2025-04-21 05:18:19 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 05:22:22 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:22:22 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:22:22 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-22-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:22:24 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 05:24:40 - INFO - __main__ - *** Train *** +2025-04-21 05:24:40 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 05:31:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:31:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:31:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-31-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:31:14 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 05:33:32 - INFO - __main__ - *** Train *** +2025-04-21 05:33:32 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 05:53:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 05:53:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 05:53:05 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_05-53-05_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 05:53:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 14:30:03 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 14:30:03 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 14:30:03 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_14-30-03_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 14:30:05 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 14:32:20 - INFO - __main__ - *** Train *** +2025-04-21 14:32:20 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 14:44:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 14:44:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 14:44:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_14-44-10_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 14:44:13 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 14:46:31 - INFO - __main__ - *** Train *** +2025-04-21 14:46:31 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 15:00:23 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 15:00:23 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 15:00:23 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_15-00-23_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 15:00:28 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 15:03:33 - INFO - __main__ - *** Train *** +2025-04-21 15:03:33 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-21 15:10:38 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-21 15:10:38 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-21 15:10:38 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr21_15-10-38_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=batchmean, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-21 15:10:40 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-21 15:13:32 - INFO - __main__ - *** Train *** +2025-04-21 15:13:32 - INFO - __main__ - DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-24 23:38:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-24 23:38:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-24 23:38:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr24_23-38-10_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-24 23:38:13 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-24 23:38:13 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-24 23:38:13 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-24 23:38:33 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-24 23:38:33 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-24 23:38:33 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-24 23:38:33 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 8743.642578125 +Memory reserved: 8782.0 +2025-04-24 23:38:33 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 00:08:03 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10797.78564453125 +Memory reserved: 11686.0 +2025-04-25 00:08:03 - INFO - __main__ - *** Starting training *** +2025-04-25 00:08:03 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (experts): ModuleList( + (0-63): 64 x Identity() + ) + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 00:21:03 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 00:21:03 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 00:21:03 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_00-21-02_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 00:21:05 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 00:21:05 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 00:21:05 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 00:21:25 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 00:21:25 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 00:21:36 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 00:21:36 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 00:21:36 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 00:22:04 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 00:22:04 - INFO - __main__ - *** Starting training *** +2025-04-25 00:22:04 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 01:34:50 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 01:34:50 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 01:34:50 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=180000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_01-34-50_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 01:34:54 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 01:34:54 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 01:34:54 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 01:35:15 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 01:35:15 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 01:35:26 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 01:35:26 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 01:35:26 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 01:35:55 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 01:35:55 - INFO - __main__ - *** Starting training *** +2025-04-25 01:35:55 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 14:12:58 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 14:12:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 14:12:58 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_14-12-58_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=128, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 14:13:01 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 14:13:01 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 14:13:01 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 14:15:09 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 14:15:09 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 14:15:22 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 14:15:22 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 14:15:22 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 14:22:39 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 14:22:39 - INFO - __main__ - *** Starting training *** +2025-04-25 14:22:39 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 17:38:14 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 17:38:14 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 17:38:14 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_17-38-14_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 17:38:17 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 17:38:17 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 17:38:17 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 17:40:36 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 17:40:36 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 17:40:44 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 17:40:44 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 17:40:44 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 17:43:00 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 17:43:00 - INFO - __main__ - *** Starting training *** +2025-04-25 17:43:00 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 18:11:06 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 18:11:06 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 18:11:06 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_18-11-06_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 18:11:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 18:11:08 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 18:11:08 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 18:13:19 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 18:13:19 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 18:13:28 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 18:13:28 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 18:13:28 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 18:15:50 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 18:15:50 - INFO - __main__ - *** Starting training *** +2025-04-25 18:15:50 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 19:56:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 19:56:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 19:56:05 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_19-56-04_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 19:56:07 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 19:56:07 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 19:56:07 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 19:58:12 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 19:58:12 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 19:58:25 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 19:58:25 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 19:58:25 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:00:42 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-25 20:00:42 - INFO - __main__ - *** Starting training *** +2025-04-25 20:00:42 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:06:53 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:06:53 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:06:53 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-06-52_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:06:55 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:06:55 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:06:55 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:08:24 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-25 20:08:24 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:08:36 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:08:36 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-25 20:08:36 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:10:22 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 10556.0 +2025-04-25 20:10:22 - INFO - __main__ - *** Starting training *** +2025-04-25 20:10:22 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:13:13 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:13:13 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:13:13 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-13-13_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:13:15 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:13:15 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:13:15 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:13:33 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:13:33 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:13:42 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:13:42 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:13:42 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:14:11 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:14:11 - INFO - __main__ - *** Starting training *** +2025-04-25 20:14:11 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:20:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:20:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:20:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-20-10_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:20:13 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:20:13 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:20:13 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:20:31 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:20:31 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:20:40 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:20:40 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:20:40 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:21:13 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:21:13 - INFO - __main__ - *** Starting training *** +2025-04-25 20:21:13 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:25:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:25:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:25:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-25-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:25:42 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:25:42 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:25:42 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:26:00 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:26:00 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:26:12 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:26:12 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:26:12 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:26:41 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:26:41 - INFO - __main__ - *** Starting training *** +2025-04-25 20:26:41 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:31:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:31:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:31:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-31-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:31:14 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:31:14 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:31:14 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:31:31 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:31:31 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:31:42 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:31:42 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:31:42 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:32:11 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:32:11 - INFO - __main__ - *** Starting training *** +2025-04-25 20:32:11 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:35:42 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:35:42 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:35:42 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-35-42_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:35:45 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:35:45 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:35:45 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:36:03 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:36:03 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:42:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:42:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:42:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-42-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:42:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:42:43 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:42:43 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:43:00 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:43:00 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:49:14 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:49:14 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:49:14 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-49-14_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:49:17 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:49:17 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:49:17 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:50:22 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:50:22 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:50:22 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-50-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:50:25 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:50:25 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:50:25 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:52:26 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:52:26 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:52:26 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-52-26_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:52:28 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:52:28 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:52:28 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:53:16 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:53:16 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:53:16 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-53-15_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:53:18 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:53:18 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:53:18 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:53:35 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:53:35 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:53:48 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:53:48 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:53:48 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:54:16 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:54:16 - INFO - __main__ - *** Starting training *** +2025-04-25 20:54:16 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 20:57:42 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 20:57:42 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 20:57:42 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_20-57-42_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 20:57:45 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 20:57:45 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 20:57:45 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 20:58:02 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 20:58:02 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 20:58:11 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 20:58:11 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 20:58:11 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 20:58:38 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18826.64306640625 +Memory reserved: 19044.0 +2025-04-25 20:58:38 - INFO - __main__ - *** Starting training *** +2025-04-25 20:58:38 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 21:03:36 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:03:36 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:03:36 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-03-36_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:03:39 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:03:39 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:03:39 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:07:57 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:07:57 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:07:57 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-07-57_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:07:59 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:07:59 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:07:59 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:12:19 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:12:19 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:12:19 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-12-19_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:12:21 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:12:21 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:12:21 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:14:47 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:14:47 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:14:47 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-14-47_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:14:49 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:14:49 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:14:49 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:15:06 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 21:15:06 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 21:15:13 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 21:15:13 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 21:15:13 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 21:15:43 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19207.39306640625 +Memory reserved: 19424.0 +2025-04-25 21:15:43 - INFO - __main__ - *** Starting training *** +2025-04-25 21:15:43 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-25 21:20:06 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:20:06 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:20:06 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-20-06_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:20:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:20:08 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:20:08 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:20:26 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 21:20:26 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 21:20:34 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 21:20:34 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 21:20:34 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 21:21:04 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19198.70556640625 +Memory reserved: 20020.0 +2025-04-25 21:21:04 - INFO - __main__ - *** Starting training *** +2025-04-25 21:21:04 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-25 21:37:46 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:37:46 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:37:46 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-37-45_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:37:49 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:37:49 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:37:49 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:38:06 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 21:38:06 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 21:38:15 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 21:38:15 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 21:38:15 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 21:42:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:42:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:42:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-42-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:42:42 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:42:42 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:42:42 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:42:59 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 21:42:59 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 21:43:08 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 21:43:08 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 21:43:08 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 21:43:36 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19198.70556640625 +Memory reserved: 20020.0 +2025-04-25 21:43:36 - INFO - __main__ - *** Starting training *** +2025-04-25 21:43:36 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-25 21:57:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 21:57:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 21:57:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_21-57-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 21:57:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 21:57:43 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 21:57:43 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 21:58:02 - INFO - __main__ - Model memory after loading model:Memory allocated: 16494.2587890625 +Memory reserved: 17466.0 +2025-04-25 21:58:02 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 21:58:10 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 21:58:10 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2373.0087890625 +Memory reserved: 4324.0 +2025-04-25 21:58:10 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 21:58:39 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18817.95556640625 +Memory reserved: 19640.0 +2025-04-25 21:58:39 - INFO - __main__ - *** Starting training *** +2025-04-25 21:58:39 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-25 22:17:55 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:17:55 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:17:55 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-17-55_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:17:58 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:17:58 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:17:58 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:18:15 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 22:18:15 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:18:24 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:18:24 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 22:18:24 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:18:51 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 18848.66650390625 +Memory reserved: 19424.0 +2025-04-25 22:18:51 - INFO - __main__ - *** Starting training *** +2025-04-25 22:18:51 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 22:25:55 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:25:55 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:25:55 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-25-54_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:25:57 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:25:57 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:25:57 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:26:14 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 22:26:14 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:26:23 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:26:23 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 22:26:23 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:26:51 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19200.88134765625 +Memory reserved: 20020.0 +2025-04-25 22:26:51 - INFO - __main__ - *** Starting training *** +2025-04-25 22:26:51 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 22:35:17 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:35:17 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:35:17 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-35-17_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:35:20 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:35:20 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:35:20 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:35:38 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 22:35:38 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:35:49 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:35:49 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 22:35:49 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:36:16 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19200.88134765625 +Memory reserved: 20020.0 +2025-04-25 22:36:16 - INFO - __main__ - *** Starting training *** +2025-04-25 22:36:16 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 22:37:17 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:37:17 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:37:17 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-37-17_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:37:20 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:37:20 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:37:20 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:37:53 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:37:53 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:37:53 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-37-52_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:37:55 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:37:55 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:37:55 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:38:13 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 22:38:13 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:38:19 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:38:19 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 22:38:19 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:38:50 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19200.88134765625 +Memory reserved: 20020.0 +2025-04-25 22:38:50 - INFO - __main__ - *** Starting training *** +2025-04-25 22:38:50 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 22:41:21 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:41:21 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:41:21 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-41-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:41:23 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:41:23 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:41:23 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:41:40 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 22:41:40 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:41:52 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:41:52 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 22:41:52 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:42:19 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19200.88134765625 +Memory reserved: 20020.0 +2025-04-25 22:42:19 - INFO - __main__ - *** Starting training *** +2025-04-25 22:42:19 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 22:55:27 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 22:55:27 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 22:55:27 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_22-55-27_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 22:55:30 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 22:55:30 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 22:55:30 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 22:55:47 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 22:55:47 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 22:55:57 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 22:55:57 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 22:55:57 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 22:56:24 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19198.70556640625 +Memory reserved: 20020.0 +2025-04-25 22:56:24 - INFO - __main__ - *** Starting training *** +2025-04-25 22:56:24 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-25 23:11:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 23:11:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 23:11:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_23-11-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 23:11:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 23:11:43 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 23:11:43 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 23:19:45 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 23:19:45 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 23:19:45 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_23-19-45_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 23:19:48 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 23:19:48 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 23:19:48 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 23:20:06 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.5087890625 +Memory reserved: 18266.0 +2025-04-25 23:20:06 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 23:20:16 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 23:20:16 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.2587890625 +Memory reserved: 4724.0 +2025-04-25 23:20:16 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 23:20:44 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19198.70556640625 +Memory reserved: 20020.0 +2025-04-25 23:20:44 - INFO - __main__ - *** Starting training *** +2025-04-25 23:20:44 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-25 23:33:53 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 23:33:53 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 23:33:53 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_23-33-53_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 23:33:56 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 23:33:56 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 23:33:56 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-25 23:34:14 - INFO - __main__ - Model memory after loading model:Memory allocated: 17275.6494140625 +Memory reserved: 18266.0 +2025-04-25 23:34:14 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-25 23:34:25 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-25 23:34:25 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2754.3994140625 +Memory reserved: 4724.0 +2025-04-25 23:34:25 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-25 23:34:52 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 19200.88134765625 +Memory reserved: 20020.0 +2025-04-25 23:34:52 - INFO - __main__ - *** Starting training *** +2025-04-25 23:34:52 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-25 23:45:03 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-25 23:45:03 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-25 23:45:03 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr25_23-45-03_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-25 23:45:06 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-25 23:45:06 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-25 23:45:06 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 04:26:55 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 04:26:55 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 04:26:55 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_04-26-55_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 04:26:58 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 04:26:58 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 04:26:58 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 04:27:17 - INFO - __main__ - Model memory after loading model:Memory allocated: 9525.033203125 +Memory reserved: 10396.0 +2025-04-26 04:27:17 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 04:27:27 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 04:27:27 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2304.345703125 +Memory reserved: 3126.0 +2025-04-26 04:27:27 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 04:27:56 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 11048.39892578125 +Memory reserved: 11844.0 +2025-04-26 04:27:56 - INFO - __main__ - *** Starting training *** +2025-04-26 04:27:56 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-26 16:32:41 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 16:32:41 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 16:32:41 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_16-32-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 16:32:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 16:32:43 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 16:32:43 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 16:36:39 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 16:36:39 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 16:36:39 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_16-36-39_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 16:36:41 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 16:36:41 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 16:36:41 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 16:37:07 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-26 16:37:07 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 16:37:19 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 16:37:19 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-26 16:37:55 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 16:37:55 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 16:37:55 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_16-37-54_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 16:37:57 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 16:37:57 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 16:37:57 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 16:38:22 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-26 16:38:22 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 16:38:31 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 16:38:31 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-26 16:38:31 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 16:46:13 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 16:46:13 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 16:46:13 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_16-46-13_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 16:46:15 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 16:46:15 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 16:46:15 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 16:46:39 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-26 16:46:39 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 16:46:48 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 16:46:48 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-26 16:46:48 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 17:52:56 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 17:52:56 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 17:52:56 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_17-52-56_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 17:52:58 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 17:52:58 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 17:52:58 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 17:53:21 - INFO - __main__ - Model memory after loading model:Memory allocated: 8743.642578125 +Memory reserved: 9596.0 +2025-04-26 17:53:21 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 17:53:29 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 17:53:29 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1722.955078125 +Memory reserved: 2526.0 +2025-04-26 17:53:29 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 17:54:03 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 10472.91064453125 +Memory reserved: 11370.0 +2025-04-26 17:54:03 - INFO - __main__ - *** Starting training *** +2025-04-26 17:54:03 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-26 18:01:27 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 18:01:27 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 18:01:27 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_18-01-27_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 18:01:30 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 18:01:30 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 18:01:30 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 18:01:50 - INFO - __main__ - Model memory after loading model:Memory allocated: 9524.892578125 +Memory reserved: 10396.0 +2025-04-26 18:01:50 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 18:01:59 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 18:01:59 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2304.205078125 +Memory reserved: 3126.0 +2025-04-26 18:01:59 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 18:02:29 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 11048.22314453125 +Memory reserved: 11844.0 +2025-04-26 18:02:29 - INFO - __main__ - *** Starting training *** +2025-04-26 18:02:29 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-26 18:30:09 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-26 18:30:09 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-26 18:30:09 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr26_18-30-09_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-26 18:30:12 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-26 18:30:12 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-26 18:30:12 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-26 18:30:30 - INFO - __main__ - Model memory after loading model:Memory allocated: 9524.892578125 +Memory reserved: 10396.0 +2025-04-26 18:30:30 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-26 18:30:39 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-26 18:30:39 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2304.205078125 +Memory reserved: 3126.0 +2025-04-26 18:30:39 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-26 18:31:08 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 11048.22314453125 +Memory reserved: 11844.0 +2025-04-26 18:31:08 - INFO - __main__ - *** Starting training *** +2025-04-26 18:31:08 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-27 02:07:32 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 02:07:32 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 02:07:32 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_02-07-32_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 02:07:35 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 02:07:35 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 02:07:35 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 02:07:59 - INFO - __main__ - Model memory after loading model:Memory allocated: 5617.64697265625 +Memory reserved: 8122.0 +2025-04-27 02:07:59 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 02:08:09 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 02:08:09 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2085.64697265625 +Memory reserved: 2226.0 +2025-04-27 02:08:09 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 02:08:47 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6918.04443359375 +Memory reserved: 9262.0 +2025-04-27 02:08:47 - INFO - __main__ - *** Starting training *** +2025-04-27 02:08:47 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100000, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100000, bias=False) +) +2025-04-27 02:28:12 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 02:28:12 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 02:28:12 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_02-28-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 02:28:14 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 02:28:14 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 02:28:14 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 02:28:40 - INFO - __main__ - Model memory after loading model:Memory allocated: 5617.78759765625 +Memory reserved: 8122.0 +2025-04-27 02:28:40 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 02:28:55 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 02:28:55 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2085.78759765625 +Memory reserved: 2226.0 +2025-04-27 02:28:55 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 02:29:30 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6918.20263671875 +Memory reserved: 9262.0 +2025-04-27 02:29:30 - INFO - __main__ - *** Starting training *** +2025-04-27 02:29:30 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-27 02:51:02 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 02:51:02 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 02:51:02 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_02-51-02_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 02:51:04 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 02:51:04 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 02:51:04 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 02:51:29 - INFO - __main__ - Model memory after loading model:Memory allocated: 5617.78759765625 +Memory reserved: 8122.0 +2025-04-27 02:51:29 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 02:51:38 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 02:51:38 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2085.78759765625 +Memory reserved: 2226.0 +2025-04-27 02:51:38 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 02:52:12 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6918.20263671875 +Memory reserved: 9262.0 +2025-04-27 02:52:12 - INFO - __main__ - *** Starting training *** +2025-04-27 02:52:12 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-27 02:59:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 02:59:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 02:59:05 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_02-59-05_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 02:59:07 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 02:59:07 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 02:59:07 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 02:59:49 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 02:59:49 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 02:59:49 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_02-59-48_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 02:59:51 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 02:59:51 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 02:59:51 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 03:00:15 - INFO - __main__ - Model memory after loading model:Memory allocated: 5617.78759765625 +Memory reserved: 8122.0 +2025-04-27 03:00:15 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 03:00:25 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 03:00:25 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 2085.78759765625 +Memory reserved: 2226.0 +2025-04-27 03:00:25 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 03:00:58 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6918.20263671875 +Memory reserved: 9262.0 +2025-04-27 03:00:58 - INFO - __main__ - *** Starting training *** +2025-04-27 03:00:58 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(100018, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=100018, bias=False) +) +2025-04-27 03:46:30 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 03:46:30 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 03:46:30 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_03-46-29_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 03:46:32 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 03:46:32 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 03:46:32 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 03:46:57 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 03:46:57 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 03:47:22 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 03:47:22 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 03:47:22 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 03:48:04 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 03:48:04 - INFO - __main__ - *** Starting training *** +2025-04-27 03:48:04 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-27 04:24:02 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 04:24:02 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 04:24:02 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=True, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_04-24-02_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 04:24:05 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 04:24:05 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 04:24:05 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 04:24:28 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 04:24:28 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 04:24:38 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 04:24:38 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 04:24:38 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 04:25:11 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 04:25:11 - INFO - __main__ - *** Starting training *** +2025-04-27 04:25:11 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-27 04:36:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 04:36:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 04:36:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_04-36-39_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=2048, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 04:36:43 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 04:36:43 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 04:36:43 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 04:37:06 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 04:37:06 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 04:37:16 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 04:37:16 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 04:37:16 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 04:37:48 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 04:37:48 - INFO - __main__ - *** Starting training *** +2025-04-27 04:37:48 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-27 04:45:19 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 04:45:19 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 04:45:19 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_04-45-18_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 04:45:21 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 04:45:21 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 04:45:21 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 04:45:46 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 04:45:46 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 04:45:55 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 04:45:55 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 04:45:55 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 04:51:51 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 04:51:51 - INFO - __main__ - *** Starting training *** +2025-04-27 04:51:51 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-27 04:53:09 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 04:53:09 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 04:53:09 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_04-53-09_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 04:53:11 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 04:53:11 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 04:53:11 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 04:53:36 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 04:53:36 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 04:53:46 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 04:53:46 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 04:53:46 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 04:54:20 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 04:54:20 - INFO - __main__ - *** Starting training *** +2025-04-27 04:54:20 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-27 04:58:43 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-27 04:58:43 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-27 04:58:43 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr27_04-58-43_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-27 04:58:46 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-27 04:58:46 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-27 04:58:46 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-27 04:59:10 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-27 04:59:10 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-27 04:59:19 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-27 04:59:19 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-27 04:59:19 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-27 04:59:54 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-27 04:59:54 - INFO - __main__ - *** Starting training *** +2025-04-27 04:59:54 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 19:33:17 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 19:33:17 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 19:33:17 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_19-33-17_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 19:33:20 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 19:33:20 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 19:33:20 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 19:34:19 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 19:34:19 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 19:34:33 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 19:34:33 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 19:34:33 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 19:35:44 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 19:35:44 - INFO - __main__ - *** Starting training *** +2025-04-28 19:35:44 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 19:36:24 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 19:36:24 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 19:36:24 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_19-36-24_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 19:36:27 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 19:36:27 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 19:36:27 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 19:36:49 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 19:36:49 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 19:36:59 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 19:36:59 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 19:36:59 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 19:37:29 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 7601.75 +Memory reserved: 8080.0 +2025-04-28 19:37:29 - INFO - __main__ - *** Starting training *** +2025-04-28 19:37:29 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 19:45:34 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 19:45:34 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 19:45:34 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_19-45-33_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 19:45:36 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 19:45:36 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 19:45:36 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 19:45:58 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 19:45:58 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 19:46:07 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 19:46:07 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 19:46:07 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 19:46:39 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 7601.75 +Memory reserved: 8080.0 +2025-04-28 19:46:39 - INFO - __main__ - *** Starting training *** +2025-04-28 19:46:39 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 19:54:58 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 19:54:58 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 19:54:58 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_19-54-58_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 19:55:00 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 19:55:00 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 19:55:00 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 19:55:21 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 19:55:21 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 19:55:31 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 19:55:31 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 19:55:31 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 19:56:49 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 19:56:49 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 19:56:49 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_19-56-49_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 19:56:52 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 19:56:52 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 19:56:52 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 19:57:14 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 19:57:14 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 19:57:24 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 19:57:24 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 19:57:24 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:03:05 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:03:05 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:03:05 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-03-05_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:03:08 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:03:08 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:03:08 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:03:29 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 20:03:29 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:03:37 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:03:37 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 20:03:37 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:11:04 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:11:04 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:11:04 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-11-04_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:11:07 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:11:07 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:11:07 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:11:30 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 20:11:30 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:11:43 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:11:43 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 20:11:43 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:14:09 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:14:09 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:14:09 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-14-09_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:14:12 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:14:12 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:14:12 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:14:35 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 20:14:35 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:14:47 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:14:47 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 20:14:47 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:16:24 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:16:24 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:16:24 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-16-24_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:16:26 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:16:26 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:16:26 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:16:49 - INFO - __main__ - Model memory after loading model:Memory allocated: 6091.439453125 +Memory reserved: 7478.0 +2025-04-28 20:16:49 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:17:02 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:17:02 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1513.814453125 +Memory reserved: 1606.0 +2025-04-28 20:17:02 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:19:49 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:19:49 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:19:49 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-19-48_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:19:51 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:19:51 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:19:51 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:20:00 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:20:00 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:20:08 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:20:08 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:20:08 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:21:32 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 31126.0048828125 +Memory reserved: 36896.0 +2025-04-28 20:21:32 - INFO - __main__ - *** Starting training *** +2025-04-28 20:21:32 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-2): 3 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 20:32:21 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:32:21 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:32:21 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-32-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:32:24 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:32:24 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_k_experts.json: {'model.layers.1.mlp': [45, 51, 44], 'model.layers.2.mlp': [25, 18, 27], 'model.layers.3.mlp': [54, 28, 25], 'model.layers.4.mlp': [11, 21, 49], 'model.layers.5.mlp': [35, 54, 20], 'model.layers.6.mlp': [45, 22, 1], 'model.layers.7.mlp': [58, 24, 43], 'model.layers.8.mlp': [47, 39, 54], 'model.layers.9.mlp': [31, 22, 32], 'model.layers.10.mlp': [22, 47, 42], 'model.layers.11.mlp': [11, 17, 29], 'model.layers.12.mlp': [4, 3, 59], 'model.layers.13.mlp': [17, 10, 47], 'model.layers.14.mlp': [51, 7, 27], 'model.layers.15.mlp': [24, 14, 17], 'model.layers.16.mlp': [61, 33, 19], 'model.layers.17.mlp': [32, 29, 26], 'model.layers.18.mlp': [56, 5, 2], 'model.layers.19.mlp': [24, 36, 40], 'model.layers.20.mlp': [1, 56, 38], 'model.layers.21.mlp': [19, 5, 28], 'model.layers.22.mlp': [32, 14, 58], 'model.layers.23.mlp': [20, 58, 0], 'model.layers.24.mlp': [7, 63, 47], 'model.layers.25.mlp': [45, 39, 46], 'model.layers.26.mlp': [6, 46, 49]} +2025-04-28 20:32:24 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:32:33 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:32:33 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:32:40 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:32:40 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:32:40 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:43:00 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:43:00 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:43:00 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-42-59_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:43:02 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:43:02 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 20:43:02 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:43:11 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:43:11 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:43:22 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:43:22 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:43:22 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 20:43:58 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 31126.0048828125 +Memory reserved: 36896.0 +2025-04-28 20:43:58 - INFO - __main__ - *** Starting training *** +2025-04-28 20:43:58 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 20:59:08 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 20:59:08 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 20:59:08 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_20-59-07_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 20:59:11 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 20:59:11 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 20:59:11 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:59:21 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:59:21 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 20:59:34 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 20:59:34 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 20:59:34 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 21:00:15 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 31126.0048828125 +Memory reserved: 36896.0 +2025-04-28 21:00:15 - INFO - __main__ - *** Starting training *** +2025-04-28 21:00:15 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 21:01:11 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 21:01:11 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 21:01:11 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=1, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_21-01-11_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=16, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 21:01:14 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 21:01:14 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 21:01:14 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:01:23 - INFO - __main__ - Model memory after loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:01:23 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 21:01:38 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 21:01:38 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:01:38 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 21:37:08 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 21:37:08 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 21:37:08 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_21-37-07_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=8, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 21:37:10 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 21:37:10 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 21:37:10 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:37:34 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 21:37:34 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 21:37:46 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 21:37:46 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 21:37:46 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 21:38:18 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 21:38:18 - INFO - __main__ - *** Starting training *** +2025-04-28 21:38:18 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 21:51:17 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 21:51:17 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 21:51:17 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_21-51-16_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=8, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 21:51:19 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 21:51:19 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 21:51:19 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:51:42 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 21:51:42 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 21:51:57 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 21:51:57 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 21:51:57 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 21:52:31 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 21:52:31 - INFO - __main__ - *** Starting training *** +2025-04-28 21:52:31 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 21:54:43 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 21:54:43 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 21:54:43 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_21-54-43_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=8, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 21:54:46 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 21:54:46 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 21:54:46 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 21:55:10 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 21:55:10 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 21:55:21 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 21:55:21 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 21:55:21 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 21:55:59 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 21:55:59 - INFO - __main__ - *** Starting training *** +2025-04-28 21:55:59 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 22:27:51 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 22:27:51 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 22:27:51 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=2, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_22-27-50_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=4096, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=3, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=8, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 22:27:53 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 22:27:53 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 22:27:53 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 22:28:17 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 22:28:17 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 22:28:33 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 22:28:34 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 22:28:34 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 22:29:07 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 22:29:07 - INFO - __main__ - *** Starting training *** +2025-04-28 22:29:07 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-28 22:54:15 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-28 22:54:15 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-28 22:54:15 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr28_22-54-14_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-28 22:54:17 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-28 22:54:17 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-28 22:54:17 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-28 22:54:40 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-28 22:54:40 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-28 22:54:53 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-28 22:54:53 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-28 22:54:53 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-28 22:55:26 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-28 22:55:26 - INFO - __main__ - *** Starting training *** +2025-04-28 22:55:26 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 03:25:00 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 03:25:00 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 03:25:00 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_03-24-59_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 03:25:05 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 03:25:05 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 03:25:05 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 03:25:30 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 03:25:30 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 03:25:43 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 03:25:43 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 03:25:43 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 03:26:21 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 03:26:21 - INFO - __main__ - *** Starting training *** +2025-04-29 03:26:21 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 03:31:03 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 03:31:03 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 03:31:03 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_03-31-03_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 03:31:06 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 03:31:06 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 03:31:06 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 03:31:30 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 03:31:30 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 03:31:42 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 03:31:42 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 03:31:42 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 03:32:18 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 03:32:18 - INFO - __main__ - *** Starting training *** +2025-04-29 03:32:18 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 03:41:55 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 03:41:55 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 03:41:55 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_03-41-55_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 03:41:59 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 03:41:59 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 03:41:59 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 03:42:22 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 03:42:22 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 03:42:37 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 03:42:37 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 03:42:37 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 03:43:11 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 03:43:11 - INFO - __main__ - *** Starting training *** +2025-04-29 03:43:11 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 03:46:01 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 03:46:01 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 03:46:01 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_03-46-01_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 03:46:04 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 03:46:04 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 03:46:04 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 03:46:27 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 03:46:27 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 03:46:39 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 03:46:39 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 03:46:39 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 03:47:15 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 03:47:15 - INFO - __main__ - *** Starting training *** +2025-04-29 03:47:15 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 04:02:33 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 04:02:33 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 04:02:33 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_04-02-33_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 04:02:37 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 04:02:37 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 04:02:37 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 04:03:01 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 04:03:01 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 04:03:16 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 04:03:16 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 04:03:16 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 04:03:50 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 04:03:50 - INFO - __main__ - *** Starting training *** +2025-04-29 04:03:50 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 04:11:21 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 04:11:21 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 04:11:21 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_04-11-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 04:11:24 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 04:11:24 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 04:11:24 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 04:11:49 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 04:11:49 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 04:12:01 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 04:12:01 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 04:12:01 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 04:12:34 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 04:12:34 - INFO - __main__ - *** Starting training *** +2025-04-29 04:12:34 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 04:22:00 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 04:22:00 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 04:22:00 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_04-21-59_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 04:22:02 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 04:22:02 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 04:22:02 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 04:22:26 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 04:22:26 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 04:22:39 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 04:22:39 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 04:22:39 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 04:23:11 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 04:23:11 - INFO - __main__ - *** Starting training *** +2025-04-29 04:23:11 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 04:37:22 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 04:37:22 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 04:37:22 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_04-37-21_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 04:37:24 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 04:37:24 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 04:37:24 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 04:37:48 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 04:37:48 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 04:38:00 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 04:38:00 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 04:38:00 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 04:38:33 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 04:38:33 - INFO - __main__ - *** Starting training *** +2025-04-29 04:38:33 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 04:59:19 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 04:59:19 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 04:59:19 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_04-59-19_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 04:59:22 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 04:59:22 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 04:59:22 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 04:59:46 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 04:59:46 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 04:59:57 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 04:59:57 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 04:59:57 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 05:00:35 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 05:00:35 - INFO - __main__ - *** Starting training *** +2025-04-29 05:00:35 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 05:16:27 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:16:27 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:16:27 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-16-27_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:16:30 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 05:16:30 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 05:16:30 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 05:16:53 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 05:16:53 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 05:17:07 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 05:17:07 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 05:17:07 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 05:17:38 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 05:17:38 - INFO - __main__ - *** Starting training *** +2025-04-29 05:17:38 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 05:23:27 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:23:27 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:23:27 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-23-26_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:23:30 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 05:23:30 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 05:23:30 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 05:23:53 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 05:23:53 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 05:24:06 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 05:24:06 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 05:24:06 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 05:24:39 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 05:24:39 - INFO - __main__ - *** Starting training *** +2025-04-29 05:24:39 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 05:28:53 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:28:53 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:28:53 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-28-52_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:28:56 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 05:28:56 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 05:28:56 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 05:29:20 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 05:29:20 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 05:29:35 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 05:29:35 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 05:29:35 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 05:30:07 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 05:30:07 - INFO - __main__ - *** Starting training *** +2025-04-29 05:30:07 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 05:47:47 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:47:47 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:47:47 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-47-47_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:48:25 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:48:25 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:48:25 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-48-25_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:53:47 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:53:47 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:53:47 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-53-46_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:55:17 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:55:17 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:55:17 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-55-17_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:57:47 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:57:47 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:57:47 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-57-46_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 05:58:17 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 05:58:17 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 05:58:17 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 05:58:40 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 05:58:40 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 05:58:56 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 05:58:56 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 05:58:56 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 05:59:40 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 05:59:40 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 05:59:40 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_05-59-40_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 06:01:28 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 06:01:28 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 06:01:28 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_06-01-27_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 06:02:37 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 06:02:37 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 06:02:37 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_06-02-37_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 06:02:40 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 06:02:40 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 06:02:40 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 06:03:06 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 06:03:06 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 06:03:23 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 06:03:23 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 06:03:23 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 06:11:40 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 06:11:40 - INFO - __main__ - *** Starting training *** +2025-04-29 06:11:40 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 06:17:20 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 06:17:20 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 06:17:20 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_06-17-20_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 06:17:23 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 06:17:23 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 06:17:23 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 06:17:49 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 06:17:49 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 06:18:01 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 06:18:01 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 06:18:01 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 06:18:29 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 06:18:29 - INFO - __main__ - *** Starting training *** +2025-04-29 06:18:29 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 06:49:56 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 06:49:56 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 06:49:56 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_06-49-55_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=8192, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 06:49:58 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 06:49:58 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 06:49:58 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 06:50:28 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 06:50:28 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 06:50:39 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 06:50:39 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 06:50:39 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 06:57:31 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 06:57:31 - INFO - __main__ - *** Starting training *** +2025-04-29 06:57:31 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 15:49:10 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 15:49:10 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 15:49:10 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_15-49-10_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7000, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 15:49:12 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 15:49:12 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 15:49:12 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 15:49:38 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 15:49:38 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 15:49:50 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 15:49:50 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 15:49:50 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 15:51:21 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 15:51:21 - INFO - __main__ - *** Starting training *** +2025-04-29 15:51:21 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 17:02:13 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 17:02:13 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 17:02:13 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_17-02-13_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7400, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 17:02:16 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 17:02:16 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 17:02:16 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 17:05:26 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 17:05:26 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 17:05:39 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 17:05:39 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 17:05:39 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 17:10:12 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 17:10:12 - INFO - __main__ - *** Starting training *** +2025-04-29 17:10:12 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-29 17:11:23 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-29 17:11:23 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-29 17:11:23 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr29_17-11-22_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7400, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=100, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-29 17:11:26 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-29 17:11:26 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-29 17:11:26 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-29 17:11:49 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-29 17:11:49 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-29 17:12:02 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-29 17:12:02 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-29 17:12:02 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-29 17:12:28 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-29 17:12:28 - INFO - __main__ - *** Starting training *** +2025-04-29 17:12:28 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-30 18:12:23 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-30 18:12:23 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='lmms-lab/Math10K', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-30 18:12:23 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr30_18-12-22_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7000, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=20, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-30 18:18:44 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-30 18:18:44 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-30 18:18:44 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr30_18-18-43_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7000, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=20, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-30 18:18:46 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-30 18:18:46 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-30 18:18:46 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-30 18:19:11 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-30 18:19:11 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-30 18:19:23 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-30 18:19:23 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-30 18:19:23 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-30 18:19:51 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-30 18:19:51 - INFO - __main__ - *** Starting training *** +2025-04-30 18:19:51 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + ) + (norm): DeepseekV2RMSNorm() + ) + (lm_head): Linear(in_features=2048, out_features=102400, bias=False) +) +2025-04-30 18:54:19 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False) +2025-04-30 18:54:19 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False) +2025-04-30 18:54:19 - INFO - __main__ - Training parameters EfficientDistillationConfig( +_n_gpu=1, +accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False}, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +average_tokens_across_devices=False, +batch_eval_metrics=False, +benchmarks=[], +bf16=True, +bf16_full_eval=False, +callbacks=[], +chars_per_token=, +chat_template=None, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=0, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +dataloader_prefetch_factor=None, +dataset_batch_size=None, +dataset_kwargs=None, +dataset_num_proc=None, +dataset_text_field=text, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800000000, +debug=[], +deepspeed=None, +disable_dropout=True, +disable_tqdm=False, +dispatch_batches=None, +do_eval=True, +do_predict=False, +do_train=False, +eval_accumulation_steps=None, +eval_delay=0, +eval_do_concat_batches=True, +eval_on_start=False, +eval_packing=None, +eval_steps=None, +eval_strategy=IntervalStrategy.NO, +eval_use_gather_object=False, +evaluation_strategy=None, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=4, +gradient_checkpointing=False, +gradient_checkpointing_kwargs={'use_reentrant': False}, +greater_is_better=None, +group_by_length=False, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill, +hub_model_revision=main, +hub_private_repo=None, +hub_strategy=HubStrategy.EVERY_SAVE, +hub_token=, +ignore_data_skip=False, +include_for_metrics=[], +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=5e-05, +length_column_name=length, +lmbda=0.0, +load_best_model_at_end=False, +local_rank=0, +log_level=info, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill/runs/Apr30_18-54-18_q-h100, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1, +logging_strategy=IntervalStrategy.STEPS, +loss_type=forward_kl, +lr_scheduler_kwargs={'min_lr_rate': 0.1}, +lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR, +max_grad_norm=1.0, +max_length=7000, +max_new_tokens=1024, +max_seq_length=None, +max_steps=-1, +metric_for_best_model=None, +model_init_kwargs=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_of_sequences=None, +num_train_epochs=1, +optim=OptimizerNames.ADAMW_TORCH, +optim_args=None, +optim_target_modules=None, +output_dir=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +overwrite_hub_revision=False, +overwrite_output_dir=True, +packing=False, +past_index=-1, +per_device_eval_batch_size=16, +per_device_train_batch_size=4, +prediction_loss_only=False, +push_to_hub=True, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_revision=False, +push_to_hub_token=, +ray_scope=last, +reduction=sum, +remove_unused_columns=True, +report_to=['wandb'], +restore_callback_states_from_checkpoint=False, +resume_from_checkpoint=None, +run_name=data/DeepSeek-Coder-V2-Lite-Instruct/distill, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=20, +save_strategy=SaveStrategy.STEPS, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=None, +system_prompt=None, +teacher_model_init_kwargs=None, +teacher_model_name_or_path=None, +temperature=0.9, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torch_empty_cache_steps=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_liger=False, +use_liger_kernel=False, +use_mps_device=False, +wandb_entity=None, +wandb_project=None, +warmup_ratio=0.1, +warmup_steps=0, +weight_decay=0.0, +) +2025-04-30 18:54:21 - INFO - __main__ - *** Initializing model kwargs *** +2025-04-30 18:54:21 - INFO - __main__ - Loaded top k experts from data/DeepSeek-Coder-V2-Lite-Instruct/distill/top_6_experts.json: {'model.layers.1.mlp': [45, 51, 44, 61, 22, 14], 'model.layers.2.mlp': [25, 18, 27, 13, 23, 3], 'model.layers.3.mlp': [54, 28, 25, 41, 23, 57], 'model.layers.4.mlp': [11, 21, 49, 33, 14, 37], 'model.layers.5.mlp': [35, 54, 20, 9, 47, 52], 'model.layers.6.mlp': [45, 22, 1, 42, 47, 13], 'model.layers.7.mlp': [58, 24, 43, 62, 18, 44], 'model.layers.8.mlp': [47, 39, 54, 58, 30, 56], 'model.layers.9.mlp': [31, 22, 32, 13, 12, 24], 'model.layers.10.mlp': [22, 47, 42, 19, 2, 13], 'model.layers.11.mlp': [11, 17, 29, 10, 22, 59], 'model.layers.12.mlp': [4, 3, 59, 56, 5, 26], 'model.layers.13.mlp': [17, 10, 47, 14, 42, 58], 'model.layers.14.mlp': [51, 7, 27, 31, 61, 18], 'model.layers.15.mlp': [24, 14, 17, 55, 41, 5], 'model.layers.16.mlp': [61, 33, 19, 49, 9, 63], 'model.layers.17.mlp': [32, 29, 26, 43, 0, 27], 'model.layers.18.mlp': [56, 5, 2, 36, 1, 42], 'model.layers.19.mlp': [24, 36, 40, 0, 23, 2], 'model.layers.20.mlp': [1, 56, 38, 48, 58, 20], 'model.layers.21.mlp': [19, 5, 28, 15, 13, 10], 'model.layers.22.mlp': [32, 14, 58, 31, 3, 45], 'model.layers.23.mlp': [20, 58, 0, 42, 33, 45], 'model.layers.24.mlp': [7, 63, 47, 42, 10, 62], 'model.layers.25.mlp': [45, 39, 46, 11, 38, 48], 'model.layers.26.mlp': [6, 46, 49, 13, 57, 11]} +2025-04-30 18:54:21 - INFO - __main__ - Model memory before loading model:Memory allocated: 0.0 +Memory reserved: 0.0 +2025-04-30 18:54:45 - INFO - __main__ - Model memory after loading model:Memory allocated: 4836.39697265625 +Memory reserved: 7322.0 +2025-04-30 18:54:45 - INFO - __main__ - Replacing MoE layers with dense layers using selected experts... +2025-04-30 18:54:57 - INFO - __main__ - MoE layers replaced with Dense MLP layers +2025-04-30 18:54:57 - INFO - __main__ - Model memory after replacing MoE with dense:Memory allocated: 1404.39697265625 +Memory reserved: 1526.0 +2025-04-30 18:54:57 - INFO - __main__ - Initializing EfficientDistillationTrainer... +2025-04-30 18:55:23 - INFO - __main__ - Model memory after trainer initialization:Memory allocated: 6238.91943359375 +Memory reserved: 7812.0 +2025-04-30 18:55:23 - INFO - __main__ - *** Starting training *** +2025-04-30 18:55:23 - INFO - __main__ - Model architecture: DeepseekV2ForCausalLM( + (model): DeepseekV2Model( + (embed_tokens): Embedding(102400, 2048) + (layers): ModuleList( + (0): DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=10944, bias=False) + (up_proj): Linear(in_features=2048, out_features=10944, bias=False) + (down_proj): Linear(in_features=10944, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (input_layernorm): DeepseekV2RMSNorm() + (post_attention_layernorm): DeepseekV2RMSNorm() + ) + (1-26): 26 x DeepseekV2DecoderLayer( + (self_attn): DeepseekV2FlashAttention2( + (q_proj): Linear(in_features=2048, out_features=3072, bias=False) + (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False) + (kv_a_layernorm): DeepseekV2RMSNorm() + (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (rotary_emb): DeepseekV2YarnRotaryEmbedding() + ) + (mlp): DeepseekV2MoE( + (gate): MoEGate() + (shared_experts): DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=2816, bias=False) + (up_proj): Linear(in_features=2048, out_features=2816, bias=False) + (down_proj): Linear(in_features=2816, out_features=2048, bias=False) + (act_fn): SiLU() + ) + (selected_experts): ModuleList( + (0-5): 6 x DeepseekV2MLP( + (gate_proj): Linear(in_features=2048, out_features=1408, bias=False) + (up_proj): Linear(in_features=2048, out_features=1408, bias=False) + (down_proj): Linear(in_features=1408, out_features=2048, bias=False) + (act_fn): SiLU() + ) + ) + (experts): ModuleList() ) (input_layernorm): DeepseekV2RMSNorm() (post_attention_layernorm): DeepseekV2RMSNorm() @@ -117,129 +28089,3 @@ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; ) (lm_head): Linear(in_features=2048, out_features=102400, bias=False) ) -Parameter Offload: Total persistent parameters: 126464 in 82 params -wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. -wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. -wandb: Currently logged in as: hector_ (hector_-carnegie-mellon-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin -wandb: Tracking run with wandb version 0.19.8 -wandb: Run data is saved locally in /ocean/projects/cis240137p/hhe4/deepseek/open-r1/wandb/run-20250412_002043-nwpje7dw -wandb: Run `wandb offline` to turn off syncing. -wandb: Syncing run data/DeepSeek-Coder-V2-Lite-Instruct -wandb: ⭐️ View project at https://wandb.ai/hector_-carnegie-mellon-university/huggingface -wandb: 🚀 View run at https://wandb.ai/hector_-carnegie-mellon-university/huggingface/runs/nwpje7dw - 0%| | 0/54042 [00:00 - main(script_args, training_args, model_args) - File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main - train_result = trainer.train(resume_from_checkpoint=checkpoint) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train - return inner_training_loop( - ^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop - tr_loss_step = self.training_step(model, inputs, num_items_in_batch) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step - loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss - (loss, outputs) = super().compute_loss( - ^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss - outputs = model(**inputs) - ^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn - ret_val = func(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward - loss = self.module(*inputs, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl - return inner() - ^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner - result = forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward - loss = loss_fct(shift_logits, shift_labels) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl - return self._call_impl(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl - return forward_call(*args, **kwargs) - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward - return F.cross_entropy( - ^^^^^^^^^^^^^^^^ - File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy - return torch._C._nn.cross_entropy_loss( - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 0 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) -[rank0]: Traceback (most recent call last): -[rank0]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 234, in -[rank0]: main(script_args, training_args, model_args) -[rank0]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main -[rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train -[rank0]: return inner_training_loop( -[rank0]: ^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop -[rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step -[rank0]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss -[rank0]: (loss, outputs) = super().compute_loss( -[rank0]: ^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss -[rank0]: outputs = model(**inputs) -[rank0]: ^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank0]: return self._call_impl(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl -[rank0]: return forward_call(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn -[rank0]: ret_val = func(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward -[rank0]: loss = self.module(*inputs, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank0]: return self._call_impl(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl -[rank0]: return inner() -[rank0]: ^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner -[rank0]: result = forward_call(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward -[rank0]: loss = loss_fct(shift_logits, shift_labels) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl -[rank0]: return self._call_impl(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl -[rank0]: return forward_call(*args, **kwargs) -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward -[rank0]: return F.cross_entropy( -[rank0]: ^^^^^^^^^^^^^^^^ -[rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy -[rank0]: return torch._C._nn.cross_entropy_loss( -[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -[rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 0 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)