HectorHe commited on
Commit
43dbce4
·
verified ·
1 Parent(s): 2e5b970

Training in progress, step 200

Browse files
config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
3
+ "architectures": [
4
+ "DeepseekV2ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct--configuration_deepseek.DeepseekV2Config",
10
+ "AutoModel": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct--modeling_deepseek.DeepseekV2Model",
11
+ "AutoModelForCausalLM": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct--modeling_deepseek.DeepseekV2ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 100000,
15
+ "eos_token_id": 100001,
16
+ "ep_size": 1,
17
+ "first_k_dense_replace": 1,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 10944,
22
+ "kv_lora_rank": 512,
23
+ "max_position_embeddings": 163840,
24
+ "model_type": "deepseek_v2",
25
+ "moe_intermediate_size": 1408,
26
+ "moe_layer_freq": 1,
27
+ "n_group": 1,
28
+ "n_routed_experts": 64,
29
+ "n_shared_experts": 2,
30
+ "norm_topk_prob": false,
31
+ "num_attention_heads": 16,
32
+ "num_experts_per_tok": 6,
33
+ "num_hidden_layers": 27,
34
+ "num_key_value_heads": 16,
35
+ "pretraining_tp": 1,
36
+ "q_lora_rank": null,
37
+ "qk_nope_head_dim": 128,
38
+ "qk_rope_head_dim": 64,
39
+ "rms_norm_eps": 1e-06,
40
+ "rope_scaling": {
41
+ "beta_fast": 32,
42
+ "beta_slow": 1,
43
+ "factor": 40,
44
+ "mscale": 0.707,
45
+ "mscale_all_dim": 0.707,
46
+ "original_max_position_embeddings": 4096,
47
+ "type": "yarn"
48
+ },
49
+ "rope_theta": 10000,
50
+ "routed_scaling_factor": 1.0,
51
+ "scoring_func": "softmax",
52
+ "seq_aux": true,
53
+ "tie_word_embeddings": false,
54
+ "topk_group": 1,
55
+ "topk_method": "greedy",
56
+ "torch_dtype": "bfloat16",
57
+ "transformers_version": "4.49.0",
58
+ "use_cache": false,
59
+ "v_head_dim": 128,
60
+ "vocab_size": 102400
61
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd11ec189c4fc2d3e263e6b08d22d56477f714a5feaec0a2814c2155e479c62
3
+ size 4994763632
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b964946be8fcd9a6048863a608e3161e168df8645afd86b718fabd507cb0047
3
+ size 4995044944
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194dce20b66593687045682a27dbb75f4a330aacdfe269e87f7db23f8bb11dc1
3
+ size 4996085000
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ff732f5434755d27030f035c13162746f8946d61cb85f32a3dc00392eb60a5
3
+ size 4996085224
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d63a031694e5948296dcdc59ee44fde4bda82cc233acd93c0fc6748ab2cbba
3
+ size 4996085224
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7e49f85b04cf6f833538c0e6269854e282a2f61bae173221e335a0661f24100
3
+ size 4995045792
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad418e1a2daffee0912b482509aacc0f7df237efbd6942e00c3ae1e183cc219
3
+ size 1440515736
model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|end▁of▁sentence|>"
17
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "100000": {
7
+ "content": "<|begin▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "100001": {
15
+ "content": "<|end▁of▁sentence|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "100002": {
23
+ "content": "<|fim▁hole|>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "100003": {
31
+ "content": "<|fim▁begin|>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": false
37
+ },
38
+ "100004": {
39
+ "content": "<|fim▁end|>",
40
+ "lstrip": false,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "100005": {
47
+ "content": "<|completion|>",
48
+ "lstrip": false,
49
+ "normalized": true,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "100006": {
55
+ "content": "<|User|>",
56
+ "lstrip": false,
57
+ "normalized": true,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "100007": {
63
+ "content": "<|Assistant|>",
64
+ "lstrip": false,
65
+ "normalized": true,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": false
69
+ },
70
+ "100008": {
71
+ "content": "<|EOT|>",
72
+ "lstrip": false,
73
+ "normalized": true,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "100009": {
79
+ "content": "<|tool▁calls▁begin|>",
80
+ "lstrip": false,
81
+ "normalized": true,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": false
85
+ },
86
+ "100010": {
87
+ "content": "<|tool▁calls▁end|>",
88
+ "lstrip": false,
89
+ "normalized": true,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": false
93
+ },
94
+ "100011": {
95
+ "content": "<|tool▁call▁begin|>",
96
+ "lstrip": false,
97
+ "normalized": true,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": false
101
+ },
102
+ "100012": {
103
+ "content": "<|tool▁call▁end|>",
104
+ "lstrip": false,
105
+ "normalized": true,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": false
109
+ },
110
+ "100013": {
111
+ "content": "<|tool▁outputs▁begin|>",
112
+ "lstrip": false,
113
+ "normalized": true,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": false
117
+ },
118
+ "100014": {
119
+ "content": "<|tool▁outputs▁end|>",
120
+ "lstrip": false,
121
+ "normalized": true,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "100015": {
127
+ "content": "<|tool▁output▁begin|>",
128
+ "lstrip": false,
129
+ "normalized": true,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "100016": {
135
+ "content": "<|tool▁output▁end|>",
136
+ "lstrip": false,
137
+ "normalized": true,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "100017": {
143
+ "content": "<|tool▁sep|>",
144
+ "lstrip": false,
145
+ "normalized": true,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ }
150
+ },
151
+ "bos_token": "<|begin▁of▁sentence|>",
152
+ "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
153
+ "clean_up_tokenization_spaces": false,
154
+ "eos_token": "<|end▁of▁sentence|>",
155
+ "extra_special_tokens": {},
156
+ "legacy": true,
157
+ "model_max_length": 16384,
158
+ "pad_token": "<|end▁of▁sentence|>",
159
+ "sp_model_kwargs": {},
160
+ "tokenizer_class": "LlamaTokenizerFast",
161
+ "unk_token": null,
162
+ "use_default_system_prompt": false
163
+ }
training.log ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0
  0%| | 0/54042 [00:00<?, ?it/s]Traceback (most recent call last):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ [rank1]: Traceback (most recent call last):
4
+ [rank1]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 234, in <module>
5
+ [rank1]: main(script_args, training_args, model_args)
6
+ [rank1]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main
7
+ [rank1]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
8
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
9
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train
10
+ [rank1]: return inner_training_loop(
11
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^
12
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
13
+ [rank1]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
14
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
15
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step
16
+ [rank1]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
17
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
18
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss
19
+ [rank1]: (loss, outputs) = super().compute_loss(
20
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^
21
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss
22
+ [rank1]: outputs = model(**inputs)
23
+ [rank1]: ^^^^^^^^^^^^^^^
24
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
25
+ [rank1]: return self._call_impl(*args, **kwargs)
26
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
27
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
28
+ [rank1]: return forward_call(*args, **kwargs)
29
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
30
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
31
+ [rank1]: ret_val = func(*args, **kwargs)
32
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^
33
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward
34
+ [rank1]: loss = self.module(*inputs, **kwargs)
35
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
36
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
37
+ [rank1]: return self._call_impl(*args, **kwargs)
38
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
39
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
40
+ [rank1]: return inner()
41
+ [rank1]: ^^^^^^^
42
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner
43
+ [rank1]: result = forward_call(*args, **kwargs)
44
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
45
+ [rank1]: File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward
46
+ [rank1]: loss = loss_fct(shift_logits, shift_labels)
47
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
48
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
49
+ [rank1]: return self._call_impl(*args, **kwargs)
50
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
51
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
52
+ [rank1]: return forward_call(*args, **kwargs)
53
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
54
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward
55
+ [rank1]: return F.cross_entropy(
56
+ [rank1]: ^^^^^^^^^^^^^^^^
57
+ [rank1]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy
58
+ [rank1]: return torch._C._nn.cross_entropy_loss(
59
+ [rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
60
+ [rank1]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 1 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
61
+ king train dataset: 2%|▏ | 2000/93733 [00:38<20:32, 74.46 examples/s]
62
+ Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
63
+ 2025-04-12 00:20:30 - INFO - __main__ - *** Train ***
64
+ 2025-04-12 00:20:30 - INFO - __main__ - DeepseekV2ForCausalLM(
65
+ (model): DeepseekV2Model(
66
+ (embed_tokens): Embedding(102400, 2048)
67
+ (layers): ModuleList(
68
+ (0): DeepseekV2DecoderLayer(
69
+ (self_attn): DeepseekV2FlashAttention2(
70
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
71
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
72
+ (kv_a_layernorm): DeepseekV2RMSNorm()
73
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
74
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
75
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
76
+ )
77
+ (mlp): DeepseekV2MLP(
78
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
79
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
80
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
81
+ (act_fn): SiLU()
82
+ )
83
+ (input_layernorm): DeepseekV2RMSNorm()
84
+ (post_attention_layernorm): DeepseekV2RMSNorm()
85
+ )
86
+ (1-26): 26 x DeepseekV2DecoderLayer(
87
+ (self_attn): DeepseekV2FlashAttention2(
88
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
89
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
90
+ (kv_a_layernorm): DeepseekV2RMSNorm()
91
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
92
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
93
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
94
+ )
95
+ (mlp): DeepseekV2MoE(
96
+ (experts): ModuleList(
97
+ (0-63): 64 x DeepseekV2MLP(
98
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
99
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
100
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
101
+ (act_fn): SiLU()
102
+ )
103
+ )
104
+ (gate): MoEGate()
105
+ (shared_experts): DeepseekV2MLP(
106
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
107
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
108
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
109
+ (act_fn): SiLU()
110
+ )
111
+ )
112
+ (input_layernorm): DeepseekV2RMSNorm()
113
+ (post_attention_layernorm): DeepseekV2RMSNorm()
114
+ )
115
+ )
116
+ (norm): DeepseekV2RMSNorm()
117
+ )
118
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
119
+ )
120
+ Parameter Offload: Total persistent parameters: 126464 in 82 params
121
+ wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
122
+ wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
123
+ wandb: Currently logged in as: hector_ (hector_-carnegie-mellon-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
124
+ wandb: Tracking run with wandb version 0.19.8
125
+ wandb: Run data is saved locally in /ocean/projects/cis240137p/hhe4/deepseek/open-r1/wandb/run-20250412_002043-nwpje7dw
126
+ wandb: Run `wandb offline` to turn off syncing.
127
+ wandb: Syncing run data/DeepSeek-Coder-V2-Lite-Instruct
128
+ wandb: ⭐️ View project at https://wandb.ai/hector_-carnegie-mellon-university/huggingface
129
+ wandb: 🚀 View run at https://wandb.ai/hector_-carnegie-mellon-university/huggingface/runs/nwpje7dw
130
+
131
  0%| | 0/54042 [00:00<?, ?it/s]Traceback (most recent call last):
132
+ File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 234, in <module>
133
+ main(script_args, training_args, model_args)
134
+ File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main
135
+ train_result = trainer.train(resume_from_checkpoint=checkpoint)
136
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
137
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train
138
+ return inner_training_loop(
139
+ ^^^^^^^^^^^^^^^^^^^^
140
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
141
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
142
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
143
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step
144
+ loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
145
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
146
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss
147
+ (loss, outputs) = super().compute_loss(
148
+ ^^^^^^^^^^^^^^^^^^^^^
149
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss
150
+ outputs = model(**inputs)
151
+ ^^^^^^^^^^^^^^^
152
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
153
+ return self._call_impl(*args, **kwargs)
154
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
155
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
156
+ return forward_call(*args, **kwargs)
157
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
158
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
159
+ ret_val = func(*args, **kwargs)
160
+ ^^^^^^^^^^^^^^^^^^^^^
161
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward
162
+ loss = self.module(*inputs, **kwargs)
163
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
164
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
165
+ return self._call_impl(*args, **kwargs)
166
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
167
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
168
+ return inner()
169
+ ^^^^^^^
170
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner
171
+ result = forward_call(*args, **kwargs)
172
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
173
+ File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward
174
+ loss = loss_fct(shift_logits, shift_labels)
175
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
176
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
177
+ return self._call_impl(*args, **kwargs)
178
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
179
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
180
+ return forward_call(*args, **kwargs)
181
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
182
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward
183
+ return F.cross_entropy(
184
+ ^^^^^^^^^^^^^^^^
185
+ File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy
186
+ return torch._C._nn.cross_entropy_loss(
187
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
188
+ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 0 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
189
+ [rank0]: Traceback (most recent call last):
190
+ [rank0]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 234, in <module>
191
+ [rank0]: main(script_args, training_args, model_args)
192
+ [rank0]: File "/ocean/projects/cis240137p/hhe4/deepseek/open-r1/src/open_r1/sft.py", line 188, in main
193
+ [rank0]: train_result = trainer.train(resume_from_checkpoint=checkpoint)
194
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
195
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2232, in train
196
+ [rank0]: return inner_training_loop(
197
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^
198
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 2548, in _inner_training_loop
199
+ [rank0]: tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
200
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
201
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3698, in training_step
202
+ [rank0]: loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
203
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
204
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/trl/trainer/sft_trainer.py", line 469, in compute_loss
205
+ [rank0]: (loss, outputs) = super().compute_loss(
206
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
207
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/transformers/trainer.py", line 3759, in compute_loss
208
+ [rank0]: outputs = model(**inputs)
209
+ [rank0]: ^^^^^^^^^^^^^^^
210
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
211
+ [rank0]: return self._call_impl(*args, **kwargs)
212
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
213
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
214
+ [rank0]: return forward_call(*args, **kwargs)
215
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
216
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
217
+ [rank0]: ret_val = func(*args, **kwargs)
218
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^
219
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1899, in forward
220
+ [rank0]: loss = self.module(*inputs, **kwargs)
221
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
222
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
223
+ [rank0]: return self._call_impl(*args, **kwargs)
224
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
225
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
226
+ [rank0]: return inner()
227
+ [rank0]: ^^^^^^^
228
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1790, in inner
229
+ [rank0]: result = forward_call(*args, **kwargs)
230
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
231
+ [rank0]: File "/ocean/projects/cis240137p/hhe4/hf_cache/modules/transformers_modules/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct/e434a23f91ba5b4923cf6c9d9a238eb4a08e3a11/modeling_deepseek.py", line 1702, in forward
232
+ [rank0]: loss = loss_fct(shift_logits, shift_labels)
233
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
234
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
235
+ [rank0]: return self._call_impl(*args, **kwargs)
236
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
237
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
238
+ [rank0]: return forward_call(*args, **kwargs)
239
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
240
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/modules/loss.py", line 1293, in forward
241
+ [rank0]: return F.cross_entropy(
242
+ [rank0]: ^^^^^^^^^^^^^^^^
243
+ [rank0]: File "/jet/home/hhe4/deepseek/open-r1/openr1/lib/python3.11/site-packages/torch/nn/functional.py", line 3479, in cross_entropy
244
+ [rank0]: return torch._C._nn.cross_entropy_loss(
245
+ [rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
246
+ [rank0]: torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.24 GiB. GPU 0 has a total capacity of 79.21 GiB of which 604.75 MiB is free. Including non-PyTorch memory, this process has 78.61 GiB memory in use. Of the allocated memory 73.91 GiB is allocated by PyTorch, and 3.09 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
training_4pus.log ADDED
Binary file (75.1 kB). View file
 
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:784c8ddede97b768ac7df72d26cc44295ab6c84ab394fde376d273efc9af5e4c
3
+ size 7544
training_backup.log ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-03-31 16:45:54 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=False, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
2
+ 2025-03-31 16:45:54 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
3
+ 2025-03-31 16:45:54 - INFO - __main__ - Training parameters SFTConfig(
4
+ _n_gpu=1,
5
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
6
+ adafactor=False,
7
+ adam_beta1=0.9,
8
+ adam_beta2=0.999,
9
+ adam_epsilon=1e-08,
10
+ auto_find_batch_size=False,
11
+ average_tokens_across_devices=False,
12
+ batch_eval_metrics=False,
13
+ benchmarks=[],
14
+ bf16=True,
15
+ bf16_full_eval=False,
16
+ callbacks=[],
17
+ chars_per_token=<CHARS_PER_TOKEN>,
18
+ chat_template=None,
19
+ data_seed=None,
20
+ dataloader_drop_last=False,
21
+ dataloader_num_workers=0,
22
+ dataloader_persistent_workers=False,
23
+ dataloader_pin_memory=True,
24
+ dataloader_prefetch_factor=None,
25
+ dataset_batch_size=None,
26
+ dataset_kwargs=None,
27
+ dataset_num_proc=None,
28
+ dataset_text_field=text,
29
+ ddp_backend=None,
30
+ ddp_broadcast_buffers=None,
31
+ ddp_bucket_cap_mb=None,
32
+ ddp_find_unused_parameters=None,
33
+ ddp_timeout=1800,
34
+ debug=[],
35
+ deepspeed=None,
36
+ disable_tqdm=False,
37
+ dispatch_batches=None,
38
+ do_eval=True,
39
+ do_predict=False,
40
+ do_train=False,
41
+ eval_accumulation_steps=None,
42
+ eval_delay=0,
43
+ eval_do_concat_batches=True,
44
+ eval_on_start=False,
45
+ eval_packing=None,
46
+ eval_steps=None,
47
+ eval_strategy=IntervalStrategy.NO,
48
+ eval_use_gather_object=False,
49
+ evaluation_strategy=None,
50
+ fp16=False,
51
+ fp16_backend=auto,
52
+ fp16_full_eval=False,
53
+ fp16_opt_level=O1,
54
+ fsdp=[],
55
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
56
+ fsdp_min_num_params=0,
57
+ fsdp_transformer_layer_cls_to_wrap=None,
58
+ full_determinism=False,
59
+ gradient_accumulation_steps=1,
60
+ gradient_checkpointing=True,
61
+ gradient_checkpointing_kwargs={'use_reentrant': False},
62
+ greater_is_better=None,
63
+ group_by_length=False,
64
+ half_precision_backend=auto,
65
+ hub_always_push=False,
66
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
67
+ hub_model_revision=main,
68
+ hub_private_repo=None,
69
+ hub_strategy=HubStrategy.EVERY_SAVE,
70
+ hub_token=<HUB_TOKEN>,
71
+ ignore_data_skip=False,
72
+ include_for_metrics=[],
73
+ include_inputs_for_metrics=False,
74
+ include_num_input_tokens_seen=False,
75
+ include_tokens_per_second=False,
76
+ jit_mode_eval=False,
77
+ label_names=None,
78
+ label_smoothing_factor=0.0,
79
+ learning_rate=5e-05,
80
+ length_column_name=length,
81
+ load_best_model_at_end=False,
82
+ local_rank=0,
83
+ log_level=info,
84
+ log_level_replica=warning,
85
+ log_on_each_node=True,
86
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/runs/Mar31_16-45-54_w002.ib.bridges2.psc.edu,
87
+ logging_first_step=False,
88
+ logging_nan_inf_filter=True,
89
+ logging_steps=1,
90
+ logging_strategy=IntervalStrategy.STEPS,
91
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
92
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
93
+ max_grad_norm=1.0,
94
+ max_length=32768,
95
+ max_seq_length=None,
96
+ max_steps=-1,
97
+ metric_for_best_model=None,
98
+ model_init_kwargs=None,
99
+ mp_parameters=,
100
+ neftune_noise_alpha=None,
101
+ no_cuda=False,
102
+ num_of_sequences=None,
103
+ num_train_epochs=5,
104
+ optim=OptimizerNames.ADAMW_TORCH,
105
+ optim_args=None,
106
+ optim_target_modules=None,
107
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct,
108
+ overwrite_hub_revision=False,
109
+ overwrite_output_dir=True,
110
+ packing=True,
111
+ past_index=-1,
112
+ per_device_eval_batch_size=16,
113
+ per_device_train_batch_size=16,
114
+ prediction_loss_only=False,
115
+ push_to_hub=True,
116
+ push_to_hub_model_id=None,
117
+ push_to_hub_organization=None,
118
+ push_to_hub_revision=False,
119
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
120
+ ray_scope=last,
121
+ remove_unused_columns=True,
122
+ report_to=['wandb'],
123
+ restore_callback_states_from_checkpoint=False,
124
+ resume_from_checkpoint=None,
125
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct,
126
+ save_on_each_node=False,
127
+ save_only_model=False,
128
+ save_safetensors=True,
129
+ save_steps=200,
130
+ save_strategy=SaveStrategy.STEPS,
131
+ save_total_limit=1,
132
+ seed=42,
133
+ skip_memory_metrics=True,
134
+ split_batches=None,
135
+ system_prompt=None,
136
+ tf32=None,
137
+ torch_compile=False,
138
+ torch_compile_backend=None,
139
+ torch_compile_mode=None,
140
+ torch_empty_cache_steps=None,
141
+ torchdynamo=None,
142
+ tpu_metrics_debug=False,
143
+ tpu_num_cores=None,
144
+ use_cpu=False,
145
+ use_ipex=False,
146
+ use_legacy_prediction_loop=False,
147
+ use_liger=True,
148
+ use_liger_kernel=False,
149
+ use_mps_device=False,
150
+ wandb_entity=None,
151
+ wandb_project=None,
152
+ warmup_ratio=0.1,
153
+ warmup_steps=0,
154
+ weight_decay=0.0,
155
+ )
156
+ 2025-03-31 16:45:57 - INFO - __main__ - *** Initializing model kwargs ***
157
+ 2025-03-31 16:48:46 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
158
+ 2025-03-31 16:48:46 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
159
+ 2025-03-31 16:48:46 - INFO - __main__ - Training parameters SFTConfig(
160
+ _n_gpu=1,
161
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
162
+ adafactor=False,
163
+ adam_beta1=0.9,
164
+ adam_beta2=0.999,
165
+ adam_epsilon=1e-08,
166
+ auto_find_batch_size=False,
167
+ average_tokens_across_devices=False,
168
+ batch_eval_metrics=False,
169
+ benchmarks=[],
170
+ bf16=True,
171
+ bf16_full_eval=False,
172
+ callbacks=[],
173
+ chars_per_token=<CHARS_PER_TOKEN>,
174
+ chat_template=None,
175
+ data_seed=None,
176
+ dataloader_drop_last=False,
177
+ dataloader_num_workers=0,
178
+ dataloader_persistent_workers=False,
179
+ dataloader_pin_memory=True,
180
+ dataloader_prefetch_factor=None,
181
+ dataset_batch_size=None,
182
+ dataset_kwargs=None,
183
+ dataset_num_proc=None,
184
+ dataset_text_field=text,
185
+ ddp_backend=None,
186
+ ddp_broadcast_buffers=None,
187
+ ddp_bucket_cap_mb=None,
188
+ ddp_find_unused_parameters=None,
189
+ ddp_timeout=1800,
190
+ debug=[],
191
+ deepspeed=None,
192
+ disable_tqdm=False,
193
+ dispatch_batches=None,
194
+ do_eval=True,
195
+ do_predict=False,
196
+ do_train=False,
197
+ eval_accumulation_steps=None,
198
+ eval_delay=0,
199
+ eval_do_concat_batches=True,
200
+ eval_on_start=False,
201
+ eval_packing=None,
202
+ eval_steps=None,
203
+ eval_strategy=IntervalStrategy.NO,
204
+ eval_use_gather_object=False,
205
+ evaluation_strategy=None,
206
+ fp16=False,
207
+ fp16_backend=auto,
208
+ fp16_full_eval=False,
209
+ fp16_opt_level=O1,
210
+ fsdp=[],
211
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
212
+ fsdp_min_num_params=0,
213
+ fsdp_transformer_layer_cls_to_wrap=None,
214
+ full_determinism=False,
215
+ gradient_accumulation_steps=1,
216
+ gradient_checkpointing=True,
217
+ gradient_checkpointing_kwargs={'use_reentrant': False},
218
+ greater_is_better=None,
219
+ group_by_length=False,
220
+ half_precision_backend=auto,
221
+ hub_always_push=False,
222
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
223
+ hub_model_revision=main,
224
+ hub_private_repo=None,
225
+ hub_strategy=HubStrategy.EVERY_SAVE,
226
+ hub_token=<HUB_TOKEN>,
227
+ ignore_data_skip=False,
228
+ include_for_metrics=[],
229
+ include_inputs_for_metrics=False,
230
+ include_num_input_tokens_seen=False,
231
+ include_tokens_per_second=False,
232
+ jit_mode_eval=False,
233
+ label_names=None,
234
+ label_smoothing_factor=0.0,
235
+ learning_rate=5e-05,
236
+ length_column_name=length,
237
+ load_best_model_at_end=False,
238
+ local_rank=0,
239
+ log_level=info,
240
+ log_level_replica=warning,
241
+ log_on_each_node=True,
242
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/runs/Mar31_16-48-46_w002.ib.bridges2.psc.edu,
243
+ logging_first_step=False,
244
+ logging_nan_inf_filter=True,
245
+ logging_steps=1,
246
+ logging_strategy=IntervalStrategy.STEPS,
247
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
248
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
249
+ max_grad_norm=1.0,
250
+ max_length=32768,
251
+ max_seq_length=None,
252
+ max_steps=-1,
253
+ metric_for_best_model=None,
254
+ model_init_kwargs=None,
255
+ mp_parameters=,
256
+ neftune_noise_alpha=None,
257
+ no_cuda=False,
258
+ num_of_sequences=None,
259
+ num_train_epochs=5,
260
+ optim=OptimizerNames.ADAMW_TORCH,
261
+ optim_args=None,
262
+ optim_target_modules=None,
263
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct,
264
+ overwrite_hub_revision=False,
265
+ overwrite_output_dir=True,
266
+ packing=True,
267
+ past_index=-1,
268
+ per_device_eval_batch_size=16,
269
+ per_device_train_batch_size=16,
270
+ prediction_loss_only=False,
271
+ push_to_hub=True,
272
+ push_to_hub_model_id=None,
273
+ push_to_hub_organization=None,
274
+ push_to_hub_revision=False,
275
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
276
+ ray_scope=last,
277
+ remove_unused_columns=True,
278
+ report_to=['wandb'],
279
+ restore_callback_states_from_checkpoint=False,
280
+ resume_from_checkpoint=None,
281
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct,
282
+ save_on_each_node=False,
283
+ save_only_model=False,
284
+ save_safetensors=True,
285
+ save_steps=200,
286
+ save_strategy=SaveStrategy.STEPS,
287
+ save_total_limit=1,
288
+ seed=42,
289
+ skip_memory_metrics=True,
290
+ split_batches=None,
291
+ system_prompt=None,
292
+ tf32=None,
293
+ torch_compile=False,
294
+ torch_compile_backend=None,
295
+ torch_compile_mode=None,
296
+ torch_empty_cache_steps=None,
297
+ torchdynamo=None,
298
+ tpu_metrics_debug=False,
299
+ tpu_num_cores=None,
300
+ use_cpu=False,
301
+ use_ipex=False,
302
+ use_legacy_prediction_loop=False,
303
+ use_liger=True,
304
+ use_liger_kernel=False,
305
+ use_mps_device=False,
306
+ wandb_entity=None,
307
+ wandb_project=None,
308
+ warmup_ratio=0.1,
309
+ warmup_steps=0,
310
+ weight_decay=0.0,
311
+ )
312
+ 2025-03-31 16:48:48 - INFO - __main__ - *** Initializing model kwargs ***
313
+ 2025-03-31 16:54:35 - INFO - __main__ - Model parameters ModelConfig(model_name_or_path='deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct', model_revision='main', torch_dtype='bfloat16', trust_remote_code=True, attn_implementation='flash_attention_2', use_peft=False, lora_r=16, lora_alpha=32, lora_dropout=0.05, lora_target_modules=None, lora_modules_to_save=None, lora_task_type='CAUSAL_LM', use_rslora=False, load_in_8bit=False, load_in_4bit=False, bnb_4bit_quant_type='nf4', use_bnb_nested_quant=False)
314
+ 2025-03-31 16:54:35 - INFO - __main__ - Script parameters ScriptArguments(dataset_name='open-r1/OpenR1-Math-220k', dataset_config=None, dataset_train_split='train', dataset_test_split='test', gradient_checkpointing_use_reentrant=False, ignore_bias_buffers=False)
315
+ 2025-03-31 16:54:35 - INFO - __main__ - Training parameters SFTConfig(
316
+ _n_gpu=1,
317
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
318
+ adafactor=False,
319
+ adam_beta1=0.9,
320
+ adam_beta2=0.999,
321
+ adam_epsilon=1e-08,
322
+ auto_find_batch_size=False,
323
+ average_tokens_across_devices=False,
324
+ batch_eval_metrics=False,
325
+ benchmarks=[],
326
+ bf16=True,
327
+ bf16_full_eval=False,
328
+ callbacks=[],
329
+ chars_per_token=<CHARS_PER_TOKEN>,
330
+ chat_template=None,
331
+ data_seed=None,
332
+ dataloader_drop_last=False,
333
+ dataloader_num_workers=0,
334
+ dataloader_persistent_workers=False,
335
+ dataloader_pin_memory=True,
336
+ dataloader_prefetch_factor=None,
337
+ dataset_batch_size=None,
338
+ dataset_kwargs=None,
339
+ dataset_num_proc=None,
340
+ dataset_text_field=text,
341
+ ddp_backend=None,
342
+ ddp_broadcast_buffers=None,
343
+ ddp_bucket_cap_mb=None,
344
+ ddp_find_unused_parameters=None,
345
+ ddp_timeout=1800,
346
+ debug=[],
347
+ deepspeed=None,
348
+ disable_tqdm=False,
349
+ dispatch_batches=None,
350
+ do_eval=True,
351
+ do_predict=False,
352
+ do_train=False,
353
+ eval_accumulation_steps=None,
354
+ eval_delay=0,
355
+ eval_do_concat_batches=True,
356
+ eval_on_start=False,
357
+ eval_packing=None,
358
+ eval_steps=None,
359
+ eval_strategy=IntervalStrategy.NO,
360
+ eval_use_gather_object=False,
361
+ evaluation_strategy=None,
362
+ fp16=False,
363
+ fp16_backend=auto,
364
+ fp16_full_eval=False,
365
+ fp16_opt_level=O1,
366
+ fsdp=[],
367
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
368
+ fsdp_min_num_params=0,
369
+ fsdp_transformer_layer_cls_to_wrap=None,
370
+ full_determinism=False,
371
+ gradient_accumulation_steps=1,
372
+ gradient_checkpointing=True,
373
+ gradient_checkpointing_kwargs={'use_reentrant': False},
374
+ greater_is_better=None,
375
+ group_by_length=False,
376
+ half_precision_backend=auto,
377
+ hub_always_push=False,
378
+ hub_model_id=Deepseek-Coder-V2-Lite-13B-Instruct-Open-R1-Distill,
379
+ hub_model_revision=main,
380
+ hub_private_repo=None,
381
+ hub_strategy=HubStrategy.EVERY_SAVE,
382
+ hub_token=<HUB_TOKEN>,
383
+ ignore_data_skip=False,
384
+ include_for_metrics=[],
385
+ include_inputs_for_metrics=False,
386
+ include_num_input_tokens_seen=False,
387
+ include_tokens_per_second=False,
388
+ jit_mode_eval=False,
389
+ label_names=None,
390
+ label_smoothing_factor=0.0,
391
+ learning_rate=5e-05,
392
+ length_column_name=length,
393
+ load_best_model_at_end=False,
394
+ local_rank=0,
395
+ log_level=info,
396
+ log_level_replica=warning,
397
+ log_on_each_node=True,
398
+ logging_dir=data/DeepSeek-Coder-V2-Lite-Instruct/runs/Mar31_16-54-34_w002.ib.bridges2.psc.edu,
399
+ logging_first_step=False,
400
+ logging_nan_inf_filter=True,
401
+ logging_steps=1,
402
+ logging_strategy=IntervalStrategy.STEPS,
403
+ lr_scheduler_kwargs={'min_lr_rate': 0.1},
404
+ lr_scheduler_type=SchedulerType.COSINE_WITH_MIN_LR,
405
+ max_grad_norm=1.0,
406
+ max_length=32768,
407
+ max_seq_length=None,
408
+ max_steps=-1,
409
+ metric_for_best_model=None,
410
+ model_init_kwargs=None,
411
+ mp_parameters=,
412
+ neftune_noise_alpha=None,
413
+ no_cuda=False,
414
+ num_of_sequences=None,
415
+ num_train_epochs=5,
416
+ optim=OptimizerNames.ADAMW_TORCH,
417
+ optim_args=None,
418
+ optim_target_modules=None,
419
+ output_dir=data/DeepSeek-Coder-V2-Lite-Instruct,
420
+ overwrite_hub_revision=False,
421
+ overwrite_output_dir=True,
422
+ packing=True,
423
+ past_index=-1,
424
+ per_device_eval_batch_size=16,
425
+ per_device_train_batch_size=16,
426
+ prediction_loss_only=False,
427
+ push_to_hub=True,
428
+ push_to_hub_model_id=None,
429
+ push_to_hub_organization=None,
430
+ push_to_hub_revision=False,
431
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
432
+ ray_scope=last,
433
+ remove_unused_columns=True,
434
+ report_to=['wandb'],
435
+ restore_callback_states_from_checkpoint=False,
436
+ resume_from_checkpoint=None,
437
+ run_name=data/DeepSeek-Coder-V2-Lite-Instruct,
438
+ save_on_each_node=False,
439
+ save_only_model=False,
440
+ save_safetensors=True,
441
+ save_steps=200,
442
+ save_strategy=SaveStrategy.STEPS,
443
+ save_total_limit=1,
444
+ seed=42,
445
+ skip_memory_metrics=True,
446
+ split_batches=None,
447
+ system_prompt=None,
448
+ tf32=None,
449
+ torch_compile=False,
450
+ torch_compile_backend=None,
451
+ torch_compile_mode=None,
452
+ torch_empty_cache_steps=None,
453
+ torchdynamo=None,
454
+ tpu_metrics_debug=False,
455
+ tpu_num_cores=None,
456
+ use_cpu=False,
457
+ use_ipex=False,
458
+ use_legacy_prediction_loop=False,
459
+ use_liger=False,
460
+ use_liger_kernel=False,
461
+ use_mps_device=False,
462
+ wandb_entity=None,
463
+ wandb_project=None,
464
+ warmup_ratio=0.1,
465
+ warmup_steps=0,
466
+ weight_decay=0.0,
467
+ )
468
+ 2025-03-31 16:54:36 - INFO - __main__ - *** Initializing model kwargs ***
469
+ 2025-03-31 17:52:07 - INFO - __main__ - *** Train ***
470
+ 2025-03-31 17:52:07 - INFO - __main__ - DeepseekV2ForCausalLM(
471
+ (model): DeepseekV2Model(
472
+ (embed_tokens): Embedding(102400, 2048)
473
+ (layers): ModuleList(
474
+ (0): DeepseekV2DecoderLayer(
475
+ (self_attn): DeepseekV2FlashAttention2(
476
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
477
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
478
+ (kv_a_layernorm): DeepseekV2RMSNorm()
479
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
480
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
481
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
482
+ )
483
+ (mlp): DeepseekV2MLP(
484
+ (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
485
+ (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
486
+ (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
487
+ (act_fn): SiLU()
488
+ )
489
+ (input_layernorm): DeepseekV2RMSNorm()
490
+ (post_attention_layernorm): DeepseekV2RMSNorm()
491
+ )
492
+ (1-26): 26 x DeepseekV2DecoderLayer(
493
+ (self_attn): DeepseekV2FlashAttention2(
494
+ (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
495
+ (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
496
+ (kv_a_layernorm): DeepseekV2RMSNorm()
497
+ (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
498
+ (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
499
+ (rotary_emb): DeepseekV2YarnRotaryEmbedding()
500
+ )
501
+ (mlp): DeepseekV2MoE(
502
+ (experts): ModuleList(
503
+ (0-63): 64 x DeepseekV2MLP(
504
+ (gate_proj): Linear(in_features=2048, out_features=1408, bias=False)
505
+ (up_proj): Linear(in_features=2048, out_features=1408, bias=False)
506
+ (down_proj): Linear(in_features=1408, out_features=2048, bias=False)
507
+ (act_fn): SiLU()
508
+ )
509
+ )
510
+ (gate): MoEGate()
511
+ (shared_experts): DeepseekV2MLP(
512
+ (gate_proj): Linear(in_features=2048, out_features=2816, bias=False)
513
+ (up_proj): Linear(in_features=2048, out_features=2816, bias=False)
514
+ (down_proj): Linear(in_features=2816, out_features=2048, bias=False)
515
+ (act_fn): SiLU()
516
+ )
517
+ )
518
+ (input_layernorm): DeepseekV2RMSNorm()
519
+ (post_attention_layernorm): DeepseekV2RMSNorm()
520
+ )
521
+ )
522
+ (norm): DeepseekV2RMSNorm()
523
+ )
524
+ (lm_head): Linear(in_features=2048, out_features=102400, bias=False)
525
+ )