Jerry999 commited on
Commit
c6a568b
·
verified ·
1 Parent(s): e2d07a4

Delete folder checkpoints with huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/knowledge/atomic_full_sft_50ep/README.md +0 -110
  2. checkpoints/knowledge/atomic_full_sft_50ep/chat_template.jinja +0 -4
  3. checkpoints/knowledge/atomic_full_sft_50ep/config.json +0 -72
  4. checkpoints/knowledge/atomic_full_sft_50ep/eval_results/eval_results.csv +0 -3
  5. checkpoints/knowledge/atomic_full_sft_50ep/eval_results/eval_summary.json +0 -11
  6. checkpoints/knowledge/atomic_full_sft_50ep/eval_results/test_inference_results.jsonl +0 -0
  7. checkpoints/knowledge/atomic_full_sft_50ep/eval_results/test_subset_of_train_inference_results.jsonl +0 -0
  8. checkpoints/knowledge/atomic_full_sft_50ep/generation_config.json +0 -12
  9. checkpoints/knowledge/atomic_full_sft_50ep/model.safetensors +0 -3
  10. checkpoints/knowledge/atomic_full_sft_50ep/tokenizer.json +0 -3
  11. checkpoints/knowledge/atomic_full_sft_50ep/tokenizer_config.json +0 -239
  12. checkpoints/knowledge/atomic_full_then_2step_full_sft/chat_template.jinja +0 -4
  13. checkpoints/knowledge/atomic_full_then_2step_full_sft/config.json +0 -71
  14. checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer.json +0 -3
  15. checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer_config.json +0 -29
  16. checkpoints/knowledge/atomic_sft_lora_50ep/README.md +0 -122
  17. checkpoints/knowledge/atomic_sft_lora_50ep/adapter_config.json +0 -46
  18. checkpoints/knowledge/atomic_sft_lora_50ep/adapter_model.safetensors +0 -3
  19. checkpoints/knowledge/atomic_sft_lora_50ep/chat_template.jinja +0 -4
  20. checkpoints/knowledge/atomic_sft_lora_50ep/config.json +0 -86
  21. checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/eval_results.csv +0 -3
  22. checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/eval_summary.json +0 -11
  23. checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/test_inference_results.jsonl +0 -0
  24. checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/test_subset_of_train_inference_results.jsonl +0 -0
  25. checkpoints/knowledge/atomic_sft_lora_50ep/merged/added_tokens.json +0 -28
  26. checkpoints/knowledge/atomic_sft_lora_50ep/merged/chat_template.jinja +0 -61
  27. checkpoints/knowledge/atomic_sft_lora_50ep/merged/config.json +0 -68
  28. checkpoints/knowledge/atomic_sft_lora_50ep/merged/generation_config.json +0 -13
  29. checkpoints/knowledge/atomic_sft_lora_50ep/merged/merges.txt +0 -0
  30. checkpoints/knowledge/atomic_sft_lora_50ep/merged/model-00001-of-00002.safetensors +0 -3
  31. checkpoints/knowledge/atomic_sft_lora_50ep/merged/model-00002-of-00002.safetensors +0 -3
  32. checkpoints/knowledge/atomic_sft_lora_50ep/merged/model.safetensors.index.json +0 -406
  33. checkpoints/knowledge/atomic_sft_lora_50ep/merged/special_tokens_map.json +0 -31
  34. checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer.json +0 -3
  35. checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer_config.json +0 -239
  36. checkpoints/knowledge/atomic_sft_lora_50ep/merged/vocab.json +0 -0
  37. checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer.json +0 -3
  38. checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer_config.json +0 -29
  39. checkpoints/math_operations/compositional_full_sft_n_steps_2/chat_template.jinja +0 -4
  40. checkpoints/math_operations/compositional_full_sft_n_steps_2/config.json +0 -71
  41. checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer.json +0 -3
  42. checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer_config.json +0 -29
  43. checkpoints/math_operations/full_sft_50k_lr5e5/README.md +0 -132
  44. checkpoints/math_operations/full_sft_50k_lr5e5/chat_template.jinja +0 -4
  45. checkpoints/math_operations/full_sft_50k_lr5e5/config.json +0 -71
  46. checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_converted.jsonl +0 -0
  47. checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_results.jsonl +0 -0
  48. checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_results.csv +0 -12
  49. checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_summary.json +0 -19
  50. checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_converted.jsonl +0 -0
checkpoints/knowledge/atomic_full_sft_50ep/README.md DELETED
@@ -1,110 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
7
- model-index:
8
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep
9
- results: []
10
- ---
11
-
12
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
- should probably proofread and complete it, then remove this comment. -->
14
-
15
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
- <details><summary>See axolotl config</summary>
17
-
18
- axolotl version: `0.15.0.dev0`
19
- ```yaml
20
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
21
- load_in_8bit: false
22
- load_in_4bit: false
23
- strict: false
24
-
25
- datasets:
26
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
27
- type: chat_template
28
- dataset_prepared_path:
29
- val_set_size: 0
30
- chat_template: chatml
31
-
32
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep
33
-
34
- sequence_len: 512
35
- sample_packing: true
36
- eval_sample_packing: true
37
-
38
- gradient_accumulation_steps: 8
39
- micro_batch_size: 1
40
- num_epochs: 50
41
- optimizer: adamw_torch_fused
42
- lr_scheduler: constant_with_warmup
43
- learning_rate: 1e-4
44
-
45
- bf16: auto
46
- tf32: true
47
-
48
- gradient_checkpointing: true
49
- gradient_checkpointing_kwargs:
50
- use_reentrant: false
51
-
52
- logging_steps: 10
53
- flash_attention: true
54
- warmup_ratio: 0.02
55
- saves_per_epoch: 1
56
- save_total_limit: 1
57
- weight_decay: 0.01
58
-
59
- wandb_project: knowledge_sft
60
- wandb_name: atomic-full-50ep-constlr
61
- wandb_log_model: "false"
62
-
63
- special_tokens:
64
-
65
- ```
66
-
67
- </details><br>
68
-
69
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep
70
-
71
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl dataset.
72
-
73
- ## Model description
74
-
75
- More information needed
76
-
77
- ## Intended uses & limitations
78
-
79
- More information needed
80
-
81
- ## Training and evaluation data
82
-
83
- More information needed
84
-
85
- ## Training procedure
86
-
87
- ### Training hyperparameters
88
-
89
- The following hyperparameters were used during training:
90
- - learning_rate: 0.0001
91
- - train_batch_size: 1
92
- - eval_batch_size: 1
93
- - seed: 42
94
- - gradient_accumulation_steps: 8
95
- - total_train_batch_size: 8
96
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
97
- - lr_scheduler_type: constant_with_warmup
98
- - lr_scheduler_warmup_steps: 26
99
- - training_steps: 1300
100
-
101
- ### Training results
102
-
103
-
104
-
105
- ### Framework versions
106
-
107
- - Transformers 5.0.0
108
- - Pytorch 2.8.0+cu128
109
- - Datasets 4.5.0
110
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/config.json DELETED
@@ -1,72 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": true,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936,
71
- "rope_theta": 5000000
72
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/eval_results/eval_results.csv DELETED
@@ -1,3 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,450,448,99.56,0,0.00,2
3
- knowledge,test_inference_results,499,13,2.61,0,0.00,486
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/eval_results/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 499,
4
- "correct": 13,
5
- "accuracy": 2.61,
6
- "format_found": 0,
7
- "format_accuracy": 0.0
8
- },
9
- "n_errors": 486,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep/eval_results/test_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/eval_results/test_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep/eval_results/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "do_sample": true,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "temperature": 0.7,
9
- "top_k": 20,
10
- "top_p": 0.8,
11
- "transformers_version": "5.0.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebe90e40bddad2b73128c35b1656006cc6cf6883bde9148ec3f150a2f9916653
3
- size 8044982080
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "151643": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "151644": {
13
- "content": "<|im_start|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "151645": {
21
- "content": "<|im_end|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "151646": {
29
- "content": "<|object_ref_start|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "151647": {
37
- "content": "<|object_ref_end|>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "151648": {
45
- "content": "<|box_start|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": true
51
- },
52
- "151649": {
53
- "content": "<|box_end|>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false,
58
- "special": true
59
- },
60
- "151650": {
61
- "content": "<|quad_start|>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": false,
65
- "single_word": false,
66
- "special": true
67
- },
68
- "151651": {
69
- "content": "<|quad_end|>",
70
- "lstrip": false,
71
- "normalized": false,
72
- "rstrip": false,
73
- "single_word": false,
74
- "special": true
75
- },
76
- "151652": {
77
- "content": "<|vision_start|>",
78
- "lstrip": false,
79
- "normalized": false,
80
- "rstrip": false,
81
- "single_word": false,
82
- "special": true
83
- },
84
- "151653": {
85
- "content": "<|vision_end|>",
86
- "lstrip": false,
87
- "normalized": false,
88
- "rstrip": false,
89
- "single_word": false,
90
- "special": true
91
- },
92
- "151654": {
93
- "content": "<|vision_pad|>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": true
99
- },
100
- "151655": {
101
- "content": "<|image_pad|>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": true
107
- },
108
- "151656": {
109
- "content": "<|video_pad|>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": true
115
- },
116
- "151657": {
117
- "content": "<tool_call>",
118
- "lstrip": false,
119
- "normalized": false,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": false
123
- },
124
- "151658": {
125
- "content": "</tool_call>",
126
- "lstrip": false,
127
- "normalized": false,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": false
131
- },
132
- "151659": {
133
- "content": "<|fim_prefix|>",
134
- "lstrip": false,
135
- "normalized": false,
136
- "rstrip": false,
137
- "single_word": false,
138
- "special": false
139
- },
140
- "151660": {
141
- "content": "<|fim_middle|>",
142
- "lstrip": false,
143
- "normalized": false,
144
- "rstrip": false,
145
- "single_word": false,
146
- "special": false
147
- },
148
- "151661": {
149
- "content": "<|fim_suffix|>",
150
- "lstrip": false,
151
- "normalized": false,
152
- "rstrip": false,
153
- "single_word": false,
154
- "special": false
155
- },
156
- "151662": {
157
- "content": "<|fim_pad|>",
158
- "lstrip": false,
159
- "normalized": false,
160
- "rstrip": false,
161
- "single_word": false,
162
- "special": false
163
- },
164
- "151663": {
165
- "content": "<|repo_name|>",
166
- "lstrip": false,
167
- "normalized": false,
168
- "rstrip": false,
169
- "single_word": false,
170
- "special": false
171
- },
172
- "151664": {
173
- "content": "<|file_sep|>",
174
- "lstrip": false,
175
- "normalized": false,
176
- "rstrip": false,
177
- "single_word": false,
178
- "special": false
179
- },
180
- "151665": {
181
- "content": "<tool_response>",
182
- "lstrip": false,
183
- "normalized": false,
184
- "rstrip": false,
185
- "single_word": false,
186
- "special": false
187
- },
188
- "151666": {
189
- "content": "</tool_response>",
190
- "lstrip": false,
191
- "normalized": false,
192
- "rstrip": false,
193
- "single_word": false,
194
- "special": false
195
- },
196
- "151667": {
197
- "content": "<think>",
198
- "lstrip": false,
199
- "normalized": false,
200
- "rstrip": false,
201
- "single_word": false,
202
- "special": false
203
- },
204
- "151668": {
205
- "content": "</think>",
206
- "lstrip": false,
207
- "normalized": false,
208
- "rstrip": false,
209
- "single_word": false,
210
- "special": false
211
- }
212
- },
213
- "additional_special_tokens": [
214
- "<|im_start|>",
215
- "<|im_end|>",
216
- "<|object_ref_start|>",
217
- "<|object_ref_end|>",
218
- "<|box_start|>",
219
- "<|box_end|>",
220
- "<|quad_start|>",
221
- "<|quad_end|>",
222
- "<|vision_start|>",
223
- "<|vision_end|>",
224
- "<|vision_pad|>",
225
- "<|image_pad|>",
226
- "<|video_pad|>"
227
- ],
228
- "bos_token": null,
229
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
230
- "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|im_end|>",
232
- "errors": "replace",
233
- "model_max_length": 1010000,
234
- "pad_token": "<|endoftext|>",
235
- "split_special_tokens": false,
236
- "tokenizer_class": "Qwen2Tokenizer",
237
- "unk_token": null,
238
- "add_bos_token": false
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft/config.json DELETED
@@ -1,71 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": false,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936
71
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "backend": "tokenizers",
4
- "bos_token": null,
5
- "clean_up_tokenization_spaces": false,
6
- "eos_token": "<|im_end|>",
7
- "errors": "replace",
8
- "extra_special_tokens": [
9
- "<|im_start|>",
10
- "<|im_end|>",
11
- "<|object_ref_start|>",
12
- "<|object_ref_end|>",
13
- "<|box_start|>",
14
- "<|box_end|>",
15
- "<|quad_start|>",
16
- "<|quad_end|>",
17
- "<|vision_start|>",
18
- "<|vision_end|>",
19
- "<|vision_pad|>",
20
- "<|image_pad|>",
21
- "<|video_pad|>"
22
- ],
23
- "is_local": true,
24
- "model_max_length": 1010000,
25
- "pad_token": "<|endoftext|>",
26
- "split_special_tokens": false,
27
- "tokenizer_class": "Qwen2Tokenizer",
28
- "unk_token": null
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/README.md DELETED
@@ -1,122 +0,0 @@
1
- ---
2
- library_name: peft
3
- tags:
4
- - axolotl
5
- - base_model:adapter:/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
6
- - lora
7
- - transformers
8
- datasets:
9
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
10
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
11
- pipeline_tag: text-generation
12
- model-index:
13
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep
14
- results: []
15
- ---
16
-
17
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
- should probably proofread and complete it, then remove this comment. -->
19
-
20
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
21
- <details><summary>See axolotl config</summary>
22
-
23
- axolotl version: `0.15.0.dev0`
24
- ```yaml
25
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
26
- load_in_8bit: false
27
- load_in_4bit: true
28
- strict: false
29
-
30
- datasets:
31
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
32
- type: chat_template
33
- dataset_prepared_path:
34
- val_set_size: 0
35
- chat_template: chatml
36
-
37
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep
38
-
39
- sequence_len: 512
40
- sample_packing: true
41
- eval_sample_packing: true
42
-
43
- adapter: qlora
44
- lora_r: 64
45
- lora_alpha: 128
46
- lora_dropout: 0.0
47
- lora_target_linear: true
48
-
49
- gradient_accumulation_steps: 4
50
- micro_batch_size: 2
51
- num_epochs: 50
52
- optimizer: adamw_torch_fused
53
- lr_scheduler: cosine
54
- learning_rate: 5e-4
55
-
56
- bf16: auto
57
- tf32: true
58
-
59
- gradient_checkpointing: true
60
- gradient_checkpointing_kwargs:
61
- use_reentrant: false
62
-
63
- logging_steps: 10
64
- flash_attention: true
65
- warmup_ratio: 0.02
66
- saves_per_epoch: 1
67
- save_total_limit: 1
68
- weight_decay: 0.01
69
-
70
- wandb_project: knowledge_sft
71
- wandb_name: atomic-lora-50ep-cosine
72
- wandb_log_model: "false"
73
-
74
- special_tokens:
75
-
76
- ```
77
-
78
- </details><br>
79
-
80
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep
81
-
82
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl dataset.
83
-
84
- ## Model description
85
-
86
- More information needed
87
-
88
- ## Intended uses & limitations
89
-
90
- More information needed
91
-
92
- ## Training and evaluation data
93
-
94
- More information needed
95
-
96
- ## Training procedure
97
-
98
- ### Training hyperparameters
99
-
100
- The following hyperparameters were used during training:
101
- - learning_rate: 0.0005
102
- - train_batch_size: 2
103
- - eval_batch_size: 2
104
- - seed: 42
105
- - gradient_accumulation_steps: 4
106
- - total_train_batch_size: 8
107
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
108
- - lr_scheduler_type: cosine
109
- - lr_scheduler_warmup_steps: 26
110
- - training_steps: 1300
111
-
112
- ### Training results
113
-
114
-
115
-
116
- ### Framework versions
117
-
118
- - PEFT 0.18.1
119
- - Transformers 5.0.0
120
- - Pytorch 2.8.0+cu128
121
- - Datasets 4.5.0
122
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/adapter_config.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "alora_invocation_tokens": null,
3
- "alpha_pattern": {},
4
- "arrow_config": null,
5
- "auto_mapping": null,
6
- "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
- "bias": "none",
8
- "corda_config": null,
9
- "ensure_weight_tying": false,
10
- "eva_config": null,
11
- "exclude_modules": null,
12
- "fan_in_fan_out": null,
13
- "inference_mode": true,
14
- "init_lora_weights": true,
15
- "layer_replication": null,
16
- "layers_pattern": null,
17
- "layers_to_transform": null,
18
- "loftq_config": {},
19
- "lora_alpha": 128,
20
- "lora_bias": false,
21
- "lora_dropout": 0.0,
22
- "megatron_config": null,
23
- "megatron_core": "megatron.core",
24
- "modules_to_save": null,
25
- "peft_type": "LORA",
26
- "peft_version": "0.18.1",
27
- "qalora_group_size": 16,
28
- "r": 64,
29
- "rank_pattern": {},
30
- "revision": null,
31
- "target_modules": [
32
- "up_proj",
33
- "q_proj",
34
- "k_proj",
35
- "gate_proj",
36
- "o_proj",
37
- "v_proj",
38
- "down_proj"
39
- ],
40
- "target_parameters": [],
41
- "task_type": "CAUSAL_LM",
42
- "trainable_token_indices": null,
43
- "use_dora": false,
44
- "use_qalora": false,
45
- "use_rslora": false
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7b81e8b39caf50b763af28fff658e2ec6faa5fb16da237f16c382ef7620b462
3
- size 528550256
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/config.json DELETED
@@ -1,86 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": null,
60
- "quantization_config": {
61
- "_load_in_4bit": true,
62
- "_load_in_8bit": false,
63
- "bnb_4bit_compute_dtype": "bfloat16",
64
- "bnb_4bit_quant_storage": "bfloat16",
65
- "bnb_4bit_quant_type": "nf4",
66
- "bnb_4bit_use_double_quant": true,
67
- "llm_int8_enable_fp32_cpu_offload": false,
68
- "llm_int8_has_fp16_weight": false,
69
- "llm_int8_skip_modules": null,
70
- "llm_int8_threshold": 6.0,
71
- "load_in_4bit": true,
72
- "load_in_8bit": false,
73
- "quant_method": "bitsandbytes"
74
- },
75
- "rms_norm_eps": 1e-06,
76
- "rope_parameters": {
77
- "rope_theta": 5000000,
78
- "rope_type": "default"
79
- },
80
- "sliding_window": null,
81
- "tie_word_embeddings": true,
82
- "transformers_version": "5.0.0",
83
- "use_cache": false,
84
- "use_sliding_window": false,
85
- "vocab_size": 151936
86
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/eval_results.csv DELETED
@@ -1,3 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,450,448,99.56,0,0.00,2
3
- knowledge,test_inference_results,499,13,2.61,0,0.00,486
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 499,
4
- "correct": 13,
5
- "accuracy": 2.61,
6
- "format_found": 0,
7
- "format_accuracy": 0.0
8
- },
9
- "n_errors": 486,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/test_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/test_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_sft_lora_50ep/eval_results/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/added_tokens.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "</think>": 151668,
3
- "</tool_call>": 151658,
4
- "</tool_response>": 151666,
5
- "<think>": 151667,
6
- "<tool_call>": 151657,
7
- "<tool_response>": 151665,
8
- "<|box_end|>": 151649,
9
- "<|box_start|>": 151648,
10
- "<|endoftext|>": 151643,
11
- "<|file_sep|>": 151664,
12
- "<|fim_middle|>": 151660,
13
- "<|fim_pad|>": 151662,
14
- "<|fim_prefix|>": 151659,
15
- "<|fim_suffix|>": 151661,
16
- "<|im_end|>": 151645,
17
- "<|im_start|>": 151644,
18
- "<|image_pad|>": 151655,
19
- "<|object_ref_end|>": 151647,
20
- "<|object_ref_start|>": 151646,
21
- "<|quad_end|>": 151651,
22
- "<|quad_start|>": 151650,
23
- "<|repo_name|>": 151663,
24
- "<|video_pad|>": 151656,
25
- "<|vision_end|>": 151653,
26
- "<|vision_pad|>": 151654,
27
- "<|vision_start|>": 151652
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/chat_template.jinja DELETED
@@ -1,61 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0].role == 'system' %}
4
- {{- messages[0].content + '\n\n' }}
5
- {%- endif %}
6
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
- {%- for tool in tools %}
8
- {{- "\n" }}
9
- {{- tool | tojson }}
10
- {%- endfor %}
11
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
- {%- else %}
13
- {%- if messages[0].role == 'system' %}
14
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
- {%- endif %}
16
- {%- endif %}
17
- {%- for message in messages %}
18
- {%- if message.content is string %}
19
- {%- set content = message.content %}
20
- {%- else %}
21
- {%- set content = '' %}
22
- {%- endif %}
23
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
- {%- elif message.role == "assistant" %}
26
- {{- '<|im_start|>' + message.role + '\n' + content }}
27
- {%- if message.tool_calls %}
28
- {%- for tool_call in message.tool_calls %}
29
- {%- if (loop.first and content) or (not loop.first) %}
30
- {{- '\n' }}
31
- {%- endif %}
32
- {%- if tool_call.function %}
33
- {%- set tool_call = tool_call.function %}
34
- {%- endif %}
35
- {{- '<tool_call>\n{"name": "' }}
36
- {{- tool_call.name }}
37
- {{- '", "arguments": ' }}
38
- {%- if tool_call.arguments is string %}
39
- {{- tool_call.arguments }}
40
- {%- else %}
41
- {{- tool_call.arguments | tojson }}
42
- {%- endif %}
43
- {{- '}\n</tool_call>' }}
44
- {%- endfor %}
45
- {%- endif %}
46
- {{- '<|im_end|>\n' }}
47
- {%- elif message.role == "tool" %}
48
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
- {{- '<|im_start|>user' }}
50
- {%- endif %}
51
- {{- '\n<tool_response>\n' }}
52
- {{- content }}
53
- {{- '\n</tool_response>' }}
54
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
- {{- '<|im_end|>\n' }}
56
- {%- endif %}
57
- {%- endif %}
58
- {%- endfor %}
59
- {%- if add_generation_prompt %}
60
- {{- '<|im_start|>assistant\n' }}
61
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/config.json DELETED
@@ -1,68 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "rms_norm_eps": 1e-06,
60
- "rope_scaling": null,
61
- "rope_theta": 5000000,
62
- "sliding_window": null,
63
- "tie_word_embeddings": true,
64
- "transformers_version": "4.57.1",
65
- "use_cache": true,
66
- "use_sliding_window": false,
67
- "vocab_size": 151936
68
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/generation_config.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "pad_token_id": 151643,
9
- "temperature": 0.7,
10
- "top_k": 20,
11
- "top_p": 0.8,
12
- "transformers_version": "4.57.1"
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/model-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d5b235f390f3f30e218ef9c6fa5ccb15a48f015216bd50473170e887b84c123
3
- size 4967215360
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/model-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3365bad9646e1d666592d69cd37186abc904b0f465c384f4555713a349651e51
3
- size 3077766632
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/model.safetensors.index.json DELETED
@@ -1,406 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 4022468096,
4
- "total_size": 8044936192
5
- },
6
- "weight_map": {
7
- "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
- "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
- "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
- "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
- "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
- "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
- "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
- "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
- "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
- "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
- "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
- "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
- "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
- "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
- "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
- "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
- "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
- "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
- "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
- "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
- "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
- "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
- "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
- "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
- "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
- "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
- "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
- "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
- "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
- "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
- "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
- "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
- "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
- "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
- "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
- "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
- "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
- "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
- "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
- "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
- "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
- "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
- "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
- "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
- "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
- "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
- "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
- "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
- "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
- "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
- "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
- "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
- "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
- "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
- "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
- "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
- "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
- "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
- "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
- "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
- "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
- "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
- "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
- "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
- "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
- "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
- "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
- "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
- "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
- "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
- "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
- "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
- "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
- "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
- "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
- "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
- "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
- "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
- "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
- "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
- "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
- "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
- "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
- "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
- "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
- "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
- "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
- "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
- "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
- "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
- "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
- "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
- "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
- "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
- "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
- "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
- "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
- "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
- "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
- "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
- "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
- "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
- "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
- "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
- "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
- "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
- "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
- "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
- "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
- "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
- "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
- "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
- "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
- "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
- "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
- "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
- "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
- "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
- "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
- "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
- "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
- "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
- "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
- "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
- "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
- "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
- "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
- "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
- "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
- "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
- "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
- "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
- "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
- "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
- "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
- "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
- "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
- "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
- "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
- "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
- "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
- "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
- "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
- "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
- "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
- "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
- "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
- "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
- "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
- "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
- "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
- "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
- "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
- "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
- "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
- "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
- "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
- "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
- "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
- "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
- "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
- "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
- "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
- "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
- "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
- "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
- "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
- "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
- "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
- "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
- "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
- "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
- "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
- "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
- "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
- "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
- "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
- "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
- "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
- "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
- "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
- "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
- "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
- "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
- "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
- "model.norm.weight": "model-00002-of-00002.safetensors"
405
- }
406
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "151643": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "151644": {
13
- "content": "<|im_start|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "151645": {
21
- "content": "<|im_end|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "151646": {
29
- "content": "<|object_ref_start|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "151647": {
37
- "content": "<|object_ref_end|>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "151648": {
45
- "content": "<|box_start|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": true
51
- },
52
- "151649": {
53
- "content": "<|box_end|>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false,
58
- "special": true
59
- },
60
- "151650": {
61
- "content": "<|quad_start|>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": false,
65
- "single_word": false,
66
- "special": true
67
- },
68
- "151651": {
69
- "content": "<|quad_end|>",
70
- "lstrip": false,
71
- "normalized": false,
72
- "rstrip": false,
73
- "single_word": false,
74
- "special": true
75
- },
76
- "151652": {
77
- "content": "<|vision_start|>",
78
- "lstrip": false,
79
- "normalized": false,
80
- "rstrip": false,
81
- "single_word": false,
82
- "special": true
83
- },
84
- "151653": {
85
- "content": "<|vision_end|>",
86
- "lstrip": false,
87
- "normalized": false,
88
- "rstrip": false,
89
- "single_word": false,
90
- "special": true
91
- },
92
- "151654": {
93
- "content": "<|vision_pad|>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": true
99
- },
100
- "151655": {
101
- "content": "<|image_pad|>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": true
107
- },
108
- "151656": {
109
- "content": "<|video_pad|>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": true
115
- },
116
- "151657": {
117
- "content": "<tool_call>",
118
- "lstrip": false,
119
- "normalized": false,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": false
123
- },
124
- "151658": {
125
- "content": "</tool_call>",
126
- "lstrip": false,
127
- "normalized": false,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": false
131
- },
132
- "151659": {
133
- "content": "<|fim_prefix|>",
134
- "lstrip": false,
135
- "normalized": false,
136
- "rstrip": false,
137
- "single_word": false,
138
- "special": false
139
- },
140
- "151660": {
141
- "content": "<|fim_middle|>",
142
- "lstrip": false,
143
- "normalized": false,
144
- "rstrip": false,
145
- "single_word": false,
146
- "special": false
147
- },
148
- "151661": {
149
- "content": "<|fim_suffix|>",
150
- "lstrip": false,
151
- "normalized": false,
152
- "rstrip": false,
153
- "single_word": false,
154
- "special": false
155
- },
156
- "151662": {
157
- "content": "<|fim_pad|>",
158
- "lstrip": false,
159
- "normalized": false,
160
- "rstrip": false,
161
- "single_word": false,
162
- "special": false
163
- },
164
- "151663": {
165
- "content": "<|repo_name|>",
166
- "lstrip": false,
167
- "normalized": false,
168
- "rstrip": false,
169
- "single_word": false,
170
- "special": false
171
- },
172
- "151664": {
173
- "content": "<|file_sep|>",
174
- "lstrip": false,
175
- "normalized": false,
176
- "rstrip": false,
177
- "single_word": false,
178
- "special": false
179
- },
180
- "151665": {
181
- "content": "<tool_response>",
182
- "lstrip": false,
183
- "normalized": false,
184
- "rstrip": false,
185
- "single_word": false,
186
- "special": false
187
- },
188
- "151666": {
189
- "content": "</tool_response>",
190
- "lstrip": false,
191
- "normalized": false,
192
- "rstrip": false,
193
- "single_word": false,
194
- "special": false
195
- },
196
- "151667": {
197
- "content": "<think>",
198
- "lstrip": false,
199
- "normalized": false,
200
- "rstrip": false,
201
- "single_word": false,
202
- "special": false
203
- },
204
- "151668": {
205
- "content": "</think>",
206
- "lstrip": false,
207
- "normalized": false,
208
- "rstrip": false,
209
- "single_word": false,
210
- "special": false
211
- }
212
- },
213
- "additional_special_tokens": [
214
- "<|im_start|>",
215
- "<|im_end|>",
216
- "<|object_ref_start|>",
217
- "<|object_ref_end|>",
218
- "<|box_start|>",
219
- "<|box_end|>",
220
- "<|quad_start|>",
221
- "<|quad_end|>",
222
- "<|vision_start|>",
223
- "<|vision_end|>",
224
- "<|vision_pad|>",
225
- "<|image_pad|>",
226
- "<|video_pad|>"
227
- ],
228
- "bos_token": null,
229
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
230
- "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|im_end|>",
232
- "errors": "replace",
233
- "model_max_length": 1010000,
234
- "pad_token": "<|endoftext|>",
235
- "split_special_tokens": false,
236
- "tokenizer_class": "Qwen2Tokenizer",
237
- "unk_token": null,
238
- "add_bos_token": false
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/merged/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "backend": "tokenizers",
4
- "bos_token": null,
5
- "clean_up_tokenization_spaces": false,
6
- "eos_token": "<|im_end|>",
7
- "errors": "replace",
8
- "extra_special_tokens": [
9
- "<|im_start|>",
10
- "<|im_end|>",
11
- "<|object_ref_start|>",
12
- "<|object_ref_end|>",
13
- "<|box_start|>",
14
- "<|box_end|>",
15
- "<|quad_start|>",
16
- "<|quad_end|>",
17
- "<|vision_start|>",
18
- "<|vision_end|>",
19
- "<|vision_pad|>",
20
- "<|image_pad|>",
21
- "<|video_pad|>"
22
- ],
23
- "is_local": true,
24
- "model_max_length": 1010000,
25
- "pad_token": "<|endoftext|>",
26
- "split_special_tokens": false,
27
- "tokenizer_class": "Qwen2Tokenizer",
28
- "unk_token": null
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/compositional_full_sft_n_steps_2/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/math_operations/compositional_full_sft_n_steps_2/config.json DELETED
@@ -1,71 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": false,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936
71
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer_config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "backend": "tokenizers",
4
- "bos_token": null,
5
- "clean_up_tokenization_spaces": false,
6
- "eos_token": "<|im_end|>",
7
- "errors": "replace",
8
- "extra_special_tokens": [
9
- "<|im_start|>",
10
- "<|im_end|>",
11
- "<|object_ref_start|>",
12
- "<|object_ref_end|>",
13
- "<|box_start|>",
14
- "<|box_end|>",
15
- "<|quad_start|>",
16
- "<|quad_end|>",
17
- "<|vision_start|>",
18
- "<|vision_end|>",
19
- "<|vision_pad|>",
20
- "<|image_pad|>",
21
- "<|video_pad|>"
22
- ],
23
- "is_local": true,
24
- "model_max_length": 1010000,
25
- "pad_token": "<|endoftext|>",
26
- "split_special_tokens": false,
27
- "tokenizer_class": "Qwen2Tokenizer",
28
- "unk_token": null
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/README.md DELETED
@@ -1,132 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
7
- model-index:
8
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
9
- results: []
10
- ---
11
-
12
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
- should probably proofread and complete it, then remove this comment. -->
14
-
15
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
- <details><summary>See axolotl config</summary>
17
-
18
- axolotl version: `0.15.0.dev0`
19
- ```yaml
20
- # Qwen3-4B full fine-tuning SFT — LR 5e-5
21
-
22
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
23
-
24
- load_in_8bit: false
25
- load_in_4bit: false
26
- strict: false
27
-
28
- datasets:
29
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
30
- type: alpaca
31
- dataset_prepared_path:
32
- val_set_size: 0
33
- chat_template: chatml
34
-
35
- test_datasets:
36
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_val_alpaca.jsonl
37
- type: alpaca
38
-
39
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
40
-
41
- sequence_len: 2048
42
- sample_packing: true
43
- eval_sample_packing: true
44
-
45
- gradient_accumulation_steps: 8
46
- micro_batch_size: 1
47
- num_epochs: 3
48
- optimizer: adamw_torch_fused
49
- lr_scheduler: cosine
50
- learning_rate: 5e-5
51
-
52
- bf16: auto
53
- tf32: true
54
-
55
- gradient_checkpointing: true
56
- gradient_checkpointing_kwargs:
57
- use_reentrant: false
58
-
59
- logging_steps: 10
60
- flash_attention: true
61
- warmup_ratio: 0.1
62
- evals_per_epoch: 2
63
- saves_per_epoch: 1
64
- save_total_limit: 1
65
- weight_decay: 0.01
66
-
67
- wandb_project: math_operations_sft
68
- wandb_name: qwen3-4b-full-sft-50k-lr5e5
69
- wandb_log_model: "false"
70
-
71
- special_tokens:
72
-
73
- ```
74
-
75
- </details><br>
76
-
77
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
78
-
79
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl dataset.
80
- It achieves the following results on the evaluation set:
81
- - Loss: 0.0001
82
- - Ppl: 1.0001
83
- - Memory/max Active (gib): 33.95
84
- - Memory/max Allocated (gib): 33.95
85
- - Memory/device Reserved (gib): 35.97
86
-
87
- ## Model description
88
-
89
- More information needed
90
-
91
- ## Intended uses & limitations
92
-
93
- More information needed
94
-
95
- ## Training and evaluation data
96
-
97
- More information needed
98
-
99
- ## Training procedure
100
-
101
- ### Training hyperparameters
102
-
103
- The following hyperparameters were used during training:
104
- - learning_rate: 5e-05
105
- - train_batch_size: 1
106
- - eval_batch_size: 1
107
- - seed: 42
108
- - gradient_accumulation_steps: 8
109
- - total_train_batch_size: 8
110
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
111
- - lr_scheduler_type: cosine
112
- - lr_scheduler_warmup_steps: 312
113
- - training_steps: 3123
114
-
115
- ### Training results
116
-
117
- | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
118
- |:-------------:|:------:|:----:|:---------------:|:------:|:------------:|:---------------:|:--------------:|
119
- | No log | 0 | 0 | 0.8898 | 2.4345 | 10.41 | 10.41 | 10.64 |
120
- | 0.0029 | 0.5002 | 521 | 0.0023 | 1.0023 | 33.97 | 33.97 | 36.5 |
121
- | 0.0003 | 1.0 | 1042 | 0.0005 | 1.0005 | 33.95 | 33.95 | 35.97 |
122
- | 0.0003 | 1.5002 | 1563 | 0.0003 | 1.0003 | 33.95 | 33.95 | 35.97 |
123
- | 0.0002 | 2.0 | 2084 | 0.0001 | 1.0001 | 33.95 | 33.95 | 35.97 |
124
- | 0.0001 | 2.5002 | 2605 | 0.0001 | 1.0001 | 33.95 | 33.95 | 35.97 |
125
-
126
-
127
- ### Framework versions
128
-
129
- - Transformers 5.0.0
130
- - Pytorch 2.8.0+cu128
131
- - Datasets 4.5.0
132
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/config.json DELETED
@@ -1,71 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": false,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936
71
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_converted.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_results.csv DELETED
@@ -1,12 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- math_operations,balanced_test_alpaca_results,200,198,99.00,200,100.00,2
3
- math_operations,balanced_test_alpaca_results,200,47,23.50,200,100.00,153
4
- math_operations,test_alpaca_results,200,0,0.00,196,98.00,200
5
- math_operations,op_A_test_alpaca_results,200,0,0.00,161,80.50,200
6
- math_operations,op_B_test_alpaca_results,200,2,1.00,190,95.00,198
7
- math_operations,op_C_test_alpaca_results,200,0,0.00,198,99.00,200
8
- math_operations,op_D_test_alpaca_results,200,2,1.00,174,87.00,198
9
- math_operations,op_E_test_alpaca_results,200,2,1.00,200,100.00,198
10
- math_operations,op_F_test_alpaca_results,200,1,0.50,198,99.00,199
11
- math_operations,op_G_test_alpaca_results,200,0,0.00,200,100.00,200
12
- math_operations,op_H_test_alpaca_results,200,0,0.00,200,100.00,200
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_summary.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 200,
4
- "correct": 0,
5
- "accuracy": 0.0,
6
- "format_found": 200,
7
- "format_accuracy": 100.0
8
- },
9
- "per_operation": {
10
- "A": {
11
- "total": 200,
12
- "correct": 0,
13
- "accuracy": 0.0,
14
- "format_found": 200
15
- }
16
- },
17
- "n_errors": 200,
18
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_results.jsonl"
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_converted.jsonl DELETED
The diff for this file is too large to render. See raw diff