Jerry999 commited on
Commit
a31e018
·
verified ·
1 Parent(s): 746214c

Delete folder checkpoints with huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/README.md +0 -225
  2. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/chat_template.jinja +0 -4
  3. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/config.json +0 -72
  4. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/eval_results.csv +0 -5
  5. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/eval_summary.json +0 -11
  6. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl +0 -0
  7. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_inference_results.jsonl +0 -0
  8. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_subset_of_train_inference_results.jsonl +0 -0
  9. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_subset_of_train_paraphrased_inference_results.jsonl +0 -0
  10. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/generation_config.json +0 -12
  11. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/model.safetensors +0 -3
  12. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/tokenizer.json +0 -3
  13. checkpoints/knowledge/atomic_full_sft_50ep_t20260305/tokenizer_config.json +0 -239
  14. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/README.md +0 -159
  15. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/chat_template.jinja +0 -4
  16. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/config.json +0 -72
  17. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/eval_results.csv +0 -3
  18. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/eval_summary.json +0 -11
  19. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/test_subset_of_train_inference_results.jsonl +0 -0
  20. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/test_subset_of_train_paraphrased_inference_results.jsonl +0 -0
  21. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/eval_results.csv +0 -4
  22. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/eval_summary.json +0 -11
  23. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_5shot_inference_results.jsonl +0 -0
  24. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_inference_results.jsonl +0 -0
  25. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_subset_of_train_inference_results.jsonl +0 -0
  26. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/generation_config.json +0 -12
  27. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/model.safetensors +0 -3
  28. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/tokenizer.json +0 -3
  29. checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/tokenizer_config.json +0 -239
  30. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/README.md +0 -237
  31. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/adapter_config.json +0 -46
  32. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/adapter_model.safetensors +0 -3
  33. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/chat_template.jinja +0 -4
  34. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/config.json +0 -86
  35. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/eval_results.csv +0 -5
  36. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/eval_summary.json +0 -11
  37. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl +0 -0
  38. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_inference_results.jsonl +0 -0
  39. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_subset_of_train_inference_results.jsonl +0 -0
  40. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_subset_of_train_paraphrased_inference_results.jsonl +0 -0
  41. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/added_tokens.json +0 -28
  42. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/chat_template.jinja +0 -61
  43. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/config.json +0 -68
  44. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/generation_config.json +0 -13
  45. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/merges.txt +0 -0
  46. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model-00001-of-00002.safetensors +0 -3
  47. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model-00002-of-00002.safetensors +0 -3
  48. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model.safetensors.index.json +0 -406
  49. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/special_tokens_map.json +0 -31
  50. checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/tokenizer.json +0 -3
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/README.md DELETED
@@ -1,225 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
7
- model-index:
8
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep_t20260305
9
- results: []
10
- ---
11
-
12
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
- should probably proofread and complete it, then remove this comment. -->
14
-
15
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
- <details><summary>See axolotl config</summary>
17
-
18
- axolotl version: `0.15.0.dev0`
19
- ```yaml
20
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
21
- load_in_8bit: false
22
- load_in_4bit: false
23
- strict: false
24
-
25
- datasets:
26
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
27
- type: chat_template
28
- dataset_prepared_path:
29
- val_set_size: 0
30
- chat_template: chatml
31
-
32
- test_datasets:
33
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/val_paraphrased_messages.jsonl
34
- type: chat_template
35
-
36
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep_t20260305
37
-
38
- sequence_len: 512
39
- sample_packing: true
40
- eval_sample_packing: true
41
-
42
- gradient_accumulation_steps: 8
43
- micro_batch_size: 1
44
- num_epochs: 50
45
- optimizer: adamw_torch_fused
46
- lr_scheduler: constant_with_warmup
47
- learning_rate: 1e-4
48
-
49
- bf16: auto
50
- tf32: true
51
-
52
- gradient_checkpointing: true
53
- gradient_checkpointing_kwargs:
54
- use_reentrant: false
55
-
56
- logging_steps: 10
57
- flash_attention: true
58
- warmup_ratio: 0.02
59
- evals_per_epoch: 2
60
- saves_per_epoch: 1
61
- save_total_limit: 3
62
- load_best_model_at_end: true
63
- weight_decay: 0.01
64
-
65
- wandb_project: knowledge_sft
66
- wandb_name: full-sft-atomic-t20260305
67
- wandb_log_model: "false"
68
-
69
- special_tokens:
70
-
71
- ```
72
-
73
- </details><br>
74
-
75
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep_t20260305
76
-
77
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl dataset.
78
- It achieves the following results on the evaluation set:
79
- - Loss: 0.0003
80
- - Ppl: 1.0003
81
- - Memory/max Active (gib): 23.32
82
- - Memory/max Allocated (gib): 23.32
83
- - Memory/device Reserved (gib): 33.31
84
-
85
- ## Model description
86
-
87
- More information needed
88
-
89
- ## Intended uses & limitations
90
-
91
- More information needed
92
-
93
- ## Training and evaluation data
94
-
95
- More information needed
96
-
97
- ## Training procedure
98
-
99
- ### Training hyperparameters
100
-
101
- The following hyperparameters were used during training:
102
- - learning_rate: 0.0001
103
- - train_batch_size: 1
104
- - eval_batch_size: 1
105
- - seed: 42
106
- - gradient_accumulation_steps: 8
107
- - total_train_batch_size: 8
108
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
109
- - lr_scheduler_type: constant_with_warmup
110
- - lr_scheduler_warmup_steps: 26
111
- - training_steps: 1300
112
-
113
- ### Training results
114
-
115
- | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
116
- |:-------------:|:-------:|:----:|:---------------:|:----------:|:------------:|:---------------:|:--------------:|
117
- | No log | 0 | 0 | 11.0500 | 62945.1072 | 8.23 | 8.23 | 8.94 |
118
- | 7.0494 | 0.4837 | 13 | 2.1581 | 8.6548 | 32.3 | 32.3 | 33.17 |
119
- | 2.0677 | 0.9674 | 26 | 1.0282 | 2.7959 | 32.29 | 32.29 | 33.31 |
120
- | 1.1577 | 1.4465 | 39 | 0.9829 | 2.6721 | 32.3 | 32.3 | 33.1 |
121
- | 0.9896 | 1.9302 | 52 | 0.8636 | 2.3716 | 32.29 | 32.29 | 33.31 |
122
- | 0.9107 | 2.4093 | 65 | 0.8443 | 2.3263 | 32.3 | 32.3 | 33.1 |
123
- | 0.8932 | 2.8930 | 78 | 0.8110 | 2.2501 | 32.29 | 32.29 | 33.31 |
124
- | 0.8151 | 3.3721 | 91 | 0.7195 | 2.0535 | 32.3 | 32.3 | 33.1 |
125
- | 0.8194 | 3.8558 | 104 | 0.6576 | 1.9302 | 32.29 | 32.29 | 33.22 |
126
- | 0.7820 | 4.3349 | 117 | 0.4008 | 1.493 | 32.3 | 32.3 | 33.1 |
127
- | 0.5420 | 4.8186 | 130 | 0.2968 | 1.3455 | 23.32 | 23.32 | 33.31 |
128
- | 0.3731 | 5.2977 | 143 | 0.1417 | 1.1522 | 32.3 | 32.3 | 33.1 |
129
- | 0.2062 | 5.7814 | 156 | 0.1033 | 1.1088 | 32.29 | 32.29 | 33.31 |
130
- | 0.2244 | 6.2605 | 169 | 0.1132 | 1.1199 | 32.3 | 32.3 | 33.1 |
131
- | 0.0950 | 6.7442 | 182 | 0.0687 | 1.0712 | 32.29 | 32.29 | 33.31 |
132
- | 0.1163 | 7.2233 | 195 | 0.0726 | 1.0753 | 32.3 | 32.3 | 33.1 |
133
- | 0.4996 | 7.7070 | 208 | 0.0478 | 1.0490 | 32.29 | 32.29 | 32.61 |
134
- | 0.0645 | 8.1860 | 221 | 0.0634 | 1.0654 | 32.3 | 32.3 | 33.1 |
135
- | 0.0649 | 8.6698 | 234 | 0.0273 | 1.0277 | 32.29 | 32.29 | 33.31 |
136
- | 0.0527 | 9.1488 | 247 | 0.0318 | 1.0323 | 32.31 | 32.31 | 33.1 |
137
- | 0.0456 | 9.6326 | 260 | 0.0321 | 1.0326 | 23.32 | 23.32 | 33.31 |
138
- | 0.0524 | 10.1116 | 273 | 0.0303 | 1.0308 | 32.3 | 32.3 | 33.1 |
139
- | 0.0315 | 10.5953 | 286 | 0.0437 | 1.0446 | 32.29 | 32.29 | 33.31 |
140
- | 0.0593 | 11.0744 | 299 | 0.0299 | 1.0303 | 32.3 | 32.3 | 33.1 |
141
- | 0.0585 | 11.5581 | 312 | 0.0368 | 1.0375 | 32.29 | 32.29 | 33.31 |
142
- | 0.0626 | 12.0372 | 325 | 0.0168 | 1.0170 | 32.3 | 32.3 | 33.1 |
143
- | 0.0275 | 12.5209 | 338 | 0.0124 | 1.0125 | 32.29 | 32.29 | 33.31 |
144
- | 0.0289 | 13.0 | 351 | 0.0166 | 1.0168 | 32.3 | 32.3 | 33.1 |
145
- | 0.0308 | 13.4837 | 364 | 0.0192 | 1.0194 | 32.29 | 32.29 | 33.31 |
146
- | 0.0263 | 13.9674 | 377 | 0.0074 | 1.0074 | 32.3 | 32.3 | 33.1 |
147
- | 0.0142 | 14.4465 | 390 | 0.0347 | 1.0353 | 23.32 | 23.32 | 33.31 |
148
- | 0.0266 | 14.9302 | 403 | 0.0114 | 1.0115 | 32.3 | 32.3 | 33.1 |
149
- | 0.0200 | 15.4093 | 416 | 0.0311 | 1.0315 | 32.29 | 32.29 | 33.31 |
150
- | 0.0198 | 15.8930 | 429 | 0.0414 | 1.0422 | 32.3 | 32.3 | 33.1 |
151
- | 0.0225 | 16.3721 | 442 | 0.0454 | 1.0465 | 32.29 | 32.29 | 33.31 |
152
- | 0.0309 | 16.8558 | 455 | 0.0268 | 1.0272 | 32.3 | 32.3 | 33.1 |
153
- | 0.0411 | 17.3349 | 468 | 0.0395 | 1.0403 | 32.29 | 32.29 | 33.31 |
154
- | 0.0275 | 17.8186 | 481 | 0.0716 | 1.0742 | 32.3 | 32.3 | 33.1 |
155
- | 0.0260 | 18.2977 | 494 | 0.0112 | 1.0113 | 32.29 | 32.29 | 33.31 |
156
- | 0.0177 | 18.7814 | 507 | 0.0131 | 1.0132 | 32.3 | 32.3 | 33.1 |
157
- | 0.0099 | 19.2605 | 520 | 0.0015 | 1.0015 | 23.32 | 23.32 | 33.31 |
158
- | 0.0037 | 19.7442 | 533 | 0.0014 | 1.0014 | 32.3 | 32.3 | 33.1 |
159
- | 0.0011 | 20.2233 | 546 | 0.0017 | 1.0017 | 32.29 | 32.29 | 33.31 |
160
- | 0.0005 | 20.7070 | 559 | 0.0014 | 1.0014 | 32.31 | 32.31 | 33.1 |
161
- | 0.0011 | 21.1860 | 572 | 0.0026 | 1.0026 | 32.29 | 32.29 | 33.31 |
162
- | 0.0021 | 21.6698 | 585 | 0.0016 | 1.0016 | 32.3 | 32.3 | 33.1 |
163
- | 0.0059 | 22.1488 | 598 | 0.0073 | 1.0073 | 32.29 | 32.29 | 33.31 |
164
- | 0.0020 | 22.6326 | 611 | 0.0004 | 1.0004 | 32.3 | 32.3 | 33.1 |
165
- | 0.0002 | 23.1116 | 624 | 0.0004 | 1.0004 | 32.29 | 32.29 | 33.31 |
166
- | 0.0002 | 23.5953 | 637 | 0.0004 | 1.0004 | 32.31 | 32.31 | 33.1 |
167
- | 0.0001 | 24.0744 | 650 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
168
- | 0.0001 | 24.5581 | 663 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
169
- | 0.0001 | 25.0372 | 676 | 0.0003 | 1.0003 | 32.29 | 32.29 | 32.61 |
170
- | 0.0001 | 25.5209 | 689 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
171
- | 0.0001 | 26.0 | 702 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
172
- | 0.0001 | 26.4837 | 715 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
173
- | 0.0001 | 26.9674 | 728 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
174
- | 0.0001 | 27.4465 | 741 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
175
- | 0.0001 | 27.9302 | 754 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
176
- | 0.0001 | 28.4093 | 767 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
177
- | 0.0001 | 28.8930 | 780 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
178
- | 0.0001 | 29.3721 | 793 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
179
- | 0.0001 | 29.8558 | 806 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
180
- | 0.0001 | 30.3349 | 819 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
181
- | 0.0001 | 30.8186 | 832 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
182
- | 0.0001 | 31.2977 | 845 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
183
- | 0.0001 | 31.7814 | 858 | 0.0003 | 1.0003 | 32.29 | 32.29 | 32.61 |
184
- | 0.0001 | 32.2605 | 871 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
185
- | 0.0001 | 32.7442 | 884 | 0.0003 | 1.0003 | 32.29 | 32.29 | 32.61 |
186
- | 0.0001 | 33.2233 | 897 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
187
- | 0.0001 | 33.7070 | 910 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
188
- | 0.0001 | 34.1860 | 923 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
189
- | 0.0001 | 34.6698 | 936 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
190
- | 0.0001 | 35.1488 | 949 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
191
- | 0.0001 | 35.6326 | 962 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
192
- | 0.0001 | 36.1116 | 975 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
193
- | 0.0001 | 36.5953 | 988 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
194
- | 0.0001 | 37.0744 | 1001 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
195
- | 0.0001 | 37.5581 | 1014 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
196
- | 0.0001 | 38.0372 | 1027 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
197
- | 0.0001 | 38.5209 | 1040 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
198
- | 0.0001 | 39.0 | 1053 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
199
- | 0.0001 | 39.4837 | 1066 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
200
- | 0.0001 | 39.9674 | 1079 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
201
- | 0.0001 | 40.4465 | 1092 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
202
- | 0.0001 | 40.9302 | 1105 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
203
- | 0.0001 | 41.4093 | 1118 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
204
- | 0.0001 | 41.8930 | 1131 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
205
- | 0.0001 | 42.3721 | 1144 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
206
- | 0.0001 | 42.8558 | 1157 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
207
- | 0.0001 | 43.3349 | 1170 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
208
- | 0.0001 | 43.8186 | 1183 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
209
- | 0.0001 | 44.2977 | 1196 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
210
- | 0.0001 | 44.7814 | 1209 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
211
- | 0.0001 | 45.2605 | 1222 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
212
- | 0.0001 | 45.7442 | 1235 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
213
- | 0.0001 | 46.2233 | 1248 | 0.0003 | 1.0003 | 32.29 | 32.29 | 33.31 |
214
- | 0.0001 | 46.7070 | 1261 | 0.0003 | 1.0003 | 32.3 | 32.3 | 33.1 |
215
- | 0.0001 | 47.1860 | 1274 | 0.0003 | 1.0003 | 32.29 | 32.29 | 32.61 |
216
- | 0.0001 | 47.6698 | 1287 | 0.0003 | 1.0003 | 32.31 | 32.31 | 33.1 |
217
- | 0.0001 | 48.1488 | 1300 | 0.0003 | 1.0003 | 23.32 | 23.32 | 33.31 |
218
-
219
-
220
- ### Framework versions
221
-
222
- - Transformers 5.0.0
223
- - Pytorch 2.8.0+cu128
224
- - Datasets 4.5.0
225
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/config.json DELETED
@@ -1,72 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": true,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936,
71
- "rope_theta": 5000000
72
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/eval_results.csv DELETED
@@ -1,5 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,450,448,99.56,0,0.00,2
3
- knowledge,test_subset_of_train_paraphrased_inference_results,450,448,99.56,0,0.00,2
4
- knowledge,test_inference_results,499,10,2.00,0,0.00,489
5
- knowledge,test_5shot_inference_results,499,12,2.40,0,0.00,487
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 499,
4
- "correct": 12,
5
- "accuracy": 2.4,
6
- "format_found": 0,
7
- "format_accuracy": 0.0
8
- },
9
- "n_errors": 487,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/eval_results/test_subset_of_train_paraphrased_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "do_sample": true,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "temperature": 0.7,
9
- "top_k": 20,
10
- "top_p": 0.8,
11
- "transformers_version": "5.0.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9f30d6a95726d71bddec43d65c6f7e3bf2c1ec51c4c3a716b1fcfca2aebbeff
3
- size 8044982080
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/knowledge/atomic_full_sft_50ep_t20260305/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "151643": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "151644": {
13
- "content": "<|im_start|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "151645": {
21
- "content": "<|im_end|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "151646": {
29
- "content": "<|object_ref_start|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "151647": {
37
- "content": "<|object_ref_end|>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "151648": {
45
- "content": "<|box_start|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": true
51
- },
52
- "151649": {
53
- "content": "<|box_end|>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false,
58
- "special": true
59
- },
60
- "151650": {
61
- "content": "<|quad_start|>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": false,
65
- "single_word": false,
66
- "special": true
67
- },
68
- "151651": {
69
- "content": "<|quad_end|>",
70
- "lstrip": false,
71
- "normalized": false,
72
- "rstrip": false,
73
- "single_word": false,
74
- "special": true
75
- },
76
- "151652": {
77
- "content": "<|vision_start|>",
78
- "lstrip": false,
79
- "normalized": false,
80
- "rstrip": false,
81
- "single_word": false,
82
- "special": true
83
- },
84
- "151653": {
85
- "content": "<|vision_end|>",
86
- "lstrip": false,
87
- "normalized": false,
88
- "rstrip": false,
89
- "single_word": false,
90
- "special": true
91
- },
92
- "151654": {
93
- "content": "<|vision_pad|>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": true
99
- },
100
- "151655": {
101
- "content": "<|image_pad|>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": true
107
- },
108
- "151656": {
109
- "content": "<|video_pad|>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": true
115
- },
116
- "151657": {
117
- "content": "<tool_call>",
118
- "lstrip": false,
119
- "normalized": false,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": false
123
- },
124
- "151658": {
125
- "content": "</tool_call>",
126
- "lstrip": false,
127
- "normalized": false,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": false
131
- },
132
- "151659": {
133
- "content": "<|fim_prefix|>",
134
- "lstrip": false,
135
- "normalized": false,
136
- "rstrip": false,
137
- "single_word": false,
138
- "special": false
139
- },
140
- "151660": {
141
- "content": "<|fim_middle|>",
142
- "lstrip": false,
143
- "normalized": false,
144
- "rstrip": false,
145
- "single_word": false,
146
- "special": false
147
- },
148
- "151661": {
149
- "content": "<|fim_suffix|>",
150
- "lstrip": false,
151
- "normalized": false,
152
- "rstrip": false,
153
- "single_word": false,
154
- "special": false
155
- },
156
- "151662": {
157
- "content": "<|fim_pad|>",
158
- "lstrip": false,
159
- "normalized": false,
160
- "rstrip": false,
161
- "single_word": false,
162
- "special": false
163
- },
164
- "151663": {
165
- "content": "<|repo_name|>",
166
- "lstrip": false,
167
- "normalized": false,
168
- "rstrip": false,
169
- "single_word": false,
170
- "special": false
171
- },
172
- "151664": {
173
- "content": "<|file_sep|>",
174
- "lstrip": false,
175
- "normalized": false,
176
- "rstrip": false,
177
- "single_word": false,
178
- "special": false
179
- },
180
- "151665": {
181
- "content": "<tool_response>",
182
- "lstrip": false,
183
- "normalized": false,
184
- "rstrip": false,
185
- "single_word": false,
186
- "special": false
187
- },
188
- "151666": {
189
- "content": "</tool_response>",
190
- "lstrip": false,
191
- "normalized": false,
192
- "rstrip": false,
193
- "single_word": false,
194
- "special": false
195
- },
196
- "151667": {
197
- "content": "<think>",
198
- "lstrip": false,
199
- "normalized": false,
200
- "rstrip": false,
201
- "single_word": false,
202
- "special": false
203
- },
204
- "151668": {
205
- "content": "</think>",
206
- "lstrip": false,
207
- "normalized": false,
208
- "rstrip": false,
209
- "single_word": false,
210
- "special": false
211
- }
212
- },
213
- "additional_special_tokens": [
214
- "<|im_start|>",
215
- "<|im_end|>",
216
- "<|object_ref_start|>",
217
- "<|object_ref_end|>",
218
- "<|box_start|>",
219
- "<|box_end|>",
220
- "<|quad_start|>",
221
- "<|quad_end|>",
222
- "<|vision_start|>",
223
- "<|vision_end|>",
224
- "<|vision_pad|>",
225
- "<|image_pad|>",
226
- "<|video_pad|>"
227
- ],
228
- "bos_token": null,
229
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
230
- "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|im_end|>",
232
- "errors": "replace",
233
- "model_max_length": 1010000,
234
- "pad_token": "<|endoftext|>",
235
- "split_special_tokens": false,
236
- "tokenizer_class": "Qwen2Tokenizer",
237
- "unk_token": null,
238
- "add_bos_token": false
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/README.md DELETED
@@ -1,159 +0,0 @@
1
- ---
2
- library_name: transformers
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/compositional_2step_sft/train_messages.jsonl
7
- model-index:
8
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305
9
- results: []
10
- ---
11
-
12
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
- should probably proofread and complete it, then remove this comment. -->
14
-
15
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
- <details><summary>See axolotl config</summary>
17
-
18
- axolotl version: `0.15.0.dev0`
19
- ```yaml
20
- # Qwen3-4B full fine-tuning SFT on 2-step compositional CoT data
21
- # Starts from the atomic full SFT checkpoint (will be overridden by sbatch script)
22
-
23
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_sft_50ep_t20260305/checkpoint-1300
24
-
25
- # Model loading (full precision, no quantization, no adapter)
26
- load_in_8bit: false
27
- load_in_4bit: false
28
- strict: false
29
-
30
- # Training dataset (5002 2-step compositional QA pairs with CoT)
31
- datasets:
32
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/compositional_2step_sft/train_messages.jsonl
33
- type: chat_template
34
- dataset_prepared_path:
35
- val_set_size: 0
36
- chat_template: chatml
37
-
38
- # Validation dataset
39
- test_datasets:
40
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/compositional_2step_sft/val_messages.jsonl
41
- type: chat_template
42
-
43
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305
44
-
45
- # Sequence settings (longer for CoT reasoning)
46
- sequence_len: 1024
47
- sample_packing: true
48
- eval_sample_packing: true
49
-
50
- # No adapter — full fine-tuning (all parameters trainable)
51
-
52
- # Training hyperparameters
53
- gradient_accumulation_steps: 8
54
- micro_batch_size: 1
55
- num_epochs: 10
56
- optimizer: adamw_torch_fused
57
- lr_scheduler: cosine
58
- learning_rate: 2e-5
59
-
60
- # Precision
61
- bf16: auto
62
- tf32: true
63
-
64
- # Memory optimization
65
- gradient_checkpointing: true
66
- gradient_checkpointing_kwargs:
67
- use_reentrant: false
68
-
69
- # Logging and saving
70
- logging_steps: 10
71
- flash_attention: true
72
- warmup_ratio: 0.1
73
- evals_per_epoch: 2
74
- saves_per_epoch: 1
75
- save_total_limit: 3
76
- load_best_model_at_end: true
77
- weight_decay: 0.01
78
-
79
- # Wandb logging
80
- wandb_project: knowledge_sft
81
- wandb_name: full-sft-compositional-on-atomic-t20260305
82
- wandb_log_model: "false"
83
-
84
- special_tokens:
85
-
86
- ```
87
-
88
- </details><br>
89
-
90
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305
91
-
92
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/compositional_2step_sft/train_messages.jsonl dataset.
93
- It achieves the following results on the evaluation set:
94
- - Loss: 0.0052
95
- - Ppl: 1.0052
96
- - Memory/max Active (gib): 32.31
97
- - Memory/max Allocated (gib): 32.31
98
- - Memory/device Reserved (gib): 33.42
99
-
100
- ## Model description
101
-
102
- More information needed
103
-
104
- ## Intended uses & limitations
105
-
106
- More information needed
107
-
108
- ## Training and evaluation data
109
-
110
- More information needed
111
-
112
- ## Training procedure
113
-
114
- ### Training hyperparameters
115
-
116
- The following hyperparameters were used during training:
117
- - learning_rate: 2e-05
118
- - train_batch_size: 1
119
- - eval_batch_size: 1
120
- - seed: 42
121
- - gradient_accumulation_steps: 8
122
- - total_train_batch_size: 8
123
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
124
- - lr_scheduler_type: cosine
125
- - lr_scheduler_warmup_steps: 71
126
- - training_steps: 710
127
-
128
- ### Training results
129
-
130
- | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
131
- |:-------------:|:------:|:----:|:---------------:|:--------:|:------------:|:---------------:|:--------------:|
132
- | No log | 0 | 0 | 6.3406 | 567.1246 | 8.96 | 8.96 | 9.68 |
133
- | 3.8203 | 0.5070 | 36 | 1.0917 | 2.9793 | 32.32 | 32.32 | 33.61 |
134
- | 0.0694 | 1.0141 | 72 | 0.0408 | 1.0416 | 32.31 | 32.31 | 33.42 |
135
- | 0.0145 | 1.5211 | 108 | 0.0113 | 1.0114 | 32.31 | 32.31 | 33.42 |
136
- | 0.0069 | 2.0282 | 144 | 0.0067 | 1.0068 | 32.31 | 32.31 | 33.38 |
137
- | 0.0017 | 2.5352 | 180 | 0.0060 | 1.0060 | 24.06 | 24.06 | 33.38 |
138
- | 0.0011 | 3.0423 | 216 | 0.0054 | 1.0054 | 32.31 | 32.31 | 33.42 |
139
- | 0.0007 | 3.5493 | 252 | 0.0053 | 1.0053 | 32.31 | 32.31 | 33.38 |
140
- | 0.0007 | 4.0563 | 288 | 0.0052 | 1.0053 | 32.31 | 32.31 | 33.42 |
141
- | 0.0005 | 4.5634 | 324 | 0.0053 | 1.0053 | 32.31 | 32.31 | 33.42 |
142
- | 0.0006 | 5.0704 | 360 | 0.0053 | 1.0053 | 24.06 | 24.06 | 33.42 |
143
- | 0.0006 | 5.5775 | 396 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.38 |
144
- | 0.0005 | 6.0845 | 432 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.42 |
145
- | 0.0005 | 6.5915 | 468 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.38 |
146
- | 0.0005 | 7.0986 | 504 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.42 |
147
- | 0.0005 | 7.6056 | 540 | 0.0052 | 1.0052 | 24.06 | 24.06 | 33.38 |
148
- | 0.0005 | 8.1127 | 576 | 0.0052 | 1.0053 | 32.31 | 32.31 | 33.42 |
149
- | 0.0005 | 8.6197 | 612 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.42 |
150
- | 0.0005 | 9.1268 | 648 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.38 |
151
- | 0.0006 | 9.6338 | 684 | 0.0052 | 1.0052 | 32.31 | 32.31 | 33.42 |
152
-
153
-
154
- ### Framework versions
155
-
156
- - Transformers 5.0.0
157
- - Pytorch 2.8.0+cu128
158
- - Datasets 4.5.0
159
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/config.json DELETED
@@ -1,72 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": 151643,
60
- "rms_norm_eps": 1e-06,
61
- "rope_parameters": {
62
- "rope_theta": 5000000,
63
- "rope_type": "default"
64
- },
65
- "sliding_window": null,
66
- "tie_word_embeddings": true,
67
- "transformers_version": "5.0.0",
68
- "use_cache": true,
69
- "use_sliding_window": false,
70
- "vocab_size": 151936,
71
- "rope_theta": 5000000
72
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/eval_results.csv DELETED
@@ -1,3 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,450,448,99.56,0,0.00,2
3
- knowledge,test_subset_of_train_paraphrased_inference_results,450,444,98.67,0,0.00,6
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 450,
4
- "correct": 444,
5
- "accuracy": 98.67,
6
- "format_found": 0,
7
- "format_accuracy": 0.0
8
- },
9
- "n_errors": 6,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/test_subset_of_train_paraphrased_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/atomic/test_subset_of_train_paraphrased_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/eval_results.csv DELETED
@@ -1,4 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,500,500,100.00,500,100.00,0
3
- knowledge,test_inference_results,499,487,97.60,499,100.00,12
4
- knowledge,test_5shot_inference_results,499,63,12.63,499,100.00,436
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 499,
4
- "correct": 63,
5
- "accuracy": 12.63,
6
- "format_found": 499,
7
- "format_accuracy": 100.0
8
- },
9
- "n_errors": 436,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_5shot_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_5shot_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/eval_results/compositional_2step/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "do_sample": true,
3
- "eos_token_id": [
4
- 151645,
5
- 151643
6
- ],
7
- "pad_token_id": 151643,
8
- "temperature": 0.7,
9
- "top_k": 20,
10
- "top_p": 0.8,
11
- "transformers_version": "5.0.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c5fac0a245d42a4d3fb022ef1f30b7f63ea902bbb54af23095a589543718eba
3
- size 8044982080
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
- size 11422650
 
 
 
 
checkpoints/knowledge/atomic_full_then_2step_full_sft_t20260305/tokenizer_config.json DELETED
@@ -1,239 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "151643": {
5
- "content": "<|endoftext|>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "151644": {
13
- "content": "<|im_start|>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "151645": {
21
- "content": "<|im_end|>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "151646": {
29
- "content": "<|object_ref_start|>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "151647": {
37
- "content": "<|object_ref_end|>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "151648": {
45
- "content": "<|box_start|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": true
51
- },
52
- "151649": {
53
- "content": "<|box_end|>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false,
58
- "special": true
59
- },
60
- "151650": {
61
- "content": "<|quad_start|>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": false,
65
- "single_word": false,
66
- "special": true
67
- },
68
- "151651": {
69
- "content": "<|quad_end|>",
70
- "lstrip": false,
71
- "normalized": false,
72
- "rstrip": false,
73
- "single_word": false,
74
- "special": true
75
- },
76
- "151652": {
77
- "content": "<|vision_start|>",
78
- "lstrip": false,
79
- "normalized": false,
80
- "rstrip": false,
81
- "single_word": false,
82
- "special": true
83
- },
84
- "151653": {
85
- "content": "<|vision_end|>",
86
- "lstrip": false,
87
- "normalized": false,
88
- "rstrip": false,
89
- "single_word": false,
90
- "special": true
91
- },
92
- "151654": {
93
- "content": "<|vision_pad|>",
94
- "lstrip": false,
95
- "normalized": false,
96
- "rstrip": false,
97
- "single_word": false,
98
- "special": true
99
- },
100
- "151655": {
101
- "content": "<|image_pad|>",
102
- "lstrip": false,
103
- "normalized": false,
104
- "rstrip": false,
105
- "single_word": false,
106
- "special": true
107
- },
108
- "151656": {
109
- "content": "<|video_pad|>",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false,
114
- "special": true
115
- },
116
- "151657": {
117
- "content": "<tool_call>",
118
- "lstrip": false,
119
- "normalized": false,
120
- "rstrip": false,
121
- "single_word": false,
122
- "special": false
123
- },
124
- "151658": {
125
- "content": "</tool_call>",
126
- "lstrip": false,
127
- "normalized": false,
128
- "rstrip": false,
129
- "single_word": false,
130
- "special": false
131
- },
132
- "151659": {
133
- "content": "<|fim_prefix|>",
134
- "lstrip": false,
135
- "normalized": false,
136
- "rstrip": false,
137
- "single_word": false,
138
- "special": false
139
- },
140
- "151660": {
141
- "content": "<|fim_middle|>",
142
- "lstrip": false,
143
- "normalized": false,
144
- "rstrip": false,
145
- "single_word": false,
146
- "special": false
147
- },
148
- "151661": {
149
- "content": "<|fim_suffix|>",
150
- "lstrip": false,
151
- "normalized": false,
152
- "rstrip": false,
153
- "single_word": false,
154
- "special": false
155
- },
156
- "151662": {
157
- "content": "<|fim_pad|>",
158
- "lstrip": false,
159
- "normalized": false,
160
- "rstrip": false,
161
- "single_word": false,
162
- "special": false
163
- },
164
- "151663": {
165
- "content": "<|repo_name|>",
166
- "lstrip": false,
167
- "normalized": false,
168
- "rstrip": false,
169
- "single_word": false,
170
- "special": false
171
- },
172
- "151664": {
173
- "content": "<|file_sep|>",
174
- "lstrip": false,
175
- "normalized": false,
176
- "rstrip": false,
177
- "single_word": false,
178
- "special": false
179
- },
180
- "151665": {
181
- "content": "<tool_response>",
182
- "lstrip": false,
183
- "normalized": false,
184
- "rstrip": false,
185
- "single_word": false,
186
- "special": false
187
- },
188
- "151666": {
189
- "content": "</tool_response>",
190
- "lstrip": false,
191
- "normalized": false,
192
- "rstrip": false,
193
- "single_word": false,
194
- "special": false
195
- },
196
- "151667": {
197
- "content": "<think>",
198
- "lstrip": false,
199
- "normalized": false,
200
- "rstrip": false,
201
- "single_word": false,
202
- "special": false
203
- },
204
- "151668": {
205
- "content": "</think>",
206
- "lstrip": false,
207
- "normalized": false,
208
- "rstrip": false,
209
- "single_word": false,
210
- "special": false
211
- }
212
- },
213
- "additional_special_tokens": [
214
- "<|im_start|>",
215
- "<|im_end|>",
216
- "<|object_ref_start|>",
217
- "<|object_ref_end|>",
218
- "<|box_start|>",
219
- "<|box_end|>",
220
- "<|quad_start|>",
221
- "<|quad_end|>",
222
- "<|vision_start|>",
223
- "<|vision_end|>",
224
- "<|vision_pad|>",
225
- "<|image_pad|>",
226
- "<|video_pad|>"
227
- ],
228
- "bos_token": null,
229
- "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}",
230
- "clean_up_tokenization_spaces": false,
231
- "eos_token": "<|im_end|>",
232
- "errors": "replace",
233
- "model_max_length": 1010000,
234
- "pad_token": "<|endoftext|>",
235
- "split_special_tokens": false,
236
- "tokenizer_class": "Qwen2Tokenizer",
237
- "unk_token": null,
238
- "add_bos_token": false
239
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/README.md DELETED
@@ -1,237 +0,0 @@
1
- ---
2
- library_name: peft
3
- tags:
4
- - axolotl
5
- - base_model:adapter:/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
6
- - lora
7
- - transformers
8
- datasets:
9
- - /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
10
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
11
- pipeline_tag: text-generation
12
- model-index:
13
- - name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep_t20260305
14
- results: []
15
- ---
16
-
17
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
- should probably proofread and complete it, then remove this comment. -->
19
-
20
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
21
- <details><summary>See axolotl config</summary>
22
-
23
- axolotl version: `0.15.0.dev0`
24
- ```yaml
25
- base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
26
- load_in_8bit: false
27
- load_in_4bit: true
28
- strict: false
29
-
30
- datasets:
31
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl
32
- type: chat_template
33
- dataset_prepared_path:
34
- val_set_size: 0
35
- chat_template: chatml
36
-
37
- test_datasets:
38
- - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/val_paraphrased_messages.jsonl
39
- type: chat_template
40
-
41
- output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep_t20260305
42
-
43
- sequence_len: 512
44
- sample_packing: true
45
- eval_sample_packing: true
46
-
47
- adapter: qlora
48
- lora_r: 64
49
- lora_alpha: 128
50
- lora_dropout: 0.0
51
- lora_target_linear: true
52
-
53
- gradient_accumulation_steps: 4
54
- micro_batch_size: 2
55
- num_epochs: 50
56
- optimizer: adamw_torch_fused
57
- lr_scheduler: cosine
58
- learning_rate: 5e-4
59
-
60
- bf16: auto
61
- tf32: true
62
-
63
- gradient_checkpointing: true
64
- gradient_checkpointing_kwargs:
65
- use_reentrant: false
66
-
67
- logging_steps: 10
68
- flash_attention: true
69
- warmup_ratio: 0.02
70
- evals_per_epoch: 2
71
- saves_per_epoch: 1
72
- save_total_limit: 3
73
- load_best_model_at_end: true
74
- weight_decay: 0.01
75
-
76
- wandb_project: knowledge_sft
77
- wandb_name: lora-atomic-t20260305
78
- wandb_log_model: "false"
79
-
80
- special_tokens:
81
-
82
- ```
83
-
84
- </details><br>
85
-
86
- # home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep_t20260305
87
-
88
- This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/knowledge/atomic_sft/train_messages.jsonl dataset.
89
- It achieves the following results on the evaluation set:
90
- - Loss: 0.0001
91
- - Ppl: 1.0001
92
- - Memory/max Active (gib): 5.47
93
- - Memory/max Allocated (gib): 5.47
94
- - Memory/device Reserved (gib): 6.7
95
-
96
- ## Model description
97
-
98
- More information needed
99
-
100
- ## Intended uses & limitations
101
-
102
- More information needed
103
-
104
- ## Training and evaluation data
105
-
106
- More information needed
107
-
108
- ## Training procedure
109
-
110
- ### Training hyperparameters
111
-
112
- The following hyperparameters were used during training:
113
- - learning_rate: 0.0005
114
- - train_batch_size: 2
115
- - eval_batch_size: 2
116
- - seed: 42
117
- - gradient_accumulation_steps: 4
118
- - total_train_batch_size: 8
119
- - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
120
- - lr_scheduler_type: cosine
121
- - lr_scheduler_warmup_steps: 26
122
- - training_steps: 1300
123
-
124
- ### Training results
125
-
126
- | Training Loss | Epoch | Step | Validation Loss | Ppl | Active (gib) | Allocated (gib) | Reserved (gib) |
127
- |:-------------:|:-------:|:----:|:---------------:|:-----------:|:------------:|:---------------:|:--------------:|
128
- | No log | 0 | 0 | 11.5764 | 106548.9464 | 4.47 | 4.47 | 8.97 |
129
- | 7.2866 | 0.4860 | 13 | 2.8221 | 16.8118 | 6.45 | 6.45 | 6.7 |
130
- | 2.6126 | 0.9720 | 26 | 1.1444 | 3.1405 | 6.45 | 6.45 | 6.7 |
131
- | 1.3757 | 1.4486 | 39 | 0.9320 | 2.5397 | 6.45 | 6.45 | 8.19 |
132
- | 1.0069 | 1.9346 | 52 | 0.8724 | 2.3927 | 6.45 | 6.45 | 6.7 |
133
- | 0.9308 | 2.4112 | 65 | 0.8703 | 2.3877 | 6.45 | 6.45 | 8.19 |
134
- | 0.9039 | 2.8972 | 78 | 0.8341 | 2.3028 | 6.45 | 6.45 | 6.7 |
135
- | 0.8535 | 3.3738 | 91 | 0.8056 | 2.2381 | 6.45 | 6.45 | 8.19 |
136
- | 0.8485 | 3.8598 | 104 | 0.7935 | 2.2112 | 6.45 | 6.45 | 6.7 |
137
- | 0.8518 | 4.3364 | 117 | 0.7475 | 2.1118 | 6.45 | 6.45 | 8.19 |
138
- | 0.8252 | 4.8224 | 130 | 0.7261 | 2.0670 | 5.47 | 5.47 | 6.7 |
139
- | 0.7677 | 5.2991 | 143 | 0.5942 | 1.8116 | 6.45 | 6.45 | 8.19 |
140
- | 0.6715 | 5.7850 | 156 | 0.5665 | 1.7620 | 6.45 | 6.45 | 6.7 |
141
- | 0.7201 | 6.2617 | 169 | 0.3298 | 1.3907 | 6.45 | 6.45 | 8.19 |
142
- | 0.4599 | 6.7477 | 182 | 0.3798 | 1.4619 | 6.45 | 6.45 | 6.7 |
143
- | 0.5207 | 7.2243 | 195 | 0.1645 | 1.1788 | 6.45 | 6.45 | 8.19 |
144
- | 0.2006 | 7.7103 | 208 | 0.1505 | 1.1624 | 6.45 | 6.45 | 6.7 |
145
- | 0.1855 | 8.1869 | 221 | 0.0746 | 1.0775 | 6.45 | 6.45 | 8.19 |
146
- | 0.1124 | 8.6729 | 234 | 0.0545 | 1.0560 | 6.45 | 6.45 | 6.7 |
147
- | 0.1329 | 9.1495 | 247 | 0.0599 | 1.0617 | 6.45 | 6.45 | 8.19 |
148
- | 0.0721 | 9.6355 | 260 | 0.0632 | 1.0652 | 5.47 | 5.47 | 6.7 |
149
- | 0.0891 | 10.1121 | 273 | 0.0525 | 1.0539 | 6.45 | 6.45 | 8.19 |
150
- | 0.0445 | 10.5981 | 286 | 0.0242 | 1.0245 | 6.45 | 6.45 | 6.7 |
151
- | 0.0559 | 11.0748 | 299 | 0.0398 | 1.0406 | 6.45 | 6.45 | 8.19 |
152
- | 0.0388 | 11.5607 | 312 | 0.0243 | 1.0246 | 6.45 | 6.45 | 6.7 |
153
- | 0.0374 | 12.0374 | 325 | 0.0142 | 1.0143 | 6.45 | 6.45 | 8.19 |
154
- | 0.0223 | 12.5234 | 338 | 0.0126 | 1.0127 | 6.45 | 6.45 | 6.7 |
155
- | 0.0198 | 13.0 | 351 | 0.0127 | 1.0128 | 6.45 | 6.45 | 8.19 |
156
- | 0.0115 | 13.4860 | 364 | 0.0447 | 1.0457 | 6.45 | 6.45 | 6.7 |
157
- | 0.0101 | 13.9720 | 377 | 0.0032 | 1.0032 | 6.45 | 6.45 | 8.19 |
158
- | 0.0020 | 14.4486 | 390 | 0.0039 | 1.0039 | 5.47 | 5.47 | 6.7 |
159
- | 0.0031 | 14.9346 | 403 | 0.0032 | 1.0032 | 6.45 | 6.45 | 8.19 |
160
- | 0.0026 | 15.4112 | 416 | 0.0007 | 1.0007 | 6.45 | 6.45 | 6.7 |
161
- | 0.0025 | 15.8972 | 429 | 0.0005 | 1.0005 | 6.45 | 6.45 | 8.19 |
162
- | 0.0037 | 16.3738 | 442 | 0.0008 | 1.0008 | 6.45 | 6.45 | 6.7 |
163
- | 0.0023 | 16.8598 | 455 | 0.0007 | 1.0007 | 6.45 | 6.45 | 8.19 |
164
- | 0.0010 | 17.3364 | 468 | 0.0005 | 1.0005 | 6.45 | 6.45 | 6.7 |
165
- | 0.0005 | 17.8224 | 481 | 0.0004 | 1.0004 | 6.45 | 6.45 | 8.19 |
166
- | 0.0006 | 18.2991 | 494 | 0.0003 | 1.0003 | 6.45 | 6.45 | 6.7 |
167
- | 0.0002 | 18.7850 | 507 | 0.0002 | 1.0002 | 6.45 | 6.45 | 8.19 |
168
- | 0.0002 | 19.2617 | 520 | 0.0002 | 1.0002 | 5.47 | 5.47 | 6.7 |
169
- | 0.0001 | 19.7477 | 533 | 0.0002 | 1.0002 | 6.45 | 6.45 | 8.19 |
170
- | 0.0001 | 20.2243 | 546 | 0.0001 | 1.0002 | 6.45 | 6.45 | 6.7 |
171
- | 0.0001 | 20.7103 | 559 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
172
- | 0.0001 | 21.1869 | 572 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
173
- | 0.0001 | 21.6729 | 585 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
174
- | 0.0001 | 22.1495 | 598 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
175
- | 0.0001 | 22.6355 | 611 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
176
- | 0.0001 | 23.1121 | 624 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
177
- | 0.0001 | 23.5981 | 637 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
178
- | 0.0001 | 24.0748 | 650 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
179
- | 0.0001 | 24.5607 | 663 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
180
- | 0.0001 | 25.0374 | 676 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
181
- | 0.0001 | 25.5234 | 689 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
182
- | 0.0001 | 26.0 | 702 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
183
- | 0.0001 | 26.4860 | 715 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
184
- | 0.0001 | 26.9720 | 728 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
185
- | 0.0001 | 27.4486 | 741 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
186
- | 0.0001 | 27.9346 | 754 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
187
- | 0.0001 | 28.4112 | 767 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
188
- | 0.0001 | 28.8972 | 780 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
189
- | 0.0001 | 29.3738 | 793 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
190
- | 0.0001 | 29.8598 | 806 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
191
- | 0.0001 | 30.3364 | 819 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
192
- | 0.0001 | 30.8224 | 832 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
193
- | 0.0000 | 31.2991 | 845 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
194
- | 0.0001 | 31.7850 | 858 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
195
- | 0.0001 | 32.2617 | 871 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
196
- | 0.0000 | 32.7477 | 884 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
197
- | 0.0000 | 33.2243 | 897 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
198
- | 0.0000 | 33.7103 | 910 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
199
- | 0.0000 | 34.1869 | 923 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
200
- | 0.0000 | 34.6729 | 936 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
201
- | 0.0000 | 35.1495 | 949 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
202
- | 0.0000 | 35.6355 | 962 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
203
- | 0.0000 | 36.1121 | 975 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
204
- | 0.0000 | 36.5981 | 988 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
205
- | 0.0000 | 37.0748 | 1001 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
206
- | 0.0000 | 37.5607 | 1014 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
207
- | 0.0000 | 38.0374 | 1027 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
208
- | 0.0000 | 38.5234 | 1040 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
209
- | 0.0000 | 39.0 | 1053 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
210
- | 0.0000 | 39.4860 | 1066 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
211
- | 0.0000 | 39.9720 | 1079 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
212
- | 0.0000 | 40.4486 | 1092 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
213
- | 0.0000 | 40.9346 | 1105 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
214
- | 0.0000 | 41.4112 | 1118 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
215
- | 0.0000 | 41.8972 | 1131 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
216
- | 0.0000 | 42.3738 | 1144 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
217
- | 0.0000 | 42.8598 | 1157 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
218
- | 0.0000 | 43.3364 | 1170 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
219
- | 0.0000 | 43.8224 | 1183 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
220
- | 0.0000 | 44.2991 | 1196 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
221
- | 0.0000 | 44.7850 | 1209 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
222
- | 0.0000 | 45.2617 | 1222 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
223
- | 0.0000 | 45.7477 | 1235 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
224
- | 0.0000 | 46.2243 | 1248 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
225
- | 0.0000 | 46.7103 | 1261 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
226
- | 0.0000 | 47.1869 | 1274 | 0.0001 | 1.0001 | 6.45 | 6.45 | 6.7 |
227
- | 0.0000 | 47.6729 | 1287 | 0.0001 | 1.0001 | 6.45 | 6.45 | 8.19 |
228
- | 0.0000 | 48.1495 | 1300 | 0.0001 | 1.0001 | 5.47 | 5.47 | 6.7 |
229
-
230
-
231
- ### Framework versions
232
-
233
- - PEFT 0.18.1
234
- - Transformers 5.0.0
235
- - Pytorch 2.8.0+cu128
236
- - Datasets 4.5.0
237
- - Tokenizers 0.22.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/adapter_config.json DELETED
@@ -1,46 +0,0 @@
1
- {
2
- "alora_invocation_tokens": null,
3
- "alpha_pattern": {},
4
- "arrow_config": null,
5
- "auto_mapping": null,
6
- "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
7
- "bias": "none",
8
- "corda_config": null,
9
- "ensure_weight_tying": false,
10
- "eva_config": null,
11
- "exclude_modules": null,
12
- "fan_in_fan_out": null,
13
- "inference_mode": true,
14
- "init_lora_weights": true,
15
- "layer_replication": null,
16
- "layers_pattern": null,
17
- "layers_to_transform": null,
18
- "loftq_config": {},
19
- "lora_alpha": 128,
20
- "lora_bias": false,
21
- "lora_dropout": 0.0,
22
- "megatron_config": null,
23
- "megatron_core": "megatron.core",
24
- "modules_to_save": null,
25
- "peft_type": "LORA",
26
- "peft_version": "0.18.1",
27
- "qalora_group_size": 16,
28
- "r": 64,
29
- "rank_pattern": {},
30
- "revision": null,
31
- "target_modules": [
32
- "o_proj",
33
- "down_proj",
34
- "k_proj",
35
- "q_proj",
36
- "up_proj",
37
- "v_proj",
38
- "gate_proj"
39
- ],
40
- "target_parameters": [],
41
- "task_type": "CAUSAL_LM",
42
- "trainable_token_indices": null,
43
- "use_dora": false,
44
- "use_qalora": false,
45
- "use_rslora": false
46
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/adapter_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0208a88f86e8650d895396fb28d73ce7b7199934f2758a5f11ea54fa0d9cd195
3
- size 528550256
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/chat_template.jinja DELETED
@@ -1,4 +0,0 @@
1
- {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
- ' + message['content'] + '<|im_end|>' + '
3
- '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
- ' }}{% endif %}
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/config.json DELETED
@@ -1,86 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": null,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "pad_token_id": null,
60
- "quantization_config": {
61
- "_load_in_4bit": true,
62
- "_load_in_8bit": false,
63
- "bnb_4bit_compute_dtype": "bfloat16",
64
- "bnb_4bit_quant_storage": "bfloat16",
65
- "bnb_4bit_quant_type": "nf4",
66
- "bnb_4bit_use_double_quant": true,
67
- "llm_int8_enable_fp32_cpu_offload": false,
68
- "llm_int8_has_fp16_weight": false,
69
- "llm_int8_skip_modules": null,
70
- "llm_int8_threshold": 6.0,
71
- "load_in_4bit": true,
72
- "load_in_8bit": false,
73
- "quant_method": "bitsandbytes"
74
- },
75
- "rms_norm_eps": 1e-06,
76
- "rope_parameters": {
77
- "rope_theta": 5000000,
78
- "rope_type": "default"
79
- },
80
- "sliding_window": null,
81
- "tie_word_embeddings": true,
82
- "transformers_version": "5.0.0",
83
- "use_cache": false,
84
- "use_sliding_window": false,
85
- "vocab_size": 151936
86
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/eval_results.csv DELETED
@@ -1,5 +0,0 @@
1
- category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
2
- knowledge,test_subset_of_train_inference_results,450,448,99.56,0,0.00,2
3
- knowledge,test_subset_of_train_paraphrased_inference_results,450,448,99.56,0,0.00,2
4
- knowledge,test_inference_results,499,12,2.40,0,0.00,487
5
- knowledge,test_5shot_inference_results,499,17,3.41,0,0.00,482
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/eval_summary.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "overall": {
3
- "total": 499,
4
- "correct": 17,
5
- "accuracy": 3.41,
6
- "format_found": 0,
7
- "format_accuracy": 0.0
8
- },
9
- "n_errors": 482,
10
- "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/knowledge/atomic_sft_lora_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl"
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_5shot_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_subset_of_train_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/eval_results/test_subset_of_train_paraphrased_inference_results.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/added_tokens.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "</think>": 151668,
3
- "</tool_call>": 151658,
4
- "</tool_response>": 151666,
5
- "<think>": 151667,
6
- "<tool_call>": 151657,
7
- "<tool_response>": 151665,
8
- "<|box_end|>": 151649,
9
- "<|box_start|>": 151648,
10
- "<|endoftext|>": 151643,
11
- "<|file_sep|>": 151664,
12
- "<|fim_middle|>": 151660,
13
- "<|fim_pad|>": 151662,
14
- "<|fim_prefix|>": 151659,
15
- "<|fim_suffix|>": 151661,
16
- "<|im_end|>": 151645,
17
- "<|im_start|>": 151644,
18
- "<|image_pad|>": 151655,
19
- "<|object_ref_end|>": 151647,
20
- "<|object_ref_start|>": 151646,
21
- "<|quad_end|>": 151651,
22
- "<|quad_start|>": 151650,
23
- "<|repo_name|>": 151663,
24
- "<|video_pad|>": 151656,
25
- "<|vision_end|>": 151653,
26
- "<|vision_pad|>": 151654,
27
- "<|vision_start|>": 151652
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/chat_template.jinja DELETED
@@ -1,61 +0,0 @@
1
- {%- if tools %}
2
- {{- '<|im_start|>system\n' }}
3
- {%- if messages[0].role == 'system' %}
4
- {{- messages[0].content + '\n\n' }}
5
- {%- endif %}
6
- {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
- {%- for tool in tools %}
8
- {{- "\n" }}
9
- {{- tool | tojson }}
10
- {%- endfor %}
11
- {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
- {%- else %}
13
- {%- if messages[0].role == 'system' %}
14
- {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
- {%- endif %}
16
- {%- endif %}
17
- {%- for message in messages %}
18
- {%- if message.content is string %}
19
- {%- set content = message.content %}
20
- {%- else %}
21
- {%- set content = '' %}
22
- {%- endif %}
23
- {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
- {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
- {%- elif message.role == "assistant" %}
26
- {{- '<|im_start|>' + message.role + '\n' + content }}
27
- {%- if message.tool_calls %}
28
- {%- for tool_call in message.tool_calls %}
29
- {%- if (loop.first and content) or (not loop.first) %}
30
- {{- '\n' }}
31
- {%- endif %}
32
- {%- if tool_call.function %}
33
- {%- set tool_call = tool_call.function %}
34
- {%- endif %}
35
- {{- '<tool_call>\n{"name": "' }}
36
- {{- tool_call.name }}
37
- {{- '", "arguments": ' }}
38
- {%- if tool_call.arguments is string %}
39
- {{- tool_call.arguments }}
40
- {%- else %}
41
- {{- tool_call.arguments | tojson }}
42
- {%- endif %}
43
- {{- '}\n</tool_call>' }}
44
- {%- endfor %}
45
- {%- endif %}
46
- {{- '<|im_end|>\n' }}
47
- {%- elif message.role == "tool" %}
48
- {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
- {{- '<|im_start|>user' }}
50
- {%- endif %}
51
- {{- '\n<tool_response>\n' }}
52
- {{- content }}
53
- {{- '\n</tool_response>' }}
54
- {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
- {{- '<|im_end|>\n' }}
56
- {%- endif %}
57
- {%- endif %}
58
- {%- endfor %}
59
- {%- if add_generation_prompt %}
60
- {{- '<|im_start|>assistant\n' }}
61
- {%- endif %}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/config.json DELETED
@@ -1,68 +0,0 @@
1
- {
2
- "architectures": [
3
- "Qwen3ForCausalLM"
4
- ],
5
- "attention_bias": false,
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "dtype": "bfloat16",
9
- "eos_token_id": 151645,
10
- "head_dim": 128,
11
- "hidden_act": "silu",
12
- "hidden_size": 2560,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 9728,
15
- "layer_types": [
16
- "full_attention",
17
- "full_attention",
18
- "full_attention",
19
- "full_attention",
20
- "full_attention",
21
- "full_attention",
22
- "full_attention",
23
- "full_attention",
24
- "full_attention",
25
- "full_attention",
26
- "full_attention",
27
- "full_attention",
28
- "full_attention",
29
- "full_attention",
30
- "full_attention",
31
- "full_attention",
32
- "full_attention",
33
- "full_attention",
34
- "full_attention",
35
- "full_attention",
36
- "full_attention",
37
- "full_attention",
38
- "full_attention",
39
- "full_attention",
40
- "full_attention",
41
- "full_attention",
42
- "full_attention",
43
- "full_attention",
44
- "full_attention",
45
- "full_attention",
46
- "full_attention",
47
- "full_attention",
48
- "full_attention",
49
- "full_attention",
50
- "full_attention",
51
- "full_attention"
52
- ],
53
- "max_position_embeddings": 262144,
54
- "max_window_layers": 36,
55
- "model_type": "qwen3",
56
- "num_attention_heads": 32,
57
- "num_hidden_layers": 36,
58
- "num_key_value_heads": 8,
59
- "rms_norm_eps": 1e-06,
60
- "rope_scaling": null,
61
- "rope_theta": 5000000,
62
- "sliding_window": null,
63
- "tie_word_embeddings": true,
64
- "transformers_version": "4.57.1",
65
- "use_cache": true,
66
- "use_sliding_window": false,
67
- "vocab_size": 151936
68
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/generation_config.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "bos_token_id": 151643,
3
- "do_sample": true,
4
- "eos_token_id": [
5
- 151645,
6
- 151643
7
- ],
8
- "pad_token_id": 151643,
9
- "temperature": 0.7,
10
- "top_k": 20,
11
- "top_p": 0.8,
12
- "transformers_version": "4.57.1"
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model-00001-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:075ef3d88561c94a14aef6de9610046354d6e39100565a84aeefc063dbffd407
3
- size 4967215360
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model-00002-of-00002.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:33254930412df48f3b04e93f7b7a328e3345fed99bd52da41e3524f7079abbe7
3
- size 3077766632
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/model.safetensors.index.json DELETED
@@ -1,406 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_parameters": 4022468096,
4
- "total_size": 8044936192
5
- },
6
- "weight_map": {
7
- "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
- "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
11
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
12
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
13
- "model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
14
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
15
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
16
- "model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
17
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
18
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
19
- "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
20
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
21
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
22
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
23
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
- "model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
25
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
26
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
27
- "model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
28
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
29
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
30
- "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
31
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
32
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
33
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
34
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
35
- "model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
36
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
37
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
- "model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
39
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
40
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
41
- "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
42
- "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
43
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
44
- "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
45
- "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
46
- "model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
47
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
48
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
- "model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
50
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
51
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
52
- "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
53
- "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
54
- "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
55
- "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
56
- "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
57
- "model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
58
- "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
59
- "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
60
- "model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
61
- "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
62
- "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
63
- "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
64
- "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
65
- "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
- "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
67
- "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
68
- "model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
69
- "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
70
- "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
71
- "model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
72
- "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
73
- "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
74
- "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
75
- "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
76
- "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
77
- "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
78
- "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
79
- "model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
80
- "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
81
- "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
82
- "model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
83
- "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
84
- "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
85
- "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
86
- "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
87
- "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
88
- "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
89
- "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
90
- "model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
91
- "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
92
- "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
93
- "model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
- "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
- "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
96
- "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
97
- "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
98
- "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
99
- "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
100
- "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
101
- "model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
102
- "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
103
- "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
104
- "model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
105
- "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
106
- "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
107
- "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
108
- "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
109
- "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
110
- "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
111
- "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
112
- "model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
113
- "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
114
- "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
115
- "model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
116
- "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
117
- "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
118
- "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
119
- "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
120
- "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
121
- "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
- "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
123
- "model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
124
- "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
125
- "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
126
- "model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
127
- "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
128
- "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
- "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
- "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
- "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
- "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
- "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
- "model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
135
- "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
- "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
- "model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
138
- "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
- "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
140
- "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
141
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
142
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
143
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
144
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
145
- "model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
146
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
147
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
148
- "model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
149
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
150
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
- "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
152
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
153
- "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
154
- "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
155
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
- "model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
157
- "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
158
- "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
159
- "model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
160
- "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
161
- "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
162
- "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
163
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
164
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
165
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
166
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
167
- "model.layers.21.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
168
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
169
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
170
- "model.layers.21.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
171
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
172
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
173
- "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
174
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
175
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
176
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
177
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
178
- "model.layers.22.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
179
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
180
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
181
- "model.layers.22.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
182
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
183
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
184
- "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
185
- "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
186
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
187
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
188
- "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
189
- "model.layers.23.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
190
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
191
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
192
- "model.layers.23.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
193
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
194
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
195
- "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
196
- "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
197
- "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
198
- "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
199
- "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
200
- "model.layers.24.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
201
- "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
202
- "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
203
- "model.layers.24.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
204
- "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
205
- "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
206
- "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
207
- "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
208
- "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
209
- "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
210
- "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
211
- "model.layers.25.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
212
- "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
213
- "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
214
- "model.layers.25.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
215
- "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
216
- "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
217
- "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
218
- "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
219
- "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
220
- "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
221
- "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
222
- "model.layers.26.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
223
- "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
224
- "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
225
- "model.layers.26.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
226
- "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
227
- "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
228
- "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
229
- "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
230
- "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
231
- "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
232
- "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
233
- "model.layers.27.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
234
- "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
235
- "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
236
- "model.layers.27.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
237
- "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
238
- "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
239
- "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
240
- "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
241
- "model.layers.28.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
242
- "model.layers.28.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
243
- "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
244
- "model.layers.28.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
245
- "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
246
- "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
247
- "model.layers.28.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
248
- "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
249
- "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
250
- "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
251
- "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
252
- "model.layers.29.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
253
- "model.layers.29.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
254
- "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
255
- "model.layers.29.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
256
- "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
257
- "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
258
- "model.layers.29.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
259
- "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
260
- "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
- "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
- "model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
267
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
- "model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
270
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
272
- "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
273
- "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
274
- "model.layers.30.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
275
- "model.layers.30.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
276
- "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
277
- "model.layers.30.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
278
- "model.layers.30.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
279
- "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
280
- "model.layers.30.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
281
- "model.layers.30.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
282
- "model.layers.30.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
283
- "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
284
- "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
285
- "model.layers.31.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
286
- "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
287
- "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
288
- "model.layers.31.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
289
- "model.layers.31.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
290
- "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
291
- "model.layers.31.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
292
- "model.layers.31.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
293
- "model.layers.31.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
294
- "model.layers.32.input_layernorm.weight": "model-00002-of-00002.safetensors",
295
- "model.layers.32.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
296
- "model.layers.32.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
297
- "model.layers.32.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
298
- "model.layers.32.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
299
- "model.layers.32.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
300
- "model.layers.32.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
301
- "model.layers.32.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
302
- "model.layers.32.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
303
- "model.layers.32.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
304
- "model.layers.32.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
305
- "model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
306
- "model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
307
- "model.layers.33.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
308
- "model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
309
- "model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
310
- "model.layers.33.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
311
- "model.layers.33.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
312
- "model.layers.33.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
313
- "model.layers.33.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
314
- "model.layers.33.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
315
- "model.layers.33.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
316
- "model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
317
- "model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
318
- "model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
319
- "model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
320
- "model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
321
- "model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
322
- "model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
323
- "model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
324
- "model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
325
- "model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
326
- "model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
327
- "model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
328
- "model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
329
- "model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
330
- "model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
331
- "model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
332
- "model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
333
- "model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
334
- "model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
335
- "model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
336
- "model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
337
- "model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
338
- "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
339
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
340
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
341
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
342
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
343
- "model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
344
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
345
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
346
- "model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
347
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
348
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
- "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
350
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
351
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
352
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
353
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
354
- "model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
355
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
356
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
357
- "model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
358
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
359
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
360
- "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
361
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
362
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
363
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
364
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
365
- "model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
366
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
367
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
368
- "model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
369
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
370
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
371
- "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
372
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
373
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
374
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
375
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
- "model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
379
- "model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
380
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
381
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
382
- "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
383
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
384
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
385
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
386
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
387
- "model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
388
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
389
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
390
- "model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
391
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
392
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
393
- "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
394
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
395
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
396
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
397
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
398
- "model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
399
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
400
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
401
- "model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
402
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
403
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
404
- "model.norm.weight": "model-00002-of-00002.safetensors"
405
- }
406
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/special_tokens_map.json DELETED
@@ -1,31 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|object_ref_start|>",
6
- "<|object_ref_end|>",
7
- "<|box_start|>",
8
- "<|box_end|>",
9
- "<|quad_start|>",
10
- "<|quad_end|>",
11
- "<|vision_start|>",
12
- "<|vision_end|>",
13
- "<|vision_pad|>",
14
- "<|image_pad|>",
15
- "<|video_pad|>"
16
- ],
17
- "eos_token": {
18
- "content": "<|im_end|>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- "pad_token": {
25
- "content": "<|endoftext|>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- }
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints/knowledge/lora_sft_atomic_50ep_t20260305/merged/tokenizer.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
- size 11422654