SirajRLX commited on
Commit
28847d8
·
verified ·
1 Parent(s): f1daf67

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. README.md +68 -0
  3. best_adapter/README.md +4 -2
  4. best_adapter/adapter_config.json +3 -6
  5. best_adapter/rng_state.pth +3 -0
  6. best_adapter/scheduler.pt +3 -0
  7. best_adapter/tokenizer.json +2 -2
  8. best_adapter/tokenizer_config.json +7 -185
  9. best_adapter/trainer_state.json +857 -0
  10. best_adapter/training_args.bin +2 -2
  11. checkpoint-100/README.md +209 -0
  12. checkpoint-100/adapter_config.json +43 -0
  13. checkpoint-100/chat_template.jinja +54 -0
  14. checkpoint-100/scheduler.pt +3 -0
  15. checkpoint-100/tokenizer.json +3 -0
  16. checkpoint-100/tokenizer_config.json +29 -0
  17. checkpoint-100/trainer_state.json +857 -0
  18. checkpoint-100/training_args.bin +3 -0
  19. config_resolved.yaml +58 -77
  20. logs/eval.jsonl +5 -52
  21. logs/train.jsonl +78 -102
  22. wandb/debug-internal.log +11 -12
  23. wandb/debug.log +26 -29
  24. wandb/run-20251226_152332-r9hfat2g/files/config.yaml +661 -0
  25. wandb/run-20251226_152332-r9hfat2g/files/output.log +189 -0
  26. wandb/run-20251226_152332-r9hfat2g/files/requirements.txt +104 -0
  27. wandb/run-20251226_152332-r9hfat2g/files/wandb-metadata.json +47 -0
  28. wandb/run-20251226_152332-r9hfat2g/files/wandb-summary.json +1 -0
  29. wandb/run-20251226_152332-r9hfat2g/logs/debug-core.log +14 -0
  30. wandb/run-20251226_152332-r9hfat2g/logs/debug-internal.log +11 -0
  31. wandb/run-20251226_152332-r9hfat2g/logs/debug.log +26 -0
  32. wandb/run-20251226_152332-r9hfat2g/run-r9hfat2g.wandb +3 -0
  33. wandb/run-20251226_152936-r1nptay8/files/config.yaml +165 -0
  34. wandb/run-20251226_152936-r1nptay8/files/output.log +121 -0
  35. wandb/run-20251226_152936-r1nptay8/files/requirements.txt +104 -0
  36. wandb/run-20251226_152936-r1nptay8/files/wandb-metadata.json +47 -0
  37. wandb/run-20251226_152936-r1nptay8/files/wandb-summary.json +1 -0
  38. wandb/run-20251226_152936-r1nptay8/logs/debug-core.log +14 -0
  39. wandb/run-20251226_152936-r1nptay8/logs/debug-internal.log +11 -0
  40. wandb/run-20251226_152936-r1nptay8/logs/debug.log +23 -0
  41. wandb/run-20251226_152936-r1nptay8/run-r1nptay8.wandb +3 -0
  42. wandb/run-20251226_155650-wbzoafvt/files/config.yaml +661 -0
  43. wandb/run-20251226_155650-wbzoafvt/files/output.log +279 -0
  44. wandb/run-20251226_155650-wbzoafvt/files/requirements.txt +104 -0
  45. wandb/run-20251226_155650-wbzoafvt/files/wandb-metadata.json +47 -0
  46. wandb/run-20251226_155650-wbzoafvt/files/wandb-summary.json +1 -0
  47. wandb/run-20251226_155650-wbzoafvt/logs/debug-core.log +14 -0
  48. wandb/run-20251226_155650-wbzoafvt/logs/debug-internal.log +11 -0
  49. wandb/run-20251226_155650-wbzoafvt/logs/debug.log +26 -0
  50. wandb/run-20251226_155650-wbzoafvt/run-wbzoafvt.wandb +3 -0
.gitattributes CHANGED
@@ -47,3 +47,7 @@ best_adapter/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
  checkpoints/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
  checkpoints/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  wandb/run-20251227_194423-jz7bptqa/run-jz7bptqa.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
47
  checkpoints/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
  checkpoints/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  wandb/run-20251227_194423-jz7bptqa/run-jz7bptqa.wandb filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ wandb/run-20251226_152332-r9hfat2g/run-r9hfat2g.wandb filter=lfs diff=lfs merge=lfs -text
52
+ wandb/run-20251226_152936-r1nptay8/run-r1nptay8.wandb filter=lfs diff=lfs merge=lfs -text
53
+ wandb/run-20251226_155650-wbzoafvt/run-wbzoafvt.wandb filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: dpo_run_14b_v1
4
+ tags:
5
+ - generated_from_trainer
6
+ - trl
7
+ - dpo
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for dpo_run_14b_v1
12
+
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="None", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/sirajuddin-shaik-007/dpo-training/runs/wbzoafvt)
30
+
31
+
32
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.26.2
37
+ - Transformers: 5.0.0.dev0
38
+ - Pytorch: 2.5.1+cu121
39
+ - Datasets: 4.4.2
40
+ - Tokenizers: 0.22.1
41
+
42
+ ## Citations
43
+
44
+ Cite DPO as:
45
+
46
+ ```bibtex
47
+ @inproceedings{rafailov2023direct,
48
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
49
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
50
+ year = 2023,
51
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
52
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
53
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
54
+ }
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
best_adapter/README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
- base_model: /workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
- - base_model:adapter:/workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2
 
7
  - lora
8
  - transformers
 
9
  ---
10
 
11
  # Model Card for Model ID
 
1
  ---
2
+ base_model: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
3
  library_name: peft
4
  pipeline_tag: text-generation
5
  tags:
6
+ - base_model:adapter:../../Models/Qwen2.5-Coder-14B-CPT-SFT
7
+ - dpo
8
  - lora
9
  - transformers
10
+ - trl
11
  ---
12
 
13
  # Model Card for Model ID
best_adapter/adapter_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
- "base_model_name_or_path": "/workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
@@ -29,13 +29,10 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
- "q_proj",
33
  "o_proj",
34
  "v_proj",
35
- "up_proj",
36
- "gate_proj",
37
- "down_proj",
38
- "k_proj"
39
  ],
40
  "target_parameters": null,
41
  "task_type": "CAUSAL_LM",
 
3
  "alpha_pattern": {},
4
  "arrow_config": null,
5
  "auto_mapping": null,
6
+ "base_model_name_or_path": "../../Models/Qwen2.5-Coder-14B-CPT-SFT",
7
  "bias": "none",
8
  "corda_config": null,
9
  "ensure_weight_tying": false,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "k_proj",
33
  "o_proj",
34
  "v_proj",
35
+ "q_proj"
 
 
 
36
  ],
37
  "target_parameters": null,
38
  "task_type": "CAUSAL_LM",
best_adapter/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ecefbb3f17bb76b6655eb0157c98b5287c17fa4b4c72a6b9068b0823ce9fd18d
3
+ size 14244
best_adapter/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2d3d5485f7a1cfe5d5e69f9e55a45f72f0a8b17e757d0ca412c96a2d472fbf
3
+ size 1064
best_adapter/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
best_adapter/tokenizer_config.json CHANGED
@@ -1,185 +1,11 @@
1
  {
2
- "add_bos_token": false,
3
  "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "151643": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "151644": {
14
- "content": "<|im_start|>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "151645": {
22
- "content": "<|im_end|>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "151646": {
30
- "content": "<|object_ref_start|>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "151647": {
38
- "content": "<|object_ref_end|>",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": true
44
- },
45
- "151648": {
46
- "content": "<|box_start|>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": true
52
- },
53
- "151649": {
54
- "content": "<|box_end|>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": true
60
- },
61
- "151650": {
62
- "content": "<|quad_start|>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": true
68
- },
69
- "151651": {
70
- "content": "<|quad_end|>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": true
76
- },
77
- "151652": {
78
- "content": "<|vision_start|>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": true
84
- },
85
- "151653": {
86
- "content": "<|vision_end|>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": true
92
- },
93
- "151654": {
94
- "content": "<|vision_pad|>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": true
100
- },
101
- "151655": {
102
- "content": "<|image_pad|>",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": true
108
- },
109
- "151656": {
110
- "content": "<|video_pad|>",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": true
116
- },
117
- "151657": {
118
- "content": "<tool_call>",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "151658": {
126
- "content": "</tool_call>",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "151659": {
134
- "content": "<|fim_prefix|>",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "151660": {
142
- "content": "<|fim_middle|>",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "151661": {
150
- "content": "<|fim_suffix|>",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "151662": {
158
- "content": "<|fim_pad|>",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "151663": {
166
- "content": "<|repo_name|>",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "151664": {
174
- "content": "<|file_sep|>",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- }
181
- },
182
- "additional_special_tokens": [
183
  "<|im_start|>",
184
  "<|im_end|>",
185
  "<|object_ref_start|>",
@@ -194,11 +20,7 @@
194
  "<|image_pad|>",
195
  "<|video_pad|>"
196
  ],
197
- "bos_token": null,
198
- "clean_up_tokenization_spaces": false,
199
- "eos_token": "<|endoftext|>",
200
- "errors": "replace",
201
- "extra_special_tokens": {},
202
  "model_max_length": 32768,
203
  "pad_token": "<|endoftext|>",
204
  "split_special_tokens": false,
 
1
  {
 
2
  "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "<|im_start|>",
10
  "<|im_end|>",
11
  "<|object_ref_start|>",
 
20
  "<|image_pad|>",
21
  "<|video_pad|>"
22
  ],
23
+ "is_local": true,
 
 
 
 
24
  "model_max_length": 32768,
25
  "pad_token": "<|endoftext|>",
26
  "split_special_tokens": false,
best_adapter/trainer_state.json ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.04428481683135033,
4
+ "best_model_checkpoint": "runs/dpo_run_14b_v1/checkpoint-100",
5
+ "epoch": 0.11678832116788321,
6
+ "eval_steps": 25,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0023357664233576644,
14
+ "grad_norm": 1.242694616317749,
15
+ "learning_rate": 1.9379844961240311e-07,
16
+ "logits/chosen": 5.179401397705078,
17
+ "logits/rejected": 5.192930698394775,
18
+ "logps/chosen": -368.911865234375,
19
+ "logps/rejected": -398.83880615234375,
20
+ "loss": 0.6931473016738892,
21
+ "rewards/accuracies": 0.0,
22
+ "rewards/chosen": 0.0,
23
+ "rewards/margins": 0.0,
24
+ "rewards/rejected": 0.0,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.004671532846715329,
29
+ "grad_norm": 1.392399787902832,
30
+ "learning_rate": 5.813953488372093e-07,
31
+ "logits/chosen": 5.403897762298584,
32
+ "logits/rejected": 5.4565606117248535,
33
+ "logps/chosen": -338.43792724609375,
34
+ "logps/rejected": -367.03057861328125,
35
+ "loss": 0.6949559450149536,
36
+ "rewards/accuracies": 0.625,
37
+ "rewards/chosen": 0.004504585638642311,
38
+ "rewards/margins": -0.003222561441361904,
39
+ "rewards/rejected": 0.007727146148681641,
40
+ "step": 4
41
+ },
42
+ {
43
+ "epoch": 0.0070072992700729924,
44
+ "grad_norm": 1.066603183746338,
45
+ "learning_rate": 9.689922480620155e-07,
46
+ "logits/chosen": 5.291868209838867,
47
+ "logits/rejected": 5.328356742858887,
48
+ "logps/chosen": -362.3431701660156,
49
+ "logps/rejected": -387.5829772949219,
50
+ "loss": 0.689236581325531,
51
+ "rewards/accuracies": 0.5625,
52
+ "rewards/chosen": -0.0034066196531057358,
53
+ "rewards/margins": 0.008255671709775925,
54
+ "rewards/rejected": -0.01166229322552681,
55
+ "step": 6
56
+ },
57
+ {
58
+ "epoch": 0.009343065693430658,
59
+ "grad_norm": 1.0005714893341064,
60
+ "learning_rate": 1.3565891472868218e-06,
61
+ "logits/chosen": 5.323437690734863,
62
+ "logits/rejected": 5.410858631134033,
63
+ "logps/chosen": -379.9283447265625,
64
+ "logps/rejected": -389.0852355957031,
65
+ "loss": 0.6943775415420532,
66
+ "rewards/accuracies": 0.375,
67
+ "rewards/chosen": 0.014657974243164062,
68
+ "rewards/margins": -0.0012350091710686684,
69
+ "rewards/rejected": 0.015892982482910156,
70
+ "step": 8
71
+ },
72
+ {
73
+ "epoch": 0.01167883211678832,
74
+ "grad_norm": 1.2461222410202026,
75
+ "learning_rate": 1.744186046511628e-06,
76
+ "logits/chosen": 5.435908317565918,
77
+ "logits/rejected": 5.494542121887207,
78
+ "logps/chosen": -363.2003479003906,
79
+ "logps/rejected": -389.67376708984375,
80
+ "loss": 0.693260908126831,
81
+ "rewards/accuracies": 0.625,
82
+ "rewards/chosen": -0.028497030958533287,
83
+ "rewards/margins": 0.00012636138126254082,
84
+ "rewards/rejected": -0.028623390942811966,
85
+ "step": 10
86
+ },
87
+ {
88
+ "epoch": 0.014014598540145985,
89
+ "grad_norm": 1.4030137062072754,
90
+ "learning_rate": 2.131782945736434e-06,
91
+ "logits/chosen": 5.3550801277160645,
92
+ "logits/rejected": 5.375768661499023,
93
+ "logps/chosen": -370.96429443359375,
94
+ "logps/rejected": -402.4786071777344,
95
+ "loss": 0.6882913112640381,
96
+ "rewards/accuracies": 0.5,
97
+ "rewards/chosen": 0.01622028276324272,
98
+ "rewards/margins": 0.010086631402373314,
99
+ "rewards/rejected": 0.006133650429546833,
100
+ "step": 12
101
+ },
102
+ {
103
+ "epoch": 0.01635036496350365,
104
+ "grad_norm": 1.1157702207565308,
105
+ "learning_rate": 2.5193798449612402e-06,
106
+ "logits/chosen": 5.515308380126953,
107
+ "logits/rejected": 5.561104774475098,
108
+ "logps/chosen": -336.7254333496094,
109
+ "logps/rejected": -357.52203369140625,
110
+ "loss": 0.6896716356277466,
111
+ "rewards/accuracies": 0.625,
112
+ "rewards/chosen": -0.017319394275546074,
113
+ "rewards/margins": 0.007328510750085115,
114
+ "rewards/rejected": -0.024647902697324753,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.018686131386861315,
119
+ "grad_norm": 0.9470655918121338,
120
+ "learning_rate": 2.9069767441860468e-06,
121
+ "logits/chosen": 5.553088665008545,
122
+ "logits/rejected": 5.582851886749268,
123
+ "logps/chosen": -415.6842041015625,
124
+ "logps/rejected": -441.1054992675781,
125
+ "loss": 0.6904245018959045,
126
+ "rewards/accuracies": 0.5625,
127
+ "rewards/chosen": 0.03270244598388672,
128
+ "rewards/margins": 0.005826758686453104,
129
+ "rewards/rejected": 0.026875685900449753,
130
+ "step": 16
131
+ },
132
+ {
133
+ "epoch": 0.021021897810218976,
134
+ "grad_norm": 1.4397331476211548,
135
+ "learning_rate": 3.2945736434108533e-06,
136
+ "logits/chosen": 5.440742015838623,
137
+ "logits/rejected": 5.489529132843018,
138
+ "logps/chosen": -392.46221923828125,
139
+ "logps/rejected": -420.1712341308594,
140
+ "loss": 0.683630108833313,
141
+ "rewards/accuracies": 0.5625,
142
+ "rewards/chosen": 0.011020278558135033,
143
+ "rewards/margins": 0.01951923407614231,
144
+ "rewards/rejected": -0.008498954586684704,
145
+ "step": 18
146
+ },
147
+ {
148
+ "epoch": 0.02335766423357664,
149
+ "grad_norm": 1.5941083431243896,
150
+ "learning_rate": 3.6821705426356594e-06,
151
+ "logits/chosen": 5.318347930908203,
152
+ "logits/rejected": 5.397945404052734,
153
+ "logps/chosen": -345.2221374511719,
154
+ "logps/rejected": -365.9537048339844,
155
+ "loss": 0.6902388334274292,
156
+ "rewards/accuracies": 0.5625,
157
+ "rewards/chosen": 0.006536484230309725,
158
+ "rewards/margins": 0.006013393402099609,
159
+ "rewards/rejected": 0.0005230908282101154,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.025693430656934305,
164
+ "grad_norm": 1.1363905668258667,
165
+ "learning_rate": 4.0697674418604655e-06,
166
+ "logits/chosen": 5.632981300354004,
167
+ "logits/rejected": 5.7265520095825195,
168
+ "logps/chosen": -347.9439697265625,
169
+ "logps/rejected": -370.65777587890625,
170
+ "loss": 0.691262423992157,
171
+ "rewards/accuracies": 0.5,
172
+ "rewards/chosen": 0.011908342130482197,
173
+ "rewards/margins": 0.004538153763860464,
174
+ "rewards/rejected": 0.007370188366621733,
175
+ "step": 22
176
+ },
177
+ {
178
+ "epoch": 0.02802919708029197,
179
+ "grad_norm": 1.0684627294540405,
180
+ "learning_rate": 4.457364341085272e-06,
181
+ "logits/chosen": 5.35699987411499,
182
+ "logits/rejected": 5.405580520629883,
183
+ "logps/chosen": -347.1539001464844,
184
+ "logps/rejected": -377.6044921875,
185
+ "loss": 0.6769475936889648,
186
+ "rewards/accuracies": 0.875,
187
+ "rewards/chosen": 0.01244144607335329,
188
+ "rewards/margins": 0.03289356082677841,
189
+ "rewards/rejected": -0.020452119410037994,
190
+ "step": 24
191
+ },
192
+ {
193
+ "epoch": 0.029197080291970802,
194
+ "eval_logits/chosen": 5.295141220092773,
195
+ "eval_logits/rejected": 5.345211029052734,
196
+ "eval_logps/chosen": -370.1607666015625,
197
+ "eval_logps/rejected": -395.7251892089844,
198
+ "eval_loss": 0.6836819648742676,
199
+ "eval_rewards/accuracies": 0.665354311466217,
200
+ "eval_rewards/chosen": 0.024636391550302505,
201
+ "eval_rewards/margins": 0.019555427134037018,
202
+ "eval_rewards/rejected": 0.005080964416265488,
203
+ "eval_runtime": 454.4375,
204
+ "eval_samples_per_second": 1.677,
205
+ "eval_steps_per_second": 1.677,
206
+ "step": 25
207
+ },
208
+ {
209
+ "epoch": 0.030364963503649634,
210
+ "grad_norm": 1.592353105545044,
211
+ "learning_rate": 4.844961240310078e-06,
212
+ "logits/chosen": 5.157042026519775,
213
+ "logits/rejected": 5.244912147521973,
214
+ "logps/chosen": -387.54876708984375,
215
+ "logps/rejected": -412.0630187988281,
216
+ "loss": 0.6849788427352905,
217
+ "rewards/accuracies": 0.625,
218
+ "rewards/chosen": 0.026385309174656868,
219
+ "rewards/margins": 0.016966437920928,
220
+ "rewards/rejected": 0.009418869391083717,
221
+ "step": 26
222
+ },
223
+ {
224
+ "epoch": 0.0327007299270073,
225
+ "grad_norm": 1.3181558847427368,
226
+ "learning_rate": 5.232558139534884e-06,
227
+ "logits/chosen": 5.545513153076172,
228
+ "logits/rejected": 5.54400110244751,
229
+ "logps/chosen": -360.41650390625,
230
+ "logps/rejected": -391.2162170410156,
231
+ "loss": 0.675189733505249,
232
+ "rewards/accuracies": 0.8125,
233
+ "rewards/chosen": 0.045946408063173294,
234
+ "rewards/margins": 0.03675585240125656,
235
+ "rewards/rejected": 0.009190557524561882,
236
+ "step": 28
237
+ },
238
+ {
239
+ "epoch": 0.035036496350364967,
240
+ "grad_norm": 1.443650722503662,
241
+ "learning_rate": 5.620155038759691e-06,
242
+ "logits/chosen": 5.136168003082275,
243
+ "logits/rejected": 5.239327907562256,
244
+ "logps/chosen": -378.6293640136719,
245
+ "logps/rejected": -405.3665466308594,
246
+ "loss": 0.6752142310142517,
247
+ "rewards/accuracies": 0.8125,
248
+ "rewards/chosen": 0.04194517061114311,
249
+ "rewards/margins": 0.03668833151459694,
250
+ "rewards/rejected": 0.005256845150142908,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.03737226277372263,
255
+ "grad_norm": 1.379568338394165,
256
+ "learning_rate": 6.007751937984497e-06,
257
+ "logits/chosen": 5.411487579345703,
258
+ "logits/rejected": 5.427243232727051,
259
+ "logps/chosen": -358.5367736816406,
260
+ "logps/rejected": -382.4181213378906,
261
+ "loss": 0.6700581312179565,
262
+ "rewards/accuracies": 0.875,
263
+ "rewards/chosen": 0.06658173352479935,
264
+ "rewards/margins": 0.047193337231874466,
265
+ "rewards/rejected": 0.019388392567634583,
266
+ "step": 32
267
+ },
268
+ {
269
+ "epoch": 0.039708029197080295,
270
+ "grad_norm": 1.3260451555252075,
271
+ "learning_rate": 6.395348837209303e-06,
272
+ "logits/chosen": 5.207217216491699,
273
+ "logits/rejected": 5.254848480224609,
274
+ "logps/chosen": -326.9423828125,
275
+ "logps/rejected": -346.52081298828125,
276
+ "loss": 0.6610866785049438,
277
+ "rewards/accuracies": 0.9375,
278
+ "rewards/chosen": 0.07038869708776474,
279
+ "rewards/margins": 0.06587495654821396,
280
+ "rewards/rejected": 0.0045137410052120686,
281
+ "step": 34
282
+ },
283
+ {
284
+ "epoch": 0.04204379562043795,
285
+ "grad_norm": 1.5776340961456299,
286
+ "learning_rate": 6.782945736434108e-06,
287
+ "logits/chosen": 5.550538063049316,
288
+ "logits/rejected": 5.6374335289001465,
289
+ "logps/chosen": -359.9613952636719,
290
+ "logps/rejected": -384.31683349609375,
291
+ "loss": 0.6281551718711853,
292
+ "rewards/accuracies": 1.0,
293
+ "rewards/chosen": 0.11738375574350357,
294
+ "rewards/margins": 0.1363767683506012,
295
+ "rewards/rejected": -0.018992995843291283,
296
+ "step": 36
297
+ },
298
+ {
299
+ "epoch": 0.04437956204379562,
300
+ "grad_norm": 1.8589071035385132,
301
+ "learning_rate": 7.170542635658915e-06,
302
+ "logits/chosen": 5.39143180847168,
303
+ "logits/rejected": 5.412029266357422,
304
+ "logps/chosen": -325.8544616699219,
305
+ "logps/rejected": -351.9772644042969,
306
+ "loss": 0.6270830631256104,
307
+ "rewards/accuracies": 0.9375,
308
+ "rewards/chosen": 0.1617884635925293,
309
+ "rewards/margins": 0.1388537436723709,
310
+ "rewards/rejected": 0.022934721782803535,
311
+ "step": 38
312
+ },
313
+ {
314
+ "epoch": 0.04671532846715328,
315
+ "grad_norm": 1.3231571912765503,
316
+ "learning_rate": 7.558139534883721e-06,
317
+ "logits/chosen": 5.189720153808594,
318
+ "logits/rejected": 5.203127384185791,
319
+ "logps/chosen": -343.3839111328125,
320
+ "logps/rejected": -374.7848205566406,
321
+ "loss": 0.641180157661438,
322
+ "rewards/accuracies": 0.875,
323
+ "rewards/chosen": 0.15248623490333557,
324
+ "rewards/margins": 0.11158552765846252,
325
+ "rewards/rejected": 0.04090070724487305,
326
+ "step": 40
327
+ },
328
+ {
329
+ "epoch": 0.049051094890510946,
330
+ "grad_norm": 2.5331315994262695,
331
+ "learning_rate": 7.945736434108528e-06,
332
+ "logits/chosen": 5.420182228088379,
333
+ "logits/rejected": 5.45302677154541,
334
+ "logps/chosen": -341.813720703125,
335
+ "logps/rejected": -372.44952392578125,
336
+ "loss": 0.6093671321868896,
337
+ "rewards/accuracies": 0.9375,
338
+ "rewards/chosen": 0.2898235321044922,
339
+ "rewards/margins": 0.18158456683158875,
340
+ "rewards/rejected": 0.10823898762464523,
341
+ "step": 42
342
+ },
343
+ {
344
+ "epoch": 0.05138686131386861,
345
+ "grad_norm": 1.5247384309768677,
346
+ "learning_rate": 8.333333333333334e-06,
347
+ "logits/chosen": 5.383636951446533,
348
+ "logits/rejected": 5.397551536560059,
349
+ "logps/chosen": -354.49627685546875,
350
+ "logps/rejected": -376.88818359375,
351
+ "loss": 0.5815833210945129,
352
+ "rewards/accuracies": 0.8125,
353
+ "rewards/chosen": 0.32459571957588196,
354
+ "rewards/margins": 0.2510552406311035,
355
+ "rewards/rejected": 0.07354050129652023,
356
+ "step": 44
357
+ },
358
+ {
359
+ "epoch": 0.053722627737226275,
360
+ "grad_norm": 2.0814144611358643,
361
+ "learning_rate": 8.72093023255814e-06,
362
+ "logits/chosen": 5.269731044769287,
363
+ "logits/rejected": 5.287116050720215,
364
+ "logps/chosen": -331.1025390625,
365
+ "logps/rejected": -362.90118408203125,
366
+ "loss": 0.5269681215286255,
367
+ "rewards/accuracies": 0.9375,
368
+ "rewards/chosen": 0.6465227603912354,
369
+ "rewards/margins": 0.37582656741142273,
370
+ "rewards/rejected": 0.27069616317749023,
371
+ "step": 46
372
+ },
373
+ {
374
+ "epoch": 0.05605839416058394,
375
+ "grad_norm": 1.769063115119934,
376
+ "learning_rate": 9.108527131782946e-06,
377
+ "logits/chosen": 5.472540855407715,
378
+ "logits/rejected": 5.465417861938477,
379
+ "logps/chosen": -369.40283203125,
380
+ "logps/rejected": -400.18438720703125,
381
+ "loss": 0.5066201686859131,
382
+ "rewards/accuracies": 1.0,
383
+ "rewards/chosen": 0.6377636194229126,
384
+ "rewards/margins": 0.42650213837623596,
385
+ "rewards/rejected": 0.21126146614551544,
386
+ "step": 48
387
+ },
388
+ {
389
+ "epoch": 0.058394160583941604,
390
+ "grad_norm": 2.84169602394104,
391
+ "learning_rate": 9.496124031007753e-06,
392
+ "logits/chosen": 5.050387382507324,
393
+ "logits/rejected": 5.112288951873779,
394
+ "logps/chosen": -363.4556579589844,
395
+ "logps/rejected": -397.8169860839844,
396
+ "loss": 0.529259979724884,
397
+ "rewards/accuracies": 1.0,
398
+ "rewards/chosen": 0.7923164367675781,
399
+ "rewards/margins": 0.3787059783935547,
400
+ "rewards/rejected": 0.4136104881763458,
401
+ "step": 50
402
+ },
403
+ {
404
+ "epoch": 0.058394160583941604,
405
+ "eval_logits/chosen": 5.22359037399292,
406
+ "eval_logits/rejected": 5.286833763122559,
407
+ "eval_logps/chosen": -361.462890625,
408
+ "eval_logps/rejected": -392.5708312988281,
409
+ "eval_loss": 0.4610801041126251,
410
+ "eval_rewards/accuracies": 0.9619422554969788,
411
+ "eval_rewards/chosen": 0.8944254517555237,
412
+ "eval_rewards/margins": 0.5739086270332336,
413
+ "eval_rewards/rejected": 0.3205168545246124,
414
+ "eval_runtime": 454.5598,
415
+ "eval_samples_per_second": 1.676,
416
+ "eval_steps_per_second": 1.676,
417
+ "step": 50
418
+ },
419
+ {
420
+ "epoch": 0.06072992700729927,
421
+ "grad_norm": 1.6907895803451538,
422
+ "learning_rate": 9.883720930232558e-06,
423
+ "logits/chosen": 5.486469268798828,
424
+ "logits/rejected": 5.541717529296875,
425
+ "logps/chosen": -343.4534606933594,
426
+ "logps/rejected": -379.39508056640625,
427
+ "loss": 0.44602835178375244,
428
+ "rewards/accuracies": 0.9375,
429
+ "rewards/chosen": 0.9869746565818787,
430
+ "rewards/margins": 0.6056646108627319,
431
+ "rewards/rejected": 0.3813100755214691,
432
+ "step": 52
433
+ },
434
+ {
435
+ "epoch": 0.06306569343065693,
436
+ "grad_norm": 1.9458682537078857,
437
+ "learning_rate": 1.0271317829457365e-05,
438
+ "logits/chosen": 5.169528961181641,
439
+ "logits/rejected": 5.2688751220703125,
440
+ "logps/chosen": -379.5437316894531,
441
+ "logps/rejected": -401.5587463378906,
442
+ "loss": 0.43609702587127686,
443
+ "rewards/accuracies": 1.0,
444
+ "rewards/chosen": 0.7794930934906006,
445
+ "rewards/margins": 0.6265671253204346,
446
+ "rewards/rejected": 0.15292587876319885,
447
+ "step": 54
448
+ },
449
+ {
450
+ "epoch": 0.0654014598540146,
451
+ "grad_norm": 2.1266520023345947,
452
+ "learning_rate": 1.065891472868217e-05,
453
+ "logits/chosen": 5.097426414489746,
454
+ "logits/rejected": 5.15327262878418,
455
+ "logps/chosen": -378.0788269042969,
456
+ "logps/rejected": -413.27392578125,
457
+ "loss": 0.3928414583206177,
458
+ "rewards/accuracies": 0.9375,
459
+ "rewards/chosen": 1.274291753768921,
460
+ "rewards/margins": 0.7864217758178711,
461
+ "rewards/rejected": 0.4878700375556946,
462
+ "step": 56
463
+ },
464
+ {
465
+ "epoch": 0.06773722627737226,
466
+ "grad_norm": 1.5381489992141724,
467
+ "learning_rate": 1.1046511627906977e-05,
468
+ "logits/chosen": 5.138954162597656,
469
+ "logits/rejected": 5.20254373550415,
470
+ "logps/chosen": -372.93438720703125,
471
+ "logps/rejected": -401.8287658691406,
472
+ "loss": 0.35855019092559814,
473
+ "rewards/accuracies": 0.875,
474
+ "rewards/chosen": 1.2897911071777344,
475
+ "rewards/margins": 0.9354276061058044,
476
+ "rewards/rejected": 0.35436347126960754,
477
+ "step": 58
478
+ },
479
+ {
480
+ "epoch": 0.07007299270072993,
481
+ "grad_norm": 2.358330726623535,
482
+ "learning_rate": 1.1434108527131783e-05,
483
+ "logits/chosen": 5.071888446807861,
484
+ "logits/rejected": 5.187964916229248,
485
+ "logps/chosen": -360.984619140625,
486
+ "logps/rejected": -392.3192138671875,
487
+ "loss": 0.42801612615585327,
488
+ "rewards/accuracies": 0.875,
489
+ "rewards/chosen": 1.3823509216308594,
490
+ "rewards/margins": 0.729066014289856,
491
+ "rewards/rejected": 0.6532848477363586,
492
+ "step": 60
493
+ },
494
+ {
495
+ "epoch": 0.07240875912408759,
496
+ "grad_norm": 2.177586317062378,
497
+ "learning_rate": 1.182170542635659e-05,
498
+ "logits/chosen": 5.264093399047852,
499
+ "logits/rejected": 5.310842990875244,
500
+ "logps/chosen": -364.808349609375,
501
+ "logps/rejected": -401.0321044921875,
502
+ "loss": 0.31365492939949036,
503
+ "rewards/accuracies": 1.0,
504
+ "rewards/chosen": 1.6637591123580933,
505
+ "rewards/margins": 1.0887457132339478,
506
+ "rewards/rejected": 0.5750135183334351,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 0.07474452554744526,
511
+ "grad_norm": 1.697789192199707,
512
+ "learning_rate": 1.2209302325581395e-05,
513
+ "logits/chosen": 5.191982269287109,
514
+ "logits/rejected": 5.261416912078857,
515
+ "logps/chosen": -359.8249816894531,
516
+ "logps/rejected": -397.2122497558594,
517
+ "loss": 0.3037749230861664,
518
+ "rewards/accuracies": 1.0,
519
+ "rewards/chosen": 1.6470392942428589,
520
+ "rewards/margins": 1.114844799041748,
521
+ "rewards/rejected": 0.5321945548057556,
522
+ "step": 64
523
+ },
524
+ {
525
+ "epoch": 0.07708029197080292,
526
+ "grad_norm": 1.3219914436340332,
527
+ "learning_rate": 1.2596899224806202e-05,
528
+ "logits/chosen": 5.293405532836914,
529
+ "logits/rejected": 5.3094048500061035,
530
+ "logps/chosen": -352.3752136230469,
531
+ "logps/rejected": -392.6779479980469,
532
+ "loss": 0.25026455521583557,
533
+ "rewards/accuracies": 1.0,
534
+ "rewards/chosen": 1.5671364068984985,
535
+ "rewards/margins": 1.4098074436187744,
536
+ "rewards/rejected": 0.15732917189598083,
537
+ "step": 66
538
+ },
539
+ {
540
+ "epoch": 0.07941605839416059,
541
+ "grad_norm": 1.8173967599868774,
542
+ "learning_rate": 1.2984496124031009e-05,
543
+ "logits/chosen": 5.025746822357178,
544
+ "logits/rejected": 5.114965438842773,
545
+ "logps/chosen": -319.99700927734375,
546
+ "logps/rejected": -364.115234375,
547
+ "loss": 0.3108353912830353,
548
+ "rewards/accuracies": 0.9375,
549
+ "rewards/chosen": 1.4788665771484375,
550
+ "rewards/margins": 1.2637410163879395,
551
+ "rewards/rejected": 0.2151254564523697,
552
+ "step": 68
553
+ },
554
+ {
555
+ "epoch": 0.08175182481751825,
556
+ "grad_norm": 1.0658400058746338,
557
+ "learning_rate": 1.3372093023255814e-05,
558
+ "logits/chosen": 4.945235729217529,
559
+ "logits/rejected": 4.959147930145264,
560
+ "logps/chosen": -383.84033203125,
561
+ "logps/rejected": -431.7752685546875,
562
+ "loss": 0.22991834580898285,
563
+ "rewards/accuracies": 1.0,
564
+ "rewards/chosen": 1.3950352668762207,
565
+ "rewards/margins": 1.4965243339538574,
566
+ "rewards/rejected": -0.1014888733625412,
567
+ "step": 70
568
+ },
569
+ {
570
+ "epoch": 0.0840875912408759,
571
+ "grad_norm": 1.0350896120071411,
572
+ "learning_rate": 1.375968992248062e-05,
573
+ "logits/chosen": 5.00426721572876,
574
+ "logits/rejected": 5.120238780975342,
575
+ "logps/chosen": -350.9471435546875,
576
+ "logps/rejected": -382.6837158203125,
577
+ "loss": 0.22603684663772583,
578
+ "rewards/accuracies": 1.0,
579
+ "rewards/chosen": 1.2978975772857666,
580
+ "rewards/margins": 1.644275426864624,
581
+ "rewards/rejected": -0.34637776017189026,
582
+ "step": 72
583
+ },
584
+ {
585
+ "epoch": 0.08642335766423358,
586
+ "grad_norm": 1.1595423221588135,
587
+ "learning_rate": 1.4147286821705426e-05,
588
+ "logits/chosen": 4.890130043029785,
589
+ "logits/rejected": 4.9504714012146,
590
+ "logps/chosen": -352.34967041015625,
591
+ "logps/rejected": -399.23028564453125,
592
+ "loss": 0.18921935558319092,
593
+ "rewards/accuracies": 1.0,
594
+ "rewards/chosen": 1.1984589099884033,
595
+ "rewards/margins": 1.7495291233062744,
596
+ "rewards/rejected": -0.5510700941085815,
597
+ "step": 74
598
+ },
599
+ {
600
+ "epoch": 0.08759124087591241,
601
+ "eval_logits/chosen": 4.930174827575684,
602
+ "eval_logits/rejected": 5.032296657562256,
603
+ "eval_logps/chosen": -359.19647216796875,
604
+ "eval_logps/rejected": -405.1120300292969,
605
+ "eval_loss": 0.16020436584949493,
606
+ "eval_rewards/accuracies": 0.9960629940032959,
607
+ "eval_rewards/chosen": 1.1210675239562988,
608
+ "eval_rewards/margins": 2.0546727180480957,
609
+ "eval_rewards/rejected": -0.9336051344871521,
610
+ "eval_runtime": 454.3435,
611
+ "eval_samples_per_second": 1.677,
612
+ "eval_steps_per_second": 1.677,
613
+ "step": 75
614
+ },
615
+ {
616
+ "epoch": 0.08875912408759123,
617
+ "grad_norm": 1.1433167457580566,
618
+ "learning_rate": 1.4534883720930233e-05,
619
+ "logits/chosen": 5.037275314331055,
620
+ "logits/rejected": 5.1315507888793945,
621
+ "logps/chosen": -313.110595703125,
622
+ "logps/rejected": -356.1000061035156,
623
+ "loss": 0.15998858213424683,
624
+ "rewards/accuracies": 1.0,
625
+ "rewards/chosen": 1.2128857374191284,
626
+ "rewards/margins": 2.0945115089416504,
627
+ "rewards/rejected": -0.8816256523132324,
628
+ "step": 76
629
+ },
630
+ {
631
+ "epoch": 0.0910948905109489,
632
+ "grad_norm": 0.9839214086532593,
633
+ "learning_rate": 1.4922480620155039e-05,
634
+ "logits/chosen": 4.817085266113281,
635
+ "logits/rejected": 4.874035835266113,
636
+ "logps/chosen": -366.2629089355469,
637
+ "logps/rejected": -405.7989196777344,
638
+ "loss": 0.1894684135913849,
639
+ "rewards/accuracies": 1.0,
640
+ "rewards/chosen": 1.0605502128601074,
641
+ "rewards/margins": 1.90762460231781,
642
+ "rewards/rejected": -0.8470743894577026,
643
+ "step": 78
644
+ },
645
+ {
646
+ "epoch": 0.09343065693430656,
647
+ "grad_norm": 0.9212782979011536,
648
+ "learning_rate": 1.5310077519379846e-05,
649
+ "logits/chosen": 5.046716690063477,
650
+ "logits/rejected": 5.157979965209961,
651
+ "logps/chosen": -348.0658264160156,
652
+ "logps/rejected": -395.23870849609375,
653
+ "loss": 0.15948188304901123,
654
+ "rewards/accuracies": 1.0,
655
+ "rewards/chosen": 0.676516056060791,
656
+ "rewards/margins": 2.167430877685547,
657
+ "rewards/rejected": -1.4909145832061768,
658
+ "step": 80
659
+ },
660
+ {
661
+ "epoch": 0.09576642335766423,
662
+ "grad_norm": 0.9820688366889954,
663
+ "learning_rate": 1.569767441860465e-05,
664
+ "logits/chosen": 4.690741539001465,
665
+ "logits/rejected": 4.771791458129883,
666
+ "logps/chosen": -378.8666076660156,
667
+ "logps/rejected": -436.9100036621094,
668
+ "loss": 0.12085139006376266,
669
+ "rewards/accuracies": 1.0,
670
+ "rewards/chosen": 0.8719685077667236,
671
+ "rewards/margins": 2.646538257598877,
672
+ "rewards/rejected": -1.7745698690414429,
673
+ "step": 82
674
+ },
675
+ {
676
+ "epoch": 0.09810218978102189,
677
+ "grad_norm": 0.66785728931427,
678
+ "learning_rate": 1.608527131782946e-05,
679
+ "logits/chosen": 4.880465984344482,
680
+ "logits/rejected": 4.961792945861816,
681
+ "logps/chosen": -346.51214599609375,
682
+ "logps/rejected": -400.1110534667969,
683
+ "loss": 0.08720710873603821,
684
+ "rewards/accuracies": 1.0,
685
+ "rewards/chosen": 1.1337480545043945,
686
+ "rewards/margins": 2.903944253921509,
687
+ "rewards/rejected": -1.7701961994171143,
688
+ "step": 84
689
+ },
690
+ {
691
+ "epoch": 0.10043795620437956,
692
+ "grad_norm": 0.5760660767555237,
693
+ "learning_rate": 1.647286821705426e-05,
694
+ "logits/chosen": 4.464397430419922,
695
+ "logits/rejected": 4.680055618286133,
696
+ "logps/chosen": -341.7489318847656,
697
+ "logps/rejected": -398.322021484375,
698
+ "loss": 0.07942983508110046,
699
+ "rewards/accuracies": 1.0,
700
+ "rewards/chosen": 1.2459325790405273,
701
+ "rewards/margins": 3.0152552127838135,
702
+ "rewards/rejected": -1.7693227529525757,
703
+ "step": 86
704
+ },
705
+ {
706
+ "epoch": 0.10277372262773722,
707
+ "grad_norm": 1.6020294427871704,
708
+ "learning_rate": 1.686046511627907e-05,
709
+ "logits/chosen": 4.563863277435303,
710
+ "logits/rejected": 4.680974960327148,
711
+ "logps/chosen": -344.9147644042969,
712
+ "logps/rejected": -395.4453125,
713
+ "loss": 0.1258174479007721,
714
+ "rewards/accuracies": 0.9375,
715
+ "rewards/chosen": 1.0706769227981567,
716
+ "rewards/margins": 3.118717670440674,
717
+ "rewards/rejected": -2.0480403900146484,
718
+ "step": 88
719
+ },
720
+ {
721
+ "epoch": 0.10510948905109489,
722
+ "grad_norm": 0.46413859724998474,
723
+ "learning_rate": 1.7248062015503875e-05,
724
+ "logits/chosen": 4.4989237785339355,
725
+ "logits/rejected": 4.673248291015625,
726
+ "logps/chosen": -326.9678649902344,
727
+ "logps/rejected": -388.4164123535156,
728
+ "loss": 0.06663060188293457,
729
+ "rewards/accuracies": 1.0,
730
+ "rewards/chosen": 1.4128761291503906,
731
+ "rewards/margins": 3.760685920715332,
732
+ "rewards/rejected": -2.3478102684020996,
733
+ "step": 90
734
+ },
735
+ {
736
+ "epoch": 0.10744525547445255,
737
+ "grad_norm": 0.6699568629264832,
738
+ "learning_rate": 1.7635658914728684e-05,
739
+ "logits/chosen": 4.7294535636901855,
740
+ "logits/rejected": 4.813880920410156,
741
+ "logps/chosen": -362.7267150878906,
742
+ "logps/rejected": -439.2985534667969,
743
+ "loss": 0.04481709748506546,
744
+ "rewards/accuracies": 1.0,
745
+ "rewards/chosen": 1.477597713470459,
746
+ "rewards/margins": 4.37883186340332,
747
+ "rewards/rejected": -2.9012341499328613,
748
+ "step": 92
749
+ },
750
+ {
751
+ "epoch": 0.10978102189781022,
752
+ "grad_norm": 0.4152977168560028,
753
+ "learning_rate": 1.802325581395349e-05,
754
+ "logits/chosen": 4.785149574279785,
755
+ "logits/rejected": 4.891542434692383,
756
+ "logps/chosen": -381.59246826171875,
757
+ "logps/rejected": -444.2817687988281,
758
+ "loss": 0.05632612109184265,
759
+ "rewards/accuracies": 1.0,
760
+ "rewards/chosen": 0.71366286277771,
761
+ "rewards/margins": 3.4584720134735107,
762
+ "rewards/rejected": -2.744809150695801,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 0.11211678832116788,
767
+ "grad_norm": 0.3152717649936676,
768
+ "learning_rate": 1.8410852713178295e-05,
769
+ "logits/chosen": 4.603940486907959,
770
+ "logits/rejected": 4.804995536804199,
771
+ "logps/chosen": -356.7286376953125,
772
+ "logps/rejected": -414.69635009765625,
773
+ "loss": 0.040920041501522064,
774
+ "rewards/accuracies": 1.0,
775
+ "rewards/chosen": 1.7566397190093994,
776
+ "rewards/margins": 4.020595550537109,
777
+ "rewards/rejected": -2.263956069946289,
778
+ "step": 96
779
+ },
780
+ {
781
+ "epoch": 0.11445255474452555,
782
+ "grad_norm": 0.37698569893836975,
783
+ "learning_rate": 1.8798449612403103e-05,
784
+ "logits/chosen": 4.558542728424072,
785
+ "logits/rejected": 4.690641403198242,
786
+ "logps/chosen": -339.794189453125,
787
+ "logps/rejected": -413.8865966796875,
788
+ "loss": 0.025794224813580513,
789
+ "rewards/accuracies": 1.0,
790
+ "rewards/chosen": 1.3867536783218384,
791
+ "rewards/margins": 4.6542744636535645,
792
+ "rewards/rejected": -3.2675204277038574,
793
+ "step": 98
794
+ },
795
+ {
796
+ "epoch": 0.11678832116788321,
797
+ "grad_norm": 0.15023073554039001,
798
+ "learning_rate": 1.918604651162791e-05,
799
+ "logits/chosen": 4.387497425079346,
800
+ "logits/rejected": 4.494588375091553,
801
+ "logps/chosen": -346.2568054199219,
802
+ "logps/rejected": -418.9315185546875,
803
+ "loss": 0.015155203640460968,
804
+ "rewards/accuracies": 1.0,
805
+ "rewards/chosen": 1.7938623428344727,
806
+ "rewards/margins": 4.942529201507568,
807
+ "rewards/rejected": -3.1486666202545166,
808
+ "step": 100
809
+ },
810
+ {
811
+ "epoch": 0.11678832116788321,
812
+ "eval_logits/chosen": 4.285891056060791,
813
+ "eval_logits/rejected": 4.425926208496094,
814
+ "eval_logps/chosen": -353.15850830078125,
815
+ "eval_logps/rejected": -424.4124755859375,
816
+ "eval_loss": 0.04428481683135033,
817
+ "eval_rewards/accuracies": 0.9921259880065918,
818
+ "eval_rewards/chosen": 1.7248634099960327,
819
+ "eval_rewards/margins": 4.588510513305664,
820
+ "eval_rewards/rejected": -2.863647222518921,
821
+ "eval_runtime": 454.7251,
822
+ "eval_samples_per_second": 1.676,
823
+ "eval_steps_per_second": 1.676,
824
+ "step": 100
825
+ }
826
+ ],
827
+ "logging_steps": 2,
828
+ "max_steps": 2571,
829
+ "num_input_tokens_seen": 0,
830
+ "num_train_epochs": 3,
831
+ "save_steps": 100,
832
+ "stateful_callbacks": {
833
+ "EarlyStoppingCallback": {
834
+ "args": {
835
+ "early_stopping_patience": 5,
836
+ "early_stopping_threshold": 0.001
837
+ },
838
+ "attributes": {
839
+ "early_stopping_patience_counter": 0
840
+ }
841
+ },
842
+ "TrainerControl": {
843
+ "args": {
844
+ "should_epoch_stop": false,
845
+ "should_evaluate": false,
846
+ "should_log": false,
847
+ "should_save": true,
848
+ "should_training_stop": false
849
+ },
850
+ "attributes": {}
851
+ }
852
+ },
853
+ "total_flos": 0.0,
854
+ "train_batch_size": 1,
855
+ "trial_name": null,
856
+ "trial_params": null
857
+ }
best_adapter/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4a4e48ed61b7c96f3bd2836ac828013a311834ab8a9542ea461fe1ff953396b
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21875ef630d3e8f528dce67596a0d783fd5cf223e6e245a98026996d1f3d3ade
3
+ size 5752
checkpoint-100/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:../../Models/Qwen2.5-Coder-14B-CPT-SFT
7
+ - dpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "../../Models/Qwen2.5-Coder-14B-CPT-SFT",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.0",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "k_proj",
33
+ "o_proj",
34
+ "v_proj",
35
+ "q_proj"
36
+ ],
37
+ "target_parameters": null,
38
+ "task_type": "CAUSAL_LM",
39
+ "trainable_token_indices": null,
40
+ "use_dora": false,
41
+ "use_qalora": false,
42
+ "use_rslora": false
43
+ }
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f2d3d5485f7a1cfe5d5e69f9e55a45f72f0a8b17e757d0ca412c96a2d472fbf
3
+ size 1064
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd169731d2cbde95e10bf356d66d5997fd885dd8dbb6fb4684da3f23b2585d8
3
+ size 11421892
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 32768,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 100,
3
+ "best_metric": 0.04428481683135033,
4
+ "best_model_checkpoint": "runs/dpo_run_14b_v1/checkpoint-100",
5
+ "epoch": 0.11678832116788321,
6
+ "eval_steps": 25,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0023357664233576644,
14
+ "grad_norm": 1.242694616317749,
15
+ "learning_rate": 1.9379844961240311e-07,
16
+ "logits/chosen": 5.179401397705078,
17
+ "logits/rejected": 5.192930698394775,
18
+ "logps/chosen": -368.911865234375,
19
+ "logps/rejected": -398.83880615234375,
20
+ "loss": 0.6931473016738892,
21
+ "rewards/accuracies": 0.0,
22
+ "rewards/chosen": 0.0,
23
+ "rewards/margins": 0.0,
24
+ "rewards/rejected": 0.0,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.004671532846715329,
29
+ "grad_norm": 1.392399787902832,
30
+ "learning_rate": 5.813953488372093e-07,
31
+ "logits/chosen": 5.403897762298584,
32
+ "logits/rejected": 5.4565606117248535,
33
+ "logps/chosen": -338.43792724609375,
34
+ "logps/rejected": -367.03057861328125,
35
+ "loss": 0.6949559450149536,
36
+ "rewards/accuracies": 0.625,
37
+ "rewards/chosen": 0.004504585638642311,
38
+ "rewards/margins": -0.003222561441361904,
39
+ "rewards/rejected": 0.007727146148681641,
40
+ "step": 4
41
+ },
42
+ {
43
+ "epoch": 0.0070072992700729924,
44
+ "grad_norm": 1.066603183746338,
45
+ "learning_rate": 9.689922480620155e-07,
46
+ "logits/chosen": 5.291868209838867,
47
+ "logits/rejected": 5.328356742858887,
48
+ "logps/chosen": -362.3431701660156,
49
+ "logps/rejected": -387.5829772949219,
50
+ "loss": 0.689236581325531,
51
+ "rewards/accuracies": 0.5625,
52
+ "rewards/chosen": -0.0034066196531057358,
53
+ "rewards/margins": 0.008255671709775925,
54
+ "rewards/rejected": -0.01166229322552681,
55
+ "step": 6
56
+ },
57
+ {
58
+ "epoch": 0.009343065693430658,
59
+ "grad_norm": 1.0005714893341064,
60
+ "learning_rate": 1.3565891472868218e-06,
61
+ "logits/chosen": 5.323437690734863,
62
+ "logits/rejected": 5.410858631134033,
63
+ "logps/chosen": -379.9283447265625,
64
+ "logps/rejected": -389.0852355957031,
65
+ "loss": 0.6943775415420532,
66
+ "rewards/accuracies": 0.375,
67
+ "rewards/chosen": 0.014657974243164062,
68
+ "rewards/margins": -0.0012350091710686684,
69
+ "rewards/rejected": 0.015892982482910156,
70
+ "step": 8
71
+ },
72
+ {
73
+ "epoch": 0.01167883211678832,
74
+ "grad_norm": 1.2461222410202026,
75
+ "learning_rate": 1.744186046511628e-06,
76
+ "logits/chosen": 5.435908317565918,
77
+ "logits/rejected": 5.494542121887207,
78
+ "logps/chosen": -363.2003479003906,
79
+ "logps/rejected": -389.67376708984375,
80
+ "loss": 0.693260908126831,
81
+ "rewards/accuracies": 0.625,
82
+ "rewards/chosen": -0.028497030958533287,
83
+ "rewards/margins": 0.00012636138126254082,
84
+ "rewards/rejected": -0.028623390942811966,
85
+ "step": 10
86
+ },
87
+ {
88
+ "epoch": 0.014014598540145985,
89
+ "grad_norm": 1.4030137062072754,
90
+ "learning_rate": 2.131782945736434e-06,
91
+ "logits/chosen": 5.3550801277160645,
92
+ "logits/rejected": 5.375768661499023,
93
+ "logps/chosen": -370.96429443359375,
94
+ "logps/rejected": -402.4786071777344,
95
+ "loss": 0.6882913112640381,
96
+ "rewards/accuracies": 0.5,
97
+ "rewards/chosen": 0.01622028276324272,
98
+ "rewards/margins": 0.010086631402373314,
99
+ "rewards/rejected": 0.006133650429546833,
100
+ "step": 12
101
+ },
102
+ {
103
+ "epoch": 0.01635036496350365,
104
+ "grad_norm": 1.1157702207565308,
105
+ "learning_rate": 2.5193798449612402e-06,
106
+ "logits/chosen": 5.515308380126953,
107
+ "logits/rejected": 5.561104774475098,
108
+ "logps/chosen": -336.7254333496094,
109
+ "logps/rejected": -357.52203369140625,
110
+ "loss": 0.6896716356277466,
111
+ "rewards/accuracies": 0.625,
112
+ "rewards/chosen": -0.017319394275546074,
113
+ "rewards/margins": 0.007328510750085115,
114
+ "rewards/rejected": -0.024647902697324753,
115
+ "step": 14
116
+ },
117
+ {
118
+ "epoch": 0.018686131386861315,
119
+ "grad_norm": 0.9470655918121338,
120
+ "learning_rate": 2.9069767441860468e-06,
121
+ "logits/chosen": 5.553088665008545,
122
+ "logits/rejected": 5.582851886749268,
123
+ "logps/chosen": -415.6842041015625,
124
+ "logps/rejected": -441.1054992675781,
125
+ "loss": 0.6904245018959045,
126
+ "rewards/accuracies": 0.5625,
127
+ "rewards/chosen": 0.03270244598388672,
128
+ "rewards/margins": 0.005826758686453104,
129
+ "rewards/rejected": 0.026875685900449753,
130
+ "step": 16
131
+ },
132
+ {
133
+ "epoch": 0.021021897810218976,
134
+ "grad_norm": 1.4397331476211548,
135
+ "learning_rate": 3.2945736434108533e-06,
136
+ "logits/chosen": 5.440742015838623,
137
+ "logits/rejected": 5.489529132843018,
138
+ "logps/chosen": -392.46221923828125,
139
+ "logps/rejected": -420.1712341308594,
140
+ "loss": 0.683630108833313,
141
+ "rewards/accuracies": 0.5625,
142
+ "rewards/chosen": 0.011020278558135033,
143
+ "rewards/margins": 0.01951923407614231,
144
+ "rewards/rejected": -0.008498954586684704,
145
+ "step": 18
146
+ },
147
+ {
148
+ "epoch": 0.02335766423357664,
149
+ "grad_norm": 1.5941083431243896,
150
+ "learning_rate": 3.6821705426356594e-06,
151
+ "logits/chosen": 5.318347930908203,
152
+ "logits/rejected": 5.397945404052734,
153
+ "logps/chosen": -345.2221374511719,
154
+ "logps/rejected": -365.9537048339844,
155
+ "loss": 0.6902388334274292,
156
+ "rewards/accuracies": 0.5625,
157
+ "rewards/chosen": 0.006536484230309725,
158
+ "rewards/margins": 0.006013393402099609,
159
+ "rewards/rejected": 0.0005230908282101154,
160
+ "step": 20
161
+ },
162
+ {
163
+ "epoch": 0.025693430656934305,
164
+ "grad_norm": 1.1363905668258667,
165
+ "learning_rate": 4.0697674418604655e-06,
166
+ "logits/chosen": 5.632981300354004,
167
+ "logits/rejected": 5.7265520095825195,
168
+ "logps/chosen": -347.9439697265625,
169
+ "logps/rejected": -370.65777587890625,
170
+ "loss": 0.691262423992157,
171
+ "rewards/accuracies": 0.5,
172
+ "rewards/chosen": 0.011908342130482197,
173
+ "rewards/margins": 0.004538153763860464,
174
+ "rewards/rejected": 0.007370188366621733,
175
+ "step": 22
176
+ },
177
+ {
178
+ "epoch": 0.02802919708029197,
179
+ "grad_norm": 1.0684627294540405,
180
+ "learning_rate": 4.457364341085272e-06,
181
+ "logits/chosen": 5.35699987411499,
182
+ "logits/rejected": 5.405580520629883,
183
+ "logps/chosen": -347.1539001464844,
184
+ "logps/rejected": -377.6044921875,
185
+ "loss": 0.6769475936889648,
186
+ "rewards/accuracies": 0.875,
187
+ "rewards/chosen": 0.01244144607335329,
188
+ "rewards/margins": 0.03289356082677841,
189
+ "rewards/rejected": -0.020452119410037994,
190
+ "step": 24
191
+ },
192
+ {
193
+ "epoch": 0.029197080291970802,
194
+ "eval_logits/chosen": 5.295141220092773,
195
+ "eval_logits/rejected": 5.345211029052734,
196
+ "eval_logps/chosen": -370.1607666015625,
197
+ "eval_logps/rejected": -395.7251892089844,
198
+ "eval_loss": 0.6836819648742676,
199
+ "eval_rewards/accuracies": 0.665354311466217,
200
+ "eval_rewards/chosen": 0.024636391550302505,
201
+ "eval_rewards/margins": 0.019555427134037018,
202
+ "eval_rewards/rejected": 0.005080964416265488,
203
+ "eval_runtime": 454.4375,
204
+ "eval_samples_per_second": 1.677,
205
+ "eval_steps_per_second": 1.677,
206
+ "step": 25
207
+ },
208
+ {
209
+ "epoch": 0.030364963503649634,
210
+ "grad_norm": 1.592353105545044,
211
+ "learning_rate": 4.844961240310078e-06,
212
+ "logits/chosen": 5.157042026519775,
213
+ "logits/rejected": 5.244912147521973,
214
+ "logps/chosen": -387.54876708984375,
215
+ "logps/rejected": -412.0630187988281,
216
+ "loss": 0.6849788427352905,
217
+ "rewards/accuracies": 0.625,
218
+ "rewards/chosen": 0.026385309174656868,
219
+ "rewards/margins": 0.016966437920928,
220
+ "rewards/rejected": 0.009418869391083717,
221
+ "step": 26
222
+ },
223
+ {
224
+ "epoch": 0.0327007299270073,
225
+ "grad_norm": 1.3181558847427368,
226
+ "learning_rate": 5.232558139534884e-06,
227
+ "logits/chosen": 5.545513153076172,
228
+ "logits/rejected": 5.54400110244751,
229
+ "logps/chosen": -360.41650390625,
230
+ "logps/rejected": -391.2162170410156,
231
+ "loss": 0.675189733505249,
232
+ "rewards/accuracies": 0.8125,
233
+ "rewards/chosen": 0.045946408063173294,
234
+ "rewards/margins": 0.03675585240125656,
235
+ "rewards/rejected": 0.009190557524561882,
236
+ "step": 28
237
+ },
238
+ {
239
+ "epoch": 0.035036496350364967,
240
+ "grad_norm": 1.443650722503662,
241
+ "learning_rate": 5.620155038759691e-06,
242
+ "logits/chosen": 5.136168003082275,
243
+ "logits/rejected": 5.239327907562256,
244
+ "logps/chosen": -378.6293640136719,
245
+ "logps/rejected": -405.3665466308594,
246
+ "loss": 0.6752142310142517,
247
+ "rewards/accuracies": 0.8125,
248
+ "rewards/chosen": 0.04194517061114311,
249
+ "rewards/margins": 0.03668833151459694,
250
+ "rewards/rejected": 0.005256845150142908,
251
+ "step": 30
252
+ },
253
+ {
254
+ "epoch": 0.03737226277372263,
255
+ "grad_norm": 1.379568338394165,
256
+ "learning_rate": 6.007751937984497e-06,
257
+ "logits/chosen": 5.411487579345703,
258
+ "logits/rejected": 5.427243232727051,
259
+ "logps/chosen": -358.5367736816406,
260
+ "logps/rejected": -382.4181213378906,
261
+ "loss": 0.6700581312179565,
262
+ "rewards/accuracies": 0.875,
263
+ "rewards/chosen": 0.06658173352479935,
264
+ "rewards/margins": 0.047193337231874466,
265
+ "rewards/rejected": 0.019388392567634583,
266
+ "step": 32
267
+ },
268
+ {
269
+ "epoch": 0.039708029197080295,
270
+ "grad_norm": 1.3260451555252075,
271
+ "learning_rate": 6.395348837209303e-06,
272
+ "logits/chosen": 5.207217216491699,
273
+ "logits/rejected": 5.254848480224609,
274
+ "logps/chosen": -326.9423828125,
275
+ "logps/rejected": -346.52081298828125,
276
+ "loss": 0.6610866785049438,
277
+ "rewards/accuracies": 0.9375,
278
+ "rewards/chosen": 0.07038869708776474,
279
+ "rewards/margins": 0.06587495654821396,
280
+ "rewards/rejected": 0.0045137410052120686,
281
+ "step": 34
282
+ },
283
+ {
284
+ "epoch": 0.04204379562043795,
285
+ "grad_norm": 1.5776340961456299,
286
+ "learning_rate": 6.782945736434108e-06,
287
+ "logits/chosen": 5.550538063049316,
288
+ "logits/rejected": 5.6374335289001465,
289
+ "logps/chosen": -359.9613952636719,
290
+ "logps/rejected": -384.31683349609375,
291
+ "loss": 0.6281551718711853,
292
+ "rewards/accuracies": 1.0,
293
+ "rewards/chosen": 0.11738375574350357,
294
+ "rewards/margins": 0.1363767683506012,
295
+ "rewards/rejected": -0.018992995843291283,
296
+ "step": 36
297
+ },
298
+ {
299
+ "epoch": 0.04437956204379562,
300
+ "grad_norm": 1.8589071035385132,
301
+ "learning_rate": 7.170542635658915e-06,
302
+ "logits/chosen": 5.39143180847168,
303
+ "logits/rejected": 5.412029266357422,
304
+ "logps/chosen": -325.8544616699219,
305
+ "logps/rejected": -351.9772644042969,
306
+ "loss": 0.6270830631256104,
307
+ "rewards/accuracies": 0.9375,
308
+ "rewards/chosen": 0.1617884635925293,
309
+ "rewards/margins": 0.1388537436723709,
310
+ "rewards/rejected": 0.022934721782803535,
311
+ "step": 38
312
+ },
313
+ {
314
+ "epoch": 0.04671532846715328,
315
+ "grad_norm": 1.3231571912765503,
316
+ "learning_rate": 7.558139534883721e-06,
317
+ "logits/chosen": 5.189720153808594,
318
+ "logits/rejected": 5.203127384185791,
319
+ "logps/chosen": -343.3839111328125,
320
+ "logps/rejected": -374.7848205566406,
321
+ "loss": 0.641180157661438,
322
+ "rewards/accuracies": 0.875,
323
+ "rewards/chosen": 0.15248623490333557,
324
+ "rewards/margins": 0.11158552765846252,
325
+ "rewards/rejected": 0.04090070724487305,
326
+ "step": 40
327
+ },
328
+ {
329
+ "epoch": 0.049051094890510946,
330
+ "grad_norm": 2.5331315994262695,
331
+ "learning_rate": 7.945736434108528e-06,
332
+ "logits/chosen": 5.420182228088379,
333
+ "logits/rejected": 5.45302677154541,
334
+ "logps/chosen": -341.813720703125,
335
+ "logps/rejected": -372.44952392578125,
336
+ "loss": 0.6093671321868896,
337
+ "rewards/accuracies": 0.9375,
338
+ "rewards/chosen": 0.2898235321044922,
339
+ "rewards/margins": 0.18158456683158875,
340
+ "rewards/rejected": 0.10823898762464523,
341
+ "step": 42
342
+ },
343
+ {
344
+ "epoch": 0.05138686131386861,
345
+ "grad_norm": 1.5247384309768677,
346
+ "learning_rate": 8.333333333333334e-06,
347
+ "logits/chosen": 5.383636951446533,
348
+ "logits/rejected": 5.397551536560059,
349
+ "logps/chosen": -354.49627685546875,
350
+ "logps/rejected": -376.88818359375,
351
+ "loss": 0.5815833210945129,
352
+ "rewards/accuracies": 0.8125,
353
+ "rewards/chosen": 0.32459571957588196,
354
+ "rewards/margins": 0.2510552406311035,
355
+ "rewards/rejected": 0.07354050129652023,
356
+ "step": 44
357
+ },
358
+ {
359
+ "epoch": 0.053722627737226275,
360
+ "grad_norm": 2.0814144611358643,
361
+ "learning_rate": 8.72093023255814e-06,
362
+ "logits/chosen": 5.269731044769287,
363
+ "logits/rejected": 5.287116050720215,
364
+ "logps/chosen": -331.1025390625,
365
+ "logps/rejected": -362.90118408203125,
366
+ "loss": 0.5269681215286255,
367
+ "rewards/accuracies": 0.9375,
368
+ "rewards/chosen": 0.6465227603912354,
369
+ "rewards/margins": 0.37582656741142273,
370
+ "rewards/rejected": 0.27069616317749023,
371
+ "step": 46
372
+ },
373
+ {
374
+ "epoch": 0.05605839416058394,
375
+ "grad_norm": 1.769063115119934,
376
+ "learning_rate": 9.108527131782946e-06,
377
+ "logits/chosen": 5.472540855407715,
378
+ "logits/rejected": 5.465417861938477,
379
+ "logps/chosen": -369.40283203125,
380
+ "logps/rejected": -400.18438720703125,
381
+ "loss": 0.5066201686859131,
382
+ "rewards/accuracies": 1.0,
383
+ "rewards/chosen": 0.6377636194229126,
384
+ "rewards/margins": 0.42650213837623596,
385
+ "rewards/rejected": 0.21126146614551544,
386
+ "step": 48
387
+ },
388
+ {
389
+ "epoch": 0.058394160583941604,
390
+ "grad_norm": 2.84169602394104,
391
+ "learning_rate": 9.496124031007753e-06,
392
+ "logits/chosen": 5.050387382507324,
393
+ "logits/rejected": 5.112288951873779,
394
+ "logps/chosen": -363.4556579589844,
395
+ "logps/rejected": -397.8169860839844,
396
+ "loss": 0.529259979724884,
397
+ "rewards/accuracies": 1.0,
398
+ "rewards/chosen": 0.7923164367675781,
399
+ "rewards/margins": 0.3787059783935547,
400
+ "rewards/rejected": 0.4136104881763458,
401
+ "step": 50
402
+ },
403
+ {
404
+ "epoch": 0.058394160583941604,
405
+ "eval_logits/chosen": 5.22359037399292,
406
+ "eval_logits/rejected": 5.286833763122559,
407
+ "eval_logps/chosen": -361.462890625,
408
+ "eval_logps/rejected": -392.5708312988281,
409
+ "eval_loss": 0.4610801041126251,
410
+ "eval_rewards/accuracies": 0.9619422554969788,
411
+ "eval_rewards/chosen": 0.8944254517555237,
412
+ "eval_rewards/margins": 0.5739086270332336,
413
+ "eval_rewards/rejected": 0.3205168545246124,
414
+ "eval_runtime": 454.5598,
415
+ "eval_samples_per_second": 1.676,
416
+ "eval_steps_per_second": 1.676,
417
+ "step": 50
418
+ },
419
+ {
420
+ "epoch": 0.06072992700729927,
421
+ "grad_norm": 1.6907895803451538,
422
+ "learning_rate": 9.883720930232558e-06,
423
+ "logits/chosen": 5.486469268798828,
424
+ "logits/rejected": 5.541717529296875,
425
+ "logps/chosen": -343.4534606933594,
426
+ "logps/rejected": -379.39508056640625,
427
+ "loss": 0.44602835178375244,
428
+ "rewards/accuracies": 0.9375,
429
+ "rewards/chosen": 0.9869746565818787,
430
+ "rewards/margins": 0.6056646108627319,
431
+ "rewards/rejected": 0.3813100755214691,
432
+ "step": 52
433
+ },
434
+ {
435
+ "epoch": 0.06306569343065693,
436
+ "grad_norm": 1.9458682537078857,
437
+ "learning_rate": 1.0271317829457365e-05,
438
+ "logits/chosen": 5.169528961181641,
439
+ "logits/rejected": 5.2688751220703125,
440
+ "logps/chosen": -379.5437316894531,
441
+ "logps/rejected": -401.5587463378906,
442
+ "loss": 0.43609702587127686,
443
+ "rewards/accuracies": 1.0,
444
+ "rewards/chosen": 0.7794930934906006,
445
+ "rewards/margins": 0.6265671253204346,
446
+ "rewards/rejected": 0.15292587876319885,
447
+ "step": 54
448
+ },
449
+ {
450
+ "epoch": 0.0654014598540146,
451
+ "grad_norm": 2.1266520023345947,
452
+ "learning_rate": 1.065891472868217e-05,
453
+ "logits/chosen": 5.097426414489746,
454
+ "logits/rejected": 5.15327262878418,
455
+ "logps/chosen": -378.0788269042969,
456
+ "logps/rejected": -413.27392578125,
457
+ "loss": 0.3928414583206177,
458
+ "rewards/accuracies": 0.9375,
459
+ "rewards/chosen": 1.274291753768921,
460
+ "rewards/margins": 0.7864217758178711,
461
+ "rewards/rejected": 0.4878700375556946,
462
+ "step": 56
463
+ },
464
+ {
465
+ "epoch": 0.06773722627737226,
466
+ "grad_norm": 1.5381489992141724,
467
+ "learning_rate": 1.1046511627906977e-05,
468
+ "logits/chosen": 5.138954162597656,
469
+ "logits/rejected": 5.20254373550415,
470
+ "logps/chosen": -372.93438720703125,
471
+ "logps/rejected": -401.8287658691406,
472
+ "loss": 0.35855019092559814,
473
+ "rewards/accuracies": 0.875,
474
+ "rewards/chosen": 1.2897911071777344,
475
+ "rewards/margins": 0.9354276061058044,
476
+ "rewards/rejected": 0.35436347126960754,
477
+ "step": 58
478
+ },
479
+ {
480
+ "epoch": 0.07007299270072993,
481
+ "grad_norm": 2.358330726623535,
482
+ "learning_rate": 1.1434108527131783e-05,
483
+ "logits/chosen": 5.071888446807861,
484
+ "logits/rejected": 5.187964916229248,
485
+ "logps/chosen": -360.984619140625,
486
+ "logps/rejected": -392.3192138671875,
487
+ "loss": 0.42801612615585327,
488
+ "rewards/accuracies": 0.875,
489
+ "rewards/chosen": 1.3823509216308594,
490
+ "rewards/margins": 0.729066014289856,
491
+ "rewards/rejected": 0.6532848477363586,
492
+ "step": 60
493
+ },
494
+ {
495
+ "epoch": 0.07240875912408759,
496
+ "grad_norm": 2.177586317062378,
497
+ "learning_rate": 1.182170542635659e-05,
498
+ "logits/chosen": 5.264093399047852,
499
+ "logits/rejected": 5.310842990875244,
500
+ "logps/chosen": -364.808349609375,
501
+ "logps/rejected": -401.0321044921875,
502
+ "loss": 0.31365492939949036,
503
+ "rewards/accuracies": 1.0,
504
+ "rewards/chosen": 1.6637591123580933,
505
+ "rewards/margins": 1.0887457132339478,
506
+ "rewards/rejected": 0.5750135183334351,
507
+ "step": 62
508
+ },
509
+ {
510
+ "epoch": 0.07474452554744526,
511
+ "grad_norm": 1.697789192199707,
512
+ "learning_rate": 1.2209302325581395e-05,
513
+ "logits/chosen": 5.191982269287109,
514
+ "logits/rejected": 5.261416912078857,
515
+ "logps/chosen": -359.8249816894531,
516
+ "logps/rejected": -397.2122497558594,
517
+ "loss": 0.3037749230861664,
518
+ "rewards/accuracies": 1.0,
519
+ "rewards/chosen": 1.6470392942428589,
520
+ "rewards/margins": 1.114844799041748,
521
+ "rewards/rejected": 0.5321945548057556,
522
+ "step": 64
523
+ },
524
+ {
525
+ "epoch": 0.07708029197080292,
526
+ "grad_norm": 1.3219914436340332,
527
+ "learning_rate": 1.2596899224806202e-05,
528
+ "logits/chosen": 5.293405532836914,
529
+ "logits/rejected": 5.3094048500061035,
530
+ "logps/chosen": -352.3752136230469,
531
+ "logps/rejected": -392.6779479980469,
532
+ "loss": 0.25026455521583557,
533
+ "rewards/accuracies": 1.0,
534
+ "rewards/chosen": 1.5671364068984985,
535
+ "rewards/margins": 1.4098074436187744,
536
+ "rewards/rejected": 0.15732917189598083,
537
+ "step": 66
538
+ },
539
+ {
540
+ "epoch": 0.07941605839416059,
541
+ "grad_norm": 1.8173967599868774,
542
+ "learning_rate": 1.2984496124031009e-05,
543
+ "logits/chosen": 5.025746822357178,
544
+ "logits/rejected": 5.114965438842773,
545
+ "logps/chosen": -319.99700927734375,
546
+ "logps/rejected": -364.115234375,
547
+ "loss": 0.3108353912830353,
548
+ "rewards/accuracies": 0.9375,
549
+ "rewards/chosen": 1.4788665771484375,
550
+ "rewards/margins": 1.2637410163879395,
551
+ "rewards/rejected": 0.2151254564523697,
552
+ "step": 68
553
+ },
554
+ {
555
+ "epoch": 0.08175182481751825,
556
+ "grad_norm": 1.0658400058746338,
557
+ "learning_rate": 1.3372093023255814e-05,
558
+ "logits/chosen": 4.945235729217529,
559
+ "logits/rejected": 4.959147930145264,
560
+ "logps/chosen": -383.84033203125,
561
+ "logps/rejected": -431.7752685546875,
562
+ "loss": 0.22991834580898285,
563
+ "rewards/accuracies": 1.0,
564
+ "rewards/chosen": 1.3950352668762207,
565
+ "rewards/margins": 1.4965243339538574,
566
+ "rewards/rejected": -0.1014888733625412,
567
+ "step": 70
568
+ },
569
+ {
570
+ "epoch": 0.0840875912408759,
571
+ "grad_norm": 1.0350896120071411,
572
+ "learning_rate": 1.375968992248062e-05,
573
+ "logits/chosen": 5.00426721572876,
574
+ "logits/rejected": 5.120238780975342,
575
+ "logps/chosen": -350.9471435546875,
576
+ "logps/rejected": -382.6837158203125,
577
+ "loss": 0.22603684663772583,
578
+ "rewards/accuracies": 1.0,
579
+ "rewards/chosen": 1.2978975772857666,
580
+ "rewards/margins": 1.644275426864624,
581
+ "rewards/rejected": -0.34637776017189026,
582
+ "step": 72
583
+ },
584
+ {
585
+ "epoch": 0.08642335766423358,
586
+ "grad_norm": 1.1595423221588135,
587
+ "learning_rate": 1.4147286821705426e-05,
588
+ "logits/chosen": 4.890130043029785,
589
+ "logits/rejected": 4.9504714012146,
590
+ "logps/chosen": -352.34967041015625,
591
+ "logps/rejected": -399.23028564453125,
592
+ "loss": 0.18921935558319092,
593
+ "rewards/accuracies": 1.0,
594
+ "rewards/chosen": 1.1984589099884033,
595
+ "rewards/margins": 1.7495291233062744,
596
+ "rewards/rejected": -0.5510700941085815,
597
+ "step": 74
598
+ },
599
+ {
600
+ "epoch": 0.08759124087591241,
601
+ "eval_logits/chosen": 4.930174827575684,
602
+ "eval_logits/rejected": 5.032296657562256,
603
+ "eval_logps/chosen": -359.19647216796875,
604
+ "eval_logps/rejected": -405.1120300292969,
605
+ "eval_loss": 0.16020436584949493,
606
+ "eval_rewards/accuracies": 0.9960629940032959,
607
+ "eval_rewards/chosen": 1.1210675239562988,
608
+ "eval_rewards/margins": 2.0546727180480957,
609
+ "eval_rewards/rejected": -0.9336051344871521,
610
+ "eval_runtime": 454.3435,
611
+ "eval_samples_per_second": 1.677,
612
+ "eval_steps_per_second": 1.677,
613
+ "step": 75
614
+ },
615
+ {
616
+ "epoch": 0.08875912408759123,
617
+ "grad_norm": 1.1433167457580566,
618
+ "learning_rate": 1.4534883720930233e-05,
619
+ "logits/chosen": 5.037275314331055,
620
+ "logits/rejected": 5.1315507888793945,
621
+ "logps/chosen": -313.110595703125,
622
+ "logps/rejected": -356.1000061035156,
623
+ "loss": 0.15998858213424683,
624
+ "rewards/accuracies": 1.0,
625
+ "rewards/chosen": 1.2128857374191284,
626
+ "rewards/margins": 2.0945115089416504,
627
+ "rewards/rejected": -0.8816256523132324,
628
+ "step": 76
629
+ },
630
+ {
631
+ "epoch": 0.0910948905109489,
632
+ "grad_norm": 0.9839214086532593,
633
+ "learning_rate": 1.4922480620155039e-05,
634
+ "logits/chosen": 4.817085266113281,
635
+ "logits/rejected": 4.874035835266113,
636
+ "logps/chosen": -366.2629089355469,
637
+ "logps/rejected": -405.7989196777344,
638
+ "loss": 0.1894684135913849,
639
+ "rewards/accuracies": 1.0,
640
+ "rewards/chosen": 1.0605502128601074,
641
+ "rewards/margins": 1.90762460231781,
642
+ "rewards/rejected": -0.8470743894577026,
643
+ "step": 78
644
+ },
645
+ {
646
+ "epoch": 0.09343065693430656,
647
+ "grad_norm": 0.9212782979011536,
648
+ "learning_rate": 1.5310077519379846e-05,
649
+ "logits/chosen": 5.046716690063477,
650
+ "logits/rejected": 5.157979965209961,
651
+ "logps/chosen": -348.0658264160156,
652
+ "logps/rejected": -395.23870849609375,
653
+ "loss": 0.15948188304901123,
654
+ "rewards/accuracies": 1.0,
655
+ "rewards/chosen": 0.676516056060791,
656
+ "rewards/margins": 2.167430877685547,
657
+ "rewards/rejected": -1.4909145832061768,
658
+ "step": 80
659
+ },
660
+ {
661
+ "epoch": 0.09576642335766423,
662
+ "grad_norm": 0.9820688366889954,
663
+ "learning_rate": 1.569767441860465e-05,
664
+ "logits/chosen": 4.690741539001465,
665
+ "logits/rejected": 4.771791458129883,
666
+ "logps/chosen": -378.8666076660156,
667
+ "logps/rejected": -436.9100036621094,
668
+ "loss": 0.12085139006376266,
669
+ "rewards/accuracies": 1.0,
670
+ "rewards/chosen": 0.8719685077667236,
671
+ "rewards/margins": 2.646538257598877,
672
+ "rewards/rejected": -1.7745698690414429,
673
+ "step": 82
674
+ },
675
+ {
676
+ "epoch": 0.09810218978102189,
677
+ "grad_norm": 0.66785728931427,
678
+ "learning_rate": 1.608527131782946e-05,
679
+ "logits/chosen": 4.880465984344482,
680
+ "logits/rejected": 4.961792945861816,
681
+ "logps/chosen": -346.51214599609375,
682
+ "logps/rejected": -400.1110534667969,
683
+ "loss": 0.08720710873603821,
684
+ "rewards/accuracies": 1.0,
685
+ "rewards/chosen": 1.1337480545043945,
686
+ "rewards/margins": 2.903944253921509,
687
+ "rewards/rejected": -1.7701961994171143,
688
+ "step": 84
689
+ },
690
+ {
691
+ "epoch": 0.10043795620437956,
692
+ "grad_norm": 0.5760660767555237,
693
+ "learning_rate": 1.647286821705426e-05,
694
+ "logits/chosen": 4.464397430419922,
695
+ "logits/rejected": 4.680055618286133,
696
+ "logps/chosen": -341.7489318847656,
697
+ "logps/rejected": -398.322021484375,
698
+ "loss": 0.07942983508110046,
699
+ "rewards/accuracies": 1.0,
700
+ "rewards/chosen": 1.2459325790405273,
701
+ "rewards/margins": 3.0152552127838135,
702
+ "rewards/rejected": -1.7693227529525757,
703
+ "step": 86
704
+ },
705
+ {
706
+ "epoch": 0.10277372262773722,
707
+ "grad_norm": 1.6020294427871704,
708
+ "learning_rate": 1.686046511627907e-05,
709
+ "logits/chosen": 4.563863277435303,
710
+ "logits/rejected": 4.680974960327148,
711
+ "logps/chosen": -344.9147644042969,
712
+ "logps/rejected": -395.4453125,
713
+ "loss": 0.1258174479007721,
714
+ "rewards/accuracies": 0.9375,
715
+ "rewards/chosen": 1.0706769227981567,
716
+ "rewards/margins": 3.118717670440674,
717
+ "rewards/rejected": -2.0480403900146484,
718
+ "step": 88
719
+ },
720
+ {
721
+ "epoch": 0.10510948905109489,
722
+ "grad_norm": 0.46413859724998474,
723
+ "learning_rate": 1.7248062015503875e-05,
724
+ "logits/chosen": 4.4989237785339355,
725
+ "logits/rejected": 4.673248291015625,
726
+ "logps/chosen": -326.9678649902344,
727
+ "logps/rejected": -388.4164123535156,
728
+ "loss": 0.06663060188293457,
729
+ "rewards/accuracies": 1.0,
730
+ "rewards/chosen": 1.4128761291503906,
731
+ "rewards/margins": 3.760685920715332,
732
+ "rewards/rejected": -2.3478102684020996,
733
+ "step": 90
734
+ },
735
+ {
736
+ "epoch": 0.10744525547445255,
737
+ "grad_norm": 0.6699568629264832,
738
+ "learning_rate": 1.7635658914728684e-05,
739
+ "logits/chosen": 4.7294535636901855,
740
+ "logits/rejected": 4.813880920410156,
741
+ "logps/chosen": -362.7267150878906,
742
+ "logps/rejected": -439.2985534667969,
743
+ "loss": 0.04481709748506546,
744
+ "rewards/accuracies": 1.0,
745
+ "rewards/chosen": 1.477597713470459,
746
+ "rewards/margins": 4.37883186340332,
747
+ "rewards/rejected": -2.9012341499328613,
748
+ "step": 92
749
+ },
750
+ {
751
+ "epoch": 0.10978102189781022,
752
+ "grad_norm": 0.4152977168560028,
753
+ "learning_rate": 1.802325581395349e-05,
754
+ "logits/chosen": 4.785149574279785,
755
+ "logits/rejected": 4.891542434692383,
756
+ "logps/chosen": -381.59246826171875,
757
+ "logps/rejected": -444.2817687988281,
758
+ "loss": 0.05632612109184265,
759
+ "rewards/accuracies": 1.0,
760
+ "rewards/chosen": 0.71366286277771,
761
+ "rewards/margins": 3.4584720134735107,
762
+ "rewards/rejected": -2.744809150695801,
763
+ "step": 94
764
+ },
765
+ {
766
+ "epoch": 0.11211678832116788,
767
+ "grad_norm": 0.3152717649936676,
768
+ "learning_rate": 1.8410852713178295e-05,
769
+ "logits/chosen": 4.603940486907959,
770
+ "logits/rejected": 4.804995536804199,
771
+ "logps/chosen": -356.7286376953125,
772
+ "logps/rejected": -414.69635009765625,
773
+ "loss": 0.040920041501522064,
774
+ "rewards/accuracies": 1.0,
775
+ "rewards/chosen": 1.7566397190093994,
776
+ "rewards/margins": 4.020595550537109,
777
+ "rewards/rejected": -2.263956069946289,
778
+ "step": 96
779
+ },
780
+ {
781
+ "epoch": 0.11445255474452555,
782
+ "grad_norm": 0.37698569893836975,
783
+ "learning_rate": 1.8798449612403103e-05,
784
+ "logits/chosen": 4.558542728424072,
785
+ "logits/rejected": 4.690641403198242,
786
+ "logps/chosen": -339.794189453125,
787
+ "logps/rejected": -413.8865966796875,
788
+ "loss": 0.025794224813580513,
789
+ "rewards/accuracies": 1.0,
790
+ "rewards/chosen": 1.3867536783218384,
791
+ "rewards/margins": 4.6542744636535645,
792
+ "rewards/rejected": -3.2675204277038574,
793
+ "step": 98
794
+ },
795
+ {
796
+ "epoch": 0.11678832116788321,
797
+ "grad_norm": 0.15023073554039001,
798
+ "learning_rate": 1.918604651162791e-05,
799
+ "logits/chosen": 4.387497425079346,
800
+ "logits/rejected": 4.494588375091553,
801
+ "logps/chosen": -346.2568054199219,
802
+ "logps/rejected": -418.9315185546875,
803
+ "loss": 0.015155203640460968,
804
+ "rewards/accuracies": 1.0,
805
+ "rewards/chosen": 1.7938623428344727,
806
+ "rewards/margins": 4.942529201507568,
807
+ "rewards/rejected": -3.1486666202545166,
808
+ "step": 100
809
+ },
810
+ {
811
+ "epoch": 0.11678832116788321,
812
+ "eval_logits/chosen": 4.285891056060791,
813
+ "eval_logits/rejected": 4.425926208496094,
814
+ "eval_logps/chosen": -353.15850830078125,
815
+ "eval_logps/rejected": -424.4124755859375,
816
+ "eval_loss": 0.04428481683135033,
817
+ "eval_rewards/accuracies": 0.9921259880065918,
818
+ "eval_rewards/chosen": 1.7248634099960327,
819
+ "eval_rewards/margins": 4.588510513305664,
820
+ "eval_rewards/rejected": -2.863647222518921,
821
+ "eval_runtime": 454.7251,
822
+ "eval_samples_per_second": 1.676,
823
+ "eval_steps_per_second": 1.676,
824
+ "step": 100
825
+ }
826
+ ],
827
+ "logging_steps": 2,
828
+ "max_steps": 2571,
829
+ "num_input_tokens_seen": 0,
830
+ "num_train_epochs": 3,
831
+ "save_steps": 100,
832
+ "stateful_callbacks": {
833
+ "EarlyStoppingCallback": {
834
+ "args": {
835
+ "early_stopping_patience": 5,
836
+ "early_stopping_threshold": 0.001
837
+ },
838
+ "attributes": {
839
+ "early_stopping_patience_counter": 0
840
+ }
841
+ },
842
+ "TrainerControl": {
843
+ "args": {
844
+ "should_epoch_stop": false,
845
+ "should_evaluate": false,
846
+ "should_log": false,
847
+ "should_save": true,
848
+ "should_training_stop": false
849
+ },
850
+ "attributes": {}
851
+ }
852
+ },
853
+ "total_flos": 0.0,
854
+ "train_batch_size": 1,
855
+ "trial_name": null,
856
+ "trial_params": null
857
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21875ef630d3e8f528dce67596a0d783fd5cf223e6e245a98026996d1f3d3ade
3
+ size 5752
config_resolved.yaml CHANGED
@@ -1,34 +1,37 @@
1
  run:
2
- run_dir: runs/grpo_14b_run1
3
- model:
4
- repo_id: /workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2
5
- tokenizer_name: Qwen/Qwen2.5-Coder-14B
6
- load_in_8bit: false
7
- load_in_4bit: false
8
- torch_dtype: bfloat16
9
- device_map: auto
10
- trust_remote_code: true
11
  wandb:
12
  enabled: true
13
- project: rl-training
14
  entity: null
15
  name: null
16
  tags:
17
- - grpo-lora
18
- - 14B-QWEN
19
  notes: null
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  data:
21
- train_jsonl: grpo_dataset.jsonl
22
  eval_jsonl: null
23
- eval_split_ratio: 0.0
24
- shuffle: true
25
- num_proc: 1
26
  prompt_field: prompt
27
- completions_field: completions
28
- scores_field: scores
29
- format_type: raw
30
- max_length: 2048
31
- min_completions: 2
32
  system_prompt: "You are a Hyperswitch Rust code analyzer. Identify functions/structs\
33
  \ that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain\
34
  \ the data flow and why each component must change:\n- Flow: [Input \u2192 Processing\
@@ -40,73 +43,51 @@ data:
40
  \ nested items: `status::StructName::Type::Name`\n3. Always explain \"must change\
41
  \ because\" and \"without this\"\n3. Types of components: function, struct, enum,\
42
  \ impl, trait\n4. If there is extra information (e.g., enum variants), include\
43
- \ that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\
44
- \n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook\
45
- \ system routes events via EventClass enum. Flow: webhook \u2192 EventClass \u2192\
46
- \ handler \u2192 processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass)\
47
- \ must add Subscriptions variant because it defines event routing\u2014without\
48
- \ this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus)\
49
- \ must map to EventType because it converts status to events\u2014without this,\
50
- \ status changes don't trigger webhooks. These are coupled: EventClass routes\
51
- \ to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\n\
52
- crates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n"
53
- custom_template: '##INSTRUCTION
54
-
55
- {instruction}<|im_end|>
56
-
57
- {input}<|im_end|>
58
-
59
- {output}<|im_end|>'
60
- grpo:
61
- group_size: 4
62
- kl_coef: 0.05
63
- normalize_advantages: true
64
- reward_scaling: 1.0
65
- reward_bias: 0.0
66
- reward_clip: 5.0
67
- advantage_temperature: 1.0
68
- use_reference_model: false
69
- seed: 42
70
  peft:
71
  enabled: true
72
  r: 16
73
  lora_alpha: 32
74
  lora_dropout: 0.05
75
- target_modules:
76
- - q_proj
77
- - k_proj
78
- - v_proj
79
- - o_proj
80
- - gate_proj
81
- - up_proj
82
- - down_proj
83
  bias: none
84
- task_type: CAUSAL_LM
 
 
 
 
 
 
85
  train:
86
- output_dir: runs/grpo_14b_run1
87
- num_train_epochs: 2
88
  per_device_train_batch_size: 1
89
- gradient_accumulation_steps: 8
90
  per_device_eval_batch_size: 1
91
- learning_rate: 5.0e-06
92
- weight_decay: 0.01
93
- warmup_ratio: 0.05
 
94
  lr_scheduler_type: cosine
95
- fp16: false
96
- bf16: true
97
  max_grad_norm: 1.0
98
  gradient_checkpointing: true
99
- logging_steps: 5
 
100
  save_steps: 100
101
- save_total_limit: 2
102
- evaluation_strategy: 'no'
103
- dataloader_num_workers: 4
104
- dataloader_pin_memory: true
105
- remove_unused_columns: false
106
- report_to: []
107
- seed: 42
108
- ddp_find_unused_parameters: false
 
 
 
109
  merge:
110
- enabled: false
111
- upload:
112
- enabled: false
 
 
1
  run:
2
+ run_dir: ./runs/dpo_run_14b_v1
3
+ seed: 42
 
 
 
 
 
 
 
4
  wandb:
5
  enabled: true
6
+ project: dpo-training
7
  entity: null
8
  name: null
9
  tags:
10
+ - dpo-lora
11
+ - preference-optimization
12
  notes: null
13
+ model:
14
+ repo_id: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
15
+ revision: null
16
+ base_local_dir: base_model
17
+ trust_remote_code: true
18
+ tokenizer_use_fast: true
19
+ device_map: auto
20
+ torch_dtype: bfloat16
21
+ use_4bit: false
22
+ bnb_4bit_quant_type: nf4
23
+ bnb_4bit_use_double_quant: false
24
+ bnb_4bit_compute_dtype: bfloat16
25
+ attn_implementation: null
26
  data:
27
+ train_jsonl: dpo_pairs_generated.jsonl
28
  eval_jsonl: null
29
+ eval_split_ratio: 0.1
 
 
30
  prompt_field: prompt
31
+ chosen_field: chosen
32
+ rejected_field: rejected
33
+ score_field: f1_score
34
+ format_type: chatml
 
35
  system_prompt: "You are a Hyperswitch Rust code analyzer. Identify functions/structs\
36
  \ that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain\
37
  \ the data flow and why each component must change:\n- Flow: [Input \u2192 Processing\
 
43
  \ nested items: `status::StructName::Type::Name`\n3. Always explain \"must change\
44
  \ because\" and \"without this\"\n3. Types of components: function, struct, enum,\
45
  \ impl, trait\n4. If there is extra information (e.g., enum variants), include\
46
+ \ that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n"
47
+ max_length: 2048
48
+ shuffle: true
49
+ num_proc: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  peft:
51
  enabled: true
52
  r: 16
53
  lora_alpha: 32
54
  lora_dropout: 0.05
 
 
 
 
 
 
 
 
55
  bias: none
56
+ target_modules: auto
57
+ dpo:
58
+ beta: 0.1
59
+ label_smoothing: 0.0
60
+ loss_type: sigmoid
61
+ use_reference_model: true
62
+ reference_free: false
63
  train:
64
+ num_train_epochs: 3
 
65
  per_device_train_batch_size: 1
 
66
  per_device_eval_batch_size: 1
67
+ gradient_accumulation_steps: 8
68
+ learning_rate: 5e-5
69
+ weight_decay: 0.0
70
+ warmup_ratio: 0.1
71
  lr_scheduler_type: cosine
72
+ optim: adamw_torch
 
73
  max_grad_norm: 1.0
74
  gradient_checkpointing: true
75
+ logging_steps: 2
76
+ save_strategy: steps
77
  save_steps: 100
78
+ save_total_limit: 10
79
+ evaluation_strategy: steps
80
+ eval_steps: 25
81
+ load_best_model_at_end: true
82
+ early_stopping:
83
+ enabled: true
84
+ patience: 5
85
+ min_delta: 0.001
86
+ metric: eval_loss
87
+ mode: min
88
+ resume_from_checkpoint: auto
89
  merge:
90
+ enabled: true
91
+ merged_dtype: float16
92
+ max_shard_size: 2GB
93
+ output_dir: ./merged_14b_dpo_lora
logs/eval.jsonl CHANGED
@@ -1,52 +1,5 @@
1
- {"ts": "2025-12-26T18:41:10", "event": "eval", "step": 100, "epoch": 0.04219409282700422, "eval_loss": 1.2979938983917236, "eval_runtime": 682.1979, "eval_samples_per_second": 3.089, "eval_steps_per_second": 3.089, "perplexity": 3.661943064177116}
2
- {"ts": "2025-12-26T19:05:13", "event": "eval", "step": 200, "epoch": 0.08438818565400844, "eval_loss": 1.142486810684204, "eval_runtime": 668.2356, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "perplexity": 3.134553722506413}
3
- {"ts": "2025-12-26T19:29:29", "event": "eval", "step": 300, "epoch": 0.12658227848101267, "eval_loss": 1.0952109098434448, "eval_runtime": 677.0652, "eval_samples_per_second": 3.112, "eval_steps_per_second": 3.112, "perplexity": 2.98981319793367}
4
- {"ts": "2025-12-26T19:53:55", "event": "eval", "step": 400, "epoch": 0.16877637130801687, "eval_loss": 1.0625108480453491, "eval_runtime": 691.0068, "eval_samples_per_second": 3.049, "eval_steps_per_second": 3.049, "perplexity": 2.893627334202045}
5
- {"ts": "2025-12-26T20:18:00", "event": "eval", "step": 500, "epoch": 0.2109704641350211, "eval_loss": 1.042096495628357, "eval_runtime": 692.4361, "eval_samples_per_second": 3.043, "eval_steps_per_second": 3.043, "perplexity": 2.8351546774213405}
6
- {"ts": "2025-12-26T20:42:00", "event": "eval", "step": 600, "epoch": 0.25316455696202533, "eval_loss": 1.0193854570388794, "eval_runtime": 677.9523, "eval_samples_per_second": 3.108, "eval_steps_per_second": 3.108, "perplexity": 2.7714910402016297}
7
- {"ts": "2025-12-26T21:06:13", "event": "eval", "step": 700, "epoch": 0.29535864978902954, "eval_loss": 0.996929407119751, "eval_runtime": 668.6398, "eval_samples_per_second": 3.151, "eval_steps_per_second": 3.151, "perplexity": 2.7099478932392134}
8
- {"ts": "2025-12-26T21:30:25", "event": "eval", "step": 800, "epoch": 0.33755274261603374, "eval_loss": 0.9800403714179993, "eval_runtime": 678.8306, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "perplexity": 2.6645638119774637}
9
- {"ts": "2025-12-26T21:54:42", "event": "eval", "step": 900, "epoch": 0.379746835443038, "eval_loss": 0.9643027186393738, "eval_runtime": 691.7929, "eval_samples_per_second": 3.046, "eval_steps_per_second": 3.046, "perplexity": 2.6229580789054108}
10
- {"ts": "2025-12-26T22:18:39", "event": "eval", "step": 1000, "epoch": 0.4219409282700422, "eval_loss": 0.9487298727035522, "eval_runtime": 689.4288, "eval_samples_per_second": 3.056, "eval_steps_per_second": 3.056, "perplexity": 2.5824275636777196}
11
- {"ts": "2025-12-26T22:42:41", "event": "eval", "step": 1100, "epoch": 0.4641350210970464, "eval_loss": 0.9357889294624329, "eval_runtime": 676.9573, "eval_samples_per_second": 3.112, "eval_steps_per_second": 3.112, "perplexity": 2.549223822396605}
12
- {"ts": "2025-12-26T23:06:55", "event": "eval", "step": 1200, "epoch": 0.5063291139240507, "eval_loss": 0.9224098324775696, "eval_runtime": 669.7542, "eval_samples_per_second": 3.146, "eval_steps_per_second": 3.146, "perplexity": 2.515344651361619}
13
- {"ts": "2025-12-26T23:31:25", "event": "eval", "step": 1300, "epoch": 0.5485232067510548, "eval_loss": 0.9068717360496521, "eval_runtime": 680.7718, "eval_samples_per_second": 3.095, "eval_steps_per_second": 3.095, "perplexity": 2.476563059931004}
14
- {"ts": "2025-12-26T23:55:39", "event": "eval", "step": 1400, "epoch": 0.5907172995780591, "eval_loss": 0.8971880674362183, "eval_runtime": 692.8046, "eval_samples_per_second": 3.041, "eval_steps_per_second": 3.041, "perplexity": 2.452696587964245}
15
- {"ts": "2025-12-27T00:19:35", "event": "eval", "step": 1500, "epoch": 0.6329113924050633, "eval_loss": 0.887488842010498, "eval_runtime": 686.2804, "eval_samples_per_second": 3.07, "eval_steps_per_second": 3.07, "perplexity": 2.4290223274474503}
16
- {"ts": "2025-12-27T00:43:47", "event": "eval", "step": 1600, "epoch": 0.6751054852320675, "eval_loss": 0.8769772052764893, "eval_runtime": 677.9338, "eval_samples_per_second": 3.108, "eval_steps_per_second": 3.108, "perplexity": 2.403623054958293}
17
- {"ts": "2025-12-27T01:08:03", "event": "eval", "step": 1700, "epoch": 0.7172995780590717, "eval_loss": 0.8708170056343079, "eval_runtime": 670.3019, "eval_samples_per_second": 3.143, "eval_steps_per_second": 3.143, "perplexity": 2.388861769986548}
18
- {"ts": "2025-12-27T01:32:23", "event": "eval", "step": 1800, "epoch": 0.759493670886076, "eval_loss": 0.8625519275665283, "eval_runtime": 686.4271, "eval_samples_per_second": 3.07, "eval_steps_per_second": 3.07, "perplexity": 2.369199010020167}
19
- {"ts": "2025-12-27T01:56:20", "event": "eval", "step": 1900, "epoch": 0.8016877637130801, "eval_loss": 0.8546335697174072, "eval_runtime": 688.5301, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.3505129236377402}
20
- {"ts": "2025-12-27T02:20:26", "event": "eval", "step": 2000, "epoch": 0.8438818565400844, "eval_loss": 0.8460908532142639, "eval_runtime": 685.2518, "eval_samples_per_second": 3.075, "eval_steps_per_second": 3.075, "perplexity": 2.330518682256874}
21
- {"ts": "2025-12-27T02:44:39", "event": "eval", "step": 2100, "epoch": 0.8860759493670886, "eval_loss": 0.8401098847389221, "eval_runtime": 669.1149, "eval_samples_per_second": 3.149, "eval_steps_per_second": 3.149, "perplexity": 2.3166215241467625}
22
- {"ts": "2025-12-27T03:09:05", "event": "eval", "step": 2200, "epoch": 0.9282700421940928, "eval_loss": 0.8336610198020935, "eval_runtime": 674.5134, "eval_samples_per_second": 3.124, "eval_steps_per_second": 3.124, "perplexity": 2.3017300131082887}
23
- {"ts": "2025-12-27T03:33:21", "event": "eval", "step": 2300, "epoch": 0.9704641350210971, "eval_loss": 0.8281980156898499, "eval_runtime": 688.6136, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.289189937012629}
24
- {"ts": "2025-12-27T03:57:18", "event": "eval", "step": 2400, "epoch": 1.0126582278481013, "eval_loss": 0.8250564932823181, "eval_runtime": 691.5833, "eval_samples_per_second": 3.047, "eval_steps_per_second": 3.047, "perplexity": 2.282009679904965}
25
- {"ts": "2025-12-27T04:21:22", "event": "eval", "step": 2500, "epoch": 1.0548523206751055, "eval_loss": 0.8249453902244568, "eval_runtime": 679.4446, "eval_samples_per_second": 3.101, "eval_steps_per_second": 3.101, "perplexity": 2.2817561557353745}
26
- {"ts": "2025-12-27T04:45:36", "event": "eval", "step": 2600, "epoch": 1.0970464135021096, "eval_loss": 0.8211485743522644, "eval_runtime": 670.2276, "eval_samples_per_second": 3.144, "eval_steps_per_second": 3.144, "perplexity": 2.2731091736340194}
27
- {"ts": "2025-12-27T05:10:00", "event": "eval", "step": 2700, "epoch": 1.139240506329114, "eval_loss": 0.8155058026313782, "eval_runtime": 678.284, "eval_samples_per_second": 3.106, "eval_steps_per_second": 3.106, "perplexity": 2.2603186583878263}
28
- {"ts": "2025-12-27T05:34:19", "event": "eval", "step": 2800, "epoch": 1.1814345991561181, "eval_loss": 0.8124309182167053, "eval_runtime": 688.4759, "eval_samples_per_second": 3.06, "eval_steps_per_second": 3.06, "perplexity": 2.2533791143885313}
29
- {"ts": "2025-12-27T05:58:11", "event": "eval", "step": 2900, "epoch": 1.2236286919831223, "eval_loss": 0.8077136278152466, "eval_runtime": 685.6042, "eval_samples_per_second": 3.073, "eval_steps_per_second": 3.073, "perplexity": 2.2427743033735634}
30
- {"ts": "2025-12-27T06:22:11", "event": "eval", "step": 3000, "epoch": 1.2658227848101267, "eval_loss": 0.8033165335655212, "eval_runtime": 678.7554, "eval_samples_per_second": 3.104, "eval_steps_per_second": 3.104, "perplexity": 2.232934263027478}
31
- {"ts": "2025-12-27T06:46:29", "event": "eval", "step": 3100, "epoch": 1.3080168776371308, "eval_loss": 0.8010181784629822, "eval_runtime": 668.1688, "eval_samples_per_second": 3.153, "eval_steps_per_second": 3.153, "perplexity": 2.2278080803210654}
32
- {"ts": "2025-12-27T07:10:54", "event": "eval", "step": 3200, "epoch": 1.350210970464135, "eval_loss": 0.797160804271698, "eval_runtime": 680.976, "eval_samples_per_second": 3.094, "eval_steps_per_second": 3.094, "perplexity": 2.2192311437906307}
33
- {"ts": "2025-12-27T07:35:02", "event": "eval", "step": 3300, "epoch": 1.3924050632911391, "eval_loss": 0.795619547367096, "eval_runtime": 692.7157, "eval_samples_per_second": 3.042, "eval_steps_per_second": 3.042, "perplexity": 2.215813372975358}
34
- {"ts": "2025-12-27T07:59:01", "event": "eval", "step": 3400, "epoch": 1.4345991561181435, "eval_loss": 0.7917885780334473, "eval_runtime": 686.1689, "eval_samples_per_second": 3.071, "eval_steps_per_second": 3.071, "perplexity": 2.2073408991501657}
35
- {"ts": "2025-12-27T08:22:55", "event": "eval", "step": 3500, "epoch": 1.4767932489451476, "eval_loss": 0.7902651429176331, "eval_runtime": 672.312, "eval_samples_per_second": 3.134, "eval_steps_per_second": 3.134, "perplexity": 2.203980718670171}
36
- {"ts": "2025-12-27T08:47:16", "event": "eval", "step": 3600, "epoch": 1.518987341772152, "eval_loss": 0.785450279712677, "eval_runtime": 675.2312, "eval_samples_per_second": 3.12, "eval_steps_per_second": 3.12, "perplexity": 2.1933943593911716}
37
- {"ts": "2025-12-27T09:11:38", "event": "eval", "step": 3700, "epoch": 1.5611814345991561, "eval_loss": 0.7854447960853577, "eval_runtime": 687.7907, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "perplexity": 2.193382331666918}
38
- {"ts": "2025-12-27T09:35:44", "event": "eval", "step": 3800, "epoch": 1.6033755274261603, "eval_loss": 0.778353214263916, "eval_runtime": 692.5522, "eval_samples_per_second": 3.042, "eval_steps_per_second": 3.042, "perplexity": 2.1778828044355443}
39
- {"ts": "2025-12-27T09:59:49", "event": "eval", "step": 3900, "epoch": 1.6455696202531644, "eval_loss": 0.7763351202011108, "eval_runtime": 682.0824, "eval_samples_per_second": 3.089, "eval_steps_per_second": 3.089, "perplexity": 2.173492064032179}
40
- {"ts": "2025-12-27T10:23:55", "event": "eval", "step": 4000, "epoch": 1.6877637130801688, "eval_loss": 0.7721371650695801, "eval_runtime": 668.395, "eval_samples_per_second": 3.152, "eval_steps_per_second": 3.152, "perplexity": 2.1643869666352633}
41
- {"ts": "2025-12-27T10:48:15", "event": "eval", "step": 4100, "epoch": 1.729957805907173, "eval_loss": 0.7690847516059875, "eval_runtime": 673.6323, "eval_samples_per_second": 3.128, "eval_steps_per_second": 3.128, "perplexity": 2.157790435509873}
42
- {"ts": "2025-12-27T11:12:31", "event": "eval", "step": 4200, "epoch": 1.7721518987341773, "eval_loss": 0.7676366567611694, "eval_runtime": 687.9619, "eval_samples_per_second": 3.063, "eval_steps_per_second": 3.063, "perplexity": 2.1546680116326113}
43
- {"ts": "2025-12-27T11:36:30", "event": "eval", "step": 4300, "epoch": 1.8143459915611815, "eval_loss": 0.7672964930534363, "eval_runtime": 688.4249, "eval_samples_per_second": 3.061, "eval_steps_per_second": 3.061, "perplexity": 2.1539351964184767}
44
- {"ts": "2025-12-27T12:00:29", "event": "eval", "step": 4400, "epoch": 1.8565400843881856, "eval_loss": 0.7635221481323242, "eval_runtime": 678.243, "eval_samples_per_second": 3.107, "eval_steps_per_second": 3.107, "perplexity": 2.1458208249008255}
45
- {"ts": "2025-12-27T12:24:47", "event": "eval", "step": 4500, "epoch": 1.8987341772151898, "eval_loss": 0.7600579857826233, "eval_runtime": 674.2593, "eval_samples_per_second": 3.125, "eval_steps_per_second": 3.125, "perplexity": 2.138400213711816}
46
- {"ts": "2025-12-27T12:49:10", "event": "eval", "step": 4600, "epoch": 1.9409282700421941, "eval_loss": 0.7585541009902954, "eval_runtime": 679.0866, "eval_samples_per_second": 3.103, "eval_steps_per_second": 3.103, "perplexity": 2.1351867231159773}
47
- {"ts": "2025-12-27T13:13:19", "event": "eval", "step": 4700, "epoch": 1.9831223628691983, "eval_loss": 0.7582268714904785, "eval_runtime": 690.081, "eval_samples_per_second": 3.053, "eval_steps_per_second": 3.053, "perplexity": 2.134488141337073}
48
- {"ts": "2025-12-27T13:37:21", "event": "eval", "step": 4800, "epoch": 2.0253164556962027, "eval_loss": 0.7633068561553955, "eval_runtime": 688.8684, "eval_samples_per_second": 3.059, "eval_steps_per_second": 3.059, "perplexity": 2.145358896619808}
49
- {"ts": "2025-12-27T14:01:30", "event": "eval", "step": 4900, "epoch": 2.067510548523207, "eval_loss": 0.7676681280136108, "eval_runtime": 676.0104, "eval_samples_per_second": 3.117, "eval_steps_per_second": 3.117, "perplexity": 2.1547358228005784}
50
- {"ts": "2025-12-27T14:25:57", "event": "eval", "step": 5000, "epoch": 2.109704641350211, "eval_loss": 0.7635271549224854, "eval_runtime": 669.5049, "eval_samples_per_second": 3.147, "eval_steps_per_second": 3.147, "perplexity": 2.145831568602315}
51
- {"ts": "2025-12-27T14:50:12", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.7654595971107483, "eval_runtime": 681.4966, "eval_samples_per_second": 3.092, "eval_steps_per_second": 3.092, "perplexity": 2.149982273261109}
52
- {"ts": "2025-12-27T15:01:27", "event": "eval", "step": 5100, "epoch": 2.151898734177215, "eval_loss": 0.7600579857826233, "eval_runtime": 674.048, "eval_samples_per_second": 3.126, "eval_steps_per_second": 3.126, "perplexity": 2.138400213711816}
 
1
+ {"ts": "2025-12-26T16:09:16", "event": "eval", "step": 25, "epoch": 0.029197080291970802, "eval_loss": 0.6836819648742676, "eval_runtime": 454.4375, "eval_samples_per_second": 1.677, "eval_steps_per_second": 1.677, "eval_rewards/chosen": 0.024636391550302505, "eval_rewards/rejected": 0.005080964416265488, "eval_rewards/accuracies": 0.665354311466217, "eval_rewards/margins": 0.019555427134037018, "eval_logps/chosen": -370.1607666015625, "eval_logps/rejected": -395.7251892089844, "eval_logits/chosen": 5.295141220092773, "eval_logits/rejected": 5.345211029052734}
2
+ {"ts": "2025-12-26T16:20:56", "event": "eval", "step": 50, "epoch": 0.058394160583941604, "eval_loss": 0.4610801041126251, "eval_runtime": 454.5598, "eval_samples_per_second": 1.676, "eval_steps_per_second": 1.676, "eval_rewards/chosen": 0.8944254517555237, "eval_rewards/rejected": 0.3205168545246124, "eval_rewards/accuracies": 0.9619422554969788, "eval_rewards/margins": 0.5739086270332336, "eval_logps/chosen": -361.462890625, "eval_logps/rejected": -392.5708312988281, "eval_logits/chosen": 5.22359037399292, "eval_logits/rejected": 5.286833763122559}
3
+ {"ts": "2025-12-26T16:32:39", "event": "eval", "step": 75, "epoch": 0.08759124087591241, "eval_loss": 0.16020436584949493, "eval_runtime": 454.3435, "eval_samples_per_second": 1.677, "eval_steps_per_second": 1.677, "eval_rewards/chosen": 1.1210675239562988, "eval_rewards/rejected": -0.9336051344871521, "eval_rewards/accuracies": 0.9960629940032959, "eval_rewards/margins": 2.0546727180480957, "eval_logps/chosen": -359.19647216796875, "eval_logps/rejected": -405.1120300292969, "eval_logits/chosen": 4.930174827575684, "eval_logits/rejected": 5.032296657562256}
4
+ {"ts": "2025-12-26T16:44:21", "event": "eval", "step": 100, "epoch": 0.11678832116788321, "eval_loss": 0.04428481683135033, "eval_runtime": 454.7251, "eval_samples_per_second": 1.676, "eval_steps_per_second": 1.676, "eval_rewards/chosen": 1.7248634099960327, "eval_rewards/rejected": -2.863647222518921, "eval_rewards/accuracies": 0.9921259880065918, "eval_rewards/margins": 4.588510513305664, "eval_logps/chosen": -353.15850830078125, "eval_logps/rejected": -424.4124755859375, "eval_logits/chosen": 4.285891056060791, "eval_logits/rejected": 4.425926208496094}
5
+ {"ts": "2025-12-26T16:56:05", "event": "eval", "step": 125, "epoch": 0.145985401459854, "eval_loss": 0.024107323959469795, "eval_runtime": 454.8045, "eval_samples_per_second": 1.675, "eval_steps_per_second": 1.675, "eval_rewards/chosen": 0.5319492816925049, "eval_rewards/rejected": -6.150709629058838, "eval_rewards/accuracies": 0.9934383034706116, "eval_rewards/margins": 6.682660102844238, "eval_logps/chosen": -365.087646484375, "eval_logps/rejected": -457.28314208984375, "eval_logits/chosen": 3.6694726943969727, "eval_logits/rejected": 3.8436598777770996}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/train.jsonl CHANGED
@@ -1,102 +1,78 @@
1
- {"ts": "2025-12-27T19:40:44", "event": "train_log", "step": 5, "epoch": 0.02002002002002002, "progress_pct": 1.0, "epoch_pct": 1.0, "eta": "02:23:23", "max_grad_norm": 1.0, "loss": 0.007, "grad_norm": 0.06052486598491669, "learning_rate": 8.000000000000001e-07, "grpo_mean_advantage": -1.3560057254835556e-07, "grpo_std_advantage": 3.0318567496578908e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5922331809997559}
2
- {"ts": "2025-12-27T19:46:55", "event": "train_log", "step": 5, "epoch": 0.02002002002002002, "progress_pct": 1.0, "epoch_pct": 1.0, "eta": "02:16:53", "max_grad_norm": 1.0, "loss": 0.007, "grad_norm": 0.05460292845964432, "learning_rate": 8.000000000000001e-07, "grpo_mean_advantage": -1.3560057254835556e-07, "grpo_std_advantage": 3.0318567496578908e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5922331809997559}
3
- {"ts": "2025-12-27T19:48:18", "event": "train_log", "step": 10, "epoch": 0.04004004004004004, "progress_pct": 2.0, "epoch_pct": 2.0, "eta": "02:15:24", "max_grad_norm": 1.0, "loss": 0.0107, "grad_norm": 0.0679207444190979, "learning_rate": 1.8000000000000001e-06, "grpo_mean_advantage": 3.6619603633880615e-06, "grpo_std_advantage": 1.6246918676188216e-05, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5561589002609253}
4
- {"ts": "2025-12-27T19:49:40", "event": "train_log", "step": 15, "epoch": 0.06006006006006006, "progress_pct": 3.0, "epoch_pct": 3.0, "eta": "02:13:25", "max_grad_norm": 1.0, "loss": 0.007, "grad_norm": 0.05788416787981987, "learning_rate": 2.8000000000000003e-06, "grpo_mean_advantage": -1.0654330395709621e-07, "grpo_std_advantage": 5.399440965447866e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5759152173995972}
5
- {"ts": "2025-12-27T19:51:01", "event": "train_log", "step": 20, "epoch": 0.08008008008008008, "progress_pct": 4.0, "epoch_pct": 4.0, "eta": "02:11:33", "max_grad_norm": 1.0, "loss": 0.0246, "grad_norm": 0.0746568813920021, "learning_rate": 3.8000000000000005e-06, "grpo_mean_advantage": -5.871057737749652e-07, "grpo_std_advantage": 2.6951597646984737e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5127314329147339}
6
- {"ts": "2025-12-27T19:52:23", "event": "train_log", "step": 25, "epoch": 0.1001001001001001, "progress_pct": 5.0, "epoch_pct": 5.01, "eta": "02:10:02", "max_grad_norm": 1.0, "loss": 0.0337, "grad_norm": 0.11442846059799194, "learning_rate": 4.800000000000001e-06, "grpo_mean_advantage": 6.370246410369873e-07, "grpo_std_advantage": 2.8908377771585947e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.539706826210022}
7
- {"ts": "2025-12-27T19:53:48", "event": "train_log", "step": 30, "epoch": 0.12012012012012012, "progress_pct": 6.0, "epoch_pct": 6.01, "eta": "02:09:24", "max_grad_norm": 1.0, "loss": 0.0171, "grad_norm": 0.05778791010379791, "learning_rate": 4.999125183044924e-06, "grpo_mean_advantage": 6.705522359595761e-09, "grpo_std_advantage": 6.189450800775376e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5812538862228394}
8
- {"ts": "2025-12-27T19:55:13", "event": "train_log", "step": 35, "epoch": 0.14014014014014015, "progress_pct": 7.0, "epoch_pct": 7.01, "eta": "02:08:36", "max_grad_norm": 1.0, "loss": 0.0145, "grad_norm": 0.05819695070385933, "learning_rate": 4.995572288443412e-06, "grpo_mean_advantage": 3.859400692363124e-07, "grpo_std_advantage": 1.6833292875162442e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5909844636917114}
9
- {"ts": "2025-12-27T19:56:38", "event": "train_log", "step": 40, "epoch": 0.16016016016016016, "progress_pct": 8.0, "epoch_pct": 8.01, "eta": "02:07:30", "max_grad_norm": 1.0, "loss": 0.0196, "grad_norm": 0.07968433201313019, "learning_rate": 4.98929052218411e-06, "grpo_mean_advantage": 2.600252742013254e-07, "grpo_std_advantage": 1.4095899132371414e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5630953907966614}
10
- {"ts": "2025-12-27T19:58:00", "event": "train_log", "step": 45, "epoch": 0.18018018018018017, "progress_pct": 9.0, "epoch_pct": 9.01, "eta": "02:06:04", "max_grad_norm": 1.0, "loss": 0.0186, "grad_norm": 0.0733402892947197, "learning_rate": 4.980286753286196e-06, "grpo_mean_advantage": -1.2591480924584175e-07, "grpo_std_advantage": 1.0309080380466185e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5604403614997864}
11
- {"ts": "2025-12-27T19:59:27", "event": "train_log", "step": 50, "epoch": 0.2002002002002002, "progress_pct": 10.0, "epoch_pct": 10.01, "eta": "02:05:12", "max_grad_norm": 1.0, "loss": 0.0286, "grad_norm": 0.07136482000350952, "learning_rate": 4.9685708272387645e-06, "grpo_mean_advantage": -2.808868941883702e-07, "grpo_std_advantage": 1.5696078889959608e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5971035957336426}
12
- {"ts": "2025-12-27T20:00:53", "event": "train_log", "step": 55, "epoch": 0.22022022022022023, "progress_pct": 11.0, "epoch_pct": 11.01, "eta": "02:04:07", "max_grad_norm": 1.0, "loss": 0.0054, "grad_norm": 0.08851475268602371, "learning_rate": 4.9541555552349404e-06, "grpo_mean_advantage": 2.6822089438383045e-08, "grpo_std_advantage": 3.7878271541558206e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5892971754074097}
13
- {"ts": "2025-12-27T20:02:17", "event": "train_log", "step": 60, "epoch": 0.24024024024024024, "progress_pct": 12.0, "epoch_pct": 12.01, "eta": "02:02:51", "max_grad_norm": 1.0, "loss": -0.0074, "grad_norm": 0.07778509706258774, "learning_rate": 4.9370567001630155e-06, "grpo_mean_advantage": -5.662441182607836e-08, "grpo_std_advantage": 6.128998393251095e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.564322292804718}
14
- {"ts": "2025-12-27T20:03:45", "event": "train_log", "step": 65, "epoch": 0.2602602602602603, "progress_pct": 13.0, "epoch_pct": 13.01, "eta": "02:01:55", "max_grad_norm": 1.0, "loss": 0.0145, "grad_norm": 0.08740051090717316, "learning_rate": 4.917292959369968e-06, "grpo_mean_advantage": -1.5944242193199898e-07, "grpo_std_advantage": 1.6374274309782777e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.562497615814209}
15
- {"ts": "2025-12-27T20:05:10", "event": "train_log", "step": 70, "epoch": 0.2802802802802803, "progress_pct": 14.0, "epoch_pct": 14.01, "eta": "02:00:32", "max_grad_norm": 1.0, "loss": 0.0257, "grad_norm": 0.19070060551166534, "learning_rate": 4.8948859442161876e-06, "grpo_mean_advantage": 1.6838312433264946e-07, "grpo_std_advantage": 8.536571272088622e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5904761552810669}
16
- {"ts": "2025-12-27T20:06:36", "event": "train_log", "step": 75, "epoch": 0.3003003003003003, "progress_pct": 15.0, "epoch_pct": 15.02, "eta": "01:59:23", "max_grad_norm": 1.0, "loss": 0.0024, "grad_norm": 0.07321271300315857, "learning_rate": 4.869860156443768e-06, "grpo_mean_advantage": 1.1175870895385742e-07, "grpo_std_advantage": 6.451961667153228e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5765624046325684}
17
- {"ts": "2025-12-27T20:08:03", "event": "train_log", "step": 80, "epoch": 0.3203203203203203, "progress_pct": 16.0, "epoch_pct": 16.02, "eta": "01:58:12", "max_grad_norm": 1.0, "loss": 0.0277, "grad_norm": 0.07126748561859131, "learning_rate": 4.842242961384211e-06, "grpo_mean_advantage": -1.4603138254187797e-07, "grpo_std_advantage": 1.1309343790344428e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5858271718025208}
18
- {"ts": "2025-12-27T20:09:31", "event": "train_log", "step": 85, "epoch": 0.34034034034034033, "progress_pct": 17.0, "epoch_pct": 17.02, "eta": "01:57:04", "max_grad_norm": 1.0, "loss": 0.0246, "grad_norm": 0.08629189431667328, "learning_rate": 4.812064558034847e-06, "grpo_mean_advantage": -1.817941665649414e-06, "grpo_std_advantage": 1.1141768482048064e-05, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5871662497520447}
19
- {"ts": "2025-12-27T20:10:55", "event": "train_log", "step": 90, "epoch": 0.36036036036036034, "progress_pct": 18.0, "epoch_pct": 18.02, "eta": "01:55:38", "max_grad_norm": 1.0, "loss": 0.0056, "grad_norm": 0.0998779758810997, "learning_rate": 4.779357946036662e-06, "grpo_mean_advantage": 1.8179416372277046e-07, "grpo_std_advantage": 6.210335072864837e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5330992937088013}
20
- {"ts": "2025-12-27T20:12:19", "event": "train_log", "step": 95, "epoch": 0.38038038038038036, "progress_pct": 19.0, "epoch_pct": 19.02, "eta": "01:54:09", "max_grad_norm": 1.0, "loss": 0.0053, "grad_norm": 0.10614689439535141, "learning_rate": 4.74415888958968e-06, "grpo_mean_advantage": -2.972781771859445e-07, "grpo_std_advantage": 3.1582342217006953e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5265295505523682}
21
- {"ts": "2025-12-27T20:13:46", "event": "train_log", "step": 100, "epoch": 0.4004004004004004, "progress_pct": 20.0, "epoch_pct": 20.02, "eta": "01:52:55", "max_grad_norm": 1.0, "loss": 0.0134, "grad_norm": 0.10345634073019028, "learning_rate": 4.706505878345343e-06, "grpo_mean_advantage": -7.033348197182931e-07, "grpo_std_advantage": 4.245831405569334e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5660771131515503}
22
- {"ts": "2025-12-27T20:15:15", "event": "train_log", "step": 105, "epoch": 0.42042042042042044, "progress_pct": 21.0, "epoch_pct": 21.02, "eta": "01:51:45", "max_grad_norm": 1.0, "loss": 0.0004, "grad_norm": 0.10077933222055435, "learning_rate": 4.666440085318626e-06, "grpo_mean_advantage": 1.1920928955078125e-07, "grpo_std_advantage": 3.2809634831210133e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.57631915807724}
23
- {"ts": "2025-12-27T20:16:39", "event": "train_log", "step": 110, "epoch": 0.44044044044044045, "progress_pct": 22.0, "epoch_pct": 22.02, "eta": "01:50:19", "max_grad_norm": 1.0, "loss": 0.0033, "grad_norm": 0.09548182785511017, "learning_rate": 4.624005321865968e-06, "grpo_mean_advantage": -4.0978193283081055e-07, "grpo_std_advantage": 6.0397578636184335e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.546563982963562}
24
- {"ts": "2025-12-27T20:18:01", "event": "train_log", "step": 115, "epoch": 0.46046046046046046, "progress_pct": 23.0, "epoch_pct": 23.02, "eta": "01:48:43", "max_grad_norm": 1.0, "loss": 0.0095, "grad_norm": 0.09417816251516342, "learning_rate": 4.57924798977818e-06, "grpo_mean_advantage": -1.467764434437413e-07, "grpo_std_advantage": 2.2689375782647403e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5519219636917114}
25
- {"ts": "2025-12-27T20:19:25", "event": "train_log", "step": 120, "epoch": 0.4804804804804805, "progress_pct": 24.0, "epoch_pct": 24.02, "eta": "01:47:18", "max_grad_norm": 1.0, "loss": 0.0006, "grad_norm": 0.10022275149822235, "learning_rate": 4.532217030540781e-06, "grpo_mean_advantage": -5.215406329028838e-09, "grpo_std_advantage": 7.929010621410271e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5490407943725586}
26
- {"ts": "2025-12-27T20:20:48", "event": "train_log", "step": 125, "epoch": 0.5005005005005005, "progress_pct": 25.0, "epoch_pct": 25.03, "eta": "01:45:46", "max_grad_norm": 1.0, "loss": -0.0046, "grad_norm": 0.14057794213294983, "learning_rate": 4.482963871817195e-06, "grpo_mean_advantage": -5.7369469175228005e-08, "grpo_std_advantage": 1.2823379620385822e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5646580457687378}
27
- {"ts": "2025-12-27T20:22:09", "event": "train_log", "step": 130, "epoch": 0.5205205205205206, "progress_pct": 26.0, "epoch_pct": 26.03, "eta": "01:44:12", "max_grad_norm": 1.0, "loss": -0.003, "grad_norm": 0.12420658767223358, "learning_rate": 4.4315423712133595e-06, "grpo_mean_advantage": 2.9876827056796174e-07, "grpo_std_advantage": 1.0496698905626545e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.6111599802970886}
28
- {"ts": "2025-12-27T20:23:32", "event": "train_log", "step": 135, "epoch": 0.5405405405405406, "progress_pct": 27.0, "epoch_pct": 27.03, "eta": "01:42:43", "max_grad_norm": 1.0, "loss": 0.0154, "grad_norm": 0.14342808723449707, "learning_rate": 4.378008757385222e-06, "grpo_mean_advantage": 1.5869736103013565e-07, "grpo_std_advantage": 1.2748531617035042e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5619662404060364}
29
- {"ts": "2025-12-27T20:24:57", "event": "train_log", "step": 140, "epoch": 0.5605605605605606, "progress_pct": 28.0, "epoch_pct": 28.03, "eta": "01:41:21", "max_grad_norm": 1.0, "loss": -0.0262, "grad_norm": 0.14729444682598114, "learning_rate": 4.322421568553529e-06, "grpo_mean_advantage": 3.0100346748440643e-07, "grpo_std_advantage": 2.4499684059264837e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5795454978942871}
30
- {"ts": "2025-12-27T20:26:19", "event": "train_log", "step": 145, "epoch": 0.5805805805805806, "progress_pct": 29.0, "epoch_pct": 29.03, "eta": "01:39:51", "max_grad_norm": 1.0, "loss": 0.0018, "grad_norm": 0.15249410271644592, "learning_rate": 4.2648415884931476e-06, "grpo_mean_advantage": -3.233552092751779e-07, "grpo_std_advantage": 1.248456669600273e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5804953575134277}
31
- {"ts": "2025-12-27T20:27:43", "event": "train_log", "step": 150, "epoch": 0.6006006006006006, "progress_pct": 30.0, "epoch_pct": 30.03, "eta": "01:38:25", "max_grad_norm": 1.0, "loss": -0.017, "grad_norm": 0.1841023564338684, "learning_rate": 4.205331780066892e-06, "grpo_mean_advantage": 3.2261013416245987e-07, "grpo_std_advantage": 1.4773489738217904e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5628539323806763}
32
- {"ts": "2025-12-27T20:29:07", "event": "train_log", "step": 155, "epoch": 0.6206206206206206, "progress_pct": 31.0, "epoch_pct": 31.03, "eta": "01:37:00", "max_grad_norm": 1.0, "loss": 0.0044, "grad_norm": 0.18597163259983063, "learning_rate": 4.1439572163765615e-06, "grpo_mean_advantage": -2.5331974029541016e-07, "grpo_std_advantage": 1.5092309695319273e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5727725625038147}
33
- {"ts": "2025-12-27T20:30:30", "event": "train_log", "step": 160, "epoch": 0.6406406406406406, "progress_pct": 32.0, "epoch_pct": 32.03, "eta": "01:35:33", "max_grad_norm": 1.0, "loss": -0.005, "grad_norm": 0.18310388922691345, "learning_rate": 4.0807850096064605e-06, "grpo_mean_advantage": -6.780028627417778e-08, "grpo_std_advantage": 8.550978805033083e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5833909511566162}
34
- {"ts": "2025-12-27T20:31:51", "event": "train_log", "step": 165, "epoch": 0.6606606606606606, "progress_pct": 33.0, "epoch_pct": 33.03, "eta": "01:34:01", "max_grad_norm": 1.0, "loss": -0.015, "grad_norm": 0.2192923128604889, "learning_rate": 4.015884237637206e-06, "grpo_mean_advantage": -5.587935447692871e-08, "grpo_std_advantage": 3.564579174053506e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5742615461349487}
35
- {"ts": "2025-12-27T20:33:17", "event": "train_log", "step": 170, "epoch": 0.6806806806806807, "progress_pct": 34.0, "epoch_pct": 34.03, "eta": "01:32:41", "max_grad_norm": 1.0, "loss": -0.0314, "grad_norm": 0.16708803176879883, "learning_rate": 3.949325868510083e-06, "grpo_mean_advantage": -5.327165126800537e-07, "grpo_std_advantage": 2.309018327650847e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5758188962936401}
36
- {"ts": "2025-12-27T20:34:40", "event": "train_log", "step": 175, "epoch": 0.7007007007007007, "progress_pct": 35.0, "epoch_pct": 35.04, "eta": "01:31:14", "max_grad_norm": 1.0, "loss": -0.0441, "grad_norm": 0.3401262164115906, "learning_rate": 3.881182682824534e-06, "grpo_mean_advantage": 5.863606702405377e-07, "grpo_std_advantage": 2.4449204829579685e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5767683982849121}
37
- {"ts": "2025-12-27T20:36:09", "event": "train_log", "step": 180, "epoch": 0.7207207207207207, "progress_pct": 36.0, "epoch_pct": 36.04, "eta": "01:29:58", "max_grad_norm": 1.0, "loss": -0.0162, "grad_norm": 0.1931898146867752, "learning_rate": 3.811529194153635e-06, "grpo_mean_advantage": 3.2186508747145126e-07, "grpo_std_advantage": 2.293551688126172e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.586772084236145}
38
- {"ts": "2025-12-27T20:37:33", "event": "train_log", "step": 185, "epoch": 0.7407407407407407, "progress_pct": 37.0, "epoch_pct": 37.04, "eta": "01:28:33", "max_grad_norm": 1.0, "loss": -0.0386, "grad_norm": 0.2537969648838043, "learning_rate": 3.7404415675646054e-06, "grpo_mean_advantage": -4.470348358154297e-08, "grpo_std_advantage": 3.7067667335577426e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.549396276473999}
39
- {"ts": "2025-12-27T20:38:56", "event": "train_log", "step": 190, "epoch": 0.7607607607607607, "progress_pct": 38.0, "epoch_pct": 38.04, "eta": "01:27:06", "max_grad_norm": 1.0, "loss": -0.037, "grad_norm": 0.20326584577560425, "learning_rate": 3.667997536333424e-06, "grpo_mean_advantage": -2.1010637851759384e-07, "grpo_std_advantage": 1.1695076409523608e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5798425078392029}
40
- {"ts": "2025-12-27T20:40:21", "event": "train_log", "step": 195, "epoch": 0.7807807807807807, "progress_pct": 39.0, "epoch_pct": 39.04, "eta": "01:25:43", "max_grad_norm": 1.0, "loss": -0.0292, "grad_norm": 0.25048357248306274, "learning_rate": 3.59427631694463e-06, "grpo_mean_advantage": 1.765787658314366e-07, "grpo_std_advantage": 2.429934738756856e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5584167838096619}
41
- {"ts": "2025-12-27T20:41:46", "event": "train_log", "step": 200, "epoch": 0.8008008008008008, "progress_pct": 40.0, "epoch_pct": 40.04, "eta": "01:24:20", "max_grad_norm": 1.0, "loss": -0.0454, "grad_norm": 0.2687569260597229, "learning_rate": 3.5193585224692595e-06, "grpo_mean_advantage": 1.6540289493605087e-07, "grpo_std_advantage": 2.6342788714828203e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5676193237304688}
42
- {"ts": "2025-12-27T20:43:13", "event": "train_log", "step": 205, "epoch": 0.8208208208208209, "progress_pct": 41.0, "epoch_pct": 41.04, "eta": "01:22:59", "max_grad_norm": 1.0, "loss": -0.0423, "grad_norm": 0.22301620244979858, "learning_rate": 3.44332607441564e-06, "grpo_mean_advantage": -1.0944902442133753e-06, "grpo_std_advantage": 5.346942998585291e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5669739842414856}
43
- {"ts": "2025-12-27T20:44:38", "event": "train_log", "step": 210, "epoch": 0.8408408408408409, "progress_pct": 42.0, "epoch_pct": 42.04, "eta": "01:21:37", "max_grad_norm": 1.0, "loss": -0.0857, "grad_norm": 0.3040211498737335, "learning_rate": 3.3662621131494204e-06, "grpo_mean_advantage": 2.4065374759629776e-07, "grpo_std_advantage": 1.6327536513927043e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5922158360481262}
44
- {"ts": "2025-12-27T20:46:02", "event": "train_log", "step": 215, "epoch": 0.8608608608608609, "progress_pct": 43.0, "epoch_pct": 43.04, "eta": "01:20:11", "max_grad_norm": 1.0, "loss": -0.0278, "grad_norm": 0.27231141924858093, "learning_rate": 3.2882509069808044e-06, "grpo_mean_advantage": -5.21540641784668e-08, "grpo_std_advantage": 5.847922466273303e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5473950505256653}
45
- {"ts": "2025-12-27T20:47:28", "event": "train_log", "step": 220, "epoch": 0.8808808808808809, "progress_pct": 44.0, "epoch_pct": 44.04, "eta": "01:18:49", "max_grad_norm": 1.0, "loss": -0.0727, "grad_norm": 0.3571636378765106, "learning_rate": 3.2093777600183873e-06, "grpo_mean_advantage": 6.541609991472797e-07, "grpo_std_advantage": 4.072162937518442e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5880032777786255}
46
- {"ts": "2025-12-27T20:48:53", "event": "train_log", "step": 225, "epoch": 0.9009009009009009, "progress_pct": 45.0, "epoch_pct": 45.05, "eta": "01:17:25", "max_grad_norm": 1.0, "loss": -0.0464, "grad_norm": 0.306273490190506, "learning_rate": 3.1297289188903705e-06, "grpo_mean_advantage": -1.2218951894737984e-07, "grpo_std_advantage": 4.386006935419573e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5835092663764954}
47
- {"ts": "2025-12-27T20:50:20", "event": "train_log", "step": 230, "epoch": 0.9209209209209209, "progress_pct": 46.0, "epoch_pct": 46.05, "eta": "01:16:03", "max_grad_norm": 1.0, "loss": -0.0295, "grad_norm": 0.2700377106666565, "learning_rate": 3.049391478435133e-06, "grpo_mean_advantage": 1.7605722177904681e-06, "grpo_std_advantage": 8.007580618141219e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5394966006278992}
48
- {"ts": "2025-12-27T20:51:41", "event": "train_log", "step": 235, "epoch": 0.9409409409409409, "progress_pct": 47.0, "epoch_pct": 47.05, "eta": "01:14:35", "max_grad_norm": 1.0, "loss": -0.031, "grad_norm": 0.39531761407852173, "learning_rate": 2.9684532864643123e-06, "grpo_mean_advantage": -3.3080578987210174e-07, "grpo_std_advantage": 1.551636614749441e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5687432289123535}
49
- {"ts": "2025-12-27T20:53:06", "event": "train_log", "step": 240, "epoch": 0.960960960960961, "progress_pct": 48.0, "epoch_pct": 48.05, "eta": "01:13:11", "max_grad_norm": 1.0, "loss": -0.0789, "grad_norm": 0.5987040996551514, "learning_rate": 2.887002847702504e-06, "grpo_mean_advantage": 2.712011450967111e-07, "grpo_std_advantage": 1.4400844747797237e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5550583600997925}
50
- {"ts": "2025-12-27T20:54:27", "event": "train_log", "step": 245, "epoch": 0.980980980980981, "progress_pct": 49.0, "epoch_pct": 49.05, "eta": "01:11:42", "max_grad_norm": 1.0, "loss": -0.1131, "grad_norm": 0.5680716037750244, "learning_rate": 2.8051292270086506e-06, "grpo_mean_advantage": -3.2857059295565705e-07, "grpo_std_advantage": 2.105091425619321e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.558111310005188}
51
- {"ts": "2025-12-27T20:55:45", "event": "train_log", "step": 250, "epoch": 1.0, "progress_pct": 50.0, "epoch_pct": 50.0, "eta": "01:10:13", "max_grad_norm": 1.0, "loss": -0.2232, "grad_norm": 0.6204046010971069, "learning_rate": 2.722921951984927e-06, "grpo_mean_advantage": 4.470348358154297e-08, "grpo_std_advantage": 5.315724820320611e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.6196198463439941}
52
- {"ts": "2025-12-27T20:57:06", "event": "train_log", "step": 255, "epoch": 1.02002002002002, "progress_pct": 51.0, "epoch_pct": 51.0, "eta": "01:08:45", "max_grad_norm": 1.0, "loss": -0.1363, "grad_norm": 0.8389026522636414, "learning_rate": 2.640470915079614e-06, "grpo_mean_advantage": 9.290873776990338e-07, "grpo_std_advantage": 4.219644324621186e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.582168459892273}
53
- {"ts": "2025-12-27T20:58:26", "event": "train_log", "step": 260, "epoch": 1.04004004004004, "progress_pct": 52.0, "epoch_pct": 52.0, "eta": "01:07:17", "max_grad_norm": 1.0, "loss": -0.1868, "grad_norm": 0.9067686796188354, "learning_rate": 2.557866275291035e-06, "grpo_mean_advantage": 2.533197474008375e-08, "grpo_std_advantage": 1.6600588992332632e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5551307797431946}
54
- {"ts": "2025-12-27T20:59:48", "event": "train_log", "step": 265, "epoch": 1.06006006006006, "progress_pct": 53.0, "epoch_pct": 53.0, "eta": "01:05:51", "max_grad_norm": 1.0, "loss": -0.1792, "grad_norm": 0.9277902841567993, "learning_rate": 2.4751983595800093e-06, "grpo_mean_advantage": -5.662441182607836e-08, "grpo_std_advantage": 1.0909400316450046e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.535040020942688}
55
- {"ts": "2025-12-27T21:01:15", "event": "train_log", "step": 270, "epoch": 1.08008008008008, "progress_pct": 54.0, "epoch_pct": 54.0, "eta": "01:04:29", "max_grad_norm": 1.0, "loss": -0.1691, "grad_norm": 1.0715463161468506, "learning_rate": 2.392557564098649e-06, "grpo_mean_advantage": -9.536743306171047e-08, "grpo_std_advantage": 5.838213610331877e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5673571825027466}
56
- {"ts": "2025-12-27T21:02:37", "event": "train_log", "step": 275, "epoch": 1.1001001001001, "progress_pct": 55.0, "epoch_pct": 55.01, "eta": "01:03:03", "max_grad_norm": 1.0, "loss": -0.1655, "grad_norm": 0.7759184837341309, "learning_rate": 2.3100342553434924e-06, "grpo_mean_advantage": 3.278255533700758e-08, "grpo_std_advantage": 9.317170679423725e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5874732732772827}
57
- {"ts": "2025-12-27T21:03:59", "event": "train_log", "step": 280, "epoch": 1.12012012012012, "progress_pct": 56.0, "epoch_pct": 56.01, "eta": "01:01:38", "max_grad_norm": 1.0, "loss": -0.1821, "grad_norm": 0.9387398958206177, "learning_rate": 2.2277186713410688e-06, "grpo_mean_advantage": -1.206994113545079e-07, "grpo_std_advantage": 6.201085511747806e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5569106340408325}
58
- {"ts": "2025-12-27T21:05:19", "event": "train_log", "step": 285, "epoch": 1.14014014014014, "progress_pct": 57.0, "epoch_pct": 57.01, "eta": "01:00:10", "max_grad_norm": 1.0, "loss": -0.2102, "grad_norm": 1.6132302284240723, "learning_rate": 2.1457008229739395e-06, "grpo_mean_advantage": 4.470348358154297e-08, "grpo_std_advantage": 6.115651558502577e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5578873157501221}
59
- {"ts": "2025-12-27T21:06:43", "event": "train_log", "step": 290, "epoch": 1.16016016016016, "progress_pct": 58.0, "epoch_pct": 58.01, "eta": "00:58:47", "max_grad_norm": 1.0, "loss": -0.2937, "grad_norm": 0.8679026961326599, "learning_rate": 2.0640703955551214e-06, "grpo_mean_advantage": -3.3453108017056365e-07, "grpo_std_advantage": 3.5326345368957845e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5735999345779419}
60
- {"ts": "2025-12-27T21:08:10", "event": "train_log", "step": 295, "epoch": 1.1801801801801801, "progress_pct": 59.0, "epoch_pct": 59.01, "eta": "00:57:24", "max_grad_norm": 1.0, "loss": -0.2598, "grad_norm": 1.0550166368484497, "learning_rate": 1.9829166507585084e-06, "grpo_mean_advantage": -1.110136480519941e-07, "grpo_std_advantage": 4.731904823529476e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5626259446144104}
61
- {"ts": "2025-12-27T21:09:34", "event": "train_log", "step": 300, "epoch": 1.2002002002002001, "progress_pct": 60.0, "epoch_pct": 60.01, "eta": "00:56:00", "max_grad_norm": 1.0, "loss": -0.2546, "grad_norm": 1.2819372415542603, "learning_rate": 1.90232832901255e-06, "grpo_mean_advantage": -5.08874677507265e-07, "grpo_std_advantage": 1.840126174101897e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5463050603866577}
62
- {"ts": "2025-12-27T21:10:59", "event": "train_log", "step": 305, "epoch": 1.2202202202202201, "progress_pct": 61.0, "epoch_pct": 61.01, "eta": "00:54:37", "max_grad_norm": 1.0, "loss": -0.1809, "grad_norm": 1.0188143253326416, "learning_rate": 1.82239355246389e-06, "grpo_mean_advantage": 1.01327898960335e-07, "grpo_std_advantage": 7.798533943059738e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5352144241333008}
63
- {"ts": "2025-12-27T21:12:25", "event": "train_log", "step": 310, "epoch": 1.2402402402402402, "progress_pct": 62.0, "epoch_pct": 62.01, "eta": "00:53:14", "max_grad_norm": 1.0, "loss": -0.3559, "grad_norm": 2.0709052085876465, "learning_rate": 1.7431997286170923e-06, "grpo_mean_advantage": 1.341104507446289e-07, "grpo_std_advantage": 7.821902840987605e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5547868013381958}
64
- {"ts": "2025-12-27T21:13:52", "event": "train_log", "step": 315, "epoch": 1.2602602602602602, "progress_pct": 63.0, "epoch_pct": 63.01, "eta": "00:51:52", "max_grad_norm": 1.0, "loss": -0.3874, "grad_norm": 1.8516215085983276, "learning_rate": 1.6648334547558227e-06, "grpo_mean_advantage": 9.015202806494926e-08, "grpo_std_advantage": 1.0693488547985908e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5859472751617432}
65
- {"ts": "2025-12-27T21:15:21", "event": "train_log", "step": 320, "epoch": 1.2802802802802802, "progress_pct": 64.0, "epoch_pct": 64.01, "eta": "00:50:31", "max_grad_norm": 1.0, "loss": -0.3467, "grad_norm": 1.283104419708252, "learning_rate": 1.5873804232499862e-06, "grpo_mean_advantage": -2.443790378947597e-07, "grpo_std_advantage": 1.183122208203713e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5751550793647766}
66
- {"ts": "2025-12-27T21:16:47", "event": "train_log", "step": 325, "epoch": 1.3003003003003002, "progress_pct": 65.0, "epoch_pct": 65.02, "eta": "00:49:07", "max_grad_norm": 1.0, "loss": -0.1703, "grad_norm": 1.4108576774597168, "learning_rate": 1.51092532785238e-06, "grpo_mean_advantage": -6.705522537231445e-08, "grpo_std_advantage": 6.109748937888071e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5497723817825317}
67
- {"ts": "2025-12-27T21:18:13", "event": "train_log", "step": 330, "epoch": 1.3203203203203202, "progress_pct": 66.0, "epoch_pct": 66.02, "eta": "00:47:44", "max_grad_norm": 1.0, "loss": -0.2918, "grad_norm": 1.0421361923217773, "learning_rate": 1.4355517710873184e-06, "grpo_mean_advantage": -1.639127766850379e-08, "grpo_std_advantage": 5.529495297196263e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.55989670753479}
68
- {"ts": "2025-12-27T21:19:37", "event": "train_log", "step": 335, "epoch": 1.3403403403403402, "progress_pct": 67.0, "epoch_pct": 67.02, "eta": "00:46:20", "max_grad_norm": 1.0, "loss": -0.3069, "grad_norm": 1.3465828895568848, "learning_rate": 1.361342172832502e-06, "grpo_mean_advantage": 4.418194237132411e-07, "grpo_std_advantage": 2.9275292945385445e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5809233784675598}
69
- {"ts": "2025-12-27T21:20:59", "event": "train_log", "step": 340, "epoch": 1.3603603603603602, "progress_pct": 68.0, "epoch_pct": 68.02, "eta": "00:44:55", "max_grad_norm": 1.0, "loss": -0.5594, "grad_norm": 1.1959459781646729, "learning_rate": 1.2883776801940884e-06, "grpo_mean_advantage": 9.685754776000977e-08, "grpo_std_advantage": 3.754235251562932e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5568087100982666}
70
- {"ts": "2025-12-27T21:22:27", "event": "train_log", "step": 345, "epoch": 1.3803803803803802, "progress_pct": 69.0, "epoch_pct": 69.02, "eta": "00:43:32", "max_grad_norm": 1.0, "loss": -0.4102, "grad_norm": 1.8967422246932983, "learning_rate": 1.216738078773522e-06, "grpo_mean_advantage": -2.384185791015625e-07, "grpo_std_advantage": 6.821086913078034e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5655568838119507}
71
- {"ts": "2025-12-27T21:23:49", "event": "train_log", "step": 350, "epoch": 1.4004004004004005, "progress_pct": 70.0, "epoch_pct": 70.02, "eta": "00:42:06", "max_grad_norm": 1.0, "loss": -0.338, "grad_norm": 2.221132755279541, "learning_rate": 1.146501705423155e-06, "grpo_mean_advantage": -8.717179156292332e-08, "grpo_std_advantage": 2.500940354366321e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.6089578866958618}
72
- {"ts": "2025-12-27T21:25:13", "event": "train_log", "step": 355, "epoch": 1.4204204204204205, "progress_pct": 71.0, "epoch_pct": 71.02, "eta": "00:40:42", "max_grad_norm": 1.0, "loss": -0.4985, "grad_norm": 2.3640377521514893, "learning_rate": 1.0777453625860474e-06, "grpo_mean_advantage": 2.1606683731079102e-07, "grpo_std_advantage": 1.4568390724889468e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.6129671335220337}
73
- {"ts": "2025-12-27T21:26:36", "event": "train_log", "step": 360, "epoch": 1.4404404404404405, "progress_pct": 72.0, "epoch_pct": 72.02, "eta": "00:39:18", "max_grad_norm": 1.0, "loss": -0.4347, "grad_norm": 1.9084734916687012, "learning_rate": 1.0105442343136184e-06, "grpo_mean_advantage": -3.725290298461914e-09, "grpo_std_advantage": 2.965894054796081e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5562310814857483}
74
- {"ts": "2025-12-27T21:27:58", "event": "train_log", "step": 365, "epoch": 1.4604604604604605, "progress_pct": 73.0, "epoch_pct": 73.02, "eta": "00:37:52", "max_grad_norm": 1.0, "loss": -0.6217, "grad_norm": 1.6063904762268066, "learning_rate": 9.449718040529987e-07, "grpo_mean_advantage": 4.313886279305734e-07, "grpo_std_advantage": 1.9621948013082147e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5884170532226562}
75
- {"ts": "2025-12-27T21:29:21", "event": "train_log", "step": 370, "epoch": 1.4804804804804805, "progress_pct": 74.0, "epoch_pct": 74.02, "eta": "00:36:28", "max_grad_norm": 1.0, "loss": -0.5364, "grad_norm": 2.114664077758789, "learning_rate": 8.810997742939531e-07, "grpo_mean_advantage": 2.0489096641540527e-07, "grpo_std_advantage": 1.0235522722723545e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5795440673828125}
76
- {"ts": "2025-12-27T21:30:45", "event": "train_log", "step": 375, "epoch": 1.5005005005005005, "progress_pct": 75.0, "epoch_pct": 75.03, "eta": "00:35:04", "max_grad_norm": 1.0, "loss": -0.4798, "grad_norm": 1.8450465202331543, "learning_rate": 8.189979881632634e-07, "grpo_mean_advantage": -1.4185905001795618e-06, "grpo_std_advantage": 1.0947338523692451e-05, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5607603788375854}
77
- {"ts": "2025-12-27T21:32:13", "event": "train_log", "step": 380, "epoch": 1.5205205205205206, "progress_pct": 76.0, "epoch_pct": 76.03, "eta": "00:33:41", "max_grad_norm": 1.0, "loss": -0.4805, "grad_norm": 2.673438787460327, "learning_rate": 7.587343530522945e-07, "grpo_mean_advantage": -1.758337049295733e-07, "grpo_std_advantage": 9.663675655247062e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5381432771682739}
78
- {"ts": "2025-12-27T21:33:36", "event": "train_log", "step": 385, "epoch": 1.5405405405405406, "progress_pct": 77.0, "epoch_pct": 77.03, "eta": "00:32:16", "max_grad_norm": 1.0, "loss": -0.433, "grad_norm": 2.2263550758361816, "learning_rate": 7.003747663612581e-07, "grpo_mean_advantage": -6.973743325033865e-07, "grpo_std_advantage": 4.341973180999048e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5528443455696106}
79
- {"ts": "2025-12-27T21:34:59", "event": "train_log", "step": 390, "epoch": 1.5605605605605606, "progress_pct": 78.0, "epoch_pct": 78.03, "eta": "00:30:52", "max_grad_norm": 1.0, "loss": -0.6021, "grad_norm": 2.3657093048095703, "learning_rate": 6.439830434413754e-07, "grpo_mean_advantage": 1.7881394143159923e-08, "grpo_std_advantage": 1.3004198251564958e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.6091476678848267}
80
- {"ts": "2025-12-27T21:36:21", "event": "train_log", "step": 395, "epoch": 1.5805805805805806, "progress_pct": 79.0, "epoch_pct": 79.03, "eta": "00:29:27", "max_grad_norm": 1.0, "loss": -0.5595, "grad_norm": 1.9847129583358765, "learning_rate": 5.896208478137222e-07, "grpo_mean_advantage": 3.4868716625169327e-07, "grpo_std_advantage": 2.059372718576924e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5397372245788574}
81
- {"ts": "2025-12-27T21:37:48", "event": "train_log", "step": 400, "epoch": 1.6006006006006006, "progress_pct": 80.0, "epoch_pct": 80.03, "eta": "00:28:03", "max_grad_norm": 1.0, "loss": -0.5592, "grad_norm": 2.922114133834839, "learning_rate": 5.373476237410808e-07, "grpo_mean_advantage": -2.1636485598719446e-06, "grpo_std_advantage": 9.725940799398813e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5873125195503235}
82
- {"ts": "2025-12-27T21:39:11", "event": "train_log", "step": 405, "epoch": 1.6206206206206206, "progress_pct": 81.0, "epoch_pct": 81.03, "eta": "00:26:39", "max_grad_norm": 1.0, "loss": -0.5623, "grad_norm": 1.8524045944213867, "learning_rate": 4.872205312265074e-07, "grpo_mean_advantage": -5.960464477539063e-08, "grpo_std_advantage": 3.460792754594877e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5601426362991333}
83
- {"ts": "2025-12-27T21:40:38", "event": "train_log", "step": 410, "epoch": 1.6406406406406406, "progress_pct": 82.0, "epoch_pct": 82.03, "eta": "00:25:15", "max_grad_norm": 1.0, "loss": -0.5943, "grad_norm": 1.7269790172576904, "learning_rate": 4.3929438350970687e-07, "grpo_mean_advantage": 2.6226044269606064e-07, "grpo_std_advantage": 7.928817922220333e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.578656792640686}
84
- {"ts": "2025-12-27T21:42:02", "event": "train_log", "step": 415, "epoch": 1.6606606606606606, "progress_pct": 83.0, "epoch_pct": 83.03, "eta": "00:23:51", "max_grad_norm": 1.0, "loss": -0.6193, "grad_norm": 2.26530122756958, "learning_rate": 3.936215871295634e-07, "grpo_mean_advantage": 2.3558736756967846e-06, "grpo_std_advantage": 1.4469559573626611e-05, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5885810852050781}
85
- {"ts": "2025-12-27T21:43:27", "event": "train_log", "step": 420, "epoch": 1.6806806806806807, "progress_pct": 84.0, "epoch_pct": 84.03, "eta": "00:22:27", "max_grad_norm": 1.0, "loss": -0.6934, "grad_norm": 2.6794464588165283, "learning_rate": 3.502520846183577e-07, "grpo_mean_advantage": 1.639127766850379e-08, "grpo_std_advantage": 9.352411325380672e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5805023312568665}
86
- {"ts": "2025-12-27T21:44:52", "event": "train_log", "step": 425, "epoch": 1.7007007007007007, "progress_pct": 85.0, "epoch_pct": 85.04, "eta": "00:21:03", "max_grad_norm": 1.0, "loss": -0.5126, "grad_norm": 2.100447654724121, "learning_rate": 3.092332998903416e-07, "grpo_mean_advantage": 3.2387674764322583e-06, "grpo_std_advantage": 1.999079904635437e-05, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5655918121337891}
87
- {"ts": "2025-12-27T21:46:17", "event": "train_log", "step": 430, "epoch": 1.7207207207207207, "progress_pct": 86.0, "epoch_pct": 86.04, "eta": "00:19:39", "max_grad_norm": 1.0, "loss": -0.5446, "grad_norm": 2.1027915477752686, "learning_rate": 2.706100863843822e-07, "grpo_mean_advantage": 3.5464762504489045e-07, "grpo_std_advantage": 1.7663603557593888e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5474504232406616}
88
- {"ts": "2025-12-27T21:47:43", "event": "train_log", "step": 435, "epoch": 1.7407407407407407, "progress_pct": 87.0, "epoch_pct": 87.04, "eta": "00:18:15", "max_grad_norm": 1.0, "loss": -0.5125, "grad_norm": 2.289045572280884, "learning_rate": 2.3442467801738867e-07, "grpo_mean_advantage": 3.6135315895080566e-07, "grpo_std_advantage": 2.356920958845876e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5874254703521729}
89
- {"ts": "2025-12-27T21:49:08", "event": "train_log", "step": 440, "epoch": 1.7607607607607607, "progress_pct": 88.0, "epoch_pct": 88.04, "eta": "00:16:51", "max_grad_norm": 1.0, "loss": -0.595, "grad_norm": 2.278038501739502, "learning_rate": 2.007166430021415e-07, "grpo_mean_advantage": 2.7567148563889532e-08, "grpo_std_advantage": 9.97340521280421e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5815118551254272}
90
- {"ts": "2025-12-27T21:50:35", "event": "train_log", "step": 445, "epoch": 1.7807807807807807, "progress_pct": 89.0, "epoch_pct": 89.04, "eta": "00:15:27", "max_grad_norm": 1.0, "loss": -0.8055, "grad_norm": 2.340942621231079, "learning_rate": 1.6952284058003366e-07, "grpo_mean_advantage": -8.34465012644614e-08, "grpo_std_advantage": 5.558832185670326e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5611211061477661}
91
- {"ts": "2025-12-27T21:52:06", "event": "train_log", "step": 450, "epoch": 1.800800800800801, "progress_pct": 90.0, "epoch_pct": 90.04, "eta": "00:14:03", "max_grad_norm": 1.0, "loss": -0.8561, "grad_norm": 2.4256298542022705, "learning_rate": 1.4087738071603075e-07, "grpo_mean_advantage": -1.9818544672034477e-07, "grpo_std_advantage": 6.800727305744658e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.590424656867981}
92
- {"ts": "2025-12-27T21:53:32", "event": "train_log", "step": 455, "epoch": 1.820820820820821, "progress_pct": 91.0, "epoch_pct": 91.04, "eta": "00:12:39", "max_grad_norm": 1.0, "loss": -0.429, "grad_norm": 1.6453255414962769, "learning_rate": 1.1481158679992554e-07, "grpo_mean_advantage": -1.9371508841459217e-08, "grpo_std_advantage": 3.142378943721269e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5670351982116699}
93
- {"ts": "2025-12-27T21:55:00", "event": "train_log", "step": 460, "epoch": 1.840840840840841, "progress_pct": 92.0, "epoch_pct": 92.04, "eta": "00:11:15", "max_grad_norm": 1.0, "loss": -0.642, "grad_norm": 2.3458049297332764, "learning_rate": 9.135396139467151e-08, "grpo_mean_advantage": 2.3692845729783585e-07, "grpo_std_advantage": 1.682946731307311e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5640432834625244}
94
- {"ts": "2025-12-27T21:56:27", "event": "train_log", "step": 465, "epoch": 1.860860860860861, "progress_pct": 93.0, "epoch_pct": 93.04, "eta": "00:09:51", "max_grad_norm": 1.0, "loss": -0.583, "grad_norm": 2.730945110321045, "learning_rate": 7.053015506924749e-08, "grpo_mean_advantage": 1.110136480519941e-07, "grpo_std_advantage": 8.930008448260196e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5584251284599304}
95
- {"ts": "2025-12-27T21:57:50", "event": "train_log", "step": 470, "epoch": 1.880880880880881, "progress_pct": 94.0, "epoch_pct": 94.04, "eta": "00:08:26", "max_grad_norm": 1.0, "loss": -0.5197, "grad_norm": 2.1463465690612793, "learning_rate": 5.236293835013839e-08, "grpo_mean_advantage": 2.5406478698641877e-07, "grpo_std_advantage": 9.93092498902115e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5392154455184937}
96
- {"ts": "2025-12-27T21:59:15", "event": "train_log", "step": 475, "epoch": 1.900900900900901, "progress_pct": 95.0, "epoch_pct": 95.05, "eta": "00:07:02", "max_grad_norm": 1.0, "loss": -0.5864, "grad_norm": 2.427900791168213, "learning_rate": 3.687217682209837e-08, "grpo_mean_advantage": -8.940697071579962e-09, "grpo_std_advantage": 5.835169645251881e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5686308741569519}
97
- {"ts": "2025-12-27T22:00:36", "event": "train_log", "step": 480, "epoch": 1.920920920920921, "progress_pct": 96.0, "epoch_pct": 96.05, "eta": "00:05:37", "max_grad_norm": 1.0, "loss": -0.5721, "grad_norm": 2.042795419692993, "learning_rate": 2.4074809405425227e-08, "grpo_mean_advantage": 4.0605664253234863e-07, "grpo_std_advantage": 2.3210795916384086e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5842767357826233}
98
- {"ts": "2025-12-27T22:01:58", "event": "train_log", "step": 485, "epoch": 1.940940940940941, "progress_pct": 97.0, "epoch_pct": 97.05, "eta": "00:04:13", "max_grad_norm": 1.0, "loss": -0.5944, "grad_norm": 2.800136089324951, "learning_rate": 1.3984829833499636e-08, "grpo_mean_advantage": 1.341104507446289e-07, "grpo_std_advantage": 1.507950400991831e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5495311617851257}
99
- {"ts": "2025-12-27T22:03:20", "event": "train_log", "step": 490, "epoch": 1.960960960960961, "progress_pct": 98.0, "epoch_pct": 98.05, "eta": "00:02:48", "max_grad_norm": 1.0, "loss": -0.7015, "grad_norm": 2.8475866317749023, "learning_rate": 6.6132713508446075e-09, "grpo_mean_advantage": 2.689659481802664e-07, "grpo_std_advantage": 8.491958851664094e-07, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.549436628818512}
100
- {"ts": "2025-12-27T22:04:42", "event": "train_log", "step": 495, "epoch": 1.980980980980981, "progress_pct": 99.0, "epoch_pct": 99.05, "eta": "00:01:24", "max_grad_norm": 1.0, "loss": -0.4033, "grad_norm": 2.9422402381896973, "learning_rate": 1.9681946484320645e-09, "grpo_mean_advantage": 8.195638656616211e-08, "grpo_std_advantage": 3.802849732892355e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.544632077217102}
101
- {"ts": "2025-12-27T22:06:01", "event": "train_log", "step": 500, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "loss": -0.6773, "grad_norm": 2.66204833984375, "learning_rate": 5.467904943851077e-11, "grpo_mean_advantage": 7.552536089860951e-07, "grpo_std_advantage": 4.143997102801222e-06, "grpo_mean_kl_div": 0.0, "grpo_mean_group_score": 0.5968535542488098}
102
- {"ts": "2025-12-27T22:06:03", "event": "train_log", "step": 500, "epoch": 2.0, "progress_pct": 100.0, "epoch_pct": 100.0, "eta": "00:00:00", "max_grad_norm": 1.0, "train_runtime": 8430.3461, "train_samples_per_second": 0.474, "train_steps_per_second": 0.059, "total_flos": 0.0, "train_loss": -0.23323759501613678}
 
1
+ {"ts": "2025-12-26T15:24:36", "event": "train_log", "step": 2, "epoch": 0.0023357664233576644, "progress_pct": 0.08, "epoch_pct": 0.08, "eta": "07:30:29", "max_grad_norm": 1.0, "loss": 0.6931473016738892, "grad_norm": 1.2424817085266113, "learning_rate": 1.9379844961240311e-07, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -368.911865234375, "logps/rejected": -398.83880615234375, "logits/chosen": 5.179401397705078, "logits/rejected": 5.192930698394775}
2
+ {"ts": "2025-12-26T15:24:56", "event": "train_log", "step": 4, "epoch": 0.004671532846715329, "progress_pct": 0.16, "epoch_pct": 0.16, "eta": "07:14:49", "max_grad_norm": 1.0, "loss": 0.693317174911499, "grad_norm": 1.3884541988372803, "learning_rate": 5.813953488372093e-07, "rewards/chosen": 0.022540951147675514, "rewards/rejected": 0.022656824439764023, "rewards/accuracies": 0.5, "rewards/margins": -0.00011587224435061216, "logps/chosen": -338.257568359375, "logps/rejected": -366.88128662109375, "logits/chosen": 5.405174255371094, "logits/rejected": 5.456291675567627}
3
+ {"ts": "2025-12-26T15:57:54", "event": "train_log", "step": 2, "epoch": 0.0023357664233576644, "progress_pct": 0.08, "epoch_pct": 0.08, "eta": "07:30:57", "max_grad_norm": 1.0, "loss": 0.6931473016738892, "grad_norm": 1.242694616317749, "learning_rate": 1.9379844961240311e-07, "rewards/chosen": 0.0, "rewards/rejected": 0.0, "rewards/accuracies": 0.0, "rewards/margins": 0.0, "logps/chosen": -368.911865234375, "logps/rejected": -398.83880615234375, "logits/chosen": 5.179401397705078, "logits/rejected": 5.192930698394775}
4
+ {"ts": "2025-12-26T15:58:14", "event": "train_log", "step": 4, "epoch": 0.004671532846715329, "progress_pct": 0.16, "epoch_pct": 0.16, "eta": "07:15:10", "max_grad_norm": 1.0, "loss": 0.6949559450149536, "grad_norm": 1.392399787902832, "learning_rate": 5.813953488372093e-07, "rewards/chosen": 0.004504585638642311, "rewards/rejected": 0.007727146148681641, "rewards/accuracies": 0.625, "rewards/margins": -0.003222561441361904, "logps/chosen": -338.43792724609375, "logps/rejected": -367.03057861328125, "logits/chosen": 5.403897762298584, "logits/rejected": 5.4565606117248535}
5
+ {"ts": "2025-12-26T15:58:34", "event": "train_log", "step": 6, "epoch": 0.0070072992700729924, "progress_pct": 0.23, "epoch_pct": 0.23, "eta": "07:13:22", "max_grad_norm": 1.0, "loss": 0.689236581325531, "grad_norm": 1.066603183746338, "learning_rate": 9.689922480620155e-07, "rewards/chosen": -0.0034066196531057358, "rewards/rejected": -0.01166229322552681, "rewards/accuracies": 0.5625, "rewards/margins": 0.008255671709775925, "logps/chosen": -362.3431701660156, "logps/rejected": -387.5829772949219, "logits/chosen": 5.291868209838867, "logits/rejected": 5.328356742858887}
6
+ {"ts": "2025-12-26T15:58:54", "event": "train_log", "step": 8, "epoch": 0.009343065693430658, "progress_pct": 0.31, "epoch_pct": 0.31, "eta": "07:08:27", "max_grad_norm": 1.0, "loss": 0.6943775415420532, "grad_norm": 1.0005714893341064, "learning_rate": 1.3565891472868218e-06, "rewards/chosen": 0.014657974243164062, "rewards/rejected": 0.015892982482910156, "rewards/accuracies": 0.375, "rewards/margins": -0.0012350091710686684, "logps/chosen": -379.9283447265625, "logps/rejected": -389.0852355957031, "logits/chosen": 5.323437690734863, "logits/rejected": 5.410858631134033}
7
+ {"ts": "2025-12-26T15:59:13", "event": "train_log", "step": 10, "epoch": 0.01167883211678832, "progress_pct": 0.39, "epoch_pct": 0.39, "eta": "07:07:21", "max_grad_norm": 1.0, "loss": 0.693260908126831, "grad_norm": 1.2461222410202026, "learning_rate": 1.744186046511628e-06, "rewards/chosen": -0.028497030958533287, "rewards/rejected": -0.028623390942811966, "rewards/accuracies": 0.625, "rewards/margins": 0.00012636138126254082, "logps/chosen": -363.2003479003906, "logps/rejected": -389.67376708984375, "logits/chosen": 5.435908317565918, "logits/rejected": 5.494542121887207}
8
+ {"ts": "2025-12-26T15:59:34", "event": "train_log", "step": 12, "epoch": 0.014014598540145985, "progress_pct": 0.47, "epoch_pct": 0.47, "eta": "07:08:38", "max_grad_norm": 1.0, "loss": 0.6882913112640381, "grad_norm": 1.4030137062072754, "learning_rate": 2.131782945736434e-06, "rewards/chosen": 0.01622028276324272, "rewards/rejected": 0.006133650429546833, "rewards/accuracies": 0.5, "rewards/margins": 0.010086631402373314, "logps/chosen": -370.96429443359375, "logps/rejected": -402.4786071777344, "logits/chosen": 5.3550801277160645, "logits/rejected": 5.375768661499023}
9
+ {"ts": "2025-12-26T15:59:52", "event": "train_log", "step": 14, "epoch": 0.01635036496350365, "progress_pct": 0.54, "epoch_pct": 0.55, "eta": "07:02:49", "max_grad_norm": 1.0, "loss": 0.6896716356277466, "grad_norm": 1.1157702207565308, "learning_rate": 2.5193798449612402e-06, "rewards/chosen": -0.017319394275546074, "rewards/rejected": -0.024647902697324753, "rewards/accuracies": 0.625, "rewards/margins": 0.007328510750085115, "logps/chosen": -336.7254333496094, "logps/rejected": -357.52203369140625, "logits/chosen": 5.515308380126953, "logits/rejected": 5.561104774475098}
10
+ {"ts": "2025-12-26T16:00:12", "event": "train_log", "step": 16, "epoch": 0.018686131386861315, "progress_pct": 0.62, "epoch_pct": 0.62, "eta": "07:03:25", "max_grad_norm": 1.0, "loss": 0.6904245018959045, "grad_norm": 0.9470655918121338, "learning_rate": 2.9069767441860468e-06, "rewards/chosen": 0.03270244598388672, "rewards/rejected": 0.026875685900449753, "rewards/accuracies": 0.5625, "rewards/margins": 0.005826758686453104, "logps/chosen": -415.6842041015625, "logps/rejected": -441.1054992675781, "logits/chosen": 5.553088665008545, "logits/rejected": 5.582851886749268}
11
+ {"ts": "2025-12-26T16:00:33", "event": "train_log", "step": 18, "epoch": 0.021021897810218976, "progress_pct": 0.7, "epoch_pct": 0.7, "eta": "07:04:54", "max_grad_norm": 1.0, "loss": 0.683630108833313, "grad_norm": 1.4397331476211548, "learning_rate": 3.2945736434108533e-06, "rewards/chosen": 0.011020278558135033, "rewards/rejected": -0.008498954586684704, "rewards/accuracies": 0.5625, "rewards/margins": 0.01951923407614231, "logps/chosen": -392.46221923828125, "logps/rejected": -420.1712341308594, "logits/chosen": 5.440742015838623, "logits/rejected": 5.489529132843018}
12
+ {"ts": "2025-12-26T16:00:52", "event": "train_log", "step": 20, "epoch": 0.02335766423357664, "progress_pct": 0.78, "epoch_pct": 0.78, "eta": "07:03:26", "max_grad_norm": 1.0, "loss": 0.6902388334274292, "grad_norm": 1.5941083431243896, "learning_rate": 3.6821705426356594e-06, "rewards/chosen": 0.006536484230309725, "rewards/rejected": 0.0005230908282101154, "rewards/accuracies": 0.5625, "rewards/margins": 0.006013393402099609, "logps/chosen": -345.2221374511719, "logps/rejected": -365.9537048339844, "logits/chosen": 5.318347930908203, "logits/rejected": 5.397945404052734}
13
+ {"ts": "2025-12-26T16:01:12", "event": "train_log", "step": 22, "epoch": 0.025693430656934305, "progress_pct": 0.86, "epoch_pct": 0.86, "eta": "07:03:12", "max_grad_norm": 1.0, "loss": 0.691262423992157, "grad_norm": 1.1363905668258667, "learning_rate": 4.0697674418604655e-06, "rewards/chosen": 0.011908342130482197, "rewards/rejected": 0.007370188366621733, "rewards/accuracies": 0.5, "rewards/margins": 0.004538153763860464, "logps/chosen": -347.9439697265625, "logps/rejected": -370.65777587890625, "logits/chosen": 5.632981300354004, "logits/rejected": 5.7265520095825195}
14
+ {"ts": "2025-12-26T16:01:32", "event": "train_log", "step": 24, "epoch": 0.02802919708029197, "progress_pct": 0.93, "epoch_pct": 0.93, "eta": "07:02:08", "max_grad_norm": 1.0, "loss": 0.6769475936889648, "grad_norm": 1.0684627294540405, "learning_rate": 4.457364341085272e-06, "rewards/chosen": 0.01244144607335329, "rewards/rejected": -0.020452119410037994, "rewards/accuracies": 0.875, "rewards/margins": 0.03289356082677841, "logps/chosen": -347.1539001464844, "logps/rejected": -377.6044921875, "logits/chosen": 5.35699987411499, "logits/rejected": 5.405580520629883}
15
+ {"ts": "2025-12-26T16:09:16", "event": "train_log", "step": 25, "epoch": 0.029197080291970802, "progress_pct": 0.97, "epoch_pct": 0.97, "eta": "19:53:32", "max_grad_norm": 1.0, "eval_loss": 0.6836819648742676, "eval_runtime": 454.4375, "eval_samples_per_second": 1.677, "eval_steps_per_second": 1.677, "eval_rewards/chosen": 0.024636391550302505, "eval_rewards/rejected": 0.005080964416265488, "eval_rewards/accuracies": 0.665354311466217, "eval_rewards/margins": 0.019555427134037018, "eval_logps/chosen": -370.1607666015625, "eval_logps/rejected": -395.7251892089844, "eval_logits/chosen": 5.295141220092773, "eval_logits/rejected": 5.345211029052734}
16
+ {"ts": "2025-12-26T16:09:27", "event": "train_log", "step": 26, "epoch": 0.030364963503649634, "progress_pct": 1.01, "epoch_pct": 1.01, "eta": "19:24:28", "max_grad_norm": 1.0, "loss": 0.6849788427352905, "grad_norm": 1.592353105545044, "learning_rate": 4.844961240310078e-06, "rewards/chosen": 0.026385309174656868, "rewards/rejected": 0.009418869391083717, "rewards/accuracies": 0.625, "rewards/margins": 0.016966437920928, "logps/chosen": -387.54876708984375, "logps/rejected": -412.0630187988281, "logits/chosen": 5.157042026519775, "logits/rejected": 5.244912147521973}
17
+ {"ts": "2025-12-26T16:09:47", "event": "train_log", "step": 28, "epoch": 0.0327007299270073, "progress_pct": 1.09, "epoch_pct": 1.09, "eta": "18:30:11", "max_grad_norm": 1.0, "loss": 0.675189733505249, "grad_norm": 1.3181558847427368, "learning_rate": 5.232558139534884e-06, "rewards/chosen": 0.045946408063173294, "rewards/rejected": 0.009190557524561882, "rewards/accuracies": 0.8125, "rewards/margins": 0.03675585240125656, "logps/chosen": -360.41650390625, "logps/rejected": -391.2162170410156, "logits/chosen": 5.545513153076172, "logits/rejected": 5.54400110244751}
18
+ {"ts": "2025-12-26T16:10:07", "event": "train_log", "step": 30, "epoch": 0.035036496350364967, "progress_pct": 1.17, "epoch_pct": 1.17, "eta": "17:43:26", "max_grad_norm": 1.0, "loss": 0.6752142310142517, "grad_norm": 1.443650722503662, "learning_rate": 5.620155038759691e-06, "rewards/chosen": 0.04194517061114311, "rewards/rejected": 0.005256845150142908, "rewards/accuracies": 0.8125, "rewards/margins": 0.03668833151459694, "logps/chosen": -378.6293640136719, "logps/rejected": -405.3665466308594, "logits/chosen": 5.136168003082275, "logits/rejected": 5.239327907562256}
19
+ {"ts": "2025-12-26T16:10:26", "event": "train_log", "step": 32, "epoch": 0.03737226277372263, "progress_pct": 1.24, "epoch_pct": 1.25, "eta": "17:01:56", "max_grad_norm": 1.0, "loss": 0.6700581312179565, "grad_norm": 1.379568338394165, "learning_rate": 6.007751937984497e-06, "rewards/chosen": 0.06658173352479935, "rewards/rejected": 0.019388392567634583, "rewards/accuracies": 0.875, "rewards/margins": 0.047193337231874466, "logps/chosen": -358.5367736816406, "logps/rejected": -382.4181213378906, "logits/chosen": 5.411487579345703, "logits/rejected": 5.427243232727051}
20
+ {"ts": "2025-12-26T16:10:44", "event": "train_log", "step": 34, "epoch": 0.039708029197080295, "progress_pct": 1.32, "epoch_pct": 1.32, "eta": "16:23:25", "max_grad_norm": 1.0, "loss": 0.6610866785049438, "grad_norm": 1.3260451555252075, "learning_rate": 6.395348837209303e-06, "rewards/chosen": 0.07038869708776474, "rewards/rejected": 0.0045137410052120686, "rewards/accuracies": 0.9375, "rewards/margins": 0.06587495654821396, "logps/chosen": -326.9423828125, "logps/rejected": -346.52081298828125, "logits/chosen": 5.207217216491699, "logits/rejected": 5.254848480224609}
21
+ {"ts": "2025-12-26T16:11:04", "event": "train_log", "step": 36, "epoch": 0.04204379562043795, "progress_pct": 1.4, "epoch_pct": 1.4, "eta": "15:51:44", "max_grad_norm": 1.0, "loss": 0.6281551718711853, "grad_norm": 1.5776340961456299, "learning_rate": 6.782945736434108e-06, "rewards/chosen": 0.11738375574350357, "rewards/rejected": -0.018992995843291283, "rewards/accuracies": 1.0, "rewards/margins": 0.1363767683506012, "logps/chosen": -359.9613952636719, "logps/rejected": -384.31683349609375, "logits/chosen": 5.550538063049316, "logits/rejected": 5.6374335289001465}
22
+ {"ts": "2025-12-26T16:11:23", "event": "train_log", "step": 38, "epoch": 0.04437956204379562, "progress_pct": 1.48, "epoch_pct": 1.48, "eta": "15:21:57", "max_grad_norm": 1.0, "loss": 0.6270830631256104, "grad_norm": 1.8589071035385132, "learning_rate": 7.170542635658915e-06, "rewards/chosen": 0.1617884635925293, "rewards/rejected": 0.022934721782803535, "rewards/accuracies": 0.9375, "rewards/margins": 0.1388537436723709, "logps/chosen": -325.8544616699219, "logps/rejected": -351.9772644042969, "logits/chosen": 5.39143180847168, "logits/rejected": 5.412029266357422}
23
+ {"ts": "2025-12-26T16:11:43", "event": "train_log", "step": 40, "epoch": 0.04671532846715328, "progress_pct": 1.56, "epoch_pct": 1.56, "eta": "14:56:25", "max_grad_norm": 1.0, "loss": 0.641180157661438, "grad_norm": 1.3231571912765503, "learning_rate": 7.558139534883721e-06, "rewards/chosen": 0.15248623490333557, "rewards/rejected": 0.04090070724487305, "rewards/accuracies": 0.875, "rewards/margins": 0.11158552765846252, "logps/chosen": -343.3839111328125, "logps/rejected": -374.7848205566406, "logits/chosen": 5.189720153808594, "logits/rejected": 5.203127384185791}
24
+ {"ts": "2025-12-26T16:12:02", "event": "train_log", "step": 42, "epoch": 0.049051094890510946, "progress_pct": 1.63, "epoch_pct": 1.64, "eta": "14:31:31", "max_grad_norm": 1.0, "loss": 0.6093671321868896, "grad_norm": 2.5331315994262695, "learning_rate": 7.945736434108528e-06, "rewards/chosen": 0.2898235321044922, "rewards/rejected": 0.10823898762464523, "rewards/accuracies": 0.9375, "rewards/margins": 0.18158456683158875, "logps/chosen": -341.813720703125, "logps/rejected": -372.44952392578125, "logits/chosen": 5.420182228088379, "logits/rejected": 5.45302677154541}
25
+ {"ts": "2025-12-26T16:12:20", "event": "train_log", "step": 44, "epoch": 0.05138686131386861, "progress_pct": 1.71, "epoch_pct": 1.71, "eta": "14:08:47", "max_grad_norm": 1.0, "loss": 0.5815833210945129, "grad_norm": 1.5247384309768677, "learning_rate": 8.333333333333334e-06, "rewards/chosen": 0.32459571957588196, "rewards/rejected": 0.07354050129652023, "rewards/accuracies": 0.8125, "rewards/margins": 0.2510552406311035, "logps/chosen": -354.49627685546875, "logps/rejected": -376.88818359375, "logits/chosen": 5.383636951446533, "logits/rejected": 5.397551536560059}
26
+ {"ts": "2025-12-26T16:12:40", "event": "train_log", "step": 46, "epoch": 0.053722627737226275, "progress_pct": 1.79, "epoch_pct": 1.79, "eta": "13:49:08", "max_grad_norm": 1.0, "loss": 0.5269681215286255, "grad_norm": 2.0814144611358643, "learning_rate": 8.72093023255814e-06, "rewards/chosen": 0.6465227603912354, "rewards/rejected": 0.27069616317749023, "rewards/accuracies": 0.9375, "rewards/margins": 0.37582656741142273, "logps/chosen": -331.1025390625, "logps/rejected": -362.90118408203125, "logits/chosen": 5.269731044769287, "logits/rejected": 5.287116050720215}
27
+ {"ts": "2025-12-26T16:12:59", "event": "train_log", "step": 48, "epoch": 0.05605839416058394, "progress_pct": 1.87, "epoch_pct": 1.87, "eta": "13:31:19", "max_grad_norm": 1.0, "loss": 0.5066201686859131, "grad_norm": 1.769063115119934, "learning_rate": 9.108527131782946e-06, "rewards/chosen": 0.6377636194229126, "rewards/rejected": 0.21126146614551544, "rewards/accuracies": 1.0, "rewards/margins": 0.42650213837623596, "logps/chosen": -369.40283203125, "logps/rejected": -400.18438720703125, "logits/chosen": 5.472540855407715, "logits/rejected": 5.465417861938477}
28
+ {"ts": "2025-12-26T16:13:21", "event": "train_log", "step": 50, "epoch": 0.058394160583941604, "progress_pct": 1.94, "epoch_pct": 1.95, "eta": "13:16:33", "max_grad_norm": 1.0, "loss": 0.529259979724884, "grad_norm": 2.84169602394104, "learning_rate": 9.496124031007753e-06, "rewards/chosen": 0.7923164367675781, "rewards/rejected": 0.4136104881763458, "rewards/accuracies": 1.0, "rewards/margins": 0.3787059783935547, "logps/chosen": -363.4556579589844, "logps/rejected": -397.8169860839844, "logits/chosen": 5.050387382507324, "logits/rejected": 5.112288951873779}
29
+ {"ts": "2025-12-26T16:20:56", "event": "train_log", "step": 50, "epoch": 0.058394160583941604, "progress_pct": 1.94, "epoch_pct": 1.95, "eta": "19:38:32", "max_grad_norm": 1.0, "eval_loss": 0.4610801041126251, "eval_runtime": 454.5598, "eval_samples_per_second": 1.676, "eval_steps_per_second": 1.676, "eval_rewards/chosen": 0.8944254517555237, "eval_rewards/rejected": 0.3205168545246124, "eval_rewards/accuracies": 0.9619422554969788, "eval_rewards/margins": 0.5739086270332336, "eval_logps/chosen": -361.462890625, "eval_logps/rejected": -392.5708312988281, "eval_logits/chosen": 5.22359037399292, "eval_logits/rejected": 5.286833763122559}
30
+ {"ts": "2025-12-26T16:21:15", "event": "train_log", "step": 52, "epoch": 0.06072992700729927, "progress_pct": 2.02, "epoch_pct": 2.02, "eta": "19:08:07", "max_grad_norm": 1.0, "loss": 0.44602835178375244, "grad_norm": 1.6907895803451538, "learning_rate": 9.883720930232558e-06, "rewards/chosen": 0.9869746565818787, "rewards/rejected": 0.3813100755214691, "rewards/accuracies": 0.9375, "rewards/margins": 0.6056646108627319, "logps/chosen": -343.4534606933594, "logps/rejected": -379.39508056640625, "logits/chosen": 5.486469268798828, "logits/rejected": 5.541717529296875}
31
+ {"ts": "2025-12-26T16:21:36", "event": "train_log", "step": 54, "epoch": 0.06306569343065693, "progress_pct": 2.1, "epoch_pct": 2.1, "eta": "18:41:01", "max_grad_norm": 1.0, "loss": 0.43609702587127686, "grad_norm": 1.9458682537078857, "learning_rate": 1.0271317829457365e-05, "rewards/chosen": 0.7794930934906006, "rewards/rejected": 0.15292587876319885, "rewards/accuracies": 1.0, "rewards/margins": 0.6265671253204346, "logps/chosen": -379.5437316894531, "logps/rejected": -401.5587463378906, "logits/chosen": 5.169528961181641, "logits/rejected": 5.2688751220703125}
32
+ {"ts": "2025-12-26T16:21:57", "event": "train_log", "step": 56, "epoch": 0.0654014598540146, "progress_pct": 2.18, "epoch_pct": 2.18, "eta": "18:15:44", "max_grad_norm": 1.0, "loss": 0.3928414583206177, "grad_norm": 2.1266520023345947, "learning_rate": 1.065891472868217e-05, "rewards/chosen": 1.274291753768921, "rewards/rejected": 0.4878700375556946, "rewards/accuracies": 0.9375, "rewards/margins": 0.7864217758178711, "logps/chosen": -378.0788269042969, "logps/rejected": -413.27392578125, "logits/chosen": 5.097426414489746, "logits/rejected": 5.15327262878418}
33
+ {"ts": "2025-12-26T16:22:18", "event": "train_log", "step": 58, "epoch": 0.06773722627737226, "progress_pct": 2.26, "epoch_pct": 2.26, "eta": "17:52:03", "max_grad_norm": 1.0, "loss": 0.35855019092559814, "grad_norm": 1.5381489992141724, "learning_rate": 1.1046511627906977e-05, "rewards/chosen": 1.2897911071777344, "rewards/rejected": 0.35436347126960754, "rewards/accuracies": 0.875, "rewards/margins": 0.9354276061058044, "logps/chosen": -372.93438720703125, "logps/rejected": -401.8287658691406, "logits/chosen": 5.138954162597656, "logits/rejected": 5.20254373550415}
34
+ {"ts": "2025-12-26T16:22:39", "event": "train_log", "step": 60, "epoch": 0.07007299270072993, "progress_pct": 2.33, "epoch_pct": 2.34, "eta": "17:29:56", "max_grad_norm": 1.0, "loss": 0.42801612615585327, "grad_norm": 2.358330726623535, "learning_rate": 1.1434108527131783e-05, "rewards/chosen": 1.3823509216308594, "rewards/rejected": 0.6532848477363586, "rewards/accuracies": 0.875, "rewards/margins": 0.729066014289856, "logps/chosen": -360.984619140625, "logps/rejected": -392.3192138671875, "logits/chosen": 5.071888446807861, "logits/rejected": 5.187964916229248}
35
+ {"ts": "2025-12-26T16:22:58", "event": "train_log", "step": 62, "epoch": 0.07240875912408759, "progress_pct": 2.41, "epoch_pct": 2.41, "eta": "17:08:40", "max_grad_norm": 1.0, "loss": 0.31365492939949036, "grad_norm": 2.177586317062378, "learning_rate": 1.182170542635659e-05, "rewards/chosen": 1.6637591123580933, "rewards/rejected": 0.5750135183334351, "rewards/accuracies": 1.0, "rewards/margins": 1.0887457132339478, "logps/chosen": -364.808349609375, "logps/rejected": -401.0321044921875, "logits/chosen": 5.264093399047852, "logits/rejected": 5.310842990875244}
36
+ {"ts": "2025-12-26T16:23:19", "event": "train_log", "step": 64, "epoch": 0.07474452554744526, "progress_pct": 2.49, "epoch_pct": 2.49, "eta": "16:48:50", "max_grad_norm": 1.0, "loss": 0.3037749230861664, "grad_norm": 1.697789192199707, "learning_rate": 1.2209302325581395e-05, "rewards/chosen": 1.6470392942428589, "rewards/rejected": 0.5321945548057556, "rewards/accuracies": 1.0, "rewards/margins": 1.114844799041748, "logps/chosen": -359.8249816894531, "logps/rejected": -397.2122497558594, "logits/chosen": 5.191982269287109, "logits/rejected": 5.261416912078857}
37
+ {"ts": "2025-12-26T16:23:38", "event": "train_log", "step": 66, "epoch": 0.07708029197080292, "progress_pct": 2.57, "epoch_pct": 2.57, "eta": "16:29:40", "max_grad_norm": 1.0, "loss": 0.25026455521583557, "grad_norm": 1.3219914436340332, "learning_rate": 1.2596899224806202e-05, "rewards/chosen": 1.5671364068984985, "rewards/rejected": 0.15732917189598083, "rewards/accuracies": 1.0, "rewards/margins": 1.4098074436187744, "logps/chosen": -352.3752136230469, "logps/rejected": -392.6779479980469, "logits/chosen": 5.293405532836914, "logits/rejected": 5.3094048500061035}
38
+ {"ts": "2025-12-26T16:23:57", "event": "train_log", "step": 68, "epoch": 0.07941605839416059, "progress_pct": 2.64, "epoch_pct": 2.65, "eta": "16:11:16", "max_grad_norm": 1.0, "loss": 0.3108353912830353, "grad_norm": 1.8173967599868774, "learning_rate": 1.2984496124031009e-05, "rewards/chosen": 1.4788665771484375, "rewards/rejected": 0.2151254564523697, "rewards/accuracies": 0.9375, "rewards/margins": 1.2637410163879395, "logps/chosen": -319.99700927734375, "logps/rejected": -364.115234375, "logits/chosen": 5.025746822357178, "logits/rejected": 5.114965438842773}
39
+ {"ts": "2025-12-26T16:24:16", "event": "train_log", "step": 70, "epoch": 0.08175182481751825, "progress_pct": 2.72, "epoch_pct": 2.73, "eta": "15:54:17", "max_grad_norm": 1.0, "loss": 0.22991834580898285, "grad_norm": 1.0658400058746338, "learning_rate": 1.3372093023255814e-05, "rewards/chosen": 1.3950352668762207, "rewards/rejected": -0.1014888733625412, "rewards/accuracies": 1.0, "rewards/margins": 1.4965243339538574, "logps/chosen": -383.84033203125, "logps/rejected": -431.7752685546875, "logits/chosen": 4.945235729217529, "logits/rejected": 4.959147930145264}
40
+ {"ts": "2025-12-26T16:24:35", "event": "train_log", "step": 72, "epoch": 0.0840875912408759, "progress_pct": 2.8, "epoch_pct": 2.8, "eta": "15:38:16", "max_grad_norm": 1.0, "loss": 0.22603684663772583, "grad_norm": 1.0350896120071411, "learning_rate": 1.375968992248062e-05, "rewards/chosen": 1.2978975772857666, "rewards/rejected": -0.34637776017189026, "rewards/accuracies": 1.0, "rewards/margins": 1.644275426864624, "logps/chosen": -350.9471435546875, "logps/rejected": -382.6837158203125, "logits/chosen": 5.00426721572876, "logits/rejected": 5.120238780975342}
41
+ {"ts": "2025-12-26T16:24:56", "event": "train_log", "step": 74, "epoch": 0.08642335766423358, "progress_pct": 2.88, "epoch_pct": 2.88, "eta": "15:23:40", "max_grad_norm": 1.0, "loss": 0.18921935558319092, "grad_norm": 1.1595423221588135, "learning_rate": 1.4147286821705426e-05, "rewards/chosen": 1.1984589099884033, "rewards/rejected": -0.5510700941085815, "rewards/accuracies": 1.0, "rewards/margins": 1.7495291233062744, "logps/chosen": -352.34967041015625, "logps/rejected": -399.23028564453125, "logits/chosen": 4.890130043029785, "logits/rejected": 4.9504714012146}
42
+ {"ts": "2025-12-26T16:32:39", "event": "train_log", "step": 75, "epoch": 0.08759124087591241, "progress_pct": 2.92, "epoch_pct": 2.92, "eta": "19:27:59", "max_grad_norm": 1.0, "eval_loss": 0.16020436584949493, "eval_runtime": 454.3435, "eval_samples_per_second": 1.677, "eval_steps_per_second": 1.677, "eval_rewards/chosen": 1.1210675239562988, "eval_rewards/rejected": -0.9336051344871521, "eval_rewards/accuracies": 0.9960629940032959, "eval_rewards/margins": 2.0546727180480957, "eval_logps/chosen": -359.19647216796875, "eval_logps/rejected": -405.1120300292969, "eval_logits/chosen": 4.930174827575684, "eval_logits/rejected": 5.032296657562256}
43
+ {"ts": "2025-12-26T16:32:49", "event": "train_log", "step": 76, "epoch": 0.08875912408759123, "progress_pct": 2.96, "epoch_pct": 2.96, "eta": "19:17:37", "max_grad_norm": 1.0, "loss": 0.15998858213424683, "grad_norm": 1.1433167457580566, "learning_rate": 1.4534883720930233e-05, "rewards/chosen": 1.2128857374191284, "rewards/rejected": -0.8816256523132324, "rewards/accuracies": 1.0, "rewards/margins": 2.0945115089416504, "logps/chosen": -313.110595703125, "logps/rejected": -356.1000061035156, "logits/chosen": 5.037275314331055, "logits/rejected": 5.1315507888793945}
44
+ {"ts": "2025-12-26T16:33:09", "event": "train_log", "step": 78, "epoch": 0.0910948905109489, "progress_pct": 3.03, "epoch_pct": 3.04, "eta": "18:57:48", "max_grad_norm": 1.0, "loss": 0.1894684135913849, "grad_norm": 0.9839214086532593, "learning_rate": 1.4922480620155039e-05, "rewards/chosen": 1.0605502128601074, "rewards/rejected": -0.8470743894577026, "rewards/accuracies": 1.0, "rewards/margins": 1.90762460231781, "logps/chosen": -366.2629089355469, "logps/rejected": -405.7989196777344, "logits/chosen": 4.817085266113281, "logits/rejected": 4.874035835266113}
45
+ {"ts": "2025-12-26T16:33:29", "event": "train_log", "step": 80, "epoch": 0.09343065693430656, "progress_pct": 3.11, "epoch_pct": 3.11, "eta": "18:38:40", "max_grad_norm": 1.0, "loss": 0.15948188304901123, "grad_norm": 0.9212782979011536, "learning_rate": 1.5310077519379846e-05, "rewards/chosen": 0.676516056060791, "rewards/rejected": -1.4909145832061768, "rewards/accuracies": 1.0, "rewards/margins": 2.167430877685547, "logps/chosen": -348.0658264160156, "logps/rejected": -395.23870849609375, "logits/chosen": 5.046716690063477, "logits/rejected": 5.157979965209961}
46
+ {"ts": "2025-12-26T16:33:49", "event": "train_log", "step": 82, "epoch": 0.09576642335766423, "progress_pct": 3.19, "epoch_pct": 3.19, "eta": "18:20:47", "max_grad_norm": 1.0, "loss": 0.12085139006376266, "grad_norm": 0.9820688366889954, "learning_rate": 1.569767441860465e-05, "rewards/chosen": 0.8719685077667236, "rewards/rejected": -1.7745698690414429, "rewards/accuracies": 1.0, "rewards/margins": 2.646538257598877, "logps/chosen": -378.8666076660156, "logps/rejected": -436.9100036621094, "logits/chosen": 4.690741539001465, "logits/rejected": 4.771791458129883}
47
+ {"ts": "2025-12-26T16:34:09", "event": "train_log", "step": 84, "epoch": 0.09810218978102189, "progress_pct": 3.27, "epoch_pct": 3.27, "eta": "18:03:33", "max_grad_norm": 1.0, "loss": 0.08720710873603821, "grad_norm": 0.66785728931427, "learning_rate": 1.608527131782946e-05, "rewards/chosen": 1.1337480545043945, "rewards/rejected": -1.7701961994171143, "rewards/accuracies": 1.0, "rewards/margins": 2.903944253921509, "logps/chosen": -346.51214599609375, "logps/rejected": -400.1110534667969, "logits/chosen": 4.880465984344482, "logits/rejected": 4.961792945861816}
48
+ {"ts": "2025-12-26T16:34:28", "event": "train_log", "step": 86, "epoch": 0.10043795620437956, "progress_pct": 3.35, "epoch_pct": 3.35, "eta": "17:46:40", "max_grad_norm": 1.0, "loss": 0.07942983508110046, "grad_norm": 0.5760660767555237, "learning_rate": 1.647286821705426e-05, "rewards/chosen": 1.2459325790405273, "rewards/rejected": -1.7693227529525757, "rewards/accuracies": 1.0, "rewards/margins": 3.0152552127838135, "logps/chosen": -341.7489318847656, "logps/rejected": -398.322021484375, "logits/chosen": 4.464397430419922, "logits/rejected": 4.680055618286133}
49
+ {"ts": "2025-12-26T16:34:48", "event": "train_log", "step": 88, "epoch": 0.10277372262773722, "progress_pct": 3.42, "epoch_pct": 3.43, "eta": "17:30:52", "max_grad_norm": 1.0, "loss": 0.1258174479007721, "grad_norm": 1.6020294427871704, "learning_rate": 1.686046511627907e-05, "rewards/chosen": 1.0706769227981567, "rewards/rejected": -2.0480403900146484, "rewards/accuracies": 0.9375, "rewards/margins": 3.118717670440674, "logps/chosen": -344.9147644042969, "logps/rejected": -395.4453125, "logits/chosen": 4.563863277435303, "logits/rejected": 4.680974960327148}
50
+ {"ts": "2025-12-26T16:35:06", "event": "train_log", "step": 90, "epoch": 0.10510948905109489, "progress_pct": 3.5, "epoch_pct": 3.5, "eta": "17:14:58", "max_grad_norm": 1.0, "loss": 0.06663060188293457, "grad_norm": 0.46413859724998474, "learning_rate": 1.7248062015503875e-05, "rewards/chosen": 1.4128761291503906, "rewards/rejected": -2.3478102684020996, "rewards/accuracies": 1.0, "rewards/margins": 3.760685920715332, "logps/chosen": -326.9678649902344, "logps/rejected": -388.4164123535156, "logits/chosen": 4.4989237785339355, "logits/rejected": 4.673248291015625}
51
+ {"ts": "2025-12-26T16:35:26", "event": "train_log", "step": 92, "epoch": 0.10744525547445255, "progress_pct": 3.58, "epoch_pct": 3.58, "eta": "17:00:34", "max_grad_norm": 1.0, "loss": 0.04481709748506546, "grad_norm": 0.6699568629264832, "learning_rate": 1.7635658914728684e-05, "rewards/chosen": 1.477597713470459, "rewards/rejected": -2.9012341499328613, "rewards/accuracies": 1.0, "rewards/margins": 4.37883186340332, "logps/chosen": -362.7267150878906, "logps/rejected": -439.2985534667969, "logits/chosen": 4.7294535636901855, "logits/rejected": 4.813880920410156}
52
+ {"ts": "2025-12-26T16:35:47", "event": "train_log", "step": 94, "epoch": 0.10978102189781022, "progress_pct": 3.66, "epoch_pct": 3.66, "eta": "16:47:17", "max_grad_norm": 1.0, "loss": 0.05632612109184265, "grad_norm": 0.4152977168560028, "learning_rate": 1.802325581395349e-05, "rewards/chosen": 0.71366286277771, "rewards/rejected": -2.744809150695801, "rewards/accuracies": 1.0, "rewards/margins": 3.4584720134735107, "logps/chosen": -381.59246826171875, "logps/rejected": -444.2817687988281, "logits/chosen": 4.785149574279785, "logits/rejected": 4.891542434692383}
53
+ {"ts": "2025-12-26T16:36:07", "event": "train_log", "step": 96, "epoch": 0.11211678832116788, "progress_pct": 3.73, "epoch_pct": 3.74, "eta": "16:34:09", "max_grad_norm": 1.0, "loss": 0.040920041501522064, "grad_norm": 0.3152717649936676, "learning_rate": 1.8410852713178295e-05, "rewards/chosen": 1.7566397190093994, "rewards/rejected": -2.263956069946289, "rewards/accuracies": 1.0, "rewards/margins": 4.020595550537109, "logps/chosen": -356.7286376953125, "logps/rejected": -414.69635009765625, "logits/chosen": 4.603940486907959, "logits/rejected": 4.804995536804199}
54
+ {"ts": "2025-12-26T16:36:26", "event": "train_log", "step": 98, "epoch": 0.11445255474452555, "progress_pct": 3.81, "epoch_pct": 3.82, "eta": "16:21:07", "max_grad_norm": 1.0, "loss": 0.025794224813580513, "grad_norm": 0.37698569893836975, "learning_rate": 1.8798449612403103e-05, "rewards/chosen": 1.3867536783218384, "rewards/rejected": -3.2675204277038574, "rewards/accuracies": 1.0, "rewards/margins": 4.6542744636535645, "logps/chosen": -339.794189453125, "logps/rejected": -413.8865966796875, "logits/chosen": 4.558542728424072, "logits/rejected": 4.690641403198242}
55
+ {"ts": "2025-12-26T16:36:46", "event": "train_log", "step": 100, "epoch": 0.11678832116788321, "progress_pct": 3.89, "epoch_pct": 3.89, "eta": "16:09:03", "max_grad_norm": 1.0, "loss": 0.015155203640460968, "grad_norm": 0.15023073554039001, "learning_rate": 1.918604651162791e-05, "rewards/chosen": 1.7938623428344727, "rewards/rejected": -3.1486666202545166, "rewards/accuracies": 1.0, "rewards/margins": 4.942529201507568, "logps/chosen": -346.2568054199219, "logps/rejected": -418.9315185546875, "logits/chosen": 4.387497425079346, "logits/rejected": 4.494588375091553}
56
+ {"ts": "2025-12-26T16:44:21", "event": "train_log", "step": 100, "epoch": 0.11678832116788321, "progress_pct": 3.89, "epoch_pct": 3.89, "eta": "19:16:19", "max_grad_norm": 1.0, "eval_loss": 0.04428481683135033, "eval_runtime": 454.7251, "eval_samples_per_second": 1.676, "eval_steps_per_second": 1.676, "eval_rewards/chosen": 1.7248634099960327, "eval_rewards/rejected": -2.863647222518921, "eval_rewards/accuracies": 0.9921259880065918, "eval_rewards/margins": 4.588510513305664, "eval_logps/chosen": -353.15850830078125, "eval_logps/rejected": -424.4124755859375, "eval_logits/chosen": 4.285891056060791, "eval_logits/rejected": 4.425926208496094}
57
+ {"ts": "2025-12-26T16:44:42", "event": "train_log", "step": 102, "epoch": 0.11912408759124088, "progress_pct": 3.97, "epoch_pct": 3.97, "eta": "19:01:01", "max_grad_norm": 1.0, "loss": 0.01589718647301197, "grad_norm": 0.21237261593341827, "learning_rate": 1.9573643410852714e-05, "rewards/chosen": 1.7697646617889404, "rewards/rejected": -3.025937557220459, "rewards/accuracies": 1.0, "rewards/margins": 4.79570198059082, "logps/chosen": -305.01165771484375, "logps/rejected": -384.8538818359375, "logits/chosen": 4.197369575500488, "logits/rejected": 4.352917671203613}
58
+ {"ts": "2025-12-26T16:45:01", "event": "train_log", "step": 104, "epoch": 0.12145985401459854, "progress_pct": 4.05, "epoch_pct": 4.05, "eta": "18:45:50", "max_grad_norm": 1.0, "loss": 0.038177840411663055, "grad_norm": 1.1960583925247192, "learning_rate": 1.996124031007752e-05, "rewards/chosen": 1.556309461593628, "rewards/rejected": -3.2670814990997314, "rewards/accuracies": 1.0, "rewards/margins": 4.823390960693359, "logps/chosen": -341.10675048828125, "logps/rejected": -417.59613037109375, "logits/chosen": 4.184627056121826, "logits/rejected": 4.280352592468262}
59
+ {"ts": "2025-12-26T16:45:21", "event": "train_log", "step": 106, "epoch": 0.12379562043795621, "progress_pct": 4.12, "epoch_pct": 4.13, "eta": "18:31:17", "max_grad_norm": 1.0, "loss": 0.056792374700307846, "grad_norm": 1.3021241426467896, "learning_rate": 2.0348837209302328e-05, "rewards/chosen": 1.6538318395614624, "rewards/rejected": -3.0760293006896973, "rewards/accuracies": 1.0, "rewards/margins": 4.729861736297607, "logps/chosen": -358.1336669921875, "logps/rejected": -426.8945617675781, "logits/chosen": 4.32430362701416, "logits/rejected": 4.451810359954834}
60
+ {"ts": "2025-12-26T16:45:41", "event": "train_log", "step": 108, "epoch": 0.12613138686131387, "progress_pct": 4.2, "epoch_pct": 4.2, "eta": "18:17:24", "max_grad_norm": 1.0, "loss": 0.07614695280790329, "grad_norm": 0.3007296025753021, "learning_rate": 2.0736434108527133e-05, "rewards/chosen": 1.4121378660202026, "rewards/rejected": -3.331850051879883, "rewards/accuracies": 0.9375, "rewards/margins": 4.743987560272217, "logps/chosen": -364.4995422363281, "logps/rejected": -434.4844055175781, "logits/chosen": 4.4918341636657715, "logits/rejected": 4.6333909034729}
61
+ {"ts": "2025-12-26T16:45:59", "event": "train_log", "step": 110, "epoch": 0.12846715328467154, "progress_pct": 4.28, "epoch_pct": 4.28, "eta": "18:03:33", "max_grad_norm": 1.0, "loss": 0.014600476250052452, "grad_norm": 0.42474085092544556, "learning_rate": 2.1124031007751938e-05, "rewards/chosen": 1.958223819732666, "rewards/rejected": -4.051264762878418, "rewards/accuracies": 1.0, "rewards/margins": 6.009488582611084, "logps/chosen": -306.4935607910156, "logps/rejected": -392.5444030761719, "logits/chosen": 3.857876777648926, "logits/rejected": 3.9678285121917725}
62
+ {"ts": "2025-12-26T16:46:19", "event": "train_log", "step": 112, "epoch": 0.1308029197080292, "progress_pct": 4.36, "epoch_pct": 4.36, "eta": "17:50:24", "max_grad_norm": 1.0, "loss": 0.010151136666536331, "grad_norm": 0.14177864789962769, "learning_rate": 2.1511627906976744e-05, "rewards/chosen": 2.196099281311035, "rewards/rejected": -3.75758695602417, "rewards/accuracies": 1.0, "rewards/margins": 5.953686237335205, "logps/chosen": -339.5606689453125, "logps/rejected": -425.51361083984375, "logits/chosen": 4.254065036773682, "logits/rejected": 4.352800369262695}
63
+ {"ts": "2025-12-26T16:46:40", "event": "train_log", "step": 114, "epoch": 0.13313868613138685, "progress_pct": 4.43, "epoch_pct": 4.44, "eta": "17:38:30", "max_grad_norm": 1.0, "loss": 0.011391772888600826, "grad_norm": 0.29438889026641846, "learning_rate": 2.1899224806201552e-05, "rewards/chosen": 1.9949897527694702, "rewards/rejected": -3.3917016983032227, "rewards/accuracies": 1.0, "rewards/margins": 5.386691093444824, "logps/chosen": -349.3886413574219, "logps/rejected": -431.79925537109375, "logits/chosen": 3.7171452045440674, "logits/rejected": 3.9224042892456055}
64
+ {"ts": "2025-12-26T16:47:00", "event": "train_log", "step": 116, "epoch": 0.13547445255474452, "progress_pct": 4.51, "epoch_pct": 4.52, "eta": "17:26:32", "max_grad_norm": 1.0, "loss": 0.024509863927960396, "grad_norm": 0.9541389346122742, "learning_rate": 2.2286821705426357e-05, "rewards/chosen": 1.8549680709838867, "rewards/rejected": -3.4747393131256104, "rewards/accuracies": 1.0, "rewards/margins": 5.329707622528076, "logps/chosen": -343.19482421875, "logps/rejected": -423.23565673828125, "logits/chosen": 3.5138039588928223, "logits/rejected": 3.7400965690612793}
65
+ {"ts": "2025-12-26T16:47:21", "event": "train_log", "step": 118, "epoch": 0.1378102189781022, "progress_pct": 4.59, "epoch_pct": 4.59, "eta": "17:15:03", "max_grad_norm": 1.0, "loss": 0.007583940401673317, "grad_norm": 0.45693957805633545, "learning_rate": 2.2674418604651163e-05, "rewards/chosen": 2.130192518234253, "rewards/rejected": -4.364559650421143, "rewards/accuracies": 1.0, "rewards/margins": 6.494752407073975, "logps/chosen": -382.1067810058594, "logps/rejected": -480.72265625, "logits/chosen": 3.9002795219421387, "logits/rejected": 3.9630849361419678}
66
+ {"ts": "2025-12-26T16:47:41", "event": "train_log", "step": 120, "epoch": 0.14014598540145987, "progress_pct": 4.67, "epoch_pct": 4.67, "eta": "17:03:44", "max_grad_norm": 1.0, "loss": 0.007748167496174574, "grad_norm": 0.20826944708824158, "learning_rate": 2.3062015503875968e-05, "rewards/chosen": 1.398924469947815, "rewards/rejected": -4.580015182495117, "rewards/accuracies": 1.0, "rewards/margins": 5.978940010070801, "logps/chosen": -355.2779541015625, "logps/rejected": -436.54022216796875, "logits/chosen": 3.7722253799438477, "logits/rejected": 3.939023494720459}
67
+ {"ts": "2025-12-26T16:48:01", "event": "train_log", "step": 122, "epoch": 0.1424817518248175, "progress_pct": 4.75, "epoch_pct": 4.75, "eta": "16:52:48", "max_grad_norm": 1.0, "loss": 0.014359460212290287, "grad_norm": 0.21926206350326538, "learning_rate": 2.3449612403100777e-05, "rewards/chosen": 1.1770455837249756, "rewards/rejected": -5.205078125, "rewards/accuracies": 1.0, "rewards/margins": 6.382123947143555, "logps/chosen": -327.1947326660156, "logps/rejected": -414.7738037109375, "logits/chosen": 3.656745672225952, "logits/rejected": 3.875434160232544}
68
+ {"ts": "2025-12-26T16:48:21", "event": "train_log", "step": 124, "epoch": 0.14481751824817518, "progress_pct": 4.82, "epoch_pct": 4.83, "eta": "16:42:21", "max_grad_norm": 1.0, "loss": 0.007621760480105877, "grad_norm": 0.03550998866558075, "learning_rate": 2.3837209302325582e-05, "rewards/chosen": 0.7802913188934326, "rewards/rejected": -6.6151628494262695, "rewards/accuracies": 1.0, "rewards/margins": 7.395453453063965, "logps/chosen": -369.8974304199219, "logps/rejected": -473.97283935546875, "logits/chosen": 3.659773826599121, "logits/rejected": 3.725044012069702}
69
+ {"ts": "2025-12-26T16:56:05", "event": "train_log", "step": 125, "epoch": 0.145985401459854, "progress_pct": 4.86, "epoch_pct": 4.87, "eta": "19:05:16", "max_grad_norm": 1.0, "eval_loss": 0.024107323959469795, "eval_runtime": 454.8045, "eval_samples_per_second": 1.675, "eval_steps_per_second": 1.675, "eval_rewards/chosen": 0.5319492816925049, "eval_rewards/rejected": -6.150709629058838, "eval_rewards/accuracies": 0.9934383034706116, "eval_rewards/margins": 6.682660102844238, "eval_logps/chosen": -365.087646484375, "eval_logps/rejected": -457.28314208984375, "eval_logits/chosen": 3.6694726943969727, "eval_logits/rejected": 3.8436598777770996}
70
+ {"ts": "2025-12-26T16:56:15", "event": "train_log", "step": 126, "epoch": 0.14715328467153285, "progress_pct": 4.9, "epoch_pct": 4.91, "eta": "18:59:00", "max_grad_norm": 1.0, "loss": 0.005531508009880781, "grad_norm": 0.21691419184207916, "learning_rate": 2.4224806201550387e-05, "rewards/chosen": 0.9075853824615479, "rewards/rejected": -7.027284622192383, "rewards/accuracies": 1.0, "rewards/margins": 7.934869289398193, "logps/chosen": -345.18023681640625, "logps/rejected": -454.6177978515625, "logits/chosen": 3.777791738510132, "logits/rejected": 3.7573630809783936}
71
+ {"ts": "2025-12-26T16:56:36", "event": "train_log", "step": 128, "epoch": 0.14948905109489052, "progress_pct": 4.98, "epoch_pct": 4.98, "eta": "18:46:50", "max_grad_norm": 1.0, "loss": 0.0008547124452888966, "grad_norm": 0.0514506921172142, "learning_rate": 2.4612403100775196e-05, "rewards/chosen": 1.0864180326461792, "rewards/rejected": -6.75621223449707, "rewards/accuracies": 1.0, "rewards/margins": 7.842630863189697, "logps/chosen": -376.30023193359375, "logps/rejected": -486.30615234375, "logits/chosen": 3.6862380504608154, "logits/rejected": 3.77681827545166}
72
+ {"ts": "2025-12-26T16:56:54", "event": "train_log", "step": 130, "epoch": 0.15182481751824817, "progress_pct": 5.06, "epoch_pct": 5.06, "eta": "18:34:28", "max_grad_norm": 1.0, "loss": 0.019211476668715477, "grad_norm": 1.0013993978500366, "learning_rate": 2.5e-05, "rewards/chosen": 0.7987843751907349, "rewards/rejected": -6.31362247467041, "rewards/accuracies": 1.0, "rewards/margins": 7.1124067306518555, "logps/chosen": -330.2728271484375, "logps/rejected": -420.1920166015625, "logits/chosen": 3.8558738231658936, "logits/rejected": 4.067385673522949}
73
+ {"ts": "2025-12-26T16:57:14", "event": "train_log", "step": 132, "epoch": 0.15416058394160584, "progress_pct": 5.13, "epoch_pct": 5.14, "eta": "18:22:39", "max_grad_norm": 1.0, "loss": 0.005051509942859411, "grad_norm": 0.2909312844276428, "learning_rate": 2.5387596899224806e-05, "rewards/chosen": 0.7269927859306335, "rewards/rejected": -6.733253479003906, "rewards/accuracies": 1.0, "rewards/margins": 7.4602460861206055, "logps/chosen": -346.632568359375, "logps/rejected": -448.4364318847656, "logits/chosen": 3.8564915657043457, "logits/rejected": 4.106588363647461}
74
+ {"ts": "2025-12-26T16:57:34", "event": "train_log", "step": 134, "epoch": 0.1564963503649635, "progress_pct": 5.21, "epoch_pct": 5.22, "eta": "18:11:24", "max_grad_norm": 1.0, "loss": 0.029044320806860924, "grad_norm": 0.10341063886880875, "learning_rate": 2.5775193798449615e-05, "rewards/chosen": -0.15517807006835938, "rewards/rejected": -7.108559608459473, "rewards/accuracies": 1.0, "rewards/margins": 6.953381538391113, "logps/chosen": -398.5205078125, "logps/rejected": -493.3382568359375, "logits/chosen": 3.6923415660858154, "logits/rejected": 3.8758797645568848}
75
+ {"ts": "2025-12-26T16:57:55", "event": "train_log", "step": 136, "epoch": 0.15883211678832118, "progress_pct": 5.29, "epoch_pct": 5.29, "eta": "18:00:50", "max_grad_norm": 1.0, "loss": 0.008300668559968472, "grad_norm": 0.40827327966690063, "learning_rate": 2.616279069767442e-05, "rewards/chosen": 0.3356212377548218, "rewards/rejected": -6.36344051361084, "rewards/accuracies": 1.0, "rewards/margins": 6.699062347412109, "logps/chosen": -420.5874328613281, "logps/rejected": -511.71661376953125, "logits/chosen": 3.701347827911377, "logits/rejected": 3.8854856491088867}
76
+ {"ts": "2025-12-26T16:58:15", "event": "train_log", "step": 138, "epoch": 0.16116788321167883, "progress_pct": 5.37, "epoch_pct": 5.37, "eta": "17:50:12", "max_grad_norm": 1.0, "loss": 0.010793081484735012, "grad_norm": 0.17690710723400116, "learning_rate": 2.655038759689923e-05, "rewards/chosen": 1.5296471118927002, "rewards/rejected": -5.799461364746094, "rewards/accuracies": 1.0, "rewards/margins": 7.329109191894531, "logps/chosen": -361.59417724609375, "logps/rejected": -455.0230407714844, "logits/chosen": 3.476500988006592, "logits/rejected": 3.6296494007110596}
77
+ {"ts": "2025-12-26T16:58:35", "event": "train_log", "step": 140, "epoch": 0.1635036496350365, "progress_pct": 5.45, "epoch_pct": 5.45, "eta": "17:39:44", "max_grad_norm": 1.0, "loss": 0.012777667492628098, "grad_norm": 0.15591435134410858, "learning_rate": 2.693798449612403e-05, "rewards/chosen": 1.2819935083389282, "rewards/rejected": -6.812654972076416, "rewards/accuracies": 1.0, "rewards/margins": 8.094648361206055, "logps/chosen": -379.35333251953125, "logps/rejected": -490.0003356933594, "logits/chosen": 3.440129518508911, "logits/rejected": 3.5673890113830566}
78
+ {"ts": "2025-12-26T16:58:56", "event": "train_log", "step": 142, "epoch": 0.16583941605839417, "progress_pct": 5.52, "epoch_pct": 5.53, "eta": "17:29:50", "max_grad_norm": 1.0, "loss": 0.007118214387446642, "grad_norm": 0.820688009262085, "learning_rate": 2.7325581395348836e-05, "rewards/chosen": 1.4685018062591553, "rewards/rejected": -6.558856964111328, "rewards/accuracies": 1.0, "rewards/margins": 8.027359008789062, "logps/chosen": -402.8253479003906, "logps/rejected": -506.32000732421875, "logits/chosen": 3.23529052734375, "logits/rejected": 3.393266201019287}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
wandb/debug-internal.log CHANGED
@@ -1,12 +1,11 @@
1
- {"time":"2025-12-27T19:44:23.778699792Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
- {"time":"2025-12-27T19:44:23.931692267Z","level":"INFO","msg":"stream: created new stream","id":"jz7bptqa"}
3
- {"time":"2025-12-27T19:44:23.931816408Z","level":"INFO","msg":"handler: started","stream_id":"jz7bptqa"}
4
- {"time":"2025-12-27T19:44:23.931950499Z","level":"INFO","msg":"stream: started","id":"jz7bptqa"}
5
- {"time":"2025-12-27T19:44:23.931981018Z","level":"INFO","msg":"writer: started","stream_id":"jz7bptqa"}
6
- {"time":"2025-12-27T19:44:23.931980519Z","level":"INFO","msg":"sender: started","stream_id":"jz7bptqa"}
7
- {"time":"2025-12-27T22:06:04.101056437Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
8
- {"time":"2025-12-27T22:06:04.298272513Z","level":"INFO","msg":"handler: operation stats","stats":{}}
9
- {"time":"2025-12-27T22:06:04.303656494Z","level":"INFO","msg":"stream: closing","id":"jz7bptqa"}
10
- {"time":"2025-12-27T22:06:04.303677603Z","level":"INFO","msg":"handler: closed","stream_id":"jz7bptqa"}
11
- {"time":"2025-12-27T22:06:04.303750712Z","level":"INFO","msg":"sender: closed","stream_id":"jz7bptqa"}
12
- {"time":"2025-12-27T22:06:04.303767265Z","level":"INFO","msg":"stream: closed","id":"jz7bptqa"}
 
1
+ {"time":"2025-12-26T15:56:50.297401502Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T15:56:50.452320078Z","level":"INFO","msg":"stream: created new stream","id":"wbzoafvt"}
3
+ {"time":"2025-12-26T15:56:50.452494836Z","level":"INFO","msg":"handler: started","stream_id":"wbzoafvt"}
4
+ {"time":"2025-12-26T15:56:50.452572405Z","level":"INFO","msg":"stream: started","id":"wbzoafvt"}
5
+ {"time":"2025-12-26T15:56:50.452599156Z","level":"INFO","msg":"writer: started","stream_id":"wbzoafvt"}
6
+ {"time":"2025-12-26T15:56:50.452607804Z","level":"INFO","msg":"sender: started","stream_id":"wbzoafvt"}
7
+ {"time":"2025-12-26T16:59:00.070531235Z","level":"INFO","msg":"stream: closing","id":"wbzoafvt"}
8
+ {"time":"2025-12-26T16:59:00.346670237Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-26T16:59:00.473496131Z","level":"INFO","msg":"handler: closed","stream_id":"wbzoafvt"}
10
+ {"time":"2025-12-26T16:59:00.473589831Z","level":"INFO","msg":"sender: closed","stream_id":"wbzoafvt"}
11
+ {"time":"2025-12-26T16:59:00.473602236Z","level":"INFO","msg":"stream: closed","id":"wbzoafvt"}
 
wandb/debug.log CHANGED
@@ -1,29 +1,26 @@
1
- 2025-12-27 19:44:23,485 INFO MainThread:592889 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_setup.py:_flush():80] Configure stats pid to 592889
3
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_setup.py:_flush():80] Loading settings from /workspace/trainer-kit/GRPO-14b/wandb/settings
5
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/grpo_14b_run1/wandb/run-20251227_194423-jz7bptqa/logs/debug.log
7
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/grpo_14b_run1/wandb/run-20251227_194423-jz7bptqa/logs/debug-internal.log
8
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_init.py:init():841] calling init triggers
9
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
- config: {'model': {'repo_id': '/workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2', 'tokenizer_name': 'Qwen/Qwen2.5-Coder-14B', 'load_in_8bit': False, 'load_in_4bit': False, 'torch_dtype': 'bfloat16', 'device_map': 'auto', 'trust_remote_code': True}, 'data': {'train_jsonl': 'grpo_dataset.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.0, 'shuffle': True, 'num_proc': 1, 'prompt_field': 'prompt', 'completions_field': 'completions', 'scores_field': 'scores', 'format_type': 'raw', 'max_length': 2048, 'min_completions': 2, 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n\n## Example\n\n##TASK\nAdd webhook subscription support\n\n##OUTPUT\nThe webhook system routes events via EventClass enum. Flow: webhook → EventClass → handler → processing. The EventClass enum (crates/common_enums/src/enums.rs::EventClass) must add Subscriptions variant because it defines event routing—without this, subscription events cannot be processed. The SubscriptionStatus impl (crates/common_enums/src/transformers.rs::SubscriptionStatus) must map to EventType because it converts status to events—without this, status changes don\'t trigger webhooks. These are coupled: EventClass routes to handlers that use SubscriptionStatus mappings.\n\n##SELECT\ncrates/common_enums/src/enums.rs::EventClass\ncrates/common_enums/src/transformers.rs::SubscriptionStatus\n<EOS>\n', 'custom_template': '##INSTRUCTION\n{instruction}<|im_end|>\n{input}<|im_end|>\n{output}<|im_end|>'}, 'peft': {'enabled': True, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'grpo': {'group_size': 4, 'kl_coef': 0.05, 'normalize_advantages': True, 'reward_scaling': 1.0, 'reward_bias': 0.0, 'reward_clip': 5.0, 'advantage_temperature': 1.0, 'use_reference_model': False, 'seed': 42}, 'train': {'output_dir': 'runs/grpo_14b_run1', 'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'gradient_accumulation_steps': 8, 'per_device_eval_batch_size': 1, 'learning_rate': 5e-06, 'weight_decay': 0.01, 'warmup_ratio': 0.05, 'lr_scheduler_type': 'cosine', 'fp16': False, 'bf16': True, 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 5, 'save_steps': 100, 'save_total_limit': 2, 'evaluation_strategy': 'no', 'dataloader_num_workers': 4, 'dataloader_pin_memory': True, 'remove_unused_columns': False, 'report_to': [], 'seed': 42, 'ddp_find_unused_parameters': False}, 'run_dir': 'runs/grpo_14b_run1', '_wandb': {}}
11
- 2025-12-27 19:44:23,486 INFO MainThread:592889 [wandb_init.py:init():889] starting backend
12
- 2025-12-27 19:44:23,772 INFO MainThread:592889 [wandb_init.py:init():892] sending inform_init request
13
- 2025-12-27 19:44:23,776 INFO MainThread:592889 [wandb_init.py:init():900] backend started and connected
14
- 2025-12-27 19:44:23,778 INFO MainThread:592889 [wandb_init.py:init():970] updated telemetry
15
- 2025-12-27 19:44:23,779 INFO MainThread:592889 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
- 2025-12-27 19:44:24,241 INFO MainThread:592889 [wandb_init.py:init():1041] starting run threads in backend
17
- 2025-12-27 19:44:24,354 INFO MainThread:592889 [wandb_run.py:_console_start():2521] atexit reg
18
- 2025-12-27 19:44:24,354 INFO MainThread:592889 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
- 2025-12-27 19:44:24,354 INFO MainThread:592889 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
- 2025-12-27 19:44:24,354 INFO MainThread:592889 [wandb_run.py:_redirect():2461] Redirects installed.
21
- 2025-12-27 19:44:24,358 INFO MainThread:592889 [wandb_init.py:init():1081] run started, returning control to user process
22
- 2025-12-27 19:45:32,787 INFO MainThread:592889 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['q_proj', 'o_proj', 'v_proj', 'up_proj', 'gate_proj', 'down_proj', 'k_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'rope_theta': 10000.0, 'rope_scaling': None, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'return_dict': True, 'output_hidden_states': False, 'torchscript': False, 'dtype': 'bfloat16', 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 151643, 'pad_token_id': None, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'num_beam_groups': 1, 'diversity_penalty': 0.0, '_name_or_path': '/workspace/Models/Qwen2.5-Coder-14B-CPT-SFT_v2', 'transformers_version': '4.57.3', 'model_type': 'qwen2', 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'tf_legacy_loss': False, 'use_bfloat16': False, 'output_attentions': False, 'output_dir': 'runs/grpo_14b_run1/checkpoints', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': False, 'do_predict': False, 'eval_strategy': 'no', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-06, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 2.0, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.05, 'warmup_steps': 0, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': 'runs/grpo_14b_run1/checkpoints/runs/Dec27_19-45-32_a100-2gpu-shell-session-757d587799-mfdvv', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 5, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 2, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 50, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '<PUSH_TO_HUB_TOKEN>', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True}
23
- 2025-12-27 19:45:32,798 INFO MainThread:592889 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14838846464 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7ada527aff70>>
24
- 2025-12-27 19:45:32,798 INFO MainThread:592889 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14838846464 None
25
- 2025-12-27 22:06:03,800 INFO MainThread:592889 [wandb_run.py:_finish():2287] finishing run sirajuddin-shaik-007/rl-training/jz7bptqa
26
- 2025-12-27 22:06:03,800 INFO MainThread:592889 [wandb_run.py:_atexit_cleanup():2486] got exitcode: 0
27
- 2025-12-27 22:06:03,801 INFO MainThread:592889 [wandb_run.py:_restore():2468] restore
28
- 2025-12-27 22:06:03,801 INFO MainThread:592889 [wandb_run.py:_restore():2474] restore done
29
- 2025-12-27 22:06:04,302 INFO MainThread:592889 [wandb_run.py:_footer_sync_info():3862] logging synced files
 
1
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Configure stats pid to 148906
3
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from /workspace/trainer-kit/DPO-14b/wandb/settings
5
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/dpo_run_14b_v1/wandb/run-20251226_155650-wbzoafvt/logs/debug.log
7
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/dpo_run_14b_v1/wandb/run-20251226_155650-wbzoafvt/logs/debug-internal.log
8
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'dpo_pairs_generated.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'prompt_field': 'prompt', 'chosen_field': 'chosen', 'rejected_field': 'rejected', 'score_field': 'f1_score', 'format_type': 'chatml', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'dpo': {'beta': 0.1, 'label_smoothing': 0.0, 'loss_type': 'sigmoid', 'use_reference_model': True, 'reference_free': False}, 'train': {'num_train_epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '5e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'evaluation_strategy': 'steps', 'eval_steps': 25, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/dpo_run_14b_v1', '_wandb': {}}
11
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 15:56:50,290 INFO MainThread:148906 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 15:56:50,295 INFO MainThread:148906 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 15:56:50,297 INFO MainThread:148906 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 15:56:50,297 INFO MainThread:148906 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 15:56:50,648 INFO MainThread:148906 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 15:56:50,762 INFO MainThread:148906 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 15:57:33,783 INFO MainThread:148906 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'v_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'runs/dpo_run_14b_v1', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.1, 'warmup_steps': 0.1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'model_init_kwargs': None, 'ref_model_init_kwargs': None, 'model_adapter_name': None, 'ref_adapter_name': None, 'force_use_ref_model': False, 'disable_dropout': True, 'use_logits_to_keep': False, 'dataset_num_proc': None, 'pad_token': '<PAD_TOKEN>', 'label_pad_token_id': -100, 'max_prompt_length': 1024, 'max_completion_length': None, 'max_length': 2048, 'truncation_mode': 'keep_end', 'padding_free': False, 'precompute_ref_log_probs': False, 'precompute_ref_batch_size': None, 'tools': None, 'loss_type': 'sigmoid', 'use_liger_loss': None, 'base_model_attribute_name': 'model', 'beta': 0.1, 'f_divergence_type': 'reverse_kl', 'f_alpha_divergence_coef': 1.0, 'reference_free': False, 'label_smoothing': 0.0, 'use_weighting': False, 'rpo_alpha': None, 'ld_alpha': None, 'discopop_tau': 0.05, 'loss_weights': None, 'sync_ref_model': False, 'ref_model_mixup_alpha': 0.6, 'ref_model_sync_steps': 512, 'generate_during_eval': False}
23
+ 2025-12-26 15:57:33,791 INFO MainThread:148906 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14795199488 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7da0304cf970>>
24
+ 2025-12-26 15:57:33,792 INFO MainThread:148906 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14795199488 None
25
+ 2025-12-26 16:59:00,070 INFO wandb-AsyncioManager-main:148906 [service_client.py:_forward_responses():80] Reached EOF.
26
+ 2025-12-26 16:59:00,070 INFO wandb-AsyncioManager-main:148906 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
 
 
 
wandb/run-20251226_152332-r9hfat2g/files/config.yaml ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.23.1
6
+ e:
7
+ ce8b9zq5sbh73okdbbvozze07ayjamtf:
8
+ args:
9
+ - --config
10
+ - config_dpo.yaml
11
+ codePath: run_dpo.py
12
+ codePathLocal: run_dpo.py
13
+ cpu_count: 12
14
+ cpu_count_logical: 24
15
+ cudaVersion: "13.0"
16
+ disk:
17
+ /:
18
+ total: "791251738624"
19
+ used: "314755911680"
20
+ email: shaiksirajuddin9949@gmail.com
21
+ executable: /workspace/llm_finetuning_env/bin/python
22
+ gpu: NVIDIA A100-SXM4-80GB
23
+ gpu_count: 2
24
+ gpu_nvidia:
25
+ - architecture: Ampere
26
+ cudaCores: 6912
27
+ memoryTotal: "85899345920"
28
+ name: NVIDIA A100-SXM4-80GB
29
+ uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba
30
+ - architecture: Ampere
31
+ cudaCores: 6912
32
+ memoryTotal: "85899345920"
33
+ name: NVIDIA A100-SXM4-80GB
34
+ uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40
35
+ host: a100-2gpu-shell-session-757d587799-mfdvv
36
+ memory:
37
+ total: "359047892992"
38
+ os: Linux-6.12.46+-x86_64-with-glibc2.35
39
+ program: /workspace/trainer-kit/DPO-14b/run_dpo.py
40
+ python: CPython 3.10.12
41
+ root: runs/dpo_run_14b_v1
42
+ startedAt: "2025-12-26T15:23:32.328004Z"
43
+ writerId: ce8b9zq5sbh73okdbbvozze07ayjamtf
44
+ m:
45
+ - "1": train/global_step
46
+ "6":
47
+ - 3
48
+ "7": []
49
+ - "2": '*'
50
+ "5": 1
51
+ "6":
52
+ - 1
53
+ "7": []
54
+ python_version: 3.10.12
55
+ t:
56
+ "1":
57
+ - 1
58
+ - 11
59
+ - 41
60
+ - 49
61
+ - 51
62
+ - 71
63
+ - 84
64
+ - 98
65
+ "2":
66
+ - 1
67
+ - 11
68
+ - 41
69
+ - 49
70
+ - 51
71
+ - 71
72
+ - 84
73
+ - 98
74
+ "3":
75
+ - 7
76
+ - 15
77
+ - 16
78
+ - 19
79
+ - 66
80
+ "4": 3.10.12
81
+ "5": 0.23.1
82
+ "6": 5.0.0.dev0
83
+ "9":
84
+ "1": transformers_trainer
85
+ "12": 0.23.1
86
+ "13": linux-x86_64
87
+ accelerator_config:
88
+ value:
89
+ dispatch_batches: null
90
+ even_batches: true
91
+ gradient_accumulation_kwargs: null
92
+ non_blocking: false
93
+ split_batches: false
94
+ use_seedable_sampler: true
95
+ adam_beta1:
96
+ value: 0.9
97
+ adam_beta2:
98
+ value: 0.999
99
+ adam_epsilon:
100
+ value: 1e-08
101
+ add_cross_attention:
102
+ value: false
103
+ architectures:
104
+ value:
105
+ - Qwen2ForCausalLM
106
+ attention_dropout:
107
+ value: 0
108
+ auto_find_batch_size:
109
+ value: false
110
+ average_tokens_across_devices:
111
+ value: true
112
+ base_model_attribute_name:
113
+ value: model
114
+ batch_eval_metrics:
115
+ value: false
116
+ beta:
117
+ value: 0.1
118
+ bf16:
119
+ value: true
120
+ bf16_full_eval:
121
+ value: false
122
+ bos_token_id:
123
+ value: null
124
+ chunk_size_feed_forward:
125
+ value: 0
126
+ cross_attention_hidden_size:
127
+ value: null
128
+ data:
129
+ value:
130
+ chosen_field: chosen
131
+ eval_jsonl: null
132
+ eval_split_ratio: 0.1
133
+ format_type: chatml
134
+ max_length: 2048
135
+ num_proc: 4
136
+ prompt_field: prompt
137
+ rejected_field: rejected
138
+ score_field: f1_score
139
+ shuffle: true
140
+ system_prompt: |
141
+ You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
142
+
143
+ ## Output Format
144
+
145
+ ##OUTPUT
146
+ Explain the data flow and why each component must change:
147
+ - Flow: [Input → Processing → Output with arrows]
148
+ - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
149
+ - Explain coupling between components
150
+
151
+ ##SELECT
152
+ modify::crates/path/to/file.rs::impl::ComponentName
153
+ add::crates/another/file.rs::function::AnotherComponent
154
+ <EOS>
155
+
156
+ ## Rules
157
+
158
+ 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
159
+ 2. Use `::` for nested items: `status::StructName::Type::Name`
160
+ 3. Always explain "must change because" and "without this"
161
+ 3. Types of components: function, struct, enum, impl, trait
162
+ 4. If there is extra information (e.g., enum variants), include that too.
163
+ 5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
164
+ train_jsonl: dpo_pairs_generated.jsonl
165
+ data_seed:
166
+ value: null
167
+ dataloader_drop_last:
168
+ value: false
169
+ dataloader_num_workers:
170
+ value: 0
171
+ dataloader_persistent_workers:
172
+ value: false
173
+ dataloader_pin_memory:
174
+ value: true
175
+ dataloader_prefetch_factor:
176
+ value: null
177
+ dataset_num_proc:
178
+ value: null
179
+ ddp_backend:
180
+ value: null
181
+ ddp_broadcast_buffers:
182
+ value: null
183
+ ddp_bucket_cap_mb:
184
+ value: null
185
+ ddp_find_unused_parameters:
186
+ value: null
187
+ ddp_timeout:
188
+ value: 1800
189
+ debug:
190
+ value: []
191
+ decoder_start_token_id:
192
+ value: null
193
+ deepspeed:
194
+ value: null
195
+ disable_dropout:
196
+ value: true
197
+ disable_tqdm:
198
+ value: false
199
+ discopop_tau:
200
+ value: 0.05
201
+ do_eval:
202
+ value: true
203
+ do_predict:
204
+ value: false
205
+ do_train:
206
+ value: false
207
+ dpo:
208
+ value:
209
+ beta: 0.1
210
+ label_smoothing: 0
211
+ loss_type: sigmoid
212
+ reference_free: false
213
+ use_reference_model: true
214
+ dtype:
215
+ value: bfloat16
216
+ enable_jit_checkpoint:
217
+ value: false
218
+ eos_token_id:
219
+ value: 151643
220
+ eval_accumulation_steps:
221
+ value: null
222
+ eval_delay:
223
+ value: 0
224
+ eval_do_concat_batches:
225
+ value: true
226
+ eval_on_start:
227
+ value: false
228
+ eval_steps:
229
+ value: 25
230
+ eval_strategy:
231
+ value: steps
232
+ eval_use_gather_object:
233
+ value: false
234
+ f_alpha_divergence_coef:
235
+ value: 1
236
+ f_divergence_type:
237
+ value: reverse_kl
238
+ finetuning_task:
239
+ value: null
240
+ force_use_ref_model:
241
+ value: false
242
+ fp16:
243
+ value: false
244
+ fp16_full_eval:
245
+ value: false
246
+ fsdp:
247
+ value: []
248
+ fsdp_config:
249
+ value:
250
+ min_num_params: 0
251
+ xla: false
252
+ xla_fsdp_grad_ckpt: false
253
+ xla_fsdp_v2: false
254
+ full_determinism:
255
+ value: false
256
+ generate_during_eval:
257
+ value: false
258
+ gradient_accumulation_steps:
259
+ value: 8
260
+ gradient_checkpointing:
261
+ value: true
262
+ gradient_checkpointing_kwargs:
263
+ value: null
264
+ greater_is_better:
265
+ value: false
266
+ group_by_length:
267
+ value: false
268
+ hidden_act:
269
+ value: silu
270
+ hidden_size:
271
+ value: 5120
272
+ hub_always_push:
273
+ value: false
274
+ hub_model_id:
275
+ value: null
276
+ hub_private_repo:
277
+ value: null
278
+ hub_revision:
279
+ value: null
280
+ hub_strategy:
281
+ value: every_save
282
+ hub_token:
283
+ value: <HUB_TOKEN>
284
+ id2label:
285
+ value:
286
+ "0": LABEL_0
287
+ "1": LABEL_1
288
+ ignore_data_skip:
289
+ value: false
290
+ include_for_metrics:
291
+ value: []
292
+ include_num_input_tokens_seen:
293
+ value: "no"
294
+ initializer_range:
295
+ value: 0.02
296
+ intermediate_size:
297
+ value: 13824
298
+ is_decoder:
299
+ value: false
300
+ is_encoder_decoder:
301
+ value: false
302
+ label_names:
303
+ value: null
304
+ label_pad_token_id:
305
+ value: -100
306
+ label_smoothing:
307
+ value: 0
308
+ label_smoothing_factor:
309
+ value: 0
310
+ label2id:
311
+ value:
312
+ LABEL_0: 0
313
+ LABEL_1: 1
314
+ layer_types:
315
+ value:
316
+ - full_attention
317
+ - full_attention
318
+ - full_attention
319
+ - full_attention
320
+ - full_attention
321
+ - full_attention
322
+ - full_attention
323
+ - full_attention
324
+ - full_attention
325
+ - full_attention
326
+ - full_attention
327
+ - full_attention
328
+ - full_attention
329
+ - full_attention
330
+ - full_attention
331
+ - full_attention
332
+ - full_attention
333
+ - full_attention
334
+ - full_attention
335
+ - full_attention
336
+ - full_attention
337
+ - full_attention
338
+ - full_attention
339
+ - full_attention
340
+ - full_attention
341
+ - full_attention
342
+ - full_attention
343
+ - full_attention
344
+ - full_attention
345
+ - full_attention
346
+ - full_attention
347
+ - full_attention
348
+ - full_attention
349
+ - full_attention
350
+ - full_attention
351
+ - full_attention
352
+ - full_attention
353
+ - full_attention
354
+ - full_attention
355
+ - full_attention
356
+ - full_attention
357
+ - full_attention
358
+ - full_attention
359
+ - full_attention
360
+ - full_attention
361
+ - full_attention
362
+ - full_attention
363
+ - full_attention
364
+ ld_alpha:
365
+ value: null
366
+ learning_rate:
367
+ value: 5e-05
368
+ length_column_name:
369
+ value: length
370
+ liger_kernel_config:
371
+ value: null
372
+ load_best_model_at_end:
373
+ value: true
374
+ local_rank:
375
+ value: -1
376
+ log_level:
377
+ value: passive
378
+ log_level_replica:
379
+ value: warning
380
+ log_on_each_node:
381
+ value: true
382
+ logging_dir:
383
+ value: null
384
+ logging_first_step:
385
+ value: false
386
+ logging_nan_inf_filter:
387
+ value: true
388
+ logging_steps:
389
+ value: 2
390
+ logging_strategy:
391
+ value: steps
392
+ loss_type:
393
+ value: sigmoid
394
+ loss_weights:
395
+ value: null
396
+ lr_scheduler_kwargs:
397
+ value: null
398
+ lr_scheduler_type:
399
+ value: cosine
400
+ max_completion_length:
401
+ value: null
402
+ max_grad_norm:
403
+ value: 1
404
+ max_length:
405
+ value: 2048
406
+ max_position_embeddings:
407
+ value: 32768
408
+ max_prompt_length:
409
+ value: 1024
410
+ max_steps:
411
+ value: -1
412
+ max_window_layers:
413
+ value: 48
414
+ metric_for_best_model:
415
+ value: eval_loss
416
+ model:
417
+ value:
418
+ attn_implementation: null
419
+ base_local_dir: base_model
420
+ bnb_4bit_compute_dtype: bfloat16
421
+ bnb_4bit_quant_type: nf4
422
+ bnb_4bit_use_double_quant: false
423
+ device_map: auto
424
+ repo_id: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
425
+ revision: null
426
+ tokenizer_use_fast: true
427
+ torch_dtype: bfloat16
428
+ trust_remote_code: true
429
+ use_4bit: false
430
+ model/num_parameters:
431
+ value: 14795199488
432
+ model_adapter_name:
433
+ value: null
434
+ model_init_kwargs:
435
+ value: null
436
+ model_type:
437
+ value: qwen2
438
+ neftune_noise_alpha:
439
+ value: null
440
+ num_attention_heads:
441
+ value: 40
442
+ num_hidden_layers:
443
+ value: 48
444
+ num_key_value_heads:
445
+ value: 8
446
+ num_train_epochs:
447
+ value: 3
448
+ optim:
449
+ value: adamw_torch
450
+ optim_args:
451
+ value: null
452
+ optim_target_modules:
453
+ value: null
454
+ output_attentions:
455
+ value: false
456
+ output_dir:
457
+ value: runs/dpo_run_14b_v1
458
+ output_hidden_states:
459
+ value: false
460
+ pad_token:
461
+ value: <PAD_TOKEN>
462
+ pad_token_id:
463
+ value: 151643
464
+ padding_free:
465
+ value: false
466
+ parallelism_config:
467
+ value: null
468
+ peft:
469
+ value:
470
+ bias: none
471
+ enabled: true
472
+ lora_alpha: 32
473
+ lora_dropout: 0.05
474
+ r: 16
475
+ target_modules: auto
476
+ peft_config:
477
+ value:
478
+ default:
479
+ alora_invocation_tokens: null
480
+ arrow_config: null
481
+ auto_mapping: null
482
+ base_model_name_or_path: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
483
+ bias: none
484
+ corda_config: null
485
+ ensure_weight_tying: false
486
+ eva_config: null
487
+ exclude_modules: null
488
+ fan_in_fan_out: false
489
+ inference_mode: false
490
+ init_lora_weights: true
491
+ layer_replication: null
492
+ layers_pattern: null
493
+ layers_to_transform: null
494
+ lora_alpha: 32
495
+ lora_bias: false
496
+ lora_dropout: 0.05
497
+ megatron_config: null
498
+ megatron_core: megatron.core
499
+ modules_to_save: null
500
+ peft_type: LORA
501
+ peft_version: 0.18.0
502
+ qalora_group_size: 16
503
+ r: 16
504
+ revision: null
505
+ runtime_config:
506
+ ephemeral_gpu_offload: false
507
+ target_modules:
508
+ - v_proj
509
+ - k_proj
510
+ - o_proj
511
+ - q_proj
512
+ target_parameters: null
513
+ task_type: CAUSAL_LM
514
+ trainable_token_indices: null
515
+ use_dora: false
516
+ use_qalora: false
517
+ use_rslora: false
518
+ per_device_eval_batch_size:
519
+ value: 1
520
+ per_device_train_batch_size:
521
+ value: 1
522
+ precompute_ref_batch_size:
523
+ value: null
524
+ precompute_ref_log_probs:
525
+ value: false
526
+ prediction_loss_only:
527
+ value: false
528
+ prefix:
529
+ value: null
530
+ problem_type:
531
+ value: null
532
+ project:
533
+ value: huggingface
534
+ push_to_hub:
535
+ value: false
536
+ ref_adapter_name:
537
+ value: null
538
+ ref_model_init_kwargs:
539
+ value: null
540
+ ref_model_mixup_alpha:
541
+ value: 0.6
542
+ ref_model_sync_steps:
543
+ value: 512
544
+ reference_free:
545
+ value: false
546
+ remove_unused_columns:
547
+ value: false
548
+ report_to:
549
+ value:
550
+ - wandb
551
+ restore_callback_states_from_checkpoint:
552
+ value: false
553
+ resume_from_checkpoint:
554
+ value: null
555
+ return_dict:
556
+ value: true
557
+ rms_norm_eps:
558
+ value: 1e-06
559
+ rope_parameters:
560
+ value:
561
+ rope_theta: 1e+06
562
+ rope_type: default
563
+ rpo_alpha:
564
+ value: null
565
+ run_dir:
566
+ value: runs/dpo_run_14b_v1
567
+ run_name:
568
+ value: null
569
+ save_on_each_node:
570
+ value: false
571
+ save_only_model:
572
+ value: false
573
+ save_steps:
574
+ value: 100
575
+ save_strategy:
576
+ value: steps
577
+ save_total_limit:
578
+ value: 10
579
+ seed:
580
+ value: 42
581
+ sep_token_id:
582
+ value: null
583
+ skip_memory_metrics:
584
+ value: true
585
+ sliding_window:
586
+ value: null
587
+ sync_ref_model:
588
+ value: false
589
+ task_specific_params:
590
+ value: null
591
+ tf32:
592
+ value: null
593
+ tie_word_embeddings:
594
+ value: false
595
+ tokenizer_class:
596
+ value: null
597
+ tools:
598
+ value: null
599
+ torch_compile:
600
+ value: false
601
+ torch_compile_backend:
602
+ value: null
603
+ torch_compile_mode:
604
+ value: null
605
+ torch_empty_cache_steps:
606
+ value: null
607
+ trackio_space_id:
608
+ value: trackio
609
+ train:
610
+ value:
611
+ early_stopping:
612
+ enabled: true
613
+ metric: eval_loss
614
+ min_delta: 0.001
615
+ mode: min
616
+ patience: 5
617
+ eval_steps: 25
618
+ evaluation_strategy: steps
619
+ gradient_accumulation_steps: 8
620
+ gradient_checkpointing: true
621
+ learning_rate: "5e-5"
622
+ load_best_model_at_end: true
623
+ logging_steps: 2
624
+ lr_scheduler_type: cosine
625
+ max_grad_norm: 1
626
+ num_train_epochs: 3
627
+ optim: adamw_torch
628
+ per_device_eval_batch_size: 1
629
+ per_device_train_batch_size: 1
630
+ resume_from_checkpoint: auto
631
+ save_steps: 100
632
+ save_strategy: steps
633
+ save_total_limit: 10
634
+ warmup_ratio: 0.1
635
+ weight_decay: 0
636
+ transformers_version:
637
+ value: 5.0.0.dev0
638
+ truncation_mode:
639
+ value: keep_end
640
+ use_cache:
641
+ value: false
642
+ use_cpu:
643
+ value: false
644
+ use_liger_kernel:
645
+ value: false
646
+ use_liger_loss:
647
+ value: null
648
+ use_logits_to_keep:
649
+ value: false
650
+ use_sliding_window:
651
+ value: false
652
+ use_weighting:
653
+ value: false
654
+ vocab_size:
655
+ value: 152064
656
+ warmup_ratio:
657
+ value: 0.1
658
+ warmup_steps:
659
+ value: 0.1
660
+ weight_decay:
661
+ value: 0
wandb/run-20251226_152332-r9hfat2g/files/output.log ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Wandb initialized: project='dpo-training', name='auto-generated'
2
+ `torch_dtype` is deprecated! Use `dtype` instead!
3
+ Loading weights: 100%|█████████████████████████████████| 579/579 [00:09<00:00, 61.71it/s, Materializing param=model.norm.weight]
4
+ Loading reference model (frozen copy)...
5
+ Loading weights: 100%|█████████████████████████████████| 579/579 [00:09<00:00, 61.41it/s, Materializing param=model.norm.weight]
6
+ Reference model loaded and frozen
7
+ 2025-12-26 15:24:00,888 - INFO - HTTP Request: HEAD https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets/json/json.py "HTTP/1.1 200 OK"
8
+ 2025-12-26 15:24:00,903 - INFO - Formatting train DPO data...
9
+ 2025-12-26 15:24:03,288 - INFO - Train dataset after filtering: 6850 examples
10
+ 2025-12-26 15:24:03,289 - INFO - train dataset validation passed: 6850 examples
11
+ 2025-12-26 15:24:03,289 - INFO - Formatting eval DPO data...
12
+ 2025-12-26 15:24:05,675 - INFO - Eval dataset after filtering: 762 examples
13
+ 2025-12-26 15:24:05,675 - INFO - eval dataset validation passed: 762 examples
14
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
15
+ Early stopping enabled: patience=5, min_delta=0.001
16
+ 2025-12-26 15:24:05,710 - INFO - DPO Training with beta=0.1, loss_type=sigmoid
17
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
18
+ 2025-12-26 15:24:15,316 - INFO - Starting DPO training...
19
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
20
+ 0%|▏ | 5/2571 [00:51<7:16:02, 10.20s/it]Traceback (most recent call last):
21
+ {'loss': '0.6931', 'grad_norm': '1.242', 'learning_rate': '1.938e-07', 'rewards/chosen': '0', 'rewards/rejected': '0', 'rewards/accuracies': '0', 'rewards/margins': '0', 'logps/chosen': '-368.9', 'logps/rejected': '-398.8', 'logits/chosen': '5.179', 'logits/rejected': '5.193', 'epoch': '0.002336'}
22
+ {'loss': '0.6933', 'grad_norm': '1.388', 'learning_rate': '5.814e-07', 'rewards/chosen': '0.02254', 'rewards/rejected': '0.02266', 'rewards/accuracies': '0.5', 'rewards/margins': '-0.0001159', 'logps/chosen': '-338.3', 'logps/rejected': '-366.9', 'logits/chosen': '5.405', 'logits/rejected': '5.456', 'epoch': '0.004672'}
23
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 953, in <module>
24
+ main()
25
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 928, in main
26
+ trainer.train(resume_from_checkpoint=resume_from)
27
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2168, in train
28
+ return inner_training_loop(
29
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2535, in _inner_training_loop
30
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
31
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 3807, in training_step
32
+ loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
33
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1810, in compute_loss
34
+ loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
35
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1733, in get_batch_loss_metrics
36
+ ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch)
37
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 926, in compute_ref_log_probs
38
+ ref_model_output = self.concatenated_forward(self.ref_model, batch, is_ref_model=True)
39
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1600, in concatenated_forward
40
+ outputs = model(input_ids, **model_kwargs)
41
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
42
+ return self._call_impl(*args, **kwargs)
43
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
44
+ return forward_call(*args, **kwargs)
45
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward
46
+ return model_forward(*args, **kwargs)
47
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__
48
+ return convert_to_fp32(self.model_forward(*args, **kwargs))
49
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
50
+ return func(*args, **kwargs)
51
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/peft_model.py", line 1923, in forward
52
+ return self.base_model(
53
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
54
+ return self._call_impl(*args, **kwargs)
55
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
56
+ return forward_call(*args, **kwargs)
57
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 308, in forward
58
+ return self.model.forward(*args, **kwargs)
59
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
60
+ output = module._old_forward(*args, **kwargs)
61
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 810, in wrapper
62
+ output = func(self, *args, **kwargs)
63
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 477, in forward
64
+ outputs: BaseModelOutputWithPast = self.model(
65
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
66
+ return self._call_impl(*args, **kwargs)
67
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
68
+ return forward_call(*args, **kwargs)
69
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
70
+ outputs = func(self, *args, **kwargs)
71
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 412, in forward
72
+ hidden_states = decoder_layer(
73
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__
74
+ return super().__call__(*args, **kwargs)
75
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
76
+ return self._call_impl(*args, **kwargs)
77
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
78
+ return forward_call(*args, **kwargs)
79
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 918, in wrapped_forward
80
+ output = orig_forward(*args, **kwargs)
81
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
82
+ output = module._old_forward(*args, **kwargs)
83
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 300, in forward
84
+ hidden_states, _ = self.self_attn(
85
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
86
+ return self._call_impl(*args, **kwargs)
87
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
88
+ return forward_call(*args, **kwargs)
89
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
90
+ output = module._old_forward(*args, **kwargs)
91
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 222, in forward
92
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
93
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
94
+ return self._call_impl(*args, **kwargs)
95
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
96
+ return forward_call(*args, **kwargs)
97
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/lora/layer.py", line 807, in forward
98
+ result = result + lora_B(lora_A(dropout(x))) * scaling
99
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
100
+ return self._call_impl(*args, **kwargs)
101
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
102
+ return forward_call(*args, **kwargs)
103
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward
104
+ return F.linear(input, self.weight, self.bias)
105
+ KeyboardInterrupt
106
+ Traceback (most recent call last):
107
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 953, in <module>
108
+ main()
109
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 928, in main
110
+ trainer.train(resume_from_checkpoint=resume_from)
111
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2168, in train
112
+ return inner_training_loop(
113
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2535, in _inner_training_loop
114
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
115
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 3807, in training_step
116
+ loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
117
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1810, in compute_loss
118
+ loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
119
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1733, in get_batch_loss_metrics
120
+ ref_chosen_logps, ref_rejected_logps = self.compute_ref_log_probs(batch)
121
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 926, in compute_ref_log_probs
122
+ ref_model_output = self.concatenated_forward(self.ref_model, batch, is_ref_model=True)
123
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1600, in concatenated_forward
124
+ outputs = model(input_ids, **model_kwargs)
125
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
126
+ return self._call_impl(*args, **kwargs)
127
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
128
+ return forward_call(*args, **kwargs)
129
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward
130
+ return model_forward(*args, **kwargs)
131
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__
132
+ return convert_to_fp32(self.model_forward(*args, **kwargs))
133
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
134
+ return func(*args, **kwargs)
135
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/peft_model.py", line 1923, in forward
136
+ return self.base_model(
137
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
138
+ return self._call_impl(*args, **kwargs)
139
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
140
+ return forward_call(*args, **kwargs)
141
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 308, in forward
142
+ return self.model.forward(*args, **kwargs)
143
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
144
+ output = module._old_forward(*args, **kwargs)
145
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 810, in wrapper
146
+ output = func(self, *args, **kwargs)
147
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 477, in forward
148
+ outputs: BaseModelOutputWithPast = self.model(
149
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
150
+ return self._call_impl(*args, **kwargs)
151
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
152
+ return forward_call(*args, **kwargs)
153
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
154
+ outputs = func(self, *args, **kwargs)
155
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 412, in forward
156
+ hidden_states = decoder_layer(
157
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_layers.py", line 94, in __call__
158
+ return super().__call__(*args, **kwargs)
159
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
160
+ return self._call_impl(*args, **kwargs)
161
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
162
+ return forward_call(*args, **kwargs)
163
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 918, in wrapped_forward
164
+ output = orig_forward(*args, **kwargs)
165
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
166
+ output = module._old_forward(*args, **kwargs)
167
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 300, in forward
168
+ hidden_states, _ = self.self_attn(
169
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
170
+ return self._call_impl(*args, **kwargs)
171
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
172
+ return forward_call(*args, **kwargs)
173
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
174
+ output = module._old_forward(*args, **kwargs)
175
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 222, in forward
176
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
177
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
178
+ return self._call_impl(*args, **kwargs)
179
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
180
+ return forward_call(*args, **kwargs)
181
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/lora/layer.py", line 807, in forward
182
+ result = result + lora_B(lora_A(dropout(x))) * scaling
183
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
184
+ return self._call_impl(*args, **kwargs)
185
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
186
+ return forward_call(*args, **kwargs)
187
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 125, in forward
188
+ return F.linear(input, self.weight, self.bias)
189
+ KeyboardInterrupt
wandb/run-20251226_152332-r9hfat2g/files/requirements.txt ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exceptiongroup==1.3.1
2
+ wheel==0.45.1
3
+ python-dateutil==2.9.0.post0
4
+ nvidia-ml-py==13.580.82
5
+ huggingface_hub==1.2.3
6
+ idna==3.11
7
+ click==8.3.1
8
+ numpy==2.2.6
9
+ httpx==0.28.1
10
+ tokenizers==0.22.1
11
+ sympy==1.13.1
12
+ yarl==1.22.0
13
+ async-timeout==5.0.1
14
+ datasets==4.4.2
15
+ platformdirs==4.5.1
16
+ nvidia-cuda-cupti-cu12==12.1.105
17
+ nvidia-nvtx-cu12==12.1.105
18
+ smmap==5.0.2
19
+ accelerate==1.12.0
20
+ requests==2.32.5
21
+ aiohttp==3.13.2
22
+ bitsandbytes==0.49.0
23
+ nvidia-cublas-cu12==12.1.3.1
24
+ mpmath==1.3.0
25
+ torchaudio==2.5.1+cu121
26
+ nvidia-cuda-runtime-cu12==12.1.105
27
+ typing-inspection==0.4.2
28
+ GitPython==3.1.45
29
+ xxhash==3.6.0
30
+ nvidia-cusolver-cu12==11.4.5.107
31
+ pydantic_core==2.41.5
32
+ six==1.17.0
33
+ torchvision==0.20.1+cu121
34
+ typing_extensions==4.15.0
35
+ triton==3.1.0
36
+ charset-normalizer==3.4.4
37
+ nvitop==1.6.1
38
+ wandb==0.23.1
39
+ regex==2025.11.3
40
+ pip==25.3
41
+ nvidia-cusparse-cu12==12.1.0.106
42
+ pytz==2025.2
43
+ Jinja2==3.1.6
44
+ psutil==7.2.0
45
+ pillow==12.0.0
46
+ packaging==25.0
47
+ safetensors==0.7.0
48
+ sentry-sdk==2.48.0
49
+ gitdb==4.0.12
50
+ httpcore==1.0.9
51
+ setuptools==80.9.0
52
+ nvidia-cufft-cu12==11.0.2.54
53
+ anyio==4.12.0
54
+ transformers==5.0.0.dev0
55
+ pydantic==2.12.5
56
+ fsspec==2025.10.0
57
+ filelock==3.20.0
58
+ PyYAML==6.0.3
59
+ hf-xet==1.2.0
60
+ nvidia-cudnn-cu12==9.1.0.70
61
+ tqdm==4.67.1
62
+ MarkupSafe==2.1.5
63
+ attrs==25.4.0
64
+ nvidia-cuda-nvrtc-cu12==12.1.105
65
+ peft==0.18.0
66
+ aiohappyeyeballs==2.6.1
67
+ networkx==3.4.2
68
+ nvidia-nvjitlink-cu12==12.9.86
69
+ certifi==2025.11.12
70
+ pyarrow==22.0.0
71
+ dill==0.4.0
72
+ protobuf==6.33.2
73
+ aiosignal==1.4.0
74
+ frozenlist==1.8.0
75
+ urllib3==2.6.2
76
+ propcache==0.4.1
77
+ tzdata==2025.3
78
+ pandas==2.3.3
79
+ annotated-types==0.7.0
80
+ shellingham==1.5.4
81
+ nvidia-nccl-cu12==2.21.5
82
+ multidict==6.7.0
83
+ nvidia-curand-cu12==10.3.2.106
84
+ trl==0.26.2
85
+ torch==2.5.1+cu121
86
+ h11==0.16.0
87
+ multiprocess==0.70.18
88
+ typer-slim==0.21.0
89
+ wheel==0.45.1
90
+ tomli==2.0.1
91
+ autocommand==2.2.2
92
+ jaraco.context==5.3.0
93
+ zipp==3.19.2
94
+ packaging==24.2
95
+ inflect==7.3.1
96
+ typing_extensions==4.12.2
97
+ platformdirs==4.2.2
98
+ jaraco.functools==4.0.1
99
+ jaraco.collections==5.1.0
100
+ jaraco.text==3.12.1
101
+ backports.tarfile==1.2.0
102
+ more-itertools==10.3.0
103
+ importlib_metadata==8.0.0
104
+ typeguard==4.3.0
wandb/run-20251226_152332-r9hfat2g/files/wandb-metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.12.46+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.12",
4
+ "startedAt": "2025-12-26T15:23:32.328004Z",
5
+ "args": [
6
+ "--config",
7
+ "config_dpo.yaml"
8
+ ],
9
+ "program": "/workspace/trainer-kit/DPO-14b/run_dpo.py",
10
+ "codePath": "run_dpo.py",
11
+ "codePathLocal": "run_dpo.py",
12
+ "email": "shaiksirajuddin9949@gmail.com",
13
+ "root": "runs/dpo_run_14b_v1",
14
+ "host": "a100-2gpu-shell-session-757d587799-mfdvv",
15
+ "executable": "/workspace/llm_finetuning_env/bin/python",
16
+ "cpu_count": 12,
17
+ "cpu_count_logical": 24,
18
+ "gpu": "NVIDIA A100-SXM4-80GB",
19
+ "gpu_count": 2,
20
+ "disk": {
21
+ "/": {
22
+ "total": "791251738624",
23
+ "used": "314755911680"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "359047892992"
28
+ },
29
+ "gpu_nvidia": [
30
+ {
31
+ "name": "NVIDIA A100-SXM4-80GB",
32
+ "memoryTotal": "85899345920",
33
+ "cudaCores": 6912,
34
+ "architecture": "Ampere",
35
+ "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba"
36
+ },
37
+ {
38
+ "name": "NVIDIA A100-SXM4-80GB",
39
+ "memoryTotal": "85899345920",
40
+ "cudaCores": 6912,
41
+ "architecture": "Ampere",
42
+ "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40"
43
+ }
44
+ ],
45
+ "cudaVersion": "13.0",
46
+ "writerId": "ce8b9zq5sbh73okdbbvozze07ayjamtf"
47
+ }
wandb/run-20251226_152332-r9hfat2g/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_step":1,"train/logps/rejected":-366.88128662109375,"train/grad_norm":1.3884541988372803,"train/logps/chosen":-338.257568359375,"train/rewards/margins":-0.00011587224435061216,"train/logits/chosen":5.405174255371094,"_wandb":{"runtime":101},"train/loss":0.693317174911499,"train/global_step":4,"train/epoch":0.004671532846715329,"_timestamp":1.7667626963258417e+09,"train/rewards/chosen":0.022540951147675514,"train/logits/rejected":5.456291675567627,"train/rewards/accuracies":0.5,"train/rewards/rejected":0.022656824439764023,"_runtime":101,"train/learning_rate":5.813953488372093e-07}
wandb/run-20251226_152332-r9hfat2g/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:23:32.418743785Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpiwm5qcwf/port-134621.txt","pid":134621,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-26T15:23:32.419487782Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":134621}
3
+ {"time":"2025-12-26T15:23:32.419441897Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-134621-134691-43401370/socket","Net":"unix"}}
4
+ {"time":"2025-12-26T15:23:32.60107271Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-26T15:23:32.607567183Z","level":"INFO","msg":"handleInformInit: received","streamId":"r9hfat2g","id":"1(@)"}
6
+ {"time":"2025-12-26T15:23:32.769941198Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"r9hfat2g","id":"1(@)"}
7
+ {"time":"2025-12-26T15:25:14.279920394Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-26T15:25:14.279987785Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-12-26T15:25:14.280023071Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-12-26T15:25:14.280085895Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-12-26T15:25:14.280137634Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-134621-134691-43401370/socket","Net":"unix"}}
12
+ {"time":"2025-12-26T15:25:14.643871761Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-26T15:25:14.643905607Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-26T15:25:14.643922133Z","level":"INFO","msg":"server is closed"}
wandb/run-20251226_152332-r9hfat2g/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:23:32.607728655Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T15:23:32.769717362Z","level":"INFO","msg":"stream: created new stream","id":"r9hfat2g"}
3
+ {"time":"2025-12-26T15:23:32.769819803Z","level":"INFO","msg":"handler: started","stream_id":"r9hfat2g"}
4
+ {"time":"2025-12-26T15:23:32.76993207Z","level":"INFO","msg":"stream: started","id":"r9hfat2g"}
5
+ {"time":"2025-12-26T15:23:32.769980394Z","level":"INFO","msg":"sender: started","stream_id":"r9hfat2g"}
6
+ {"time":"2025-12-26T15:23:32.769979838Z","level":"INFO","msg":"writer: started","stream_id":"r9hfat2g"}
7
+ {"time":"2025-12-26T15:25:14.280016864Z","level":"INFO","msg":"stream: closing","id":"r9hfat2g"}
8
+ {"time":"2025-12-26T15:25:14.470499024Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-26T15:25:14.642982392Z","level":"INFO","msg":"handler: closed","stream_id":"r9hfat2g"}
10
+ {"time":"2025-12-26T15:25:14.643087783Z","level":"INFO","msg":"sender: closed","stream_id":"r9hfat2g"}
11
+ {"time":"2025-12-26T15:25:14.643101377Z","level":"INFO","msg":"stream: closed","id":"r9hfat2g"}
wandb/run-20251226_152332-r9hfat2g/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_setup.py:_flush():80] Configure stats pid to 134621
3
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_setup.py:_flush():80] Loading settings from /workspace/trainer-kit/DPO-14b/wandb/settings
5
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 15:23:32,329 INFO MainThread:134621 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/dpo_run_14b_v1/wandb/run-20251226_152332-r9hfat2g/logs/debug.log
7
+ 2025-12-26 15:23:32,330 INFO MainThread:134621 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/dpo_run_14b_v1/wandb/run-20251226_152332-r9hfat2g/logs/debug-internal.log
8
+ 2025-12-26 15:23:32,330 INFO MainThread:134621 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 15:23:32,330 INFO MainThread:134621 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'dpo_pairs_generated.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'prompt_field': 'prompt', 'chosen_field': 'chosen', 'rejected_field': 'rejected', 'score_field': 'f1_score', 'format_type': 'chatml', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'dpo': {'beta': 0.1, 'label_smoothing': 0.0, 'loss_type': 'sigmoid', 'use_reference_model': True, 'reference_free': False}, 'train': {'num_train_epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '5e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'evaluation_strategy': 'steps', 'eval_steps': 25, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/dpo_run_14b_v1', '_wandb': {}}
11
+ 2025-12-26 15:23:32,330 INFO MainThread:134621 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 15:23:32,601 INFO MainThread:134621 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 15:23:32,605 INFO MainThread:134621 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 15:23:32,607 INFO MainThread:134621 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 15:23:32,608 INFO MainThread:134621 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 15:23:32,915 INFO MainThread:134621 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 15:23:33,025 INFO MainThread:134621 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 15:23:33,025 INFO MainThread:134621 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 15:23:33,025 INFO MainThread:134621 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 15:23:33,025 INFO MainThread:134621 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 15:23:33,031 INFO MainThread:134621 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 15:24:15,664 INFO MainThread:134621 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['v_proj', 'k_proj', 'o_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'runs/dpo_run_14b_v1', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.1, 'warmup_steps': 0.1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'model_init_kwargs': None, 'ref_model_init_kwargs': None, 'model_adapter_name': None, 'ref_adapter_name': None, 'force_use_ref_model': False, 'disable_dropout': True, 'use_logits_to_keep': False, 'dataset_num_proc': None, 'pad_token': '<PAD_TOKEN>', 'label_pad_token_id': -100, 'max_prompt_length': 1024, 'max_completion_length': None, 'max_length': 2048, 'truncation_mode': 'keep_end', 'padding_free': False, 'precompute_ref_log_probs': False, 'precompute_ref_batch_size': None, 'tools': None, 'loss_type': 'sigmoid', 'use_liger_loss': None, 'base_model_attribute_name': 'model', 'beta': 0.1, 'f_divergence_type': 'reverse_kl', 'f_alpha_divergence_coef': 1.0, 'reference_free': False, 'label_smoothing': 0.0, 'use_weighting': False, 'rpo_alpha': None, 'ld_alpha': None, 'discopop_tau': 0.05, 'loss_weights': None, 'sync_ref_model': False, 'ref_model_mixup_alpha': 0.6, 'ref_model_sync_steps': 512, 'generate_during_eval': False}
23
+ 2025-12-26 15:24:15,672 INFO MainThread:134621 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14795199488 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x79bf403bb7c0>>
24
+ 2025-12-26 15:24:15,672 INFO MainThread:134621 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14795199488 None
25
+ 2025-12-26 15:25:14,280 INFO wandb-AsyncioManager-main:134621 [service_client.py:_forward_responses():80] Reached EOF.
26
+ 2025-12-26 15:25:14,280 INFO wandb-AsyncioManager-main:134621 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
wandb/run-20251226_152332-r9hfat2g/run-r9hfat2g.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e630f821a728a70660f513c24097fbebe6281e9ed349c81fbbf5c9ee24270a
3
+ size 515777
wandb/run-20251226_152936-r1nptay8/files/config.yaml ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _wandb:
2
+ value:
3
+ cli_version: 0.23.1
4
+ e:
5
+ 62bhwklrbfchpar5wzdaud7re7jdowat:
6
+ args:
7
+ - --config
8
+ - config_dpo.yaml
9
+ codePath: run_dpo.py
10
+ codePathLocal: run_dpo.py
11
+ cpu_count: 12
12
+ cpu_count_logical: 24
13
+ cudaVersion: "13.0"
14
+ disk:
15
+ /:
16
+ total: "791251738624"
17
+ used: "316563935232"
18
+ email: shaiksirajuddin9949@gmail.com
19
+ executable: /workspace/llm_finetuning_env/bin/python
20
+ gpu: NVIDIA A100-SXM4-80GB
21
+ gpu_count: 2
22
+ gpu_nvidia:
23
+ - architecture: Ampere
24
+ cudaCores: 6912
25
+ memoryTotal: "85899345920"
26
+ name: NVIDIA A100-SXM4-80GB
27
+ uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba
28
+ - architecture: Ampere
29
+ cudaCores: 6912
30
+ memoryTotal: "85899345920"
31
+ name: NVIDIA A100-SXM4-80GB
32
+ uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40
33
+ host: a100-2gpu-shell-session-757d587799-mfdvv
34
+ memory:
35
+ total: "359047892992"
36
+ os: Linux-6.12.46+-x86_64-with-glibc2.35
37
+ program: /workspace/trainer-kit/DPO-14b/run_dpo.py
38
+ python: CPython 3.10.12
39
+ root: runs/dpo_run_14b_v1
40
+ startedAt: "2025-12-26T15:29:36.793485Z"
41
+ writerId: 62bhwklrbfchpar5wzdaud7re7jdowat
42
+ m: []
43
+ python_version: 3.10.12
44
+ t:
45
+ "1":
46
+ - 1
47
+ - 11
48
+ - 41
49
+ - 49
50
+ - 51
51
+ - 71
52
+ - 84
53
+ - 98
54
+ "2":
55
+ - 1
56
+ - 11
57
+ - 41
58
+ - 49
59
+ - 51
60
+ - 71
61
+ - 84
62
+ - 98
63
+ "3":
64
+ - 15
65
+ - 16
66
+ "4": 3.10.12
67
+ "5": 0.23.1
68
+ "6": 5.0.0.dev0
69
+ "12": 0.23.1
70
+ "13": linux-x86_64
71
+ data:
72
+ value:
73
+ chosen_field: chosen
74
+ eval_jsonl: null
75
+ eval_split_ratio: 0.1
76
+ format_type: chatml
77
+ max_length: 2048
78
+ num_proc: 4
79
+ prompt_field: prompt
80
+ rejected_field: rejected
81
+ score_field: f1_score
82
+ shuffle: true
83
+ system_prompt: |
84
+ You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
85
+
86
+ ## Output Format
87
+
88
+ ##OUTPUT
89
+ Explain the data flow and why each component must change:
90
+ - Flow: [Input → Processing → Output with arrows]
91
+ - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
92
+ - Explain coupling between components
93
+
94
+ ##SELECT
95
+ modify::crates/path/to/file.rs::impl::ComponentName
96
+ add::crates/another/file.rs::function::AnotherComponent
97
+ <EOS>
98
+
99
+ ## Rules
100
+
101
+ 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
102
+ 2. Use `::` for nested items: `status::StructName::Type::Name`
103
+ 3. Always explain "must change because" and "without this"
104
+ 3. Types of components: function, struct, enum, impl, trait
105
+ 4. If there is extra information (e.g., enum variants), include that too.
106
+ 5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
107
+ train_jsonl: dpo_pairs_generated.jsonl
108
+ dpo:
109
+ value:
110
+ beta: 0.1
111
+ label_smoothing: 0
112
+ loss_type: sigmoid
113
+ reference_free: false
114
+ use_reference_model: true
115
+ model:
116
+ value:
117
+ attn_implementation: null
118
+ base_local_dir: base_model
119
+ bnb_4bit_compute_dtype: bfloat16
120
+ bnb_4bit_quant_type: nf4
121
+ bnb_4bit_use_double_quant: false
122
+ device_map: auto
123
+ repo_id: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
124
+ revision: null
125
+ tokenizer_use_fast: true
126
+ torch_dtype: bfloat16
127
+ trust_remote_code: true
128
+ use_4bit: false
129
+ peft:
130
+ value:
131
+ bias: none
132
+ enabled: true
133
+ lora_alpha: 32
134
+ lora_dropout: 0.05
135
+ r: 16
136
+ target_modules: auto
137
+ run_dir:
138
+ value: runs/dpo_run_14b_v1
139
+ train:
140
+ value:
141
+ early_stopping:
142
+ enabled: true
143
+ metric: eval_loss
144
+ min_delta: 0.001
145
+ mode: min
146
+ patience: 5
147
+ eval_steps: 25
148
+ evaluation_strategy: steps
149
+ gradient_accumulation_steps: 8
150
+ gradient_checkpointing: true
151
+ learning_rate: "5e-5"
152
+ load_best_model_at_end: true
153
+ logging_steps: 2
154
+ lr_scheduler_type: cosine
155
+ max_grad_norm: 1
156
+ num_train_epochs: 3
157
+ optim: adamw_torch
158
+ per_device_eval_batch_size: 1
159
+ per_device_train_batch_size: 1
160
+ resume_from_checkpoint: auto
161
+ save_steps: 100
162
+ save_strategy: steps
163
+ save_total_limit: 10
164
+ warmup_ratio: 0.1
165
+ weight_decay: 0
wandb/run-20251226_152936-r1nptay8/files/output.log ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Wandb initialized: project='dpo-training', name='auto-generated'
2
+ `torch_dtype` is deprecated! Use `dtype` instead!
3
+ Loading weights: 100%|█████████| 579/579 [00:09<00:00, 61.06it/s, Materializing param=model.norm.weight]
4
+ Loading reference model (frozen copy)...
5
+ Loading weights: 100%|█████████| 579/579 [00:09<00:00, 62.49it/s, Materializing param=model.norm.weight]
6
+ Reference model loaded and frozen
7
+ 2025-12-26 15:30:05,632 - INFO - HTTP Request: HEAD https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets/json/json.py "HTTP/1.1 200 OK"
8
+ 2025-12-26 15:30:05,647 - INFO - Formatting train DPO data...
9
+ 2025-12-26 15:30:07,996 - INFO - Train dataset after filtering: 6850 examples
10
+ 2025-12-26 15:30:07,997 - INFO - train dataset validation passed: 6850 examples
11
+ 2025-12-26 15:30:07,997 - INFO - Formatting eval DPO data...
12
+ 2025-12-26 15:30:10,371 - INFO - Eval dataset after filtering: 762 examples
13
+ 2025-12-26 15:30:10,372 - INFO - eval dataset validation passed: 762 examples
14
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
15
+ Early stopping enabled: patience=5, min_delta=0.001
16
+ 2025-12-26 15:30:10,408 - INFO - DPO Training with beta=0.1, loss_type=sigmoid
17
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
18
+ Parameter 'fn_kwargs'={'tokenizer': Qwen2Tokenizer(name_or_path='../../Models/Qwen2.5-Coder-14B-CPT-SFT', vocab_size=151643, model_max_length=32768, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, added_tokens_decoder={
19
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
20
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
21
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
22
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
23
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
24
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
25
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
26
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
27
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
28
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
29
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
30
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
31
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
32
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
33
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
34
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
35
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
36
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
37
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
38
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
39
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
40
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
41
+ }
42
+ ), 'tools': None} of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only shown once. Subsequent hashing failures won't be shown.
43
+ 2025-12-26 15:30:15,283 - WARNING - Parameter 'fn_kwargs'={'tokenizer': Qwen2Tokenizer(name_or_path='../../Models/Qwen2.5-Coder-14B-CPT-SFT', vocab_size=151643, model_max_length=32768, padding_side='right', truncation_side='right', special_tokens={'eos_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, added_tokens_decoder={
44
+ 151643: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
45
+ 151644: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
46
+ 151645: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
47
+ 151646: AddedToken("<|object_ref_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
48
+ 151647: AddedToken("<|object_ref_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
49
+ 151648: AddedToken("<|box_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
50
+ 151649: AddedToken("<|box_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
51
+ 151650: AddedToken("<|quad_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
52
+ 151651: AddedToken("<|quad_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
53
+ 151652: AddedToken("<|vision_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
54
+ 151653: AddedToken("<|vision_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
55
+ 151654: AddedToken("<|vision_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
56
+ 151655: AddedToken("<|image_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
57
+ 151656: AddedToken("<|video_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
58
+ 151657: AddedToken("<tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
59
+ 151658: AddedToken("</tool_call>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
60
+ 151659: AddedToken("<|fim_prefix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
61
+ 151660: AddedToken("<|fim_middle|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
62
+ 151661: AddedToken("<|fim_suffix|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
63
+ 151662: AddedToken("<|fim_pad|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
64
+ 151663: AddedToken("<|repo_name|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
65
+ 151664: AddedToken("<|file_sep|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
66
+ }
67
+ ), 'tools': None} of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead. Make sure your transforms and parameters are serializable with pickle or dill for the dataset fingerprinting and caching to work. If you reuse this transform, the caching mechanism will consider it to be different from the previous calls and recompute everything. This warning is only shown once. Subsequent hashing failures won't be shown.
68
+ Applying chat template to eval dataset: 100%|████████████████| 762/762 [00:00<00:00, 8054.02 examples/s]
69
+ Tokenizing eval dataset: 47%|███████████████▏ | 361/762 [00:01<00:01, 236.68 examples/s]
70
+ Traceback (most recent call last):
71
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 953, in <module>
72
+ main()
73
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 909, in main
74
+ trainer = DPOTrainer(
75
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 488, in __init__
76
+ eval_dataset = self._prepare_dataset(eval_dataset, processing_class, args, "eval")
77
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 654, in _prepare_dataset
78
+ dataset = dataset.map(
79
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 562, in wrapper
80
+ out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
81
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3341, in map
82
+ for rank, done, content in Dataset._map_single(**unprocessed_kwargs):
83
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3673, in _map_single
84
+ for i, example in iter_outputs(shard_iterable):
85
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3647, in iter_outputs
86
+ yield i, apply_function(example, i, offset=offset)
87
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3570, in apply_function
88
+ processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
89
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 714, in tokenize_row
90
+ rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"]
91
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2567, in __call__
92
+ encodings = self._encode_plus(
93
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/tokenization_utils_tokenizers.py", line 833, in _encode_plus
94
+ encodings = self._tokenizer.encode_batch(
95
+ KeyboardInterrupt
96
+ Traceback (most recent call last):
97
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 953, in <module>
98
+ main()
99
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 909, in main
100
+ trainer = DPOTrainer(
101
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 488, in __init__
102
+ eval_dataset = self._prepare_dataset(eval_dataset, processing_class, args, "eval")
103
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 654, in _prepare_dataset
104
+ dataset = dataset.map(
105
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 562, in wrapper
106
+ out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
107
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3341, in map
108
+ for rank, done, content in Dataset._map_single(**unprocessed_kwargs):
109
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3673, in _map_single
110
+ for i, example in iter_outputs(shard_iterable):
111
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3647, in iter_outputs
112
+ yield i, apply_function(example, i, offset=offset)
113
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3570, in apply_function
114
+ processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
115
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 714, in tokenize_row
116
+ rejected_input_ids = tokenizer(features["rejected"], add_special_tokens=False)["input_ids"]
117
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 2567, in __call__
118
+ encodings = self._encode_plus(
119
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/tokenization_utils_tokenizers.py", line 833, in _encode_plus
120
+ encodings = self._tokenizer.encode_batch(
121
+ KeyboardInterrupt
wandb/run-20251226_152936-r1nptay8/files/requirements.txt ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exceptiongroup==1.3.1
2
+ wheel==0.45.1
3
+ python-dateutil==2.9.0.post0
4
+ nvidia-ml-py==13.580.82
5
+ huggingface_hub==1.2.3
6
+ idna==3.11
7
+ click==8.3.1
8
+ numpy==2.2.6
9
+ httpx==0.28.1
10
+ tokenizers==0.22.1
11
+ sympy==1.13.1
12
+ yarl==1.22.0
13
+ async-timeout==5.0.1
14
+ datasets==4.4.2
15
+ platformdirs==4.5.1
16
+ nvidia-cuda-cupti-cu12==12.1.105
17
+ nvidia-nvtx-cu12==12.1.105
18
+ smmap==5.0.2
19
+ accelerate==1.12.0
20
+ requests==2.32.5
21
+ aiohttp==3.13.2
22
+ bitsandbytes==0.49.0
23
+ nvidia-cublas-cu12==12.1.3.1
24
+ mpmath==1.3.0
25
+ torchaudio==2.5.1+cu121
26
+ nvidia-cuda-runtime-cu12==12.1.105
27
+ typing-inspection==0.4.2
28
+ GitPython==3.1.45
29
+ xxhash==3.6.0
30
+ nvidia-cusolver-cu12==11.4.5.107
31
+ pydantic_core==2.41.5
32
+ six==1.17.0
33
+ torchvision==0.20.1+cu121
34
+ typing_extensions==4.15.0
35
+ triton==3.1.0
36
+ charset-normalizer==3.4.4
37
+ nvitop==1.6.1
38
+ wandb==0.23.1
39
+ regex==2025.11.3
40
+ pip==25.3
41
+ nvidia-cusparse-cu12==12.1.0.106
42
+ pytz==2025.2
43
+ Jinja2==3.1.6
44
+ psutil==7.2.0
45
+ pillow==12.0.0
46
+ packaging==25.0
47
+ safetensors==0.7.0
48
+ sentry-sdk==2.48.0
49
+ gitdb==4.0.12
50
+ httpcore==1.0.9
51
+ setuptools==80.9.0
52
+ nvidia-cufft-cu12==11.0.2.54
53
+ anyio==4.12.0
54
+ transformers==5.0.0.dev0
55
+ pydantic==2.12.5
56
+ fsspec==2025.10.0
57
+ filelock==3.20.0
58
+ PyYAML==6.0.3
59
+ hf-xet==1.2.0
60
+ nvidia-cudnn-cu12==9.1.0.70
61
+ tqdm==4.67.1
62
+ MarkupSafe==2.1.5
63
+ attrs==25.4.0
64
+ nvidia-cuda-nvrtc-cu12==12.1.105
65
+ peft==0.18.0
66
+ aiohappyeyeballs==2.6.1
67
+ networkx==3.4.2
68
+ nvidia-nvjitlink-cu12==12.9.86
69
+ certifi==2025.11.12
70
+ pyarrow==22.0.0
71
+ dill==0.4.0
72
+ protobuf==6.33.2
73
+ aiosignal==1.4.0
74
+ frozenlist==1.8.0
75
+ urllib3==2.6.2
76
+ propcache==0.4.1
77
+ tzdata==2025.3
78
+ pandas==2.3.3
79
+ annotated-types==0.7.0
80
+ shellingham==1.5.4
81
+ nvidia-nccl-cu12==2.21.5
82
+ multidict==6.7.0
83
+ nvidia-curand-cu12==10.3.2.106
84
+ trl==0.26.2
85
+ torch==2.5.1+cu121
86
+ h11==0.16.0
87
+ multiprocess==0.70.18
88
+ typer-slim==0.21.0
89
+ wheel==0.45.1
90
+ tomli==2.0.1
91
+ autocommand==2.2.2
92
+ jaraco.context==5.3.0
93
+ zipp==3.19.2
94
+ packaging==24.2
95
+ inflect==7.3.1
96
+ typing_extensions==4.12.2
97
+ platformdirs==4.2.2
98
+ jaraco.functools==4.0.1
99
+ jaraco.collections==5.1.0
100
+ jaraco.text==3.12.1
101
+ backports.tarfile==1.2.0
102
+ more-itertools==10.3.0
103
+ importlib_metadata==8.0.0
104
+ typeguard==4.3.0
wandb/run-20251226_152936-r1nptay8/files/wandb-metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.12.46+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.12",
4
+ "startedAt": "2025-12-26T15:29:36.793485Z",
5
+ "args": [
6
+ "--config",
7
+ "config_dpo.yaml"
8
+ ],
9
+ "program": "/workspace/trainer-kit/DPO-14b/run_dpo.py",
10
+ "codePath": "run_dpo.py",
11
+ "codePathLocal": "run_dpo.py",
12
+ "email": "shaiksirajuddin9949@gmail.com",
13
+ "root": "runs/dpo_run_14b_v1",
14
+ "host": "a100-2gpu-shell-session-757d587799-mfdvv",
15
+ "executable": "/workspace/llm_finetuning_env/bin/python",
16
+ "cpu_count": 12,
17
+ "cpu_count_logical": 24,
18
+ "gpu": "NVIDIA A100-SXM4-80GB",
19
+ "gpu_count": 2,
20
+ "disk": {
21
+ "/": {
22
+ "total": "791251738624",
23
+ "used": "316563935232"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "359047892992"
28
+ },
29
+ "gpu_nvidia": [
30
+ {
31
+ "name": "NVIDIA A100-SXM4-80GB",
32
+ "memoryTotal": "85899345920",
33
+ "cudaCores": 6912,
34
+ "architecture": "Ampere",
35
+ "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba"
36
+ },
37
+ {
38
+ "name": "NVIDIA A100-SXM4-80GB",
39
+ "memoryTotal": "85899345920",
40
+ "cudaCores": 6912,
41
+ "architecture": "Ampere",
42
+ "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40"
43
+ }
44
+ ],
45
+ "cudaVersion": "13.0",
46
+ "writerId": "62bhwklrbfchpar5wzdaud7re7jdowat"
47
+ }
wandb/run-20251226_152936-r1nptay8/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb":{"runtime":41},"_runtime":41}
wandb/run-20251226_152936-r1nptay8/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:29:36.871855887Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmp2764fn9e/port-137205.txt","pid":137205,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-26T15:29:36.872449374Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":137205}
3
+ {"time":"2025-12-26T15:29:36.872451526Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-137205-137352-1482738377/socket","Net":"unix"}}
4
+ {"time":"2025-12-26T15:29:37.058666689Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-26T15:29:37.064819569Z","level":"INFO","msg":"handleInformInit: received","streamId":"r1nptay8","id":"1(@)"}
6
+ {"time":"2025-12-26T15:29:37.216524061Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"r1nptay8","id":"1(@)"}
7
+ {"time":"2025-12-26T15:30:19.248432516Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-26T15:30:19.248506742Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-12-26T15:30:19.24857928Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
10
+ {"time":"2025-12-26T15:30:19.248524342Z","level":"INFO","msg":"server is shutting down"}
11
+ {"time":"2025-12-26T15:30:19.248647813Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-137205-137352-1482738377/socket","Net":"unix"}}
12
+ {"time":"2025-12-26T15:30:19.549751743Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-26T15:30:19.549788501Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-26T15:30:19.549806198Z","level":"INFO","msg":"server is closed"}
wandb/run-20251226_152936-r1nptay8/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:29:37.064937062Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T15:29:37.216325813Z","level":"INFO","msg":"stream: created new stream","id":"r1nptay8"}
3
+ {"time":"2025-12-26T15:29:37.216413019Z","level":"INFO","msg":"handler: started","stream_id":"r1nptay8"}
4
+ {"time":"2025-12-26T15:29:37.216515668Z","level":"INFO","msg":"stream: started","id":"r1nptay8"}
5
+ {"time":"2025-12-26T15:29:37.216542759Z","level":"INFO","msg":"writer: started","stream_id":"r1nptay8"}
6
+ {"time":"2025-12-26T15:29:37.216565747Z","level":"INFO","msg":"sender: started","stream_id":"r1nptay8"}
7
+ {"time":"2025-12-26T15:30:19.248508176Z","level":"INFO","msg":"stream: closing","id":"r1nptay8"}
8
+ {"time":"2025-12-26T15:30:19.441030263Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-26T15:30:19.548847282Z","level":"INFO","msg":"handler: closed","stream_id":"r1nptay8"}
10
+ {"time":"2025-12-26T15:30:19.548944003Z","level":"INFO","msg":"sender: closed","stream_id":"r1nptay8"}
11
+ {"time":"2025-12-26T15:30:19.54895272Z","level":"INFO","msg":"stream: closed","id":"r1nptay8"}
wandb/run-20251226_152936-r1nptay8/logs/debug.log ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_setup.py:_flush():80] Configure stats pid to 137205
3
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_setup.py:_flush():80] Loading settings from /workspace/trainer-kit/DPO-14b/wandb/settings
5
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/dpo_run_14b_v1/wandb/run-20251226_152936-r1nptay8/logs/debug.log
7
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/dpo_run_14b_v1/wandb/run-20251226_152936-r1nptay8/logs/debug-internal.log
8
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'dpo_pairs_generated.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'prompt_field': 'prompt', 'chosen_field': 'chosen', 'rejected_field': 'rejected', 'score_field': 'f1_score', 'format_type': 'chatml', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'dpo': {'beta': 0.1, 'label_smoothing': 0.0, 'loss_type': 'sigmoid', 'use_reference_model': True, 'reference_free': False}, 'train': {'num_train_epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '5e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'evaluation_strategy': 'steps', 'eval_steps': 25, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/dpo_run_14b_v1', '_wandb': {}}
11
+ 2025-12-26 15:29:36,795 INFO MainThread:137205 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 15:29:37,058 INFO MainThread:137205 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 15:29:37,063 INFO MainThread:137205 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 15:29:37,065 INFO MainThread:137205 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 15:29:37,065 INFO MainThread:137205 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 15:29:37,469 INFO MainThread:137205 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 15:29:37,577 INFO MainThread:137205 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 15:29:37,578 INFO MainThread:137205 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 15:29:37,578 INFO MainThread:137205 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 15:29:37,578 INFO MainThread:137205 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 15:29:37,582 INFO MainThread:137205 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 15:30:19,248 INFO wandb-AsyncioManager-main:137205 [service_client.py:_forward_responses():80] Reached EOF.
23
+ 2025-12-26 15:30:19,248 INFO wandb-AsyncioManager-main:137205 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
wandb/run-20251226_152936-r1nptay8/run-r1nptay8.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc498b55a73f9c8a0d524a9f073f87ac74b98d8031b36455fd731caf2cff78f
3
+ size 403205
wandb/run-20251226_155650-wbzoafvt/files/config.yaml ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _name_or_path:
2
+ value: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
3
+ _wandb:
4
+ value:
5
+ cli_version: 0.23.1
6
+ e:
7
+ afn1h9dtq29ul6sseazq0ojw1mqcn19i:
8
+ args:
9
+ - --config
10
+ - config_dpo.yaml
11
+ codePath: run_dpo.py
12
+ codePathLocal: run_dpo.py
13
+ cpu_count: 12
14
+ cpu_count_logical: 24
15
+ cudaVersion: "13.0"
16
+ disk:
17
+ /:
18
+ total: "791251738624"
19
+ used: "323290275840"
20
+ email: shaiksirajuddin9949@gmail.com
21
+ executable: /workspace/llm_finetuning_env/bin/python
22
+ gpu: NVIDIA A100-SXM4-80GB
23
+ gpu_count: 2
24
+ gpu_nvidia:
25
+ - architecture: Ampere
26
+ cudaCores: 6912
27
+ memoryTotal: "85899345920"
28
+ name: NVIDIA A100-SXM4-80GB
29
+ uuid: GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba
30
+ - architecture: Ampere
31
+ cudaCores: 6912
32
+ memoryTotal: "85899345920"
33
+ name: NVIDIA A100-SXM4-80GB
34
+ uuid: GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40
35
+ host: a100-2gpu-shell-session-757d587799-mfdvv
36
+ memory:
37
+ total: "359047892992"
38
+ os: Linux-6.12.46+-x86_64-with-glibc2.35
39
+ program: /workspace/trainer-kit/DPO-14b/run_dpo.py
40
+ python: CPython 3.10.12
41
+ root: runs/dpo_run_14b_v1
42
+ startedAt: "2025-12-26T15:56:50.015524Z"
43
+ writerId: afn1h9dtq29ul6sseazq0ojw1mqcn19i
44
+ m:
45
+ - "1": train/global_step
46
+ "6":
47
+ - 3
48
+ "7": []
49
+ - "2": '*'
50
+ "5": 1
51
+ "6":
52
+ - 1
53
+ "7": []
54
+ python_version: 3.10.12
55
+ t:
56
+ "1":
57
+ - 1
58
+ - 11
59
+ - 41
60
+ - 49
61
+ - 51
62
+ - 71
63
+ - 84
64
+ - 98
65
+ "2":
66
+ - 1
67
+ - 11
68
+ - 41
69
+ - 49
70
+ - 51
71
+ - 71
72
+ - 84
73
+ - 98
74
+ "3":
75
+ - 7
76
+ - 15
77
+ - 16
78
+ - 19
79
+ - 66
80
+ "4": 3.10.12
81
+ "5": 0.23.1
82
+ "6": 5.0.0.dev0
83
+ "9":
84
+ "1": transformers_trainer
85
+ "12": 0.23.1
86
+ "13": linux-x86_64
87
+ accelerator_config:
88
+ value:
89
+ dispatch_batches: null
90
+ even_batches: true
91
+ gradient_accumulation_kwargs: null
92
+ non_blocking: false
93
+ split_batches: false
94
+ use_seedable_sampler: true
95
+ adam_beta1:
96
+ value: 0.9
97
+ adam_beta2:
98
+ value: 0.999
99
+ adam_epsilon:
100
+ value: 1e-08
101
+ add_cross_attention:
102
+ value: false
103
+ architectures:
104
+ value:
105
+ - Qwen2ForCausalLM
106
+ attention_dropout:
107
+ value: 0
108
+ auto_find_batch_size:
109
+ value: false
110
+ average_tokens_across_devices:
111
+ value: true
112
+ base_model_attribute_name:
113
+ value: model
114
+ batch_eval_metrics:
115
+ value: false
116
+ beta:
117
+ value: 0.1
118
+ bf16:
119
+ value: true
120
+ bf16_full_eval:
121
+ value: false
122
+ bos_token_id:
123
+ value: null
124
+ chunk_size_feed_forward:
125
+ value: 0
126
+ cross_attention_hidden_size:
127
+ value: null
128
+ data:
129
+ value:
130
+ chosen_field: chosen
131
+ eval_jsonl: null
132
+ eval_split_ratio: 0.1
133
+ format_type: chatml
134
+ max_length: 2048
135
+ num_proc: 4
136
+ prompt_field: prompt
137
+ rejected_field: rejected
138
+ score_field: f1_score
139
+ shuffle: true
140
+ system_prompt: |
141
+ You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.
142
+
143
+ ## Output Format
144
+
145
+ ##OUTPUT
146
+ Explain the data flow and why each component must change:
147
+ - Flow: [Input → Processing → Output with arrows]
148
+ - For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"
149
+ - Explain coupling between components
150
+
151
+ ##SELECT
152
+ modify::crates/path/to/file.rs::impl::ComponentName
153
+ add::crates/another/file.rs::function::AnotherComponent
154
+ <EOS>
155
+
156
+ ## Rules
157
+
158
+ 1. Use full paths: `remove::crates/folder/file.rs::Type::Name`
159
+ 2. Use `::` for nested items: `status::StructName::Type::Name`
160
+ 3. Always explain "must change because" and "without this"
161
+ 3. Types of components: function, struct, enum, impl, trait
162
+ 4. If there is extra information (e.g., enum variants), include that too.
163
+ 5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>
164
+ train_jsonl: dpo_pairs_generated.jsonl
165
+ data_seed:
166
+ value: null
167
+ dataloader_drop_last:
168
+ value: false
169
+ dataloader_num_workers:
170
+ value: 0
171
+ dataloader_persistent_workers:
172
+ value: false
173
+ dataloader_pin_memory:
174
+ value: true
175
+ dataloader_prefetch_factor:
176
+ value: null
177
+ dataset_num_proc:
178
+ value: null
179
+ ddp_backend:
180
+ value: null
181
+ ddp_broadcast_buffers:
182
+ value: null
183
+ ddp_bucket_cap_mb:
184
+ value: null
185
+ ddp_find_unused_parameters:
186
+ value: null
187
+ ddp_timeout:
188
+ value: 1800
189
+ debug:
190
+ value: []
191
+ decoder_start_token_id:
192
+ value: null
193
+ deepspeed:
194
+ value: null
195
+ disable_dropout:
196
+ value: true
197
+ disable_tqdm:
198
+ value: false
199
+ discopop_tau:
200
+ value: 0.05
201
+ do_eval:
202
+ value: true
203
+ do_predict:
204
+ value: false
205
+ do_train:
206
+ value: false
207
+ dpo:
208
+ value:
209
+ beta: 0.1
210
+ label_smoothing: 0
211
+ loss_type: sigmoid
212
+ reference_free: false
213
+ use_reference_model: true
214
+ dtype:
215
+ value: bfloat16
216
+ enable_jit_checkpoint:
217
+ value: false
218
+ eos_token_id:
219
+ value: 151643
220
+ eval_accumulation_steps:
221
+ value: null
222
+ eval_delay:
223
+ value: 0
224
+ eval_do_concat_batches:
225
+ value: true
226
+ eval_on_start:
227
+ value: false
228
+ eval_steps:
229
+ value: 25
230
+ eval_strategy:
231
+ value: steps
232
+ eval_use_gather_object:
233
+ value: false
234
+ f_alpha_divergence_coef:
235
+ value: 1
236
+ f_divergence_type:
237
+ value: reverse_kl
238
+ finetuning_task:
239
+ value: null
240
+ force_use_ref_model:
241
+ value: false
242
+ fp16:
243
+ value: false
244
+ fp16_full_eval:
245
+ value: false
246
+ fsdp:
247
+ value: []
248
+ fsdp_config:
249
+ value:
250
+ min_num_params: 0
251
+ xla: false
252
+ xla_fsdp_grad_ckpt: false
253
+ xla_fsdp_v2: false
254
+ full_determinism:
255
+ value: false
256
+ generate_during_eval:
257
+ value: false
258
+ gradient_accumulation_steps:
259
+ value: 8
260
+ gradient_checkpointing:
261
+ value: true
262
+ gradient_checkpointing_kwargs:
263
+ value: null
264
+ greater_is_better:
265
+ value: false
266
+ group_by_length:
267
+ value: false
268
+ hidden_act:
269
+ value: silu
270
+ hidden_size:
271
+ value: 5120
272
+ hub_always_push:
273
+ value: false
274
+ hub_model_id:
275
+ value: null
276
+ hub_private_repo:
277
+ value: null
278
+ hub_revision:
279
+ value: null
280
+ hub_strategy:
281
+ value: every_save
282
+ hub_token:
283
+ value: <HUB_TOKEN>
284
+ id2label:
285
+ value:
286
+ "0": LABEL_0
287
+ "1": LABEL_1
288
+ ignore_data_skip:
289
+ value: false
290
+ include_for_metrics:
291
+ value: []
292
+ include_num_input_tokens_seen:
293
+ value: "no"
294
+ initializer_range:
295
+ value: 0.02
296
+ intermediate_size:
297
+ value: 13824
298
+ is_decoder:
299
+ value: false
300
+ is_encoder_decoder:
301
+ value: false
302
+ label_names:
303
+ value: null
304
+ label_pad_token_id:
305
+ value: -100
306
+ label_smoothing:
307
+ value: 0
308
+ label_smoothing_factor:
309
+ value: 0
310
+ label2id:
311
+ value:
312
+ LABEL_0: 0
313
+ LABEL_1: 1
314
+ layer_types:
315
+ value:
316
+ - full_attention
317
+ - full_attention
318
+ - full_attention
319
+ - full_attention
320
+ - full_attention
321
+ - full_attention
322
+ - full_attention
323
+ - full_attention
324
+ - full_attention
325
+ - full_attention
326
+ - full_attention
327
+ - full_attention
328
+ - full_attention
329
+ - full_attention
330
+ - full_attention
331
+ - full_attention
332
+ - full_attention
333
+ - full_attention
334
+ - full_attention
335
+ - full_attention
336
+ - full_attention
337
+ - full_attention
338
+ - full_attention
339
+ - full_attention
340
+ - full_attention
341
+ - full_attention
342
+ - full_attention
343
+ - full_attention
344
+ - full_attention
345
+ - full_attention
346
+ - full_attention
347
+ - full_attention
348
+ - full_attention
349
+ - full_attention
350
+ - full_attention
351
+ - full_attention
352
+ - full_attention
353
+ - full_attention
354
+ - full_attention
355
+ - full_attention
356
+ - full_attention
357
+ - full_attention
358
+ - full_attention
359
+ - full_attention
360
+ - full_attention
361
+ - full_attention
362
+ - full_attention
363
+ - full_attention
364
+ ld_alpha:
365
+ value: null
366
+ learning_rate:
367
+ value: 5e-05
368
+ length_column_name:
369
+ value: length
370
+ liger_kernel_config:
371
+ value: null
372
+ load_best_model_at_end:
373
+ value: true
374
+ local_rank:
375
+ value: -1
376
+ log_level:
377
+ value: passive
378
+ log_level_replica:
379
+ value: warning
380
+ log_on_each_node:
381
+ value: true
382
+ logging_dir:
383
+ value: null
384
+ logging_first_step:
385
+ value: false
386
+ logging_nan_inf_filter:
387
+ value: true
388
+ logging_steps:
389
+ value: 2
390
+ logging_strategy:
391
+ value: steps
392
+ loss_type:
393
+ value: sigmoid
394
+ loss_weights:
395
+ value: null
396
+ lr_scheduler_kwargs:
397
+ value: null
398
+ lr_scheduler_type:
399
+ value: cosine
400
+ max_completion_length:
401
+ value: null
402
+ max_grad_norm:
403
+ value: 1
404
+ max_length:
405
+ value: 2048
406
+ max_position_embeddings:
407
+ value: 32768
408
+ max_prompt_length:
409
+ value: 1024
410
+ max_steps:
411
+ value: -1
412
+ max_window_layers:
413
+ value: 48
414
+ metric_for_best_model:
415
+ value: eval_loss
416
+ model:
417
+ value:
418
+ attn_implementation: null
419
+ base_local_dir: base_model
420
+ bnb_4bit_compute_dtype: bfloat16
421
+ bnb_4bit_quant_type: nf4
422
+ bnb_4bit_use_double_quant: false
423
+ device_map: auto
424
+ repo_id: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
425
+ revision: null
426
+ tokenizer_use_fast: true
427
+ torch_dtype: bfloat16
428
+ trust_remote_code: true
429
+ use_4bit: false
430
+ model/num_parameters:
431
+ value: 14795199488
432
+ model_adapter_name:
433
+ value: null
434
+ model_init_kwargs:
435
+ value: null
436
+ model_type:
437
+ value: qwen2
438
+ neftune_noise_alpha:
439
+ value: null
440
+ num_attention_heads:
441
+ value: 40
442
+ num_hidden_layers:
443
+ value: 48
444
+ num_key_value_heads:
445
+ value: 8
446
+ num_train_epochs:
447
+ value: 3
448
+ optim:
449
+ value: adamw_torch
450
+ optim_args:
451
+ value: null
452
+ optim_target_modules:
453
+ value: null
454
+ output_attentions:
455
+ value: false
456
+ output_dir:
457
+ value: runs/dpo_run_14b_v1
458
+ output_hidden_states:
459
+ value: false
460
+ pad_token:
461
+ value: <PAD_TOKEN>
462
+ pad_token_id:
463
+ value: 151643
464
+ padding_free:
465
+ value: false
466
+ parallelism_config:
467
+ value: null
468
+ peft:
469
+ value:
470
+ bias: none
471
+ enabled: true
472
+ lora_alpha: 32
473
+ lora_dropout: 0.05
474
+ r: 16
475
+ target_modules: auto
476
+ peft_config:
477
+ value:
478
+ default:
479
+ alora_invocation_tokens: null
480
+ arrow_config: null
481
+ auto_mapping: null
482
+ base_model_name_or_path: ../../Models/Qwen2.5-Coder-14B-CPT-SFT
483
+ bias: none
484
+ corda_config: null
485
+ ensure_weight_tying: false
486
+ eva_config: null
487
+ exclude_modules: null
488
+ fan_in_fan_out: false
489
+ inference_mode: false
490
+ init_lora_weights: true
491
+ layer_replication: null
492
+ layers_pattern: null
493
+ layers_to_transform: null
494
+ lora_alpha: 32
495
+ lora_bias: false
496
+ lora_dropout: 0.05
497
+ megatron_config: null
498
+ megatron_core: megatron.core
499
+ modules_to_save: null
500
+ peft_type: LORA
501
+ peft_version: 0.18.0
502
+ qalora_group_size: 16
503
+ r: 16
504
+ revision: null
505
+ runtime_config:
506
+ ephemeral_gpu_offload: false
507
+ target_modules:
508
+ - k_proj
509
+ - o_proj
510
+ - v_proj
511
+ - q_proj
512
+ target_parameters: null
513
+ task_type: CAUSAL_LM
514
+ trainable_token_indices: null
515
+ use_dora: false
516
+ use_qalora: false
517
+ use_rslora: false
518
+ per_device_eval_batch_size:
519
+ value: 1
520
+ per_device_train_batch_size:
521
+ value: 1
522
+ precompute_ref_batch_size:
523
+ value: null
524
+ precompute_ref_log_probs:
525
+ value: false
526
+ prediction_loss_only:
527
+ value: false
528
+ prefix:
529
+ value: null
530
+ problem_type:
531
+ value: null
532
+ project:
533
+ value: huggingface
534
+ push_to_hub:
535
+ value: false
536
+ ref_adapter_name:
537
+ value: null
538
+ ref_model_init_kwargs:
539
+ value: null
540
+ ref_model_mixup_alpha:
541
+ value: 0.6
542
+ ref_model_sync_steps:
543
+ value: 512
544
+ reference_free:
545
+ value: false
546
+ remove_unused_columns:
547
+ value: false
548
+ report_to:
549
+ value:
550
+ - wandb
551
+ restore_callback_states_from_checkpoint:
552
+ value: false
553
+ resume_from_checkpoint:
554
+ value: null
555
+ return_dict:
556
+ value: true
557
+ rms_norm_eps:
558
+ value: 1e-06
559
+ rope_parameters:
560
+ value:
561
+ rope_theta: 1e+06
562
+ rope_type: default
563
+ rpo_alpha:
564
+ value: null
565
+ run_dir:
566
+ value: runs/dpo_run_14b_v1
567
+ run_name:
568
+ value: null
569
+ save_on_each_node:
570
+ value: false
571
+ save_only_model:
572
+ value: false
573
+ save_steps:
574
+ value: 100
575
+ save_strategy:
576
+ value: steps
577
+ save_total_limit:
578
+ value: 10
579
+ seed:
580
+ value: 42
581
+ sep_token_id:
582
+ value: null
583
+ skip_memory_metrics:
584
+ value: true
585
+ sliding_window:
586
+ value: null
587
+ sync_ref_model:
588
+ value: false
589
+ task_specific_params:
590
+ value: null
591
+ tf32:
592
+ value: null
593
+ tie_word_embeddings:
594
+ value: false
595
+ tokenizer_class:
596
+ value: null
597
+ tools:
598
+ value: null
599
+ torch_compile:
600
+ value: false
601
+ torch_compile_backend:
602
+ value: null
603
+ torch_compile_mode:
604
+ value: null
605
+ torch_empty_cache_steps:
606
+ value: null
607
+ trackio_space_id:
608
+ value: trackio
609
+ train:
610
+ value:
611
+ early_stopping:
612
+ enabled: true
613
+ metric: eval_loss
614
+ min_delta: 0.001
615
+ mode: min
616
+ patience: 5
617
+ eval_steps: 25
618
+ evaluation_strategy: steps
619
+ gradient_accumulation_steps: 8
620
+ gradient_checkpointing: true
621
+ learning_rate: "5e-5"
622
+ load_best_model_at_end: true
623
+ logging_steps: 2
624
+ lr_scheduler_type: cosine
625
+ max_grad_norm: 1
626
+ num_train_epochs: 3
627
+ optim: adamw_torch
628
+ per_device_eval_batch_size: 1
629
+ per_device_train_batch_size: 1
630
+ resume_from_checkpoint: auto
631
+ save_steps: 100
632
+ save_strategy: steps
633
+ save_total_limit: 10
634
+ warmup_ratio: 0.1
635
+ weight_decay: 0
636
+ transformers_version:
637
+ value: 5.0.0.dev0
638
+ truncation_mode:
639
+ value: keep_end
640
+ use_cache:
641
+ value: false
642
+ use_cpu:
643
+ value: false
644
+ use_liger_kernel:
645
+ value: false
646
+ use_liger_loss:
647
+ value: null
648
+ use_logits_to_keep:
649
+ value: false
650
+ use_sliding_window:
651
+ value: false
652
+ use_weighting:
653
+ value: false
654
+ vocab_size:
655
+ value: 152064
656
+ warmup_ratio:
657
+ value: 0.1
658
+ warmup_steps:
659
+ value: 0.1
660
+ weight_decay:
661
+ value: 0
wandb/run-20251226_155650-wbzoafvt/files/output.log ADDED
@@ -0,0 +1,279 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Wandb initialized: project='dpo-training', name='auto-generated'
2
+ `torch_dtype` is deprecated! Use `dtype` instead!
3
+ Loading weights: 100%|█████████| 579/579 [00:09<00:00, 60.71it/s, Materializing param=model.norm.weight]
4
+ Loading reference model (frozen copy)...
5
+ Loading weights: 100%|█████████| 579/579 [00:09<00:00, 60.20it/s, Materializing param=model.norm.weight]
6
+ Reference model loaded and frozen
7
+ 2025-12-26 15:57:19,133 - INFO - HTTP Request: HEAD https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets/json/json.py "HTTP/1.1 200 OK"
8
+ 2025-12-26 15:57:19,148 - INFO - Formatting train DPO data...
9
+ 2025-12-26 15:57:21,512 - INFO - Train dataset after filtering: 6850 examples
10
+ 2025-12-26 15:57:21,513 - INFO - train dataset validation passed: 6850 examples
11
+ 2025-12-26 15:57:21,513 - INFO - Formatting eval DPO data...
12
+ 2025-12-26 15:57:23,870 - INFO - Eval dataset after filtering: 762 examples
13
+ 2025-12-26 15:57:23,871 - INFO - eval dataset validation passed: 762 examples
14
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
15
+ Early stopping enabled: patience=5, min_delta=0.001
16
+ 2025-12-26 15:57:23,907 - INFO - DPO Training with beta=0.1, loss_type=sigmoid
17
+ warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.
18
+ 2025-12-26 15:57:33,435 - INFO - Starting DPO training...
19
+ The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.
20
+
21
+ {'loss': '0.6931', 'grad_norm': '1.243', 'learning_rate': '1.938e-07', 'rewards/chosen': '0', 'rewards/rejected': '0', 'rewards/accuracies': '0', 'rewards/margins': '0', 'logps/chosen': '-368.9', 'logps/rejected': '-398.8', 'logits/chosen': '5.179', 'logits/rejected': '5.193', 'epoch': '0.002336'}
22
+ {'loss': '0.695', 'grad_norm': '1.392', 'learning_rate': '5.814e-07', 'rewards/chosen': '0.004505', 'rewards/rejected': '0.007727', 'rewards/accuracies': '0.625', 'rewards/margins': '-0.003223', 'logps/chosen': '-338.4', 'logps/rejected': '-367', 'logits/chosen': '5.404', 'logits/rejected': '5.457', 'epoch': '0.004672'}
23
+ {'loss': '0.6892', 'grad_norm': '1.067', 'learning_rate': '9.69e-07', 'rewards/chosen': '-0.003407', 'rewards/rejected': '-0.01166', 'rewards/accuracies': '0.5625', 'rewards/margins': '0.008256', 'logps/chosen': '-362.3', 'logps/rejected': '-387.6', 'logits/chosen': '5.292', 'logits/rejected': '5.328', 'epoch': '0.007007'}
24
+ {'loss': '0.6944', 'grad_norm': '1.001', 'learning_rate': '1.357e-06', 'rewards/chosen': '0.01466', 'rewards/rejected': '0.01589', 'rewards/accuracies': '0.375', 'rewards/margins': '-0.001235', 'logps/chosen': '-379.9', 'logps/rejected': '-389.1', 'logits/chosen': '5.323', 'logits/rejected': '5.411', 'epoch': '0.009343'}
25
+ {'loss': '0.6933', 'grad_norm': '1.246', 'learning_rate': '1.744e-06', 'rewards/chosen': '-0.0285', 'rewards/rejected': '-0.02862', 'rewards/accuracies': '0.625', 'rewards/margins': '0.0001264', 'logps/chosen': '-363.2', 'logps/rejected': '-389.7', 'logits/chosen': '5.436', 'logits/rejected': '5.495', 'epoch': '0.01168'}
26
+ {'loss': '0.6883', 'grad_norm': '1.403', 'learning_rate': '2.132e-06', 'rewards/chosen': '0.01622', 'rewards/rejected': '0.006134', 'rewards/accuracies': '0.5', 'rewards/margins': '0.01009', 'logps/chosen': '-371', 'logps/rejected': '-402.5', 'logits/chosen': '5.355', 'logits/rejected': '5.376', 'epoch': '0.01401'}
27
+ {'loss': '0.6897', 'grad_norm': '1.116', 'learning_rate': '2.519e-06', 'rewards/chosen': '-0.01732', 'rewards/rejected': '-0.02465', 'rewards/accuracies': '0.625', 'rewards/margins': '0.007329', 'logps/chosen': '-336.7', 'logps/rejected': '-357.5', 'logits/chosen': '5.515', 'logits/rejected': '5.561', 'epoch': '0.01635'}
28
+ {'loss': '0.6904', 'grad_norm': '0.9471', 'learning_rate': '2.907e-06', 'rewards/chosen': '0.0327', 'rewards/rejected': '0.02688', 'rewards/accuracies': '0.5625', 'rewards/margins': '0.005827', 'logps/chosen': '-415.7', 'logps/rejected': '-441.1', 'logits/chosen': '5.553', 'logits/rejected': '5.583', 'epoch': '0.01869'}
29
+ {'loss': '0.6836', 'grad_norm': '1.44', 'learning_rate': '3.295e-06', 'rewards/chosen': '0.01102', 'rewards/rejected': '-0.008499', 'rewards/accuracies': '0.5625', 'rewards/margins': '0.01952', 'logps/chosen': '-392.5', 'logps/rejected': '-420.2', 'logits/chosen': '5.441', 'logits/rejected': '5.49', 'epoch': '0.02102'}
30
+ {'loss': '0.6902', 'grad_norm': '1.594', 'learning_rate': '3.682e-06', 'rewards/chosen': '0.006536', 'rewards/rejected': '0.0005231', 'rewards/accuracies': '0.5625', 'rewards/margins': '0.006013', 'logps/chosen': '-345.2', 'logps/rejected': '-366', 'logits/chosen': '5.318', 'logits/rejected': '5.398', 'epoch': '0.02336'}
31
+ {'loss': '0.6913', 'grad_norm': '1.136', 'learning_rate': '4.07e-06', 'rewards/chosen': '0.01191', 'rewards/rejected': '0.00737', 'rewards/accuracies': '0.5', 'rewards/margins': '0.004538', 'logps/chosen': '-347.9', 'logps/rejected': '-370.7', 'logits/chosen': '5.633', 'logits/rejected': '5.727', 'epoch': '0.02569'}
32
+ {'loss': '0.6769', 'grad_norm': '1.068', 'learning_rate': '4.457e-06', 'rewards/chosen': '0.01244', 'rewards/rejected': '-0.02045', 'rewards/accuracies': '0.875', 'rewards/margins': '0.03289', 'logps/chosen': '-347.2', 'logps/rejected': '-377.6', 'logits/chosen': '5.357', 'logits/rejected': '5.406', 'epoch': '0.02803'}
33
+
34
+ {'eval_loss': '0.6837', 'eval_runtime': '454.4', 'eval_samples_per_second': '1.677', 'eval_steps_per_second': '1.677', 'eval_rewards/chosen': '0.02464', 'eval_rewards/rejected': '0.005081', 'eval_rewards/accuracies': '0.6654', 'eval_rewards/margins': '0.01956', 'eval_logps/chosen': '-370.2', 'eval_logps/rejected': '-395.7', 'eval_logits/chosen': '5.295', 'eval_logits/rejected': '5.345', 'epoch': '0.0292'}
35
+ {'loss': '0.685', 'grad_norm': '1.592', 'learning_rate': '4.845e-06', 'rewards/chosen': '0.02639', 'rewards/rejected': '0.009419', 'rewards/accuracies': '0.625', 'rewards/margins': '0.01697', 'logps/chosen': '-387.5', 'logps/rejected': '-412.1', 'logits/chosen': '5.157', 'logits/rejected': '5.245', 'epoch': '0.03036'}
36
+ {'loss': '0.6752', 'grad_norm': '1.318', 'learning_rate': '5.233e-06', 'rewards/chosen': '0.04595', 'rewards/rejected': '0.009191', 'rewards/accuracies': '0.8125', 'rewards/margins': '0.03676', 'logps/chosen': '-360.4', 'logps/rejected': '-391.2', 'logits/chosen': '5.546', 'logits/rejected': '5.544', 'epoch': '0.0327'}
37
+ {'loss': '0.6752', 'grad_norm': '1.444', 'learning_rate': '5.62e-06', 'rewards/chosen': '0.04195', 'rewards/rejected': '0.005257', 'rewards/accuracies': '0.8125', 'rewards/margins': '0.03669', 'logps/chosen': '-378.6', 'logps/rejected': '-405.4', 'logits/chosen': '5.136', 'logits/rejected': '5.239', 'epoch': '0.03504'}
38
+ {'loss': '0.6701', 'grad_norm': '1.38', 'learning_rate': '6.008e-06', 'rewards/chosen': '0.06658', 'rewards/rejected': '0.01939', 'rewards/accuracies': '0.875', 'rewards/margins': '0.04719', 'logps/chosen': '-358.5', 'logps/rejected': '-382.4', 'logits/chosen': '5.411', 'logits/rejected': '5.427', 'epoch': '0.03737'}
39
+ {'loss': '0.6611', 'grad_norm': '1.326', 'learning_rate': '6.395e-06', 'rewards/chosen': '0.07039', 'rewards/rejected': '0.004514', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.06587', 'logps/chosen': '-326.9', 'logps/rejected': '-346.5', 'logits/chosen': '5.207', 'logits/rejected': '5.255', 'epoch': '0.03971'}
40
+ {'loss': '0.6282', 'grad_norm': '1.578', 'learning_rate': '6.783e-06', 'rewards/chosen': '0.1174', 'rewards/rejected': '-0.01899', 'rewards/accuracies': '1', 'rewards/margins': '0.1364', 'logps/chosen': '-360', 'logps/rejected': '-384.3', 'logits/chosen': '5.551', 'logits/rejected': '5.637', 'epoch': '0.04204'}
41
+ {'loss': '0.6271', 'grad_norm': '1.859', 'learning_rate': '7.171e-06', 'rewards/chosen': '0.1618', 'rewards/rejected': '0.02293', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.1389', 'logps/chosen': '-325.9', 'logps/rejected': '-352', 'logits/chosen': '5.391', 'logits/rejected': '5.412', 'epoch': '0.04438'}
42
+ {'loss': '0.6412', 'grad_norm': '1.323', 'learning_rate': '7.558e-06', 'rewards/chosen': '0.1525', 'rewards/rejected': '0.0409', 'rewards/accuracies': '0.875', 'rewards/margins': '0.1116', 'logps/chosen': '-343.4', 'logps/rejected': '-374.8', 'logits/chosen': '5.19', 'logits/rejected': '5.203', 'epoch': '0.04672'}
43
+ {'loss': '0.6094', 'grad_norm': '2.533', 'learning_rate': '7.946e-06', 'rewards/chosen': '0.2898', 'rewards/rejected': '0.1082', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.1816', 'logps/chosen': '-341.8', 'logps/rejected': '-372.4', 'logits/chosen': '5.42', 'logits/rejected': '5.453', 'epoch': '0.04905'}
44
+ {'loss': '0.5816', 'grad_norm': '1.525', 'learning_rate': '8.333e-06', 'rewards/chosen': '0.3246', 'rewards/rejected': '0.07354', 'rewards/accuracies': '0.8125', 'rewards/margins': '0.2511', 'logps/chosen': '-354.5', 'logps/rejected': '-376.9', 'logits/chosen': '5.384', 'logits/rejected': '5.398', 'epoch': '0.05139'}
45
+ {'loss': '0.527', 'grad_norm': '2.081', 'learning_rate': '8.721e-06', 'rewards/chosen': '0.6465', 'rewards/rejected': '0.2707', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.3758', 'logps/chosen': '-331.1', 'logps/rejected': '-362.9', 'logits/chosen': '5.27', 'logits/rejected': '5.287', 'epoch': '0.05372'}
46
+ {'loss': '0.5066', 'grad_norm': '1.769', 'learning_rate': '9.109e-06', 'rewards/chosen': '0.6378', 'rewards/rejected': '0.2113', 'rewards/accuracies': '1', 'rewards/margins': '0.4265', 'logps/chosen': '-369.4', 'logps/rejected': '-400.2', 'logits/chosen': '5.473', 'logits/rejected': '5.465', 'epoch': '0.05606'}
47
+ {'loss': '0.5293', 'grad_norm': '2.842', 'learning_rate': '9.496e-06', 'rewards/chosen': '0.7923', 'rewards/rejected': '0.4136', 'rewards/accuracies': '1', 'rewards/margins': '0.3787', 'logps/chosen': '-363.5', 'logps/rejected': '-397.8', 'logits/chosen': '5.05', 'logits/rejected': '5.112', 'epoch': '0.05839'}
48
+ {'eval_loss': '0.4611', 'eval_runtime': '454.6', 'eval_samples_per_second': '1.676', 'eval_steps_per_second': '1.676', 'eval_rewards/chosen': '0.8944', 'eval_rewards/rejected': '0.3205', 'eval_rewards/accuracies': '0.9619', 'eval_rewards/margins': '0.5739', 'eval_logps/chosen': '-361.5', 'eval_logps/rejected': '-392.6', 'eval_logits/chosen': '5.224', 'eval_logits/rejected': '5.287', 'epoch': '0.05839'}
49
+ {'loss': '0.446', 'grad_norm': '1.691', 'learning_rate': '9.884e-06', 'rewards/chosen': '0.987', 'rewards/rejected': '0.3813', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.6057', 'logps/chosen': '-343.5', 'logps/rejected': '-379.4', 'logits/chosen': '5.486', 'logits/rejected': '5.542', 'epoch': '0.06073'}
50
+ {'loss': '0.4361', 'grad_norm': '1.946', 'learning_rate': '1.027e-05', 'rewards/chosen': '0.7795', 'rewards/rejected': '0.1529', 'rewards/accuracies': '1', 'rewards/margins': '0.6266', 'logps/chosen': '-379.5', 'logps/rejected': '-401.6', 'logits/chosen': '5.17', 'logits/rejected': '5.269', 'epoch': '0.06307'}
51
+ {'loss': '0.3928', 'grad_norm': '2.127', 'learning_rate': '1.066e-05', 'rewards/chosen': '1.274', 'rewards/rejected': '0.4879', 'rewards/accuracies': '0.9375', 'rewards/margins': '0.7864', 'logps/chosen': '-378.1', 'logps/rejected': '-413.3', 'logits/chosen': '5.097', 'logits/rejected': '5.153', 'epoch': '0.0654'}
52
+ {'loss': '0.3586', 'grad_norm': '1.538', 'learning_rate': '1.105e-05', 'rewards/chosen': '1.29', 'rewards/rejected': '0.3544', 'rewards/accuracies': '0.875', 'rewards/margins': '0.9354', 'logps/chosen': '-372.9', 'logps/rejected': '-401.8', 'logits/chosen': '5.139', 'logits/rejected': '5.203', 'epoch': '0.06774'}
53
+ {'loss': '0.428', 'grad_norm': '2.358', 'learning_rate': '1.143e-05', 'rewards/chosen': '1.382', 'rewards/rejected': '0.6533', 'rewards/accuracies': '0.875', 'rewards/margins': '0.7291', 'logps/chosen': '-361', 'logps/rejected': '-392.3', 'logits/chosen': '5.072', 'logits/rejected': '5.188', 'epoch': '0.07007'}
54
+ {'loss': '0.3137', 'grad_norm': '2.178', 'learning_rate': '1.182e-05', 'rewards/chosen': '1.664', 'rewards/rejected': '0.575', 'rewards/accuracies': '1', 'rewards/margins': '1.089', 'logps/chosen': '-364.8', 'logps/rejected': '-401', 'logits/chosen': '5.264', 'logits/rejected': '5.311', 'epoch': '0.07241'}
55
+ {'loss': '0.3038', 'grad_norm': '1.698', 'learning_rate': '1.221e-05', 'rewards/chosen': '1.647', 'rewards/rejected': '0.5322', 'rewards/accuracies': '1', 'rewards/margins': '1.115', 'logps/chosen': '-359.8', 'logps/rejected': '-397.2', 'logits/chosen': '5.192', 'logits/rejected': '5.261', 'epoch': '0.07474'}
56
+ {'loss': '0.2503', 'grad_norm': '1.322', 'learning_rate': '1.26e-05', 'rewards/chosen': '1.567', 'rewards/rejected': '0.1573', 'rewards/accuracies': '1', 'rewards/margins': '1.41', 'logps/chosen': '-352.4', 'logps/rejected': '-392.7', 'logits/chosen': '5.293', 'logits/rejected': '5.309', 'epoch': '0.07708'}
57
+ {'loss': '0.3108', 'grad_norm': '1.817', 'learning_rate': '1.298e-05', 'rewards/chosen': '1.479', 'rewards/rejected': '0.2151', 'rewards/accuracies': '0.9375', 'rewards/margins': '1.264', 'logps/chosen': '-320', 'logps/rejected': '-364.1', 'logits/chosen': '5.026', 'logits/rejected': '5.115', 'epoch': '0.07942'}
58
+ {'loss': '0.2299', 'grad_norm': '1.066', 'learning_rate': '1.337e-05', 'rewards/chosen': '1.395', 'rewards/rejected': '-0.1015', 'rewards/accuracies': '1', 'rewards/margins': '1.497', 'logps/chosen': '-383.8', 'logps/rejected': '-431.8', 'logits/chosen': '4.945', 'logits/rejected': '4.959', 'epoch': '0.08175'}
59
+ {'loss': '0.226', 'grad_norm': '1.035', 'learning_rate': '1.376e-05', 'rewards/chosen': '1.298', 'rewards/rejected': '-0.3464', 'rewards/accuracies': '1', 'rewards/margins': '1.644', 'logps/chosen': '-350.9', 'logps/rejected': '-382.7', 'logits/chosen': '5.004', 'logits/rejected': '5.12', 'epoch': '0.08409'}
60
+ {'loss': '0.1892', 'grad_norm': '1.16', 'learning_rate': '1.415e-05', 'rewards/chosen': '1.198', 'rewards/rejected': '-0.5511', 'rewards/accuracies': '1', 'rewards/margins': '1.75', 'logps/chosen': '-352.3', 'logps/rejected': '-399.2', 'logits/chosen': '4.89', 'logits/rejected': '4.95', 'epoch': '0.08642'}
61
+ {'eval_loss': '0.1602', 'eval_runtime': '454.3', 'eval_samples_per_second': '1.677', 'eval_steps_per_second': '1.677', 'eval_rewards/chosen': '1.121', 'eval_rewards/rejected': '-0.9336', 'eval_rewards/accuracies': '0.9961', 'eval_rewards/margins': '2.055', 'eval_logps/chosen': '-359.2', 'eval_logps/rejected': '-405.1', 'eval_logits/chosen': '4.93', 'eval_logits/rejected': '5.032', 'epoch': '0.08759'}
62
+ {'loss': '0.16', 'grad_norm': '1.143', 'learning_rate': '1.453e-05', 'rewards/chosen': '1.213', 'rewards/rejected': '-0.8816', 'rewards/accuracies': '1', 'rewards/margins': '2.095', 'logps/chosen': '-313.1', 'logps/rejected': '-356.1', 'logits/chosen': '5.037', 'logits/rejected': '5.132', 'epoch': '0.08876'}
63
+ {'loss': '0.1895', 'grad_norm': '0.9839', 'learning_rate': '1.492e-05', 'rewards/chosen': '1.061', 'rewards/rejected': '-0.8471', 'rewards/accuracies': '1', 'rewards/margins': '1.908', 'logps/chosen': '-366.3', 'logps/rejected': '-405.8', 'logits/chosen': '4.817', 'logits/rejected': '4.874', 'epoch': '0.09109'}
64
+ {'loss': '0.1595', 'grad_norm': '0.9213', 'learning_rate': '1.531e-05', 'rewards/chosen': '0.6765', 'rewards/rejected': '-1.491', 'rewards/accuracies': '1', 'rewards/margins': '2.167', 'logps/chosen': '-348.1', 'logps/rejected': '-395.2', 'logits/chosen': '5.047', 'logits/rejected': '5.158', 'epoch': '0.09343'}
65
+ {'loss': '0.1209', 'grad_norm': '0.9821', 'learning_rate': '1.57e-05', 'rewards/chosen': '0.872', 'rewards/rejected': '-1.775', 'rewards/accuracies': '1', 'rewards/margins': '2.647', 'logps/chosen': '-378.9', 'logps/rejected': '-436.9', 'logits/chosen': '4.691', 'logits/rejected': '4.772', 'epoch': '0.09577'}
66
+ {'loss': '0.08721', 'grad_norm': '0.6679', 'learning_rate': '1.609e-05', 'rewards/chosen': '1.134', 'rewards/rejected': '-1.77', 'rewards/accuracies': '1', 'rewards/margins': '2.904', 'logps/chosen': '-346.5', 'logps/rejected': '-400.1', 'logits/chosen': '4.88', 'logits/rejected': '4.962', 'epoch': '0.0981'}
67
+ {'loss': '0.07943', 'grad_norm': '0.5761', 'learning_rate': '1.647e-05', 'rewards/chosen': '1.246', 'rewards/rejected': '-1.769', 'rewards/accuracies': '1', 'rewards/margins': '3.015', 'logps/chosen': '-341.7', 'logps/rejected': '-398.3', 'logits/chosen': '4.464', 'logits/rejected': '4.68', 'epoch': '0.1004'}
68
+ {'loss': '0.1258', 'grad_norm': '1.602', 'learning_rate': '1.686e-05', 'rewards/chosen': '1.071', 'rewards/rejected': '-2.048', 'rewards/accuracies': '0.9375', 'rewards/margins': '3.119', 'logps/chosen': '-344.9', 'logps/rejected': '-395.4', 'logits/chosen': '4.564', 'logits/rejected': '4.681', 'epoch': '0.1028'}
69
+ {'loss': '0.06663', 'grad_norm': '0.4641', 'learning_rate': '1.725e-05', 'rewards/chosen': '1.413', 'rewards/rejected': '-2.348', 'rewards/accuracies': '1', 'rewards/margins': '3.761', 'logps/chosen': '-327', 'logps/rejected': '-388.4', 'logits/chosen': '4.499', 'logits/rejected': '4.673', 'epoch': '0.1051'}
70
+ {'loss': '0.04482', 'grad_norm': '0.67', 'learning_rate': '1.764e-05', 'rewards/chosen': '1.478', 'rewards/rejected': '-2.901', 'rewards/accuracies': '1', 'rewards/margins': '4.379', 'logps/chosen': '-362.7', 'logps/rejected': '-439.3', 'logits/chosen': '4.729', 'logits/rejected': '4.814', 'epoch': '0.1074'}
71
+ {'loss': '0.05633', 'grad_norm': '0.4153', 'learning_rate': '1.802e-05', 'rewards/chosen': '0.7137', 'rewards/rejected': '-2.745', 'rewards/accuracies': '1', 'rewards/margins': '3.458', 'logps/chosen': '-381.6', 'logps/rejected': '-444.3', 'logits/chosen': '4.785', 'logits/rejected': '4.892', 'epoch': '0.1098'}
72
+ {'loss': '0.04092', 'grad_norm': '0.3153', 'learning_rate': '1.841e-05', 'rewards/chosen': '1.757', 'rewards/rejected': '-2.264', 'rewards/accuracies': '1', 'rewards/margins': '4.021', 'logps/chosen': '-356.7', 'logps/rejected': '-414.7', 'logits/chosen': '4.604', 'logits/rejected': '4.805', 'epoch': '0.1121'}
73
+ {'loss': '0.02579', 'grad_norm': '0.377', 'learning_rate': '1.88e-05', 'rewards/chosen': '1.387', 'rewards/rejected': '-3.268', 'rewards/accuracies': '1', 'rewards/margins': '4.654', 'logps/chosen': '-339.8', 'logps/rejected': '-413.9', 'logits/chosen': '4.559', 'logits/rejected': '4.691', 'epoch': '0.1145'}
74
+ {'loss': '0.01516', 'grad_norm': '0.1502', 'learning_rate': '1.919e-05', 'rewards/chosen': '1.794', 'rewards/rejected': '-3.149', 'rewards/accuracies': '1', 'rewards/margins': '4.943', 'logps/chosen': '-346.3', 'logps/rejected': '-418.9', 'logits/chosen': '4.387', 'logits/rejected': '4.495', 'epoch': '0.1168'}
75
+ {'eval_loss': '0.04428', 'eval_runtime': '454.7', 'eval_samples_per_second': '1.676', 'eval_steps_per_second': '1.676', 'eval_rewards/chosen': '1.725', 'eval_rewards/rejected': '-2.864', 'eval_rewards/accuracies': '0.9921', 'eval_rewards/margins': '4.589', 'eval_logps/chosen': '-353.2', 'eval_logps/rejected': '-424.4', 'eval_logits/chosen': '4.286', 'eval_logits/rejected': '4.426', 'epoch': '0.1168'}
76
+ {'loss': '0.0159', 'grad_norm': '0.2124', 'learning_rate': '1.957e-05', 'rewards/chosen': '1.77', 'rewards/rejected': '-3.026', 'rewards/accuracies': '1', 'rewards/margins': '4.796', 'logps/chosen': '-305', 'logps/rejected': '-384.9', 'logits/chosen': '4.197', 'logits/rejected': '4.353', 'epoch': '0.1191'}
77
+ {'loss': '0.03818', 'grad_norm': '1.196', 'learning_rate': '1.996e-05', 'rewards/chosen': '1.556', 'rewards/rejected': '-3.267', 'rewards/accuracies': '1', 'rewards/margins': '4.823', 'logps/chosen': '-341.1', 'logps/rejected': '-417.6', 'logits/chosen': '4.185', 'logits/rejected': '4.28', 'epoch': '0.1215'}
78
+ {'loss': '0.05679', 'grad_norm': '1.302', 'learning_rate': '2.035e-05', 'rewards/chosen': '1.654', 'rewards/rejected': '-3.076', 'rewards/accuracies': '1', 'rewards/margins': '4.73', 'logps/chosen': '-358.1', 'logps/rejected': '-426.9', 'logits/chosen': '4.324', 'logits/rejected': '4.452', 'epoch': '0.1238'}
79
+ {'loss': '0.07615', 'grad_norm': '0.3007', 'learning_rate': '2.074e-05', 'rewards/chosen': '1.412', 'rewards/rejected': '-3.332', 'rewards/accuracies': '0.9375', 'rewards/margins': '4.744', 'logps/chosen': '-364.5', 'logps/rejected': '-434.5', 'logits/chosen': '4.492', 'logits/rejected': '4.633', 'epoch': '0.1261'}
80
+ {'loss': '0.0146', 'grad_norm': '0.4247', 'learning_rate': '2.112e-05', 'rewards/chosen': '1.958', 'rewards/rejected': '-4.051', 'rewards/accuracies': '1', 'rewards/margins': '6.009', 'logps/chosen': '-306.5', 'logps/rejected': '-392.5', 'logits/chosen': '3.858', 'logits/rejected': '3.968', 'epoch': '0.1285'}
81
+ {'loss': '0.01015', 'grad_norm': '0.1418', 'learning_rate': '2.151e-05', 'rewards/chosen': '2.196', 'rewards/rejected': '-3.758', 'rewards/accuracies': '1', 'rewards/margins': '5.954', 'logps/chosen': '-339.6', 'logps/rejected': '-425.5', 'logits/chosen': '4.254', 'logits/rejected': '4.353', 'epoch': '0.1308'}
82
+ {'loss': '0.01139', 'grad_norm': '0.2944', 'learning_rate': '2.19e-05', 'rewards/chosen': '1.995', 'rewards/rejected': '-3.392', 'rewards/accuracies': '1', 'rewards/margins': '5.387', 'logps/chosen': '-349.4', 'logps/rejected': '-431.8', 'logits/chosen': '3.717', 'logits/rejected': '3.922', 'epoch': '0.1331'}
83
+ {'loss': '0.02451', 'grad_norm': '0.9541', 'learning_rate': '2.229e-05', 'rewards/chosen': '1.855', 'rewards/rejected': '-3.475', 'rewards/accuracies': '1', 'rewards/margins': '5.33', 'logps/chosen': '-343.2', 'logps/rejected': '-423.2', 'logits/chosen': '3.514', 'logits/rejected': '3.74', 'epoch': '0.1355'}
84
+ {'loss': '0.007584', 'grad_norm': '0.4569', 'learning_rate': '2.267e-05', 'rewards/chosen': '2.13', 'rewards/rejected': '-4.365', 'rewards/accuracies': '1', 'rewards/margins': '6.495', 'logps/chosen': '-382.1', 'logps/rejected': '-480.7', 'logits/chosen': '3.9', 'logits/rejected': '3.963', 'epoch': '0.1378'}
85
+ {'loss': '0.007748', 'grad_norm': '0.2083', 'learning_rate': '2.306e-05', 'rewards/chosen': '1.399', 'rewards/rejected': '-4.58', 'rewards/accuracies': '1', 'rewards/margins': '5.979', 'logps/chosen': '-355.3', 'logps/rejected': '-436.5', 'logits/chosen': '3.772', 'logits/rejected': '3.939', 'epoch': '0.1401'}
86
+ {'loss': '0.01436', 'grad_norm': '0.2193', 'learning_rate': '2.345e-05', 'rewards/chosen': '1.177', 'rewards/rejected': '-5.205', 'rewards/accuracies': '1', 'rewards/margins': '6.382', 'logps/chosen': '-327.2', 'logps/rejected': '-414.8', 'logits/chosen': '3.657', 'logits/rejected': '3.875', 'epoch': '0.1425'}
87
+ {'loss': '0.007622', 'grad_norm': '0.03551', 'learning_rate': '2.384e-05', 'rewards/chosen': '0.7803', 'rewards/rejected': '-6.615', 'rewards/accuracies': '1', 'rewards/margins': '7.395', 'logps/chosen': '-369.9', 'logps/rejected': '-474', 'logits/chosen': '3.66', 'logits/rejected': '3.725', 'epoch': '0.1448'}
88
+ {'eval_loss': '0.02411', 'eval_runtime': '454.8', 'eval_samples_per_second': '1.675', 'eval_steps_per_second': '1.675', 'eval_rewards/chosen': '0.5319', 'eval_rewards/rejected': '-6.151', 'eval_rewards/accuracies': '0.9934', 'eval_rewards/margins': '6.683', 'eval_logps/chosen': '-365.1', 'eval_logps/rejected': '-457.3', 'eval_logits/chosen': '3.669', 'eval_logits/rejected': '3.844', 'epoch': '0.146'}
89
+ {'loss': '0.005532', 'grad_norm': '0.2169', 'learning_rate': '2.422e-05', 'rewards/chosen': '0.9076', 'rewards/rejected': '-7.027', 'rewards/accuracies': '1', 'rewards/margins': '7.935', 'logps/chosen': '-345.2', 'logps/rejected': '-454.6', 'logits/chosen': '3.778', 'logits/rejected': '3.757', 'epoch': '0.1472'}
90
+ {'loss': '0.0008547', 'grad_norm': '0.05145', 'learning_rate': '2.461e-05', 'rewards/chosen': '1.086', 'rewards/rejected': '-6.756', 'rewards/accuracies': '1', 'rewards/margins': '7.843', 'logps/chosen': '-376.3', 'logps/rejected': '-486.3', 'logits/chosen': '3.686', 'logits/rejected': '3.777', 'epoch': '0.1495'}
91
+ {'loss': '0.01921', 'grad_norm': '1.001', 'learning_rate': '2.5e-05', 'rewards/chosen': '0.7988', 'rewards/rejected': '-6.314', 'rewards/accuracies': '1', 'rewards/margins': '7.112', 'logps/chosen': '-330.3', 'logps/rejected': '-420.2', 'logits/chosen': '3.856', 'logits/rejected': '4.067', 'epoch': '0.1518'}
92
+ {'loss': '0.005052', 'grad_norm': '0.2909', 'learning_rate': '2.539e-05', 'rewards/chosen': '0.727', 'rewards/rejected': '-6.733', 'rewards/accuracies': '1', 'rewards/margins': '7.46', 'logps/chosen': '-346.6', 'logps/rejected': '-448.4', 'logits/chosen': '3.856', 'logits/rejected': '4.107', 'epoch': '0.1542'}
93
+ {'loss': '0.02904', 'grad_norm': '0.1034', 'learning_rate': '2.578e-05', 'rewards/chosen': '-0.1552', 'rewards/rejected': '-7.109', 'rewards/accuracies': '1', 'rewards/margins': '6.953', 'logps/chosen': '-398.5', 'logps/rejected': '-493.3', 'logits/chosen': '3.692', 'logits/rejected': '3.876', 'epoch': '0.1565'}
94
+ {'loss': '0.008301', 'grad_norm': '0.4083', 'learning_rate': '2.616e-05', 'rewards/chosen': '0.3356', 'rewards/rejected': '-6.363', 'rewards/accuracies': '1', 'rewards/margins': '6.699', 'logps/chosen': '-420.6', 'logps/rejected': '-511.7', 'logits/chosen': '3.701', 'logits/rejected': '3.885', 'epoch': '0.1588'}
95
+ {'loss': '0.01079', 'grad_norm': '0.1769', 'learning_rate': '2.655e-05', 'rewards/chosen': '1.53', 'rewards/rejected': '-5.799', 'rewards/accuracies': '1', 'rewards/margins': '7.329', 'logps/chosen': '-361.6', 'logps/rejected': '-455', 'logits/chosen': '3.477', 'logits/rejected': '3.63', 'epoch': '0.1612'}
96
+ {'loss': '0.01278', 'grad_norm': '0.1559', 'learning_rate': '2.694e-05', 'rewards/chosen': '1.282', 'rewards/rejected': '-6.813', 'rewards/accuracies': '1', 'rewards/margins': '8.095', 'logps/chosen': '-379.4', 'logps/rejected': '-490', 'logits/chosen': '3.44', 'logits/rejected': '3.567', 'epoch': '0.1635'}
97
+ {'loss': '0.007118', 'grad_norm': '0.8207', 'learning_rate': '2.733e-05', 'rewards/chosen': '1.469', 'rewards/rejected': '-6.559', 'rewards/accuracies': '1', 'rewards/margins': '8.027', 'logps/chosen': '-402.8', 'logps/rejected': '-506.3', 'logits/chosen': '3.235', 'logits/rejected': '3.393', 'epoch': '0.1658'}
98
+ main()
99
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 928, in main
100
+ trainer.train(resume_from_checkpoint=resume_from)
101
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2168, in train
102
+ return inner_training_loop(
103
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2535, in _inner_training_loop
104
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
105
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 3807, in training_step
106
+ loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
107
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1810, in compute_loss
108
+ loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
109
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1726, in get_batch_loss_metrics
110
+ model_output = self.concatenated_forward(model, batch)
111
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1600, in concatenated_forward
112
+ outputs = model(input_ids, **model_kwargs)
113
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
114
+ return self._call_impl(*args, **kwargs)
115
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
116
+ return forward_call(*args, **kwargs)
117
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward
118
+ return model_forward(*args, **kwargs)
119
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__
120
+ return convert_to_fp32(self.model_forward(*args, **kwargs))
121
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
122
+ return func(*args, **kwargs)
123
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/peft_model.py", line 1923, in forward
124
+ return self.base_model(
125
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
126
+ return self._call_impl(*args, **kwargs)
127
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
128
+ return forward_call(*args, **kwargs)
129
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 308, in forward
130
+ return self.model.forward(*args, **kwargs)
131
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
132
+ output = module._old_forward(*args, **kwargs)
133
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 810, in wrapper
134
+ output = func(self, *args, **kwargs)
135
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 477, in forward
136
+ outputs: BaseModelOutputWithPast = self.model(
137
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
138
+ return self._call_impl(*args, **kwargs)
139
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
140
+ return forward_call(*args, **kwargs)
141
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
142
+ outputs = func(self, *args, **kwargs)
143
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 412, in forward
144
+ hidden_states = decoder_layer(
145
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_layers.py", line 93, in __call__
146
+ return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
147
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/_compile.py", line 32, in inner
148
+ return disable_fn(*args, **kwargs)
149
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
150
+ return fn(*args, **kwargs)
151
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint
152
+ return CheckpointFunction.apply(function, preserve, *args)
153
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply
154
+ return super().apply(*args, **kwargs) # type: ignore[misc]
155
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 264, in forward
156
+ outputs = run_function(*args)
157
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
158
+ return self._call_impl(*args, **kwargs)
159
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
160
+ return forward_call(*args, **kwargs)
161
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 918, in wrapped_forward
162
+ output = orig_forward(*args, **kwargs)
163
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
164
+ output = module._old_forward(*args, **kwargs)
165
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 300, in forward
166
+ hidden_states, _ = self.self_attn(
167
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
168
+ return self._call_impl(*args, **kwargs)
169
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
170
+ return forward_call(*args, **kwargs)
171
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
172
+ output = module._old_forward(*args, **kwargs)
173
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 220, in forward
174
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
175
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
176
+ return self._call_impl(*args, **kwargs)
177
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
178
+ return forward_call(*args, **kwargs)
179
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/lora/layer.py", line 793, in forward
180
+ result = self.base_layer(x, *args, **kwargs)
181
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
182
+ return self._call_impl(*args, **kwargs)
183
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
184
+ return forward_call(*args, **kwargs)
185
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
186
+ output = module._old_forward(*args, **kwargs)
187
+ KeyboardInterrupt
188
+ Traceback (most recent call last):
189
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 953, in <module>
190
+ main()
191
+ File "/workspace/trainer-kit/DPO-14b/run_dpo.py", line 928, in main
192
+ trainer.train(resume_from_checkpoint=resume_from)
193
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2168, in train
194
+ return inner_training_loop(
195
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 2535, in _inner_training_loop
196
+ tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
197
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/trainer.py", line 3807, in training_step
198
+ loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
199
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1810, in compute_loss
200
+ loss, metrics = self.get_batch_loss_metrics(model, inputs, train_eval="train")
201
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1726, in get_batch_loss_metrics
202
+ model_output = self.concatenated_forward(model, batch)
203
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/trl/trainer/dpo_trainer.py", line 1600, in concatenated_forward
204
+ outputs = model(input_ids, **model_kwargs)
205
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
206
+ return self._call_impl(*args, **kwargs)
207
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
208
+ return forward_call(*args, **kwargs)
209
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 819, in forward
210
+ return model_forward(*args, **kwargs)
211
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/utils/operations.py", line 807, in __call__
212
+ return convert_to_fp32(self.model_forward(*args, **kwargs))
213
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/amp/autocast_mode.py", line 44, in decorate_autocast
214
+ return func(*args, **kwargs)
215
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/peft_model.py", line 1923, in forward
216
+ return self.base_model(
217
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
218
+ return self._call_impl(*args, **kwargs)
219
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
220
+ return forward_call(*args, **kwargs)
221
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 308, in forward
222
+ return self.model.forward(*args, **kwargs)
223
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
224
+ output = module._old_forward(*args, **kwargs)
225
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 810, in wrapper
226
+ output = func(self, *args, **kwargs)
227
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 477, in forward
228
+ outputs: BaseModelOutputWithPast = self.model(
229
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
230
+ return self._call_impl(*args, **kwargs)
231
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
232
+ return forward_call(*args, **kwargs)
233
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 965, in wrapper
234
+ outputs = func(self, *args, **kwargs)
235
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 412, in forward
236
+ hidden_states = decoder_layer(
237
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/modeling_layers.py", line 93, in __call__
238
+ return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
239
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/_compile.py", line 32, in inner
240
+ return disable_fn(*args, **kwargs)
241
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 632, in _fn
242
+ return fn(*args, **kwargs)
243
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 489, in checkpoint
244
+ return CheckpointFunction.apply(function, preserve, *args)
245
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/autograd/function.py", line 575, in apply
246
+ return super().apply(*args, **kwargs) # type: ignore[misc]
247
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 264, in forward
248
+ outputs = run_function(*args)
249
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
250
+ return self._call_impl(*args, **kwargs)
251
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
252
+ return forward_call(*args, **kwargs)
253
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/utils/generic.py", line 918, in wrapped_forward
254
+ output = orig_forward(*args, **kwargs)
255
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
256
+ output = module._old_forward(*args, **kwargs)
257
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 300, in forward
258
+ hidden_states, _ = self.self_attn(
259
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
260
+ return self._call_impl(*args, **kwargs)
261
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
262
+ return forward_call(*args, **kwargs)
263
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
264
+ output = module._old_forward(*args, **kwargs)
265
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/transformers/models/qwen2/modeling_qwen2.py", line 220, in forward
266
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
267
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
268
+ return self._call_impl(*args, **kwargs)
269
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
270
+ return forward_call(*args, **kwargs)
271
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/peft/tuners/lora/layer.py", line 793, in forward
272
+ result = self.base_layer(x, *args, **kwargs)
273
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
274
+ return self._call_impl(*args, **kwargs)
275
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
276
+ return forward_call(*args, **kwargs)
277
+ File "/workspace/llm_finetuning_env/lib/python3.10/site-packages/accelerate/hooks.py", line 175, in new_forward
278
+ output = module._old_forward(*args, **kwargs)
279
+ KeyboardInterrupt
wandb/run-20251226_155650-wbzoafvt/files/requirements.txt ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exceptiongroup==1.3.1
2
+ wheel==0.45.1
3
+ python-dateutil==2.9.0.post0
4
+ nvidia-ml-py==13.580.82
5
+ huggingface_hub==1.2.3
6
+ idna==3.11
7
+ click==8.3.1
8
+ numpy==2.2.6
9
+ httpx==0.28.1
10
+ tokenizers==0.22.1
11
+ sympy==1.13.1
12
+ yarl==1.22.0
13
+ async-timeout==5.0.1
14
+ datasets==4.4.2
15
+ platformdirs==4.5.1
16
+ nvidia-cuda-cupti-cu12==12.1.105
17
+ nvidia-nvtx-cu12==12.1.105
18
+ smmap==5.0.2
19
+ accelerate==1.12.0
20
+ requests==2.32.5
21
+ aiohttp==3.13.2
22
+ bitsandbytes==0.49.0
23
+ nvidia-cublas-cu12==12.1.3.1
24
+ mpmath==1.3.0
25
+ torchaudio==2.5.1+cu121
26
+ nvidia-cuda-runtime-cu12==12.1.105
27
+ typing-inspection==0.4.2
28
+ GitPython==3.1.45
29
+ xxhash==3.6.0
30
+ nvidia-cusolver-cu12==11.4.5.107
31
+ pydantic_core==2.41.5
32
+ six==1.17.0
33
+ torchvision==0.20.1+cu121
34
+ typing_extensions==4.15.0
35
+ triton==3.1.0
36
+ charset-normalizer==3.4.4
37
+ nvitop==1.6.1
38
+ wandb==0.23.1
39
+ regex==2025.11.3
40
+ pip==25.3
41
+ nvidia-cusparse-cu12==12.1.0.106
42
+ pytz==2025.2
43
+ Jinja2==3.1.6
44
+ psutil==7.2.0
45
+ pillow==12.0.0
46
+ packaging==25.0
47
+ safetensors==0.7.0
48
+ sentry-sdk==2.48.0
49
+ gitdb==4.0.12
50
+ httpcore==1.0.9
51
+ setuptools==80.9.0
52
+ nvidia-cufft-cu12==11.0.2.54
53
+ anyio==4.12.0
54
+ transformers==5.0.0.dev0
55
+ pydantic==2.12.5
56
+ fsspec==2025.10.0
57
+ filelock==3.20.0
58
+ PyYAML==6.0.3
59
+ hf-xet==1.2.0
60
+ nvidia-cudnn-cu12==9.1.0.70
61
+ tqdm==4.67.1
62
+ MarkupSafe==2.1.5
63
+ attrs==25.4.0
64
+ nvidia-cuda-nvrtc-cu12==12.1.105
65
+ peft==0.18.0
66
+ aiohappyeyeballs==2.6.1
67
+ networkx==3.4.2
68
+ nvidia-nvjitlink-cu12==12.9.86
69
+ certifi==2025.11.12
70
+ pyarrow==22.0.0
71
+ dill==0.4.0
72
+ protobuf==6.33.2
73
+ aiosignal==1.4.0
74
+ frozenlist==1.8.0
75
+ urllib3==2.6.2
76
+ propcache==0.4.1
77
+ tzdata==2025.3
78
+ pandas==2.3.3
79
+ annotated-types==0.7.0
80
+ shellingham==1.5.4
81
+ nvidia-nccl-cu12==2.21.5
82
+ multidict==6.7.0
83
+ nvidia-curand-cu12==10.3.2.106
84
+ trl==0.26.2
85
+ torch==2.5.1+cu121
86
+ h11==0.16.0
87
+ multiprocess==0.70.18
88
+ typer-slim==0.21.0
89
+ wheel==0.45.1
90
+ tomli==2.0.1
91
+ autocommand==2.2.2
92
+ jaraco.context==5.3.0
93
+ zipp==3.19.2
94
+ packaging==24.2
95
+ inflect==7.3.1
96
+ typing_extensions==4.12.2
97
+ platformdirs==4.2.2
98
+ jaraco.functools==4.0.1
99
+ jaraco.collections==5.1.0
100
+ jaraco.text==3.12.1
101
+ backports.tarfile==1.2.0
102
+ more-itertools==10.3.0
103
+ importlib_metadata==8.0.0
104
+ typeguard==4.3.0
wandb/run-20251226_155650-wbzoafvt/files/wandb-metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-6.12.46+-x86_64-with-glibc2.35",
3
+ "python": "CPython 3.10.12",
4
+ "startedAt": "2025-12-26T15:56:50.015524Z",
5
+ "args": [
6
+ "--config",
7
+ "config_dpo.yaml"
8
+ ],
9
+ "program": "/workspace/trainer-kit/DPO-14b/run_dpo.py",
10
+ "codePath": "run_dpo.py",
11
+ "codePathLocal": "run_dpo.py",
12
+ "email": "shaiksirajuddin9949@gmail.com",
13
+ "root": "runs/dpo_run_14b_v1",
14
+ "host": "a100-2gpu-shell-session-757d587799-mfdvv",
15
+ "executable": "/workspace/llm_finetuning_env/bin/python",
16
+ "cpu_count": 12,
17
+ "cpu_count_logical": 24,
18
+ "gpu": "NVIDIA A100-SXM4-80GB",
19
+ "gpu_count": 2,
20
+ "disk": {
21
+ "/": {
22
+ "total": "791251738624",
23
+ "used": "323290275840"
24
+ }
25
+ },
26
+ "memory": {
27
+ "total": "359047892992"
28
+ },
29
+ "gpu_nvidia": [
30
+ {
31
+ "name": "NVIDIA A100-SXM4-80GB",
32
+ "memoryTotal": "85899345920",
33
+ "cudaCores": 6912,
34
+ "architecture": "Ampere",
35
+ "uuid": "GPU-989794b0-ec3b-13bf-db9f-3fbe341497ba"
36
+ },
37
+ {
38
+ "name": "NVIDIA A100-SXM4-80GB",
39
+ "memoryTotal": "85899345920",
40
+ "cudaCores": 6912,
41
+ "architecture": "Ampere",
42
+ "uuid": "GPU-3790aa64-60ef-9eac-b0b1-b278ee8c0d40"
43
+ }
44
+ ],
45
+ "cudaVersion": "13.0",
46
+ "writerId": "afn1h9dtq29ul6sseazq0ojw1mqcn19i"
47
+ }
wandb/run-20251226_155650-wbzoafvt/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"train/logits/chosen":3.23529052734375,"eval/logits/chosen":3.6694726943969727,"train/logits/rejected":3.393266201019287,"train/learning_rate":2.7325581395348836e-05,"eval/rewards/rejected":-6.150709629058838,"eval/logits/rejected":3.8436598777770996,"eval/rewards/chosen":0.5319492816925049,"_timestamp":1.7667683362567813e+09,"train/rewards/rejected":-6.558856964111328,"eval/runtime":454.8045,"eval/rewards/accuracies":0.9934383034706116,"train/logps/chosen":-402.8253479003906,"train/rewards/accuracies":1,"_wandb":{"runtime":3729},"train/grad_norm":0.820688009262085,"eval/rewards/margins":6.682660102844238,"_runtime":3729,"eval/logps/rejected":-457.28314208984375,"train/rewards/chosen":1.4685018062591553,"eval/samples_per_second":1.675,"eval/steps_per_second":1.675,"_step":75,"train/rewards/margins":8.027359008789062,"eval/logps/chosen":-365.087646484375,"train/logps/rejected":-506.32000732421875,"train/global_step":142,"train/loss":0.007118214387446642,"eval/loss":0.024107323959469795,"train/epoch":0.16583941605839417}
wandb/run-20251226_155650-wbzoafvt/logs/debug-core.log ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:56:50.109153388Z","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpa97nk_g5/port-148906.txt","pid":148906,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false}
2
+ {"time":"2025-12-26T15:56:50.110079679Z","level":"INFO","msg":"server: will exit if parent process dies","ppid":148906}
3
+ {"time":"2025-12-26T15:56:50.110081586Z","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-148906-148983-325621513/socket","Net":"unix"}}
4
+ {"time":"2025-12-26T15:56:50.290687433Z","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"}
5
+ {"time":"2025-12-26T15:56:50.297246166Z","level":"INFO","msg":"handleInformInit: received","streamId":"wbzoafvt","id":"1(@)"}
6
+ {"time":"2025-12-26T15:56:50.452581495Z","level":"INFO","msg":"handleInformInit: stream started","streamId":"wbzoafvt","id":"1(@)"}
7
+ {"time":"2025-12-26T16:59:00.070455239Z","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"1(@)"}
8
+ {"time":"2025-12-26T16:59:00.070533969Z","level":"INFO","msg":"connection: closing","id":"1(@)"}
9
+ {"time":"2025-12-26T16:59:00.0705585Z","level":"INFO","msg":"server is shutting down"}
10
+ {"time":"2025-12-26T16:59:00.070589863Z","level":"INFO","msg":"connection: closed successfully","id":"1(@)"}
11
+ {"time":"2025-12-26T16:59:00.070654266Z","level":"INFO","msg":"server: listener closed","addr":{"Name":"/tmp/wandb-148906-148983-325621513/socket","Net":"unix"}}
12
+ {"time":"2025-12-26T16:59:00.47438251Z","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"1(@)"}
13
+ {"time":"2025-12-26T16:59:00.474428554Z","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"1(@)"}
14
+ {"time":"2025-12-26T16:59:00.474451644Z","level":"INFO","msg":"server is closed"}
wandb/run-20251226_155650-wbzoafvt/logs/debug-internal.log ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"time":"2025-12-26T15:56:50.297401502Z","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
2
+ {"time":"2025-12-26T15:56:50.452320078Z","level":"INFO","msg":"stream: created new stream","id":"wbzoafvt"}
3
+ {"time":"2025-12-26T15:56:50.452494836Z","level":"INFO","msg":"handler: started","stream_id":"wbzoafvt"}
4
+ {"time":"2025-12-26T15:56:50.452572405Z","level":"INFO","msg":"stream: started","id":"wbzoafvt"}
5
+ {"time":"2025-12-26T15:56:50.452599156Z","level":"INFO","msg":"writer: started","stream_id":"wbzoafvt"}
6
+ {"time":"2025-12-26T15:56:50.452607804Z","level":"INFO","msg":"sender: started","stream_id":"wbzoafvt"}
7
+ {"time":"2025-12-26T16:59:00.070531235Z","level":"INFO","msg":"stream: closing","id":"wbzoafvt"}
8
+ {"time":"2025-12-26T16:59:00.346670237Z","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"}
9
+ {"time":"2025-12-26T16:59:00.473496131Z","level":"INFO","msg":"handler: closed","stream_id":"wbzoafvt"}
10
+ {"time":"2025-12-26T16:59:00.473589831Z","level":"INFO","msg":"sender: closed","stream_id":"wbzoafvt"}
11
+ {"time":"2025-12-26T16:59:00.473602236Z","level":"INFO","msg":"stream: closed","id":"wbzoafvt"}
wandb/run-20251226_155650-wbzoafvt/logs/debug.log ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
2
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Configure stats pid to 148906
3
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from /root/.config/wandb/settings
4
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from /workspace/trainer-kit/DPO-14b/wandb/settings
5
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_setup.py:_flush():80] Loading settings from environment variables
6
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:setup_run_log_directory():714] Logging user logs to runs/dpo_run_14b_v1/wandb/run-20251226_155650-wbzoafvt/logs/debug.log
7
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to runs/dpo_run_14b_v1/wandb/run-20251226_155650-wbzoafvt/logs/debug-internal.log
8
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():841] calling init triggers
9
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
10
+ config: {'model': {'repo_id': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'base_local_dir': 'base_model', 'trust_remote_code': True, 'tokenizer_use_fast': True, 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'use_4bit': False, 'bnb_4bit_quant_type': 'nf4', 'bnb_4bit_use_double_quant': False, 'bnb_4bit_compute_dtype': 'bfloat16', 'attn_implementation': None}, 'data': {'train_jsonl': 'dpo_pairs_generated.jsonl', 'eval_jsonl': None, 'eval_split_ratio': 0.1, 'prompt_field': 'prompt', 'chosen_field': 'chosen', 'rejected_field': 'rejected', 'score_field': 'f1_score', 'format_type': 'chatml', 'system_prompt': 'You are a Hyperswitch Rust code analyzer. Identify functions/structs that need modification for a given task.\n\n## Output Format\n\n##OUTPUT\nExplain the data flow and why each component must change:\n- Flow: [Input → Processing → Output with arrows]\n- For each component: "The [ComponentName] ([path]) must [action] because [reason]—without this, [consequence]"\n- Explain coupling between components\n\n##SELECT\nmodify::crates/path/to/file.rs::impl::ComponentName\nadd::crates/another/file.rs::function::AnotherComponent\n<EOS>\n\n## Rules\n\n1. Use full paths: `remove::crates/folder/file.rs::Type::Name`\n2. Use `::` for nested items: `status::StructName::Type::Name`\n3. Always explain "must change because" and "without this"\n3. Types of components: function, struct, enum, impl, trait\n4. If there is extra information (e.g., enum variants), include that too.\n5. Start with ##OUTPUT, end with ##SELECT, terminate with <EOS>\n', 'max_length': 2048, 'shuffle': True, 'num_proc': 4}, 'peft': {'enabled': True, 'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'target_modules': 'auto'}, 'dpo': {'beta': 0.1, 'label_smoothing': 0.0, 'loss_type': 'sigmoid', 'use_reference_model': True, 'reference_free': False}, 'train': {'num_train_epochs': 3, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'learning_rate': '5e-5', 'weight_decay': 0.0, 'warmup_ratio': 0.1, 'lr_scheduler_type': 'cosine', 'optim': 'adamw_torch', 'max_grad_norm': 1.0, 'gradient_checkpointing': True, 'logging_steps': 2, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'evaluation_strategy': 'steps', 'eval_steps': 25, 'load_best_model_at_end': True, 'early_stopping': {'enabled': True, 'patience': 5, 'min_delta': 0.001, 'metric': 'eval_loss', 'mode': 'min'}, 'resume_from_checkpoint': 'auto'}, 'run_dir': 'runs/dpo_run_14b_v1', '_wandb': {}}
11
+ 2025-12-26 15:56:50,017 INFO MainThread:148906 [wandb_init.py:init():889] starting backend
12
+ 2025-12-26 15:56:50,290 INFO MainThread:148906 [wandb_init.py:init():892] sending inform_init request
13
+ 2025-12-26 15:56:50,295 INFO MainThread:148906 [wandb_init.py:init():900] backend started and connected
14
+ 2025-12-26 15:56:50,297 INFO MainThread:148906 [wandb_init.py:init():970] updated telemetry
15
+ 2025-12-26 15:56:50,297 INFO MainThread:148906 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
16
+ 2025-12-26 15:56:50,648 INFO MainThread:148906 [wandb_init.py:init():1041] starting run threads in backend
17
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_console_start():2521] atexit reg
18
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2369] redirect: wrap_raw
19
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2438] Wrapping output streams.
20
+ 2025-12-26 15:56:50,757 INFO MainThread:148906 [wandb_run.py:_redirect():2461] Redirects installed.
21
+ 2025-12-26 15:56:50,762 INFO MainThread:148906 [wandb_init.py:init():1081] run started, returning control to user process
22
+ 2025-12-26 15:57:33,783 INFO MainThread:148906 [wandb_run.py:_config_callback():1396] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': 'LORA', 'auto_mapping': None, 'peft_version': '0.18.0', 'base_model_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': ['k_proj', 'o_proj', 'v_proj', 'q_proj'], 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'trainable_token_indices': None, 'loftq_config': {}, 'eva_config': None, 'corda_config': None, 'use_dora': False, 'alora_invocation_tokens': None, 'use_qalora': False, 'qalora_group_size': 16, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False, 'target_parameters': None, 'arrow_config': None, 'ensure_weight_tying': False}}, 'vocab_size': 152064, 'max_position_embeddings': 32768, 'hidden_size': 5120, 'intermediate_size': 13824, 'num_hidden_layers': 48, 'num_attention_heads': 40, 'use_sliding_window': False, 'sliding_window': None, 'max_window_layers': 48, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-06, 'use_cache': False, 'attention_dropout': 0.0, 'layer_types': ['full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention', 'full_attention'], 'rope_parameters': {'rope_theta': 1000000.0, 'rope_type': 'default'}, 'return_dict': True, 'output_hidden_states': False, 'dtype': 'bfloat16', 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'architectures': ['Qwen2ForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'task_specific_params': None, 'problem_type': None, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 151643, 'eos_token_id': 151643, 'sep_token_id': None, 'decoder_start_token_id': None, '_name_or_path': '../../Models/Qwen2.5-Coder-14B-CPT-SFT', 'transformers_version': '5.0.0.dev0', 'model_type': 'qwen2', 'output_attentions': False, 'output_dir': 'runs/dpo_run_14b_v1', 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 5e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 3, 'max_steps': -1, 'lr_scheduler_type': 'cosine', 'lr_scheduler_kwargs': None, 'warmup_ratio': 0.1, 'warmup_steps': 0.1, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': None, 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 2, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 100, 'save_total_limit': 10, 'enable_jit_checkpoint': False, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'use_cpu': False, 'seed': 42, 'data_seed': None, 'bf16': True, 'fp16': False, 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': None, 'local_rank': -1, 'ddp_backend': None, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 25, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': False, 'label_names': None, 'load_best_model_at_end': True, 'metric_for_best_model': 'eval_loss', 'greater_is_better': False, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'parallelism_config': None, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'adamw_torch', 'optim_args': None, 'group_by_length': False, 'length_column_name': 'length', 'report_to': ['wandb'], 'project': 'huggingface', 'trackio_space_id': 'trackio', 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '<HUB_TOKEN>', 'hub_private_repo': None, 'hub_always_push': False, 'hub_revision': None, 'gradient_checkpointing': True, 'gradient_checkpointing_kwargs': None, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'auto_find_batch_size': False, 'full_determinism': False, 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'include_num_input_tokens_seen': 'no', 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'liger_kernel_config': None, 'eval_use_gather_object': False, 'average_tokens_across_devices': True, 'model_init_kwargs': None, 'ref_model_init_kwargs': None, 'model_adapter_name': None, 'ref_adapter_name': None, 'force_use_ref_model': False, 'disable_dropout': True, 'use_logits_to_keep': False, 'dataset_num_proc': None, 'pad_token': '<PAD_TOKEN>', 'label_pad_token_id': -100, 'max_prompt_length': 1024, 'max_completion_length': None, 'max_length': 2048, 'truncation_mode': 'keep_end', 'padding_free': False, 'precompute_ref_log_probs': False, 'precompute_ref_batch_size': None, 'tools': None, 'loss_type': 'sigmoid', 'use_liger_loss': None, 'base_model_attribute_name': 'model', 'beta': 0.1, 'f_divergence_type': 'reverse_kl', 'f_alpha_divergence_coef': 1.0, 'reference_free': False, 'label_smoothing': 0.0, 'use_weighting': False, 'rpo_alpha': None, 'ld_alpha': None, 'discopop_tau': 0.05, 'loss_weights': None, 'sync_ref_model': False, 'ref_model_mixup_alpha': 0.6, 'ref_model_sync_steps': 512, 'generate_during_eval': False}
23
+ 2025-12-26 15:57:33,791 INFO MainThread:148906 [wandb_config.py:__setitem__():154] [no run ID] config set model/num_parameters = 14795199488 - <bound method Run._config_callback of <wandb.sdk.wandb_run.Run object at 0x7da0304cf970>>
24
+ 2025-12-26 15:57:33,792 INFO MainThread:148906 [wandb_run.py:_config_callback():1396] config_cb model/num_parameters 14795199488 None
25
+ 2025-12-26 16:59:00,070 INFO wandb-AsyncioManager-main:148906 [service_client.py:_forward_responses():80] Reached EOF.
26
+ 2025-12-26 16:59:00,070 INFO wandb-AsyncioManager-main:148906 [mailbox.py:close():137] Closing mailbox, abandoning 1 handles.
wandb/run-20251226_155650-wbzoafvt/run-wbzoafvt.wandb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00679558234aafe800040b8c88fedf7b94f44f3b9a953dcb8eb28ed2f6af9ccb
3
+ size 2174075