saranshagarwal2020 commited on
Commit
2570d5e
·
verified ·
1 Parent(s): 4d57d68

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. README.md +68 -0
  2. checkpoint-300/chat_template.jinja +45 -0
  3. checkpoint-300/config.json +61 -0
  4. checkpoint-300/generation_config.json +9 -0
  5. checkpoint-300/model.safetensors +3 -0
  6. checkpoint-300/optimizer.pt +3 -0
  7. checkpoint-300/rng_state.pth +3 -0
  8. checkpoint-300/scheduler.pt +3 -0
  9. checkpoint-300/tokenizer.json +0 -0
  10. checkpoint-300/tokenizer_config.json +20 -0
  11. checkpoint-300/trainer_state.json +532 -0
  12. checkpoint-300/training_args.bin +3 -0
  13. checkpoint-400/chat_template.jinja +45 -0
  14. checkpoint-400/config.json +61 -0
  15. checkpoint-400/generation_config.json +9 -0
  16. checkpoint-400/model.safetensors +3 -0
  17. checkpoint-400/optimizer.pt +3 -0
  18. checkpoint-400/rng_state.pth +3 -0
  19. checkpoint-400/scheduler.pt +3 -0
  20. checkpoint-400/tokenizer.json +0 -0
  21. checkpoint-400/tokenizer_config.json +20 -0
  22. checkpoint-400/trainer_state.json +698 -0
  23. checkpoint-400/training_args.bin +3 -0
  24. checkpoint-432/chat_template.jinja +45 -0
  25. checkpoint-432/config.json +61 -0
  26. checkpoint-432/generation_config.json +9 -0
  27. checkpoint-432/model.safetensors +3 -0
  28. checkpoint-432/optimizer.pt +3 -0
  29. checkpoint-432/rng_state.pth +3 -0
  30. checkpoint-432/scheduler.pt +3 -0
  31. checkpoint-432/tokenizer.json +0 -0
  32. checkpoint-432/tokenizer_config.json +20 -0
  33. checkpoint-432/trainer_state.json +743 -0
  34. checkpoint-432/training_args.bin +3 -0
  35. final_model/chat_template.jinja +45 -0
  36. final_model/config.json +61 -0
  37. final_model/generation_config.json +9 -0
  38. final_model/model.safetensors +3 -0
  39. final_model/tokenizer.json +0 -0
  40. final_model/tokenizer_config.json +20 -0
  41. final_model/training_args.bin +3 -0
  42. run_meta.json +39 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: LiquidAI/LFM2.5-1.2B-Instruct
3
+ library_name: transformers
4
+ model_name: dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126
5
+ tags:
6
+ - generated_from_trainer
7
+ - dpo
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126
13
+
14
+ This model is a fine-tuned version of [LiquidAI/LFM2.5-1.2B-Instruct](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.28.0
38
+ - Transformers: 5.2.0
39
+ - Pytorch: 2.8.0
40
+ - Datasets: 4.5.0
41
+ - Tokenizers: 0.22.2
42
+
43
+ ## Citations
44
+
45
+ Cite DPO as:
46
+
47
+ ```bibtex
48
+ @inproceedings{rafailov2023direct,
49
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
50
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
51
+ year = 2023,
52
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
53
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
54
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
55
+ }
56
+ ```
57
+
58
+ Cite TRL as:
59
+
60
+ ```bibtex
61
+ @software{vonwerra2020trl,
62
+ title = {{TRL: Transformers Reinforcement Learning}},
63
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
64
+ license = {Apache-2.0},
65
+ url = {https://github.com/huggingface/trl},
66
+ year = {2020}
67
+ }
68
+ ```
checkpoint-300/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
checkpoint-300/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Lfm2ForCausalLM"
4
+ ],
5
+ "block_auto_adjust_ff_dim": true,
6
+ "block_dim": 2048,
7
+ "block_ff_dim": 12288,
8
+ "block_ffn_dim_multiplier": 1.0,
9
+ "block_mlp_init_scale": 1.0,
10
+ "block_multiple_of": 256,
11
+ "block_norm_eps": 1e-05,
12
+ "block_out_init_scale": 1.0,
13
+ "block_use_swiglu": true,
14
+ "block_use_xavier_init": true,
15
+ "bos_token_id": 1,
16
+ "conv_L_cache": 3,
17
+ "conv_bias": false,
18
+ "conv_dim": 2048,
19
+ "conv_use_xavier_init": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 7,
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "layer_types": [
26
+ "conv",
27
+ "conv",
28
+ "full_attention",
29
+ "conv",
30
+ "conv",
31
+ "full_attention",
32
+ "conv",
33
+ "conv",
34
+ "full_attention",
35
+ "conv",
36
+ "full_attention",
37
+ "conv",
38
+ "full_attention",
39
+ "conv",
40
+ "full_attention",
41
+ "conv"
42
+ ],
43
+ "max_position_embeddings": 128000,
44
+ "model_type": "lfm2",
45
+ "norm_eps": 1e-05,
46
+ "num_attention_heads": 32,
47
+ "num_heads": 32,
48
+ "num_hidden_layers": 16,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 0,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "tie_embedding": true,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "5.2.0",
58
+ "use_cache": false,
59
+ "use_pos_enc": true,
60
+ "vocab_size": 65536
61
+ }
checkpoint-300/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 7
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0"
9
+ }
checkpoint-300/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8a8b6c80f8066d94803546515de24808278c0cfb7efff8e029129e40dd94ff
3
+ size 2340697936
checkpoint-300/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0115530fa698b4ae187d35c21f6abaf53d807fda66ea7b7457a826086059b84
3
+ size 3178927435
checkpoint-300/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01f9a0f7843a37be87edd23f4e88aa93b38b95cc2c07503eeb1cf2e4632453a2
3
+ size 14645
checkpoint-300/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:426f979e5c76efd0e1e3fe301e36c368d85ae96e9cf26208baaf62ca2cac3280
3
+ size 1465
checkpoint-300/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-300/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "right",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }
checkpoint-300/trainer_state.json ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 300,
3
+ "best_metric": 0.6872237920761108,
4
+ "best_model_checkpoint": "models/dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126/checkpoint-300",
5
+ "epoch": 2.0834782608695654,
6
+ "eval_steps": 100,
7
+ "global_step": 300,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06956521739130435,
14
+ "grad_norm": 44.25,
15
+ "learning_rate": 1.0227272727272728e-07,
16
+ "logits/chosen": -1.0599091053009033,
17
+ "logits/rejected": -1.042307734489441,
18
+ "logps/chosen": -332.13507080078125,
19
+ "logps/rejected": -332.9361267089844,
20
+ "loss": 0.6927361965179444,
21
+ "rewards/accuracies": 0.41874998807907104,
22
+ "rewards/chosen": 0.01432617288082838,
23
+ "rewards/margins": 0.0042648655362427235,
24
+ "rewards/rejected": 0.010061310604214668,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.1391304347826087,
29
+ "grad_norm": 50.5,
30
+ "learning_rate": 2.159090909090909e-07,
31
+ "logits/chosen": -1.0978257656097412,
32
+ "logits/rejected": -1.09658682346344,
33
+ "logps/chosen": -329.22100830078125,
34
+ "logps/rejected": -312.2664794921875,
35
+ "loss": 0.7003747940063476,
36
+ "rewards/accuracies": 0.44999998807907104,
37
+ "rewards/chosen": 0.004466252867132425,
38
+ "rewards/margins": -0.011238865554332733,
39
+ "rewards/rejected": 0.01570511981844902,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.20869565217391303,
44
+ "grad_norm": 54.75,
45
+ "learning_rate": 3.295454545454545e-07,
46
+ "logits/chosen": -1.1366004943847656,
47
+ "logits/rejected": -1.1707069873809814,
48
+ "logps/chosen": -328.1323547363281,
49
+ "logps/rejected": -305.90618896484375,
50
+ "loss": 0.7031519412994385,
51
+ "rewards/accuracies": 0.4312500059604645,
52
+ "rewards/chosen": 0.03125340864062309,
53
+ "rewards/margins": -0.01610126718878746,
54
+ "rewards/rejected": 0.04735467582941055,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.2782608695652174,
59
+ "grad_norm": 50.5,
60
+ "learning_rate": 4.4318181818181817e-07,
61
+ "logits/chosen": -1.0889461040496826,
62
+ "logits/rejected": -1.1017862558364868,
63
+ "logps/chosen": -338.80938720703125,
64
+ "logps/rejected": -328.07464599609375,
65
+ "loss": 0.6893259048461914,
66
+ "rewards/accuracies": 0.4937500059604645,
67
+ "rewards/chosen": 0.09773217141628265,
68
+ "rewards/margins": 0.01154844556003809,
69
+ "rewards/rejected": 0.08618371188640594,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.34782608695652173,
74
+ "grad_norm": 53.5,
75
+ "learning_rate": 4.997951542310825e-07,
76
+ "logits/chosen": -1.1367381811141968,
77
+ "logits/rejected": -1.1468210220336914,
78
+ "logps/chosen": -329.12823486328125,
79
+ "logps/rejected": -323.8036804199219,
80
+ "loss": 0.6982084751129151,
81
+ "rewards/accuracies": 0.44999998807907104,
82
+ "rewards/chosen": 0.1568203717470169,
83
+ "rewards/margins": -0.004218918737024069,
84
+ "rewards/rejected": 0.1610392928123474,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.41739130434782606,
89
+ "grad_norm": 64.0,
90
+ "learning_rate": 4.98158401695492e-07,
91
+ "logits/chosen": -1.0899083614349365,
92
+ "logits/rejected": -1.0698351860046387,
93
+ "logps/chosen": -335.6194152832031,
94
+ "logps/rejected": -329.00653076171875,
95
+ "loss": 0.6905360221862793,
96
+ "rewards/accuracies": 0.53125,
97
+ "rewards/chosen": 0.1543850153684616,
98
+ "rewards/margins": 0.010315253399312496,
99
+ "rewards/rejected": 0.14406974613666534,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.48695652173913045,
104
+ "grad_norm": 72.5,
105
+ "learning_rate": 4.948956212473369e-07,
106
+ "logits/chosen": -1.1223671436309814,
107
+ "logits/rejected": -1.113906741142273,
108
+ "logps/chosen": -319.424560546875,
109
+ "logps/rejected": -317.76934814453125,
110
+ "loss": 0.7009393215179444,
111
+ "rewards/accuracies": 0.4749999940395355,
112
+ "rewards/chosen": 0.18402941524982452,
113
+ "rewards/margins": -0.010006649419665337,
114
+ "rewards/rejected": 0.19403605163097382,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.5565217391304348,
119
+ "grad_norm": 52.5,
120
+ "learning_rate": 4.900281918608732e-07,
121
+ "logits/chosen": -1.1157792806625366,
122
+ "logits/rejected": -1.1078118085861206,
123
+ "logps/chosen": -314.5404052734375,
124
+ "logps/rejected": -311.91656494140625,
125
+ "loss": 0.6877760887145996,
126
+ "rewards/accuracies": 0.4749999940395355,
127
+ "rewards/chosen": 0.19012701511383057,
128
+ "rewards/margins": 0.016789216548204422,
129
+ "rewards/rejected": 0.17333778738975525,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.6260869565217392,
134
+ "grad_norm": 47.75,
135
+ "learning_rate": 4.835880067784441e-07,
136
+ "logits/chosen": -1.0738165378570557,
137
+ "logits/rejected": -1.0576341152191162,
138
+ "logps/chosen": -336.3821716308594,
139
+ "logps/rejected": -331.9998779296875,
140
+ "loss": 0.6956596374511719,
141
+ "rewards/accuracies": 0.4375,
142
+ "rewards/chosen": 0.21225734055042267,
143
+ "rewards/margins": 0.0009234324097633362,
144
+ "rewards/rejected": 0.21133390069007874,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.6956521739130435,
149
+ "grad_norm": 49.0,
150
+ "learning_rate": 4.7561726453386744e-07,
151
+ "logits/chosen": -1.0457407236099243,
152
+ "logits/rejected": -1.038447618484497,
153
+ "logps/chosen": -330.8192138671875,
154
+ "logps/rejected": -323.5484924316406,
155
+ "loss": 0.6817105770111084,
156
+ "rewards/accuracies": 0.581250011920929,
157
+ "rewards/chosen": 0.26413798332214355,
158
+ "rewards/margins": 0.029450010508298874,
159
+ "rewards/rejected": 0.2346879541873932,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.6956521739130435,
164
+ "eval_logits/chosen": -1.0708128213882446,
165
+ "eval_logits/rejected": -1.0917303562164307,
166
+ "eval_logps/chosen": -317.1337585449219,
167
+ "eval_logps/rejected": -318.0965270996094,
168
+ "eval_loss": 0.6924862265586853,
169
+ "eval_rewards/accuracies": 0.4838709533214569,
170
+ "eval_rewards/chosen": 0.2332489788532257,
171
+ "eval_rewards/margins": 0.0038641756400465965,
172
+ "eval_rewards/rejected": 0.22938479483127594,
173
+ "eval_runtime": 11.3087,
174
+ "eval_samples_per_second": 10.7,
175
+ "eval_steps_per_second": 2.741,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 0.7652173913043478,
180
+ "grad_norm": 47.0,
181
+ "learning_rate": 4.6616819245164655e-07,
182
+ "logits/chosen": -1.0736799240112305,
183
+ "logits/rejected": -1.0783708095550537,
184
+ "logps/chosen": -314.97943115234375,
185
+ "logps/rejected": -308.35675048828125,
186
+ "loss": 0.6984636306762695,
187
+ "rewards/accuracies": 0.48750001192092896,
188
+ "rewards/chosen": 0.22377920150756836,
189
+ "rewards/margins": -0.0050087496638298035,
190
+ "rewards/rejected": 0.22878792881965637,
191
+ "step": 110
192
+ },
193
+ {
194
+ "epoch": 0.8347826086956521,
195
+ "grad_norm": 51.25,
196
+ "learning_rate": 4.55302704433743e-07,
197
+ "logits/chosen": -1.048730731010437,
198
+ "logits/rejected": -1.0437910556793213,
199
+ "logps/chosen": -332.6585388183594,
200
+ "logps/rejected": -319.9508361816406,
201
+ "loss": 0.6809643268585205,
202
+ "rewards/accuracies": 0.59375,
203
+ "rewards/chosen": 0.3183898627758026,
204
+ "rewards/margins": 0.03039008378982544,
205
+ "rewards/rejected": 0.2879997789859772,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 0.9043478260869565,
210
+ "grad_norm": 57.5,
211
+ "learning_rate": 4.4309199527622254e-07,
212
+ "logits/chosen": -1.0928480625152588,
213
+ "logits/rejected": -1.0909947156906128,
214
+ "logps/chosen": -333.54510498046875,
215
+ "logps/rejected": -321.5794372558594,
216
+ "loss": 0.6884570121765137,
217
+ "rewards/accuracies": 0.518750011920929,
218
+ "rewards/chosen": 0.3165169656276703,
219
+ "rewards/margins": 0.015730690211057663,
220
+ "rewards/rejected": 0.3007862865924835,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.9739130434782609,
225
+ "grad_norm": 50.75,
226
+ "learning_rate": 4.2961607417396517e-07,
227
+ "logits/chosen": -1.0702852010726929,
228
+ "logits/rejected": -1.0860363245010376,
229
+ "logps/chosen": -336.94122314453125,
230
+ "logps/rejected": -312.6028137207031,
231
+ "loss": 0.6929869651794434,
232
+ "rewards/accuracies": 0.5,
233
+ "rewards/chosen": 0.3366253077983856,
234
+ "rewards/margins": 0.008591671474277973,
235
+ "rewards/rejected": 0.3280336260795593,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 1.0417391304347827,
240
+ "grad_norm": 58.25,
241
+ "learning_rate": 4.1496324047009244e-07,
242
+ "logits/chosen": -1.137653112411499,
243
+ "logits/rejected": -1.1385780572891235,
244
+ "logps/chosen": -297.7463073730469,
245
+ "logps/rejected": -304.8460388183594,
246
+ "loss": 0.6796726703643798,
247
+ "rewards/accuracies": 0.5320512652397156,
248
+ "rewards/chosen": 0.36476510763168335,
249
+ "rewards/margins": 0.029467487707734108,
250
+ "rewards/rejected": 0.3352976143360138,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 1.111304347826087,
255
+ "grad_norm": 62.0,
256
+ "learning_rate": 3.9922950508520126e-07,
257
+ "logits/chosen": -1.0764613151550293,
258
+ "logits/rejected": -1.073861837387085,
259
+ "logps/chosen": -325.74151611328125,
260
+ "logps/rejected": -314.5641174316406,
261
+ "loss": 0.6664072513580322,
262
+ "rewards/accuracies": 0.6312500238418579,
263
+ "rewards/chosen": 0.37418311834335327,
264
+ "rewards/margins": 0.06225377321243286,
265
+ "rewards/rejected": 0.3119293451309204,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 1.1808695652173913,
270
+ "grad_norm": 48.25,
271
+ "learning_rate": 3.8251796141741945e-07,
272
+ "logits/chosen": -1.1032686233520508,
273
+ "logits/rejected": -1.1010105609893799,
274
+ "logps/chosen": -324.8043212890625,
275
+ "logps/rejected": -314.7898254394531,
276
+ "loss": 0.6656156063079834,
277
+ "rewards/accuracies": 0.643750011920929,
278
+ "rewards/chosen": 0.39548203349113464,
279
+ "rewards/margins": 0.06372012943029404,
280
+ "rewards/rejected": 0.33176189661026,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 1.2504347826086957,
285
+ "grad_norm": 73.5,
286
+ "learning_rate": 3.649381098353834e-07,
287
+ "logits/chosen": -1.108781099319458,
288
+ "logits/rejected": -1.081012487411499,
289
+ "logps/chosen": -333.0024719238281,
290
+ "logps/rejected": -317.6542663574219,
291
+ "loss": 0.6682669162750244,
292
+ "rewards/accuracies": 0.6187499761581421,
293
+ "rewards/chosen": 0.4114314019680023,
294
+ "rewards/margins": 0.06464624404907227,
295
+ "rewards/rejected": 0.34678512811660767,
296
+ "step": 180
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "grad_norm": 51.75,
301
+ "learning_rate": 3.466051401903162e-07,
302
+ "logits/chosen": -1.0279661417007446,
303
+ "logits/rejected": -1.028050422668457,
304
+ "logps/chosen": -342.05322265625,
305
+ "logps/rejected": -329.083984375,
306
+ "loss": 0.6815151691436767,
307
+ "rewards/accuracies": 0.5249999761581421,
308
+ "rewards/chosen": 0.4475061893463135,
309
+ "rewards/margins": 0.03387105464935303,
310
+ "rewards/rejected": 0.41363516449928284,
311
+ "step": 190
312
+ },
313
+ {
314
+ "epoch": 1.3895652173913042,
315
+ "grad_norm": 43.25,
316
+ "learning_rate": 3.276391770484606e-07,
317
+ "logits/chosen": -1.0554686784744263,
318
+ "logits/rejected": -1.0444362163543701,
319
+ "logps/chosen": -323.93359375,
320
+ "logps/rejected": -300.8354187011719,
321
+ "loss": 0.6729463577270508,
322
+ "rewards/accuracies": 0.5874999761581421,
323
+ "rewards/chosen": 0.4051585793495178,
324
+ "rewards/margins": 0.04842006787657738,
325
+ "rewards/rejected": 0.35673850774765015,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 1.3895652173913042,
330
+ "eval_logits/chosen": -1.059758186340332,
331
+ "eval_logits/rejected": -1.0798335075378418,
332
+ "eval_logps/chosen": -315.1194763183594,
333
+ "eval_logps/rejected": -316.16046142578125,
334
+ "eval_loss": 0.6918731331825256,
335
+ "eval_rewards/accuracies": 0.5483871102333069,
336
+ "eval_rewards/chosen": 0.4346778690814972,
337
+ "eval_rewards/margins": 0.011690356768667698,
338
+ "eval_rewards/rejected": 0.42298752069473267,
339
+ "eval_runtime": 11.2737,
340
+ "eval_samples_per_second": 10.733,
341
+ "eval_steps_per_second": 2.75,
342
+ "step": 200
343
+ },
344
+ {
345
+ "epoch": 1.4591304347826086,
346
+ "grad_norm": 44.5,
347
+ "learning_rate": 3.0816449258938656e-07,
348
+ "logits/chosen": -1.0483617782592773,
349
+ "logits/rejected": -1.0491857528686523,
350
+ "logps/chosen": -326.4630126953125,
351
+ "logps/rejected": -321.35467529296875,
352
+ "loss": 0.6760656356811523,
353
+ "rewards/accuracies": 0.59375,
354
+ "rewards/chosen": 0.4774439334869385,
355
+ "rewards/margins": 0.04994555562734604,
356
+ "rewards/rejected": 0.42749834060668945,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 1.528695652173913,
361
+ "grad_norm": 69.0,
362
+ "learning_rate": 2.883086923275658e-07,
363
+ "logits/chosen": -1.0638173818588257,
364
+ "logits/rejected": -1.0861108303070068,
365
+ "logps/chosen": -323.24517822265625,
366
+ "logps/rejected": -316.22454833984375,
367
+ "loss": 0.6859281063079834,
368
+ "rewards/accuracies": 0.5375000238418579,
369
+ "rewards/chosen": 0.464817613363266,
370
+ "rewards/margins": 0.025601129978895187,
371
+ "rewards/rejected": 0.4392164647579193,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 1.5982608695652174,
376
+ "grad_norm": 51.25,
377
+ "learning_rate": 2.6820187899267197e-07,
378
+ "logits/chosen": -1.0582925081253052,
379
+ "logits/rejected": -1.0700470209121704,
380
+ "logps/chosen": -331.1497802734375,
381
+ "logps/rejected": -318.2004089355469,
382
+ "loss": 0.6922653675079345,
383
+ "rewards/accuracies": 0.5249999761581421,
384
+ "rewards/chosen": 0.4878065586090088,
385
+ "rewards/margins": 0.014133572578430176,
386
+ "rewards/rejected": 0.4736729562282562,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 1.6678260869565218,
391
+ "grad_norm": 44.0,
392
+ "learning_rate": 2.4797580004718036e-07,
393
+ "logits/chosen": -1.059127926826477,
394
+ "logits/rejected": -1.0511342287063599,
395
+ "logps/chosen": -329.36309814453125,
396
+ "logps/rejected": -324.93060302734375,
397
+ "loss": 0.6760355949401855,
398
+ "rewards/accuracies": 0.5562499761581421,
399
+ "rewards/chosen": 0.4827202260494232,
400
+ "rewards/margins": 0.04902523010969162,
401
+ "rewards/rejected": 0.433694988489151,
402
+ "step": 240
403
+ },
404
+ {
405
+ "epoch": 1.7373913043478262,
406
+ "grad_norm": 47.0,
407
+ "learning_rate": 2.277629844270543e-07,
408
+ "logits/chosen": -1.1127703189849854,
409
+ "logits/rejected": -1.119107961654663,
410
+ "logps/chosen": -313.205810546875,
411
+ "logps/rejected": -302.2017822265625,
412
+ "loss": 0.6791357040405274,
413
+ "rewards/accuracies": 0.59375,
414
+ "rewards/chosen": 0.46044379472732544,
415
+ "rewards/margins": 0.04207334667444229,
416
+ "rewards/rejected": 0.41837042570114136,
417
+ "step": 250
418
+ },
419
+ {
420
+ "epoch": 1.8069565217391306,
421
+ "grad_norm": 74.5,
422
+ "learning_rate": 2.0769587416192208e-07,
423
+ "logits/chosen": -1.0957348346710205,
424
+ "logits/rejected": -1.0845588445663452,
425
+ "logps/chosen": -326.6529235839844,
426
+ "logps/rejected": -328.163330078125,
427
+ "loss": 0.6897631645202636,
428
+ "rewards/accuracies": 0.550000011920929,
429
+ "rewards/chosen": 0.4732390344142914,
430
+ "rewards/margins": 0.017790159210562706,
431
+ "rewards/rejected": 0.4554489552974701,
432
+ "step": 260
433
+ },
434
+ {
435
+ "epoch": 1.8765217391304347,
436
+ "grad_norm": 54.25,
437
+ "learning_rate": 1.8790595656469626e-07,
438
+ "logits/chosen": -1.0816611051559448,
439
+ "logits/rejected": -1.0837030410766602,
440
+ "logps/chosen": -331.3684387207031,
441
+ "logps/rejected": -313.75445556640625,
442
+ "loss": 0.6794800758361816,
443
+ "rewards/accuracies": 0.5375000238418579,
444
+ "rewards/chosen": 0.4701307415962219,
445
+ "rewards/margins": 0.03732570633292198,
446
+ "rewards/rejected": 0.43280500173568726,
447
+ "step": 270
448
+ },
449
+ {
450
+ "epoch": 1.9460869565217391,
451
+ "grad_norm": 50.5,
452
+ "learning_rate": 1.6852290267685928e-07,
453
+ "logits/chosen": -1.0794267654418945,
454
+ "logits/rejected": -1.0937658548355103,
455
+ "logps/chosen": -331.467529296875,
456
+ "logps/rejected": -321.3943786621094,
457
+ "loss": 0.6855987071990967,
458
+ "rewards/accuracies": 0.543749988079071,
459
+ "rewards/chosen": 0.48015865683555603,
460
+ "rewards/margins": 0.03580061346292496,
461
+ "rewards/rejected": 0.44435805082321167,
462
+ "step": 280
463
+ },
464
+ {
465
+ "epoch": 2.013913043478261,
466
+ "grad_norm": 45.25,
467
+ "learning_rate": 1.4967371761464737e-07,
468
+ "logits/chosen": -1.0743939876556396,
469
+ "logits/rejected": -1.0771441459655762,
470
+ "logps/chosen": -322.74835205078125,
471
+ "logps/rejected": -319.2298278808594,
472
+ "loss": 0.690724229812622,
473
+ "rewards/accuracies": 0.4871794879436493,
474
+ "rewards/chosen": 0.4530966281890869,
475
+ "rewards/margins": 0.012794151902198792,
476
+ "rewards/rejected": 0.4403024911880493,
477
+ "step": 290
478
+ },
479
+ {
480
+ "epoch": 2.0834782608695654,
481
+ "grad_norm": 45.5,
482
+ "learning_rate": 1.3148190838338802e-07,
483
+ "logits/chosen": -1.0955299139022827,
484
+ "logits/rejected": -1.1200354099273682,
485
+ "logps/chosen": -331.6620178222656,
486
+ "logps/rejected": -313.5164489746094,
487
+ "loss": 0.6698547840118408,
488
+ "rewards/accuracies": 0.612500011920929,
489
+ "rewards/chosen": 0.44486236572265625,
490
+ "rewards/margins": 0.06079784035682678,
491
+ "rewards/rejected": 0.38406452536582947,
492
+ "step": 300
493
+ },
494
+ {
495
+ "epoch": 2.0834782608695654,
496
+ "eval_logits/chosen": -1.0598626136779785,
497
+ "eval_logits/rejected": -1.080617070198059,
498
+ "eval_logps/chosen": -315.2421875,
499
+ "eval_logps/rejected": -316.3223876953125,
500
+ "eval_loss": 0.6872237920761108,
501
+ "eval_rewards/accuracies": 0.5403226017951965,
502
+ "eval_rewards/chosen": 0.42240414023399353,
503
+ "eval_rewards/margins": 0.015609556809067726,
504
+ "eval_rewards/rejected": 0.40679463744163513,
505
+ "eval_runtime": 11.2881,
506
+ "eval_samples_per_second": 10.719,
507
+ "eval_steps_per_second": 2.746,
508
+ "step": 300
509
+ }
510
+ ],
511
+ "logging_steps": 10,
512
+ "max_steps": 432,
513
+ "num_input_tokens_seen": 0,
514
+ "num_train_epochs": 3,
515
+ "save_steps": 100,
516
+ "stateful_callbacks": {
517
+ "TrainerControl": {
518
+ "args": {
519
+ "should_epoch_stop": false,
520
+ "should_evaluate": false,
521
+ "should_log": false,
522
+ "should_save": true,
523
+ "should_training_stop": false
524
+ },
525
+ "attributes": {}
526
+ }
527
+ },
528
+ "total_flos": 0.0,
529
+ "train_batch_size": 4,
530
+ "trial_name": null,
531
+ "trial_params": null
532
+ }
checkpoint-300/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b05394a09844ad8fbc5bf2ac3f5b6be474605707ae7d97f68562a3f9312197c4
3
+ size 6225
checkpoint-400/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
checkpoint-400/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Lfm2ForCausalLM"
4
+ ],
5
+ "block_auto_adjust_ff_dim": true,
6
+ "block_dim": 2048,
7
+ "block_ff_dim": 12288,
8
+ "block_ffn_dim_multiplier": 1.0,
9
+ "block_mlp_init_scale": 1.0,
10
+ "block_multiple_of": 256,
11
+ "block_norm_eps": 1e-05,
12
+ "block_out_init_scale": 1.0,
13
+ "block_use_swiglu": true,
14
+ "block_use_xavier_init": true,
15
+ "bos_token_id": 1,
16
+ "conv_L_cache": 3,
17
+ "conv_bias": false,
18
+ "conv_dim": 2048,
19
+ "conv_use_xavier_init": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 7,
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "layer_types": [
26
+ "conv",
27
+ "conv",
28
+ "full_attention",
29
+ "conv",
30
+ "conv",
31
+ "full_attention",
32
+ "conv",
33
+ "conv",
34
+ "full_attention",
35
+ "conv",
36
+ "full_attention",
37
+ "conv",
38
+ "full_attention",
39
+ "conv",
40
+ "full_attention",
41
+ "conv"
42
+ ],
43
+ "max_position_embeddings": 128000,
44
+ "model_type": "lfm2",
45
+ "norm_eps": 1e-05,
46
+ "num_attention_heads": 32,
47
+ "num_heads": 32,
48
+ "num_hidden_layers": 16,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 0,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "tie_embedding": true,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "5.2.0",
58
+ "use_cache": false,
59
+ "use_pos_enc": true,
60
+ "vocab_size": 65536
61
+ }
checkpoint-400/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 7
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0"
9
+ }
checkpoint-400/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a7c37aae343f39e5bff72a9e4c3f50518a498c1331c30c85404a17ed984d89c
3
+ size 2340697936
checkpoint-400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5482a323203561585dedf00d2fbec8a8a40881ac4d483182c9dc68142423a846
3
+ size 3178927435
checkpoint-400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f196323d7423b60f8e4ceb7dbf8715ee326c0d068e5ff164f13c63b279b9f1a0
3
+ size 14645
checkpoint-400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58d2a6b6ae30d2c6082649e4dba1804265132f00e29fa400bec0b33445a55f46
3
+ size 1465
checkpoint-400/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-400/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "right",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }
checkpoint-400/trainer_state.json ADDED
@@ -0,0 +1,698 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 300,
3
+ "best_metric": 0.6872237920761108,
4
+ "best_model_checkpoint": "models/dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126/checkpoint-300",
5
+ "epoch": 2.7791304347826085,
6
+ "eval_steps": 100,
7
+ "global_step": 400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06956521739130435,
14
+ "grad_norm": 44.25,
15
+ "learning_rate": 1.0227272727272728e-07,
16
+ "logits/chosen": -1.0599091053009033,
17
+ "logits/rejected": -1.042307734489441,
18
+ "logps/chosen": -332.13507080078125,
19
+ "logps/rejected": -332.9361267089844,
20
+ "loss": 0.6927361965179444,
21
+ "rewards/accuracies": 0.41874998807907104,
22
+ "rewards/chosen": 0.01432617288082838,
23
+ "rewards/margins": 0.0042648655362427235,
24
+ "rewards/rejected": 0.010061310604214668,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.1391304347826087,
29
+ "grad_norm": 50.5,
30
+ "learning_rate": 2.159090909090909e-07,
31
+ "logits/chosen": -1.0978257656097412,
32
+ "logits/rejected": -1.09658682346344,
33
+ "logps/chosen": -329.22100830078125,
34
+ "logps/rejected": -312.2664794921875,
35
+ "loss": 0.7003747940063476,
36
+ "rewards/accuracies": 0.44999998807907104,
37
+ "rewards/chosen": 0.004466252867132425,
38
+ "rewards/margins": -0.011238865554332733,
39
+ "rewards/rejected": 0.01570511981844902,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.20869565217391303,
44
+ "grad_norm": 54.75,
45
+ "learning_rate": 3.295454545454545e-07,
46
+ "logits/chosen": -1.1366004943847656,
47
+ "logits/rejected": -1.1707069873809814,
48
+ "logps/chosen": -328.1323547363281,
49
+ "logps/rejected": -305.90618896484375,
50
+ "loss": 0.7031519412994385,
51
+ "rewards/accuracies": 0.4312500059604645,
52
+ "rewards/chosen": 0.03125340864062309,
53
+ "rewards/margins": -0.01610126718878746,
54
+ "rewards/rejected": 0.04735467582941055,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.2782608695652174,
59
+ "grad_norm": 50.5,
60
+ "learning_rate": 4.4318181818181817e-07,
61
+ "logits/chosen": -1.0889461040496826,
62
+ "logits/rejected": -1.1017862558364868,
63
+ "logps/chosen": -338.80938720703125,
64
+ "logps/rejected": -328.07464599609375,
65
+ "loss": 0.6893259048461914,
66
+ "rewards/accuracies": 0.4937500059604645,
67
+ "rewards/chosen": 0.09773217141628265,
68
+ "rewards/margins": 0.01154844556003809,
69
+ "rewards/rejected": 0.08618371188640594,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.34782608695652173,
74
+ "grad_norm": 53.5,
75
+ "learning_rate": 4.997951542310825e-07,
76
+ "logits/chosen": -1.1367381811141968,
77
+ "logits/rejected": -1.1468210220336914,
78
+ "logps/chosen": -329.12823486328125,
79
+ "logps/rejected": -323.8036804199219,
80
+ "loss": 0.6982084751129151,
81
+ "rewards/accuracies": 0.44999998807907104,
82
+ "rewards/chosen": 0.1568203717470169,
83
+ "rewards/margins": -0.004218918737024069,
84
+ "rewards/rejected": 0.1610392928123474,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.41739130434782606,
89
+ "grad_norm": 64.0,
90
+ "learning_rate": 4.98158401695492e-07,
91
+ "logits/chosen": -1.0899083614349365,
92
+ "logits/rejected": -1.0698351860046387,
93
+ "logps/chosen": -335.6194152832031,
94
+ "logps/rejected": -329.00653076171875,
95
+ "loss": 0.6905360221862793,
96
+ "rewards/accuracies": 0.53125,
97
+ "rewards/chosen": 0.1543850153684616,
98
+ "rewards/margins": 0.010315253399312496,
99
+ "rewards/rejected": 0.14406974613666534,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.48695652173913045,
104
+ "grad_norm": 72.5,
105
+ "learning_rate": 4.948956212473369e-07,
106
+ "logits/chosen": -1.1223671436309814,
107
+ "logits/rejected": -1.113906741142273,
108
+ "logps/chosen": -319.424560546875,
109
+ "logps/rejected": -317.76934814453125,
110
+ "loss": 0.7009393215179444,
111
+ "rewards/accuracies": 0.4749999940395355,
112
+ "rewards/chosen": 0.18402941524982452,
113
+ "rewards/margins": -0.010006649419665337,
114
+ "rewards/rejected": 0.19403605163097382,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.5565217391304348,
119
+ "grad_norm": 52.5,
120
+ "learning_rate": 4.900281918608732e-07,
121
+ "logits/chosen": -1.1157792806625366,
122
+ "logits/rejected": -1.1078118085861206,
123
+ "logps/chosen": -314.5404052734375,
124
+ "logps/rejected": -311.91656494140625,
125
+ "loss": 0.6877760887145996,
126
+ "rewards/accuracies": 0.4749999940395355,
127
+ "rewards/chosen": 0.19012701511383057,
128
+ "rewards/margins": 0.016789216548204422,
129
+ "rewards/rejected": 0.17333778738975525,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.6260869565217392,
134
+ "grad_norm": 47.75,
135
+ "learning_rate": 4.835880067784441e-07,
136
+ "logits/chosen": -1.0738165378570557,
137
+ "logits/rejected": -1.0576341152191162,
138
+ "logps/chosen": -336.3821716308594,
139
+ "logps/rejected": -331.9998779296875,
140
+ "loss": 0.6956596374511719,
141
+ "rewards/accuracies": 0.4375,
142
+ "rewards/chosen": 0.21225734055042267,
143
+ "rewards/margins": 0.0009234324097633362,
144
+ "rewards/rejected": 0.21133390069007874,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.6956521739130435,
149
+ "grad_norm": 49.0,
150
+ "learning_rate": 4.7561726453386744e-07,
151
+ "logits/chosen": -1.0457407236099243,
152
+ "logits/rejected": -1.038447618484497,
153
+ "logps/chosen": -330.8192138671875,
154
+ "logps/rejected": -323.5484924316406,
155
+ "loss": 0.6817105770111084,
156
+ "rewards/accuracies": 0.581250011920929,
157
+ "rewards/chosen": 0.26413798332214355,
158
+ "rewards/margins": 0.029450010508298874,
159
+ "rewards/rejected": 0.2346879541873932,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.6956521739130435,
164
+ "eval_logits/chosen": -1.0708128213882446,
165
+ "eval_logits/rejected": -1.0917303562164307,
166
+ "eval_logps/chosen": -317.1337585449219,
167
+ "eval_logps/rejected": -318.0965270996094,
168
+ "eval_loss": 0.6924862265586853,
169
+ "eval_rewards/accuracies": 0.4838709533214569,
170
+ "eval_rewards/chosen": 0.2332489788532257,
171
+ "eval_rewards/margins": 0.0038641756400465965,
172
+ "eval_rewards/rejected": 0.22938479483127594,
173
+ "eval_runtime": 11.3087,
174
+ "eval_samples_per_second": 10.7,
175
+ "eval_steps_per_second": 2.741,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 0.7652173913043478,
180
+ "grad_norm": 47.0,
181
+ "learning_rate": 4.6616819245164655e-07,
182
+ "logits/chosen": -1.0736799240112305,
183
+ "logits/rejected": -1.0783708095550537,
184
+ "logps/chosen": -314.97943115234375,
185
+ "logps/rejected": -308.35675048828125,
186
+ "loss": 0.6984636306762695,
187
+ "rewards/accuracies": 0.48750001192092896,
188
+ "rewards/chosen": 0.22377920150756836,
189
+ "rewards/margins": -0.0050087496638298035,
190
+ "rewards/rejected": 0.22878792881965637,
191
+ "step": 110
192
+ },
193
+ {
194
+ "epoch": 0.8347826086956521,
195
+ "grad_norm": 51.25,
196
+ "learning_rate": 4.55302704433743e-07,
197
+ "logits/chosen": -1.048730731010437,
198
+ "logits/rejected": -1.0437910556793213,
199
+ "logps/chosen": -332.6585388183594,
200
+ "logps/rejected": -319.9508361816406,
201
+ "loss": 0.6809643268585205,
202
+ "rewards/accuracies": 0.59375,
203
+ "rewards/chosen": 0.3183898627758026,
204
+ "rewards/margins": 0.03039008378982544,
205
+ "rewards/rejected": 0.2879997789859772,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 0.9043478260869565,
210
+ "grad_norm": 57.5,
211
+ "learning_rate": 4.4309199527622254e-07,
212
+ "logits/chosen": -1.0928480625152588,
213
+ "logits/rejected": -1.0909947156906128,
214
+ "logps/chosen": -333.54510498046875,
215
+ "logps/rejected": -321.5794372558594,
216
+ "loss": 0.6884570121765137,
217
+ "rewards/accuracies": 0.518750011920929,
218
+ "rewards/chosen": 0.3165169656276703,
219
+ "rewards/margins": 0.015730690211057663,
220
+ "rewards/rejected": 0.3007862865924835,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.9739130434782609,
225
+ "grad_norm": 50.75,
226
+ "learning_rate": 4.2961607417396517e-07,
227
+ "logits/chosen": -1.0702852010726929,
228
+ "logits/rejected": -1.0860363245010376,
229
+ "logps/chosen": -336.94122314453125,
230
+ "logps/rejected": -312.6028137207031,
231
+ "loss": 0.6929869651794434,
232
+ "rewards/accuracies": 0.5,
233
+ "rewards/chosen": 0.3366253077983856,
234
+ "rewards/margins": 0.008591671474277973,
235
+ "rewards/rejected": 0.3280336260795593,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 1.0417391304347827,
240
+ "grad_norm": 58.25,
241
+ "learning_rate": 4.1496324047009244e-07,
242
+ "logits/chosen": -1.137653112411499,
243
+ "logits/rejected": -1.1385780572891235,
244
+ "logps/chosen": -297.7463073730469,
245
+ "logps/rejected": -304.8460388183594,
246
+ "loss": 0.6796726703643798,
247
+ "rewards/accuracies": 0.5320512652397156,
248
+ "rewards/chosen": 0.36476510763168335,
249
+ "rewards/margins": 0.029467487707734108,
250
+ "rewards/rejected": 0.3352976143360138,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 1.111304347826087,
255
+ "grad_norm": 62.0,
256
+ "learning_rate": 3.9922950508520126e-07,
257
+ "logits/chosen": -1.0764613151550293,
258
+ "logits/rejected": -1.073861837387085,
259
+ "logps/chosen": -325.74151611328125,
260
+ "logps/rejected": -314.5641174316406,
261
+ "loss": 0.6664072513580322,
262
+ "rewards/accuracies": 0.6312500238418579,
263
+ "rewards/chosen": 0.37418311834335327,
264
+ "rewards/margins": 0.06225377321243286,
265
+ "rewards/rejected": 0.3119293451309204,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 1.1808695652173913,
270
+ "grad_norm": 48.25,
271
+ "learning_rate": 3.8251796141741945e-07,
272
+ "logits/chosen": -1.1032686233520508,
273
+ "logits/rejected": -1.1010105609893799,
274
+ "logps/chosen": -324.8043212890625,
275
+ "logps/rejected": -314.7898254394531,
276
+ "loss": 0.6656156063079834,
277
+ "rewards/accuracies": 0.643750011920929,
278
+ "rewards/chosen": 0.39548203349113464,
279
+ "rewards/margins": 0.06372012943029404,
280
+ "rewards/rejected": 0.33176189661026,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 1.2504347826086957,
285
+ "grad_norm": 73.5,
286
+ "learning_rate": 3.649381098353834e-07,
287
+ "logits/chosen": -1.108781099319458,
288
+ "logits/rejected": -1.081012487411499,
289
+ "logps/chosen": -333.0024719238281,
290
+ "logps/rejected": -317.6542663574219,
291
+ "loss": 0.6682669162750244,
292
+ "rewards/accuracies": 0.6187499761581421,
293
+ "rewards/chosen": 0.4114314019680023,
294
+ "rewards/margins": 0.06464624404907227,
295
+ "rewards/rejected": 0.34678512811660767,
296
+ "step": 180
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "grad_norm": 51.75,
301
+ "learning_rate": 3.466051401903162e-07,
302
+ "logits/chosen": -1.0279661417007446,
303
+ "logits/rejected": -1.028050422668457,
304
+ "logps/chosen": -342.05322265625,
305
+ "logps/rejected": -329.083984375,
306
+ "loss": 0.6815151691436767,
307
+ "rewards/accuracies": 0.5249999761581421,
308
+ "rewards/chosen": 0.4475061893463135,
309
+ "rewards/margins": 0.03387105464935303,
310
+ "rewards/rejected": 0.41363516449928284,
311
+ "step": 190
312
+ },
313
+ {
314
+ "epoch": 1.3895652173913042,
315
+ "grad_norm": 43.25,
316
+ "learning_rate": 3.276391770484606e-07,
317
+ "logits/chosen": -1.0554686784744263,
318
+ "logits/rejected": -1.0444362163543701,
319
+ "logps/chosen": -323.93359375,
320
+ "logps/rejected": -300.8354187011719,
321
+ "loss": 0.6729463577270508,
322
+ "rewards/accuracies": 0.5874999761581421,
323
+ "rewards/chosen": 0.4051585793495178,
324
+ "rewards/margins": 0.04842006787657738,
325
+ "rewards/rejected": 0.35673850774765015,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 1.3895652173913042,
330
+ "eval_logits/chosen": -1.059758186340332,
331
+ "eval_logits/rejected": -1.0798335075378418,
332
+ "eval_logps/chosen": -315.1194763183594,
333
+ "eval_logps/rejected": -316.16046142578125,
334
+ "eval_loss": 0.6918731331825256,
335
+ "eval_rewards/accuracies": 0.5483871102333069,
336
+ "eval_rewards/chosen": 0.4346778690814972,
337
+ "eval_rewards/margins": 0.011690356768667698,
338
+ "eval_rewards/rejected": 0.42298752069473267,
339
+ "eval_runtime": 11.2737,
340
+ "eval_samples_per_second": 10.733,
341
+ "eval_steps_per_second": 2.75,
342
+ "step": 200
343
+ },
344
+ {
345
+ "epoch": 1.4591304347826086,
346
+ "grad_norm": 44.5,
347
+ "learning_rate": 3.0816449258938656e-07,
348
+ "logits/chosen": -1.0483617782592773,
349
+ "logits/rejected": -1.0491857528686523,
350
+ "logps/chosen": -326.4630126953125,
351
+ "logps/rejected": -321.35467529296875,
352
+ "loss": 0.6760656356811523,
353
+ "rewards/accuracies": 0.59375,
354
+ "rewards/chosen": 0.4774439334869385,
355
+ "rewards/margins": 0.04994555562734604,
356
+ "rewards/rejected": 0.42749834060668945,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 1.528695652173913,
361
+ "grad_norm": 69.0,
362
+ "learning_rate": 2.883086923275658e-07,
363
+ "logits/chosen": -1.0638173818588257,
364
+ "logits/rejected": -1.0861108303070068,
365
+ "logps/chosen": -323.24517822265625,
366
+ "logps/rejected": -316.22454833984375,
367
+ "loss": 0.6859281063079834,
368
+ "rewards/accuracies": 0.5375000238418579,
369
+ "rewards/chosen": 0.464817613363266,
370
+ "rewards/margins": 0.025601129978895187,
371
+ "rewards/rejected": 0.4392164647579193,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 1.5982608695652174,
376
+ "grad_norm": 51.25,
377
+ "learning_rate": 2.6820187899267197e-07,
378
+ "logits/chosen": -1.0582925081253052,
379
+ "logits/rejected": -1.0700470209121704,
380
+ "logps/chosen": -331.1497802734375,
381
+ "logps/rejected": -318.2004089355469,
382
+ "loss": 0.6922653675079345,
383
+ "rewards/accuracies": 0.5249999761581421,
384
+ "rewards/chosen": 0.4878065586090088,
385
+ "rewards/margins": 0.014133572578430176,
386
+ "rewards/rejected": 0.4736729562282562,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 1.6678260869565218,
391
+ "grad_norm": 44.0,
392
+ "learning_rate": 2.4797580004718036e-07,
393
+ "logits/chosen": -1.059127926826477,
394
+ "logits/rejected": -1.0511342287063599,
395
+ "logps/chosen": -329.36309814453125,
396
+ "logps/rejected": -324.93060302734375,
397
+ "loss": 0.6760355949401855,
398
+ "rewards/accuracies": 0.5562499761581421,
399
+ "rewards/chosen": 0.4827202260494232,
400
+ "rewards/margins": 0.04902523010969162,
401
+ "rewards/rejected": 0.433694988489151,
402
+ "step": 240
403
+ },
404
+ {
405
+ "epoch": 1.7373913043478262,
406
+ "grad_norm": 47.0,
407
+ "learning_rate": 2.277629844270543e-07,
408
+ "logits/chosen": -1.1127703189849854,
409
+ "logits/rejected": -1.119107961654663,
410
+ "logps/chosen": -313.205810546875,
411
+ "logps/rejected": -302.2017822265625,
412
+ "loss": 0.6791357040405274,
413
+ "rewards/accuracies": 0.59375,
414
+ "rewards/chosen": 0.46044379472732544,
415
+ "rewards/margins": 0.04207334667444229,
416
+ "rewards/rejected": 0.41837042570114136,
417
+ "step": 250
418
+ },
419
+ {
420
+ "epoch": 1.8069565217391306,
421
+ "grad_norm": 74.5,
422
+ "learning_rate": 2.0769587416192208e-07,
423
+ "logits/chosen": -1.0957348346710205,
424
+ "logits/rejected": -1.0845588445663452,
425
+ "logps/chosen": -326.6529235839844,
426
+ "logps/rejected": -328.163330078125,
427
+ "loss": 0.6897631645202636,
428
+ "rewards/accuracies": 0.550000011920929,
429
+ "rewards/chosen": 0.4732390344142914,
430
+ "rewards/margins": 0.017790159210562706,
431
+ "rewards/rejected": 0.4554489552974701,
432
+ "step": 260
433
+ },
434
+ {
435
+ "epoch": 1.8765217391304347,
436
+ "grad_norm": 54.25,
437
+ "learning_rate": 1.8790595656469626e-07,
438
+ "logits/chosen": -1.0816611051559448,
439
+ "logits/rejected": -1.0837030410766602,
440
+ "logps/chosen": -331.3684387207031,
441
+ "logps/rejected": -313.75445556640625,
442
+ "loss": 0.6794800758361816,
443
+ "rewards/accuracies": 0.5375000238418579,
444
+ "rewards/chosen": 0.4701307415962219,
445
+ "rewards/margins": 0.03732570633292198,
446
+ "rewards/rejected": 0.43280500173568726,
447
+ "step": 270
448
+ },
449
+ {
450
+ "epoch": 1.9460869565217391,
451
+ "grad_norm": 50.5,
452
+ "learning_rate": 1.6852290267685928e-07,
453
+ "logits/chosen": -1.0794267654418945,
454
+ "logits/rejected": -1.0937658548355103,
455
+ "logps/chosen": -331.467529296875,
456
+ "logps/rejected": -321.3943786621094,
457
+ "loss": 0.6855987071990967,
458
+ "rewards/accuracies": 0.543749988079071,
459
+ "rewards/chosen": 0.48015865683555603,
460
+ "rewards/margins": 0.03580061346292496,
461
+ "rewards/rejected": 0.44435805082321167,
462
+ "step": 280
463
+ },
464
+ {
465
+ "epoch": 2.013913043478261,
466
+ "grad_norm": 45.25,
467
+ "learning_rate": 1.4967371761464737e-07,
468
+ "logits/chosen": -1.0743939876556396,
469
+ "logits/rejected": -1.0771441459655762,
470
+ "logps/chosen": -322.74835205078125,
471
+ "logps/rejected": -319.2298278808594,
472
+ "loss": 0.690724229812622,
473
+ "rewards/accuracies": 0.4871794879436493,
474
+ "rewards/chosen": 0.4530966281890869,
475
+ "rewards/margins": 0.012794151902198792,
476
+ "rewards/rejected": 0.4403024911880493,
477
+ "step": 290
478
+ },
479
+ {
480
+ "epoch": 2.0834782608695654,
481
+ "grad_norm": 45.5,
482
+ "learning_rate": 1.3148190838338802e-07,
483
+ "logits/chosen": -1.0955299139022827,
484
+ "logits/rejected": -1.1200354099273682,
485
+ "logps/chosen": -331.6620178222656,
486
+ "logps/rejected": -313.5164489746094,
487
+ "loss": 0.6698547840118408,
488
+ "rewards/accuracies": 0.612500011920929,
489
+ "rewards/chosen": 0.44486236572265625,
490
+ "rewards/margins": 0.06079784035682678,
491
+ "rewards/rejected": 0.38406452536582947,
492
+ "step": 300
493
+ },
494
+ {
495
+ "epoch": 2.0834782608695654,
496
+ "eval_logits/chosen": -1.0598626136779785,
497
+ "eval_logits/rejected": -1.080617070198059,
498
+ "eval_logps/chosen": -315.2421875,
499
+ "eval_logps/rejected": -316.3223876953125,
500
+ "eval_loss": 0.6872237920761108,
501
+ "eval_rewards/accuracies": 0.5403226017951965,
502
+ "eval_rewards/chosen": 0.42240414023399353,
503
+ "eval_rewards/margins": 0.015609556809067726,
504
+ "eval_rewards/rejected": 0.40679463744163513,
505
+ "eval_runtime": 11.2881,
506
+ "eval_samples_per_second": 10.719,
507
+ "eval_steps_per_second": 2.746,
508
+ "step": 300
509
+ },
510
+ {
511
+ "epoch": 2.1530434782608694,
512
+ "grad_norm": 47.5,
513
+ "learning_rate": 1.1406667461278538e-07,
514
+ "logits/chosen": -1.0953623056411743,
515
+ "logits/rejected": -1.0638518333435059,
516
+ "logps/chosen": -317.5690002441406,
517
+ "logps/rejected": -313.86895751953125,
518
+ "loss": 0.6786671161651612,
519
+ "rewards/accuracies": 0.5874999761581421,
520
+ "rewards/chosen": 0.43794316053390503,
521
+ "rewards/margins": 0.040073662996292114,
522
+ "rewards/rejected": 0.39786943793296814,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 2.222608695652174,
527
+ "grad_norm": 75.5,
528
+ "learning_rate": 9.754212751576385e-08,
529
+ "logits/chosen": -1.036792516708374,
530
+ "logits/rejected": -1.0298279523849487,
531
+ "logps/chosen": -322.68511962890625,
532
+ "logps/rejected": -324.95562744140625,
533
+ "loss": 0.6964206695556641,
534
+ "rewards/accuracies": 0.5,
535
+ "rewards/chosen": 0.4462190568447113,
536
+ "rewards/margins": 0.005142434034496546,
537
+ "rewards/rejected": 0.4410766065120697,
538
+ "step": 320
539
+ },
540
+ {
541
+ "epoch": 2.292173913043478,
542
+ "grad_norm": 50.0,
543
+ "learning_rate": 8.201654218854689e-08,
544
+ "logits/chosen": -1.076971173286438,
545
+ "logits/rejected": -1.0646440982818604,
546
+ "logps/chosen": -314.86492919921875,
547
+ "logps/rejected": -309.55389404296875,
548
+ "loss": 0.6790688037872314,
549
+ "rewards/accuracies": 0.581250011920929,
550
+ "rewards/chosen": 0.4310247302055359,
551
+ "rewards/margins": 0.03967974707484245,
552
+ "rewards/rejected": 0.39134499430656433,
553
+ "step": 330
554
+ },
555
+ {
556
+ "epoch": 2.3617391304347826,
557
+ "grad_norm": 48.0,
558
+ "learning_rate": 6.759164815118493e-08,
559
+ "logits/chosen": -1.0706030130386353,
560
+ "logits/rejected": -1.0911849737167358,
561
+ "logps/chosen": -337.18853759765625,
562
+ "logps/rejected": -322.5784912109375,
563
+ "loss": 0.6748550415039063,
564
+ "rewards/accuracies": 0.625,
565
+ "rewards/chosen": 0.5078705549240112,
566
+ "rewards/margins": 0.05118296295404434,
567
+ "rewards/rejected": 0.4566876292228699,
568
+ "step": 340
569
+ },
570
+ {
571
+ "epoch": 2.431304347826087,
572
+ "grad_norm": 60.5,
573
+ "learning_rate": 5.4361962777179273e-08,
574
+ "logits/chosen": -1.0652002096176147,
575
+ "logits/rejected": -1.0517784357070923,
576
+ "logps/chosen": -330.02325439453125,
577
+ "logps/rejected": -326.01873779296875,
578
+ "loss": 0.691571044921875,
579
+ "rewards/accuracies": 0.5687500238418579,
580
+ "rewards/chosen": 0.46054545044898987,
581
+ "rewards/margins": 0.012338491156697273,
582
+ "rewards/rejected": 0.44820699095726013,
583
+ "step": 350
584
+ },
585
+ {
586
+ "epoch": 2.5008695652173913,
587
+ "grad_norm": 51.75,
588
+ "learning_rate": 4.2414171979824e-08,
589
+ "logits/chosen": -1.0523455142974854,
590
+ "logits/rejected": -1.0446560382843018,
591
+ "logps/chosen": -336.7765197753906,
592
+ "logps/rejected": -316.00067138671875,
593
+ "loss": 0.6638113975524902,
594
+ "rewards/accuracies": 0.6625000238418579,
595
+ "rewards/chosen": 0.46467241644859314,
596
+ "rewards/margins": 0.06902624666690826,
597
+ "rewards/rejected": 0.39564621448516846,
598
+ "step": 360
599
+ },
600
+ {
601
+ "epoch": 2.5704347826086957,
602
+ "grad_norm": 49.0,
603
+ "learning_rate": 3.1826562213243844e-08,
604
+ "logits/chosen": -1.0767319202423096,
605
+ "logits/rejected": -1.0776704549789429,
606
+ "logps/chosen": -329.4648742675781,
607
+ "logps/rejected": -319.5013732910156,
608
+ "loss": 0.6690452575683594,
609
+ "rewards/accuracies": 0.5687500238418579,
610
+ "rewards/chosen": 0.46328702569007874,
611
+ "rewards/margins": 0.061441998928785324,
612
+ "rewards/rejected": 0.40184497833251953,
613
+ "step": 370
614
+ },
615
+ {
616
+ "epoch": 2.64,
617
+ "grad_norm": 56.5,
618
+ "learning_rate": 2.2668507509871953e-08,
619
+ "logits/chosen": -1.1089640855789185,
620
+ "logits/rejected": -1.0823930501937866,
621
+ "logps/chosen": -328.79351806640625,
622
+ "logps/rejected": -311.1128845214844,
623
+ "loss": 0.645816707611084,
624
+ "rewards/accuracies": 0.675000011920929,
625
+ "rewards/chosen": 0.4771113991737366,
626
+ "rewards/margins": 0.11158710718154907,
627
+ "rewards/rejected": 0.3655242323875427,
628
+ "step": 380
629
+ },
630
+ {
631
+ "epoch": 2.7095652173913045,
632
+ "grad_norm": 51.0,
633
+ "learning_rate": 1.5000014915493465e-08,
634
+ "logits/chosen": -1.1323436498641968,
635
+ "logits/rejected": -1.1117795705795288,
636
+ "logps/chosen": -316.7195739746094,
637
+ "logps/rejected": -307.0235900878906,
638
+ "loss": 0.6755961894989013,
639
+ "rewards/accuracies": 0.581250011920929,
640
+ "rewards/chosen": 0.45080599188804626,
641
+ "rewards/margins": 0.048326827585697174,
642
+ "rewards/rejected": 0.4024791717529297,
643
+ "step": 390
644
+ },
645
+ {
646
+ "epoch": 2.7791304347826085,
647
+ "grad_norm": 77.0,
648
+ "learning_rate": 8.871331300335321e-09,
649
+ "logits/chosen": -1.0839048624038696,
650
+ "logits/rejected": -1.0926908254623413,
651
+ "logps/chosen": -324.00946044921875,
652
+ "logps/rejected": -318.5250549316406,
653
+ "loss": 0.6874616622924805,
654
+ "rewards/accuracies": 0.5562499761581421,
655
+ "rewards/chosen": 0.45788002014160156,
656
+ "rewards/margins": 0.025683864951133728,
657
+ "rewards/rejected": 0.4321961998939514,
658
+ "step": 400
659
+ },
660
+ {
661
+ "epoch": 2.7791304347826085,
662
+ "eval_logits/chosen": -1.059512972831726,
663
+ "eval_logits/rejected": -1.0796120166778564,
664
+ "eval_logps/chosen": -315.1502990722656,
665
+ "eval_logps/rejected": -316.1904296875,
666
+ "eval_loss": 0.6914414763450623,
667
+ "eval_rewards/accuracies": 0.5322580933570862,
668
+ "eval_rewards/chosen": 0.4315947890281677,
669
+ "eval_rewards/margins": 0.01160100195556879,
670
+ "eval_rewards/rejected": 0.4199938178062439,
671
+ "eval_runtime": 11.25,
672
+ "eval_samples_per_second": 10.756,
673
+ "eval_steps_per_second": 2.756,
674
+ "step": 400
675
+ }
676
+ ],
677
+ "logging_steps": 10,
678
+ "max_steps": 432,
679
+ "num_input_tokens_seen": 0,
680
+ "num_train_epochs": 3,
681
+ "save_steps": 100,
682
+ "stateful_callbacks": {
683
+ "TrainerControl": {
684
+ "args": {
685
+ "should_epoch_stop": false,
686
+ "should_evaluate": false,
687
+ "should_log": false,
688
+ "should_save": true,
689
+ "should_training_stop": false
690
+ },
691
+ "attributes": {}
692
+ }
693
+ },
694
+ "total_flos": 0.0,
695
+ "train_batch_size": 4,
696
+ "trial_name": null,
697
+ "trial_params": null
698
+ }
checkpoint-400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b05394a09844ad8fbc5bf2ac3f5b6be474605707ae7d97f68562a3f9312197c4
3
+ size 6225
checkpoint-432/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
checkpoint-432/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Lfm2ForCausalLM"
4
+ ],
5
+ "block_auto_adjust_ff_dim": true,
6
+ "block_dim": 2048,
7
+ "block_ff_dim": 12288,
8
+ "block_ffn_dim_multiplier": 1.0,
9
+ "block_mlp_init_scale": 1.0,
10
+ "block_multiple_of": 256,
11
+ "block_norm_eps": 1e-05,
12
+ "block_out_init_scale": 1.0,
13
+ "block_use_swiglu": true,
14
+ "block_use_xavier_init": true,
15
+ "bos_token_id": 1,
16
+ "conv_L_cache": 3,
17
+ "conv_bias": false,
18
+ "conv_dim": 2048,
19
+ "conv_use_xavier_init": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 7,
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "layer_types": [
26
+ "conv",
27
+ "conv",
28
+ "full_attention",
29
+ "conv",
30
+ "conv",
31
+ "full_attention",
32
+ "conv",
33
+ "conv",
34
+ "full_attention",
35
+ "conv",
36
+ "full_attention",
37
+ "conv",
38
+ "full_attention",
39
+ "conv",
40
+ "full_attention",
41
+ "conv"
42
+ ],
43
+ "max_position_embeddings": 128000,
44
+ "model_type": "lfm2",
45
+ "norm_eps": 1e-05,
46
+ "num_attention_heads": 32,
47
+ "num_heads": 32,
48
+ "num_hidden_layers": 16,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 0,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "tie_embedding": true,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "5.2.0",
58
+ "use_cache": false,
59
+ "use_pos_enc": true,
60
+ "vocab_size": 65536
61
+ }
checkpoint-432/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 7
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0"
9
+ }
checkpoint-432/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d360e6c4593561d9ab3c7c38f9c56f3b0e9007f2104a3d408741552c249a101
3
+ size 2340697936
checkpoint-432/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293e1dd43e9549c2668c55fc867d914e17d6d112d4671b18f856bb9bbd8177d7
3
+ size 3178927435
checkpoint-432/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f196323d7423b60f8e4ceb7dbf8715ee326c0d068e5ff164f13c63b279b9f1a0
3
+ size 14645
checkpoint-432/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:500bc375cdd19b81329befa9b7152f30c7d8b2e4876303d1a5d1646dc97c1f9f
3
+ size 1465
checkpoint-432/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-432/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "right",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }
checkpoint-432/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 300,
3
+ "best_metric": 0.6872237920761108,
4
+ "best_model_checkpoint": "models/dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126/checkpoint-300",
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 432,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06956521739130435,
14
+ "grad_norm": 44.25,
15
+ "learning_rate": 1.0227272727272728e-07,
16
+ "logits/chosen": -1.0599091053009033,
17
+ "logits/rejected": -1.042307734489441,
18
+ "logps/chosen": -332.13507080078125,
19
+ "logps/rejected": -332.9361267089844,
20
+ "loss": 0.6927361965179444,
21
+ "rewards/accuracies": 0.41874998807907104,
22
+ "rewards/chosen": 0.01432617288082838,
23
+ "rewards/margins": 0.0042648655362427235,
24
+ "rewards/rejected": 0.010061310604214668,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.1391304347826087,
29
+ "grad_norm": 50.5,
30
+ "learning_rate": 2.159090909090909e-07,
31
+ "logits/chosen": -1.0978257656097412,
32
+ "logits/rejected": -1.09658682346344,
33
+ "logps/chosen": -329.22100830078125,
34
+ "logps/rejected": -312.2664794921875,
35
+ "loss": 0.7003747940063476,
36
+ "rewards/accuracies": 0.44999998807907104,
37
+ "rewards/chosen": 0.004466252867132425,
38
+ "rewards/margins": -0.011238865554332733,
39
+ "rewards/rejected": 0.01570511981844902,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.20869565217391303,
44
+ "grad_norm": 54.75,
45
+ "learning_rate": 3.295454545454545e-07,
46
+ "logits/chosen": -1.1366004943847656,
47
+ "logits/rejected": -1.1707069873809814,
48
+ "logps/chosen": -328.1323547363281,
49
+ "logps/rejected": -305.90618896484375,
50
+ "loss": 0.7031519412994385,
51
+ "rewards/accuracies": 0.4312500059604645,
52
+ "rewards/chosen": 0.03125340864062309,
53
+ "rewards/margins": -0.01610126718878746,
54
+ "rewards/rejected": 0.04735467582941055,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.2782608695652174,
59
+ "grad_norm": 50.5,
60
+ "learning_rate": 4.4318181818181817e-07,
61
+ "logits/chosen": -1.0889461040496826,
62
+ "logits/rejected": -1.1017862558364868,
63
+ "logps/chosen": -338.80938720703125,
64
+ "logps/rejected": -328.07464599609375,
65
+ "loss": 0.6893259048461914,
66
+ "rewards/accuracies": 0.4937500059604645,
67
+ "rewards/chosen": 0.09773217141628265,
68
+ "rewards/margins": 0.01154844556003809,
69
+ "rewards/rejected": 0.08618371188640594,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.34782608695652173,
74
+ "grad_norm": 53.5,
75
+ "learning_rate": 4.997951542310825e-07,
76
+ "logits/chosen": -1.1367381811141968,
77
+ "logits/rejected": -1.1468210220336914,
78
+ "logps/chosen": -329.12823486328125,
79
+ "logps/rejected": -323.8036804199219,
80
+ "loss": 0.6982084751129151,
81
+ "rewards/accuracies": 0.44999998807907104,
82
+ "rewards/chosen": 0.1568203717470169,
83
+ "rewards/margins": -0.004218918737024069,
84
+ "rewards/rejected": 0.1610392928123474,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.41739130434782606,
89
+ "grad_norm": 64.0,
90
+ "learning_rate": 4.98158401695492e-07,
91
+ "logits/chosen": -1.0899083614349365,
92
+ "logits/rejected": -1.0698351860046387,
93
+ "logps/chosen": -335.6194152832031,
94
+ "logps/rejected": -329.00653076171875,
95
+ "loss": 0.6905360221862793,
96
+ "rewards/accuracies": 0.53125,
97
+ "rewards/chosen": 0.1543850153684616,
98
+ "rewards/margins": 0.010315253399312496,
99
+ "rewards/rejected": 0.14406974613666534,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.48695652173913045,
104
+ "grad_norm": 72.5,
105
+ "learning_rate": 4.948956212473369e-07,
106
+ "logits/chosen": -1.1223671436309814,
107
+ "logits/rejected": -1.113906741142273,
108
+ "logps/chosen": -319.424560546875,
109
+ "logps/rejected": -317.76934814453125,
110
+ "loss": 0.7009393215179444,
111
+ "rewards/accuracies": 0.4749999940395355,
112
+ "rewards/chosen": 0.18402941524982452,
113
+ "rewards/margins": -0.010006649419665337,
114
+ "rewards/rejected": 0.19403605163097382,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.5565217391304348,
119
+ "grad_norm": 52.5,
120
+ "learning_rate": 4.900281918608732e-07,
121
+ "logits/chosen": -1.1157792806625366,
122
+ "logits/rejected": -1.1078118085861206,
123
+ "logps/chosen": -314.5404052734375,
124
+ "logps/rejected": -311.91656494140625,
125
+ "loss": 0.6877760887145996,
126
+ "rewards/accuracies": 0.4749999940395355,
127
+ "rewards/chosen": 0.19012701511383057,
128
+ "rewards/margins": 0.016789216548204422,
129
+ "rewards/rejected": 0.17333778738975525,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.6260869565217392,
134
+ "grad_norm": 47.75,
135
+ "learning_rate": 4.835880067784441e-07,
136
+ "logits/chosen": -1.0738165378570557,
137
+ "logits/rejected": -1.0576341152191162,
138
+ "logps/chosen": -336.3821716308594,
139
+ "logps/rejected": -331.9998779296875,
140
+ "loss": 0.6956596374511719,
141
+ "rewards/accuracies": 0.4375,
142
+ "rewards/chosen": 0.21225734055042267,
143
+ "rewards/margins": 0.0009234324097633362,
144
+ "rewards/rejected": 0.21133390069007874,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.6956521739130435,
149
+ "grad_norm": 49.0,
150
+ "learning_rate": 4.7561726453386744e-07,
151
+ "logits/chosen": -1.0457407236099243,
152
+ "logits/rejected": -1.038447618484497,
153
+ "logps/chosen": -330.8192138671875,
154
+ "logps/rejected": -323.5484924316406,
155
+ "loss": 0.6817105770111084,
156
+ "rewards/accuracies": 0.581250011920929,
157
+ "rewards/chosen": 0.26413798332214355,
158
+ "rewards/margins": 0.029450010508298874,
159
+ "rewards/rejected": 0.2346879541873932,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.6956521739130435,
164
+ "eval_logits/chosen": -1.0708128213882446,
165
+ "eval_logits/rejected": -1.0917303562164307,
166
+ "eval_logps/chosen": -317.1337585449219,
167
+ "eval_logps/rejected": -318.0965270996094,
168
+ "eval_loss": 0.6924862265586853,
169
+ "eval_rewards/accuracies": 0.4838709533214569,
170
+ "eval_rewards/chosen": 0.2332489788532257,
171
+ "eval_rewards/margins": 0.0038641756400465965,
172
+ "eval_rewards/rejected": 0.22938479483127594,
173
+ "eval_runtime": 11.3087,
174
+ "eval_samples_per_second": 10.7,
175
+ "eval_steps_per_second": 2.741,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 0.7652173913043478,
180
+ "grad_norm": 47.0,
181
+ "learning_rate": 4.6616819245164655e-07,
182
+ "logits/chosen": -1.0736799240112305,
183
+ "logits/rejected": -1.0783708095550537,
184
+ "logps/chosen": -314.97943115234375,
185
+ "logps/rejected": -308.35675048828125,
186
+ "loss": 0.6984636306762695,
187
+ "rewards/accuracies": 0.48750001192092896,
188
+ "rewards/chosen": 0.22377920150756836,
189
+ "rewards/margins": -0.0050087496638298035,
190
+ "rewards/rejected": 0.22878792881965637,
191
+ "step": 110
192
+ },
193
+ {
194
+ "epoch": 0.8347826086956521,
195
+ "grad_norm": 51.25,
196
+ "learning_rate": 4.55302704433743e-07,
197
+ "logits/chosen": -1.048730731010437,
198
+ "logits/rejected": -1.0437910556793213,
199
+ "logps/chosen": -332.6585388183594,
200
+ "logps/rejected": -319.9508361816406,
201
+ "loss": 0.6809643268585205,
202
+ "rewards/accuracies": 0.59375,
203
+ "rewards/chosen": 0.3183898627758026,
204
+ "rewards/margins": 0.03039008378982544,
205
+ "rewards/rejected": 0.2879997789859772,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 0.9043478260869565,
210
+ "grad_norm": 57.5,
211
+ "learning_rate": 4.4309199527622254e-07,
212
+ "logits/chosen": -1.0928480625152588,
213
+ "logits/rejected": -1.0909947156906128,
214
+ "logps/chosen": -333.54510498046875,
215
+ "logps/rejected": -321.5794372558594,
216
+ "loss": 0.6884570121765137,
217
+ "rewards/accuracies": 0.518750011920929,
218
+ "rewards/chosen": 0.3165169656276703,
219
+ "rewards/margins": 0.015730690211057663,
220
+ "rewards/rejected": 0.3007862865924835,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.9739130434782609,
225
+ "grad_norm": 50.75,
226
+ "learning_rate": 4.2961607417396517e-07,
227
+ "logits/chosen": -1.0702852010726929,
228
+ "logits/rejected": -1.0860363245010376,
229
+ "logps/chosen": -336.94122314453125,
230
+ "logps/rejected": -312.6028137207031,
231
+ "loss": 0.6929869651794434,
232
+ "rewards/accuracies": 0.5,
233
+ "rewards/chosen": 0.3366253077983856,
234
+ "rewards/margins": 0.008591671474277973,
235
+ "rewards/rejected": 0.3280336260795593,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 1.0417391304347827,
240
+ "grad_norm": 58.25,
241
+ "learning_rate": 4.1496324047009244e-07,
242
+ "logits/chosen": -1.137653112411499,
243
+ "logits/rejected": -1.1385780572891235,
244
+ "logps/chosen": -297.7463073730469,
245
+ "logps/rejected": -304.8460388183594,
246
+ "loss": 0.6796726703643798,
247
+ "rewards/accuracies": 0.5320512652397156,
248
+ "rewards/chosen": 0.36476510763168335,
249
+ "rewards/margins": 0.029467487707734108,
250
+ "rewards/rejected": 0.3352976143360138,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 1.111304347826087,
255
+ "grad_norm": 62.0,
256
+ "learning_rate": 3.9922950508520126e-07,
257
+ "logits/chosen": -1.0764613151550293,
258
+ "logits/rejected": -1.073861837387085,
259
+ "logps/chosen": -325.74151611328125,
260
+ "logps/rejected": -314.5641174316406,
261
+ "loss": 0.6664072513580322,
262
+ "rewards/accuracies": 0.6312500238418579,
263
+ "rewards/chosen": 0.37418311834335327,
264
+ "rewards/margins": 0.06225377321243286,
265
+ "rewards/rejected": 0.3119293451309204,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 1.1808695652173913,
270
+ "grad_norm": 48.25,
271
+ "learning_rate": 3.8251796141741945e-07,
272
+ "logits/chosen": -1.1032686233520508,
273
+ "logits/rejected": -1.1010105609893799,
274
+ "logps/chosen": -324.8043212890625,
275
+ "logps/rejected": -314.7898254394531,
276
+ "loss": 0.6656156063079834,
277
+ "rewards/accuracies": 0.643750011920929,
278
+ "rewards/chosen": 0.39548203349113464,
279
+ "rewards/margins": 0.06372012943029404,
280
+ "rewards/rejected": 0.33176189661026,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 1.2504347826086957,
285
+ "grad_norm": 73.5,
286
+ "learning_rate": 3.649381098353834e-07,
287
+ "logits/chosen": -1.108781099319458,
288
+ "logits/rejected": -1.081012487411499,
289
+ "logps/chosen": -333.0024719238281,
290
+ "logps/rejected": -317.6542663574219,
291
+ "loss": 0.6682669162750244,
292
+ "rewards/accuracies": 0.6187499761581421,
293
+ "rewards/chosen": 0.4114314019680023,
294
+ "rewards/margins": 0.06464624404907227,
295
+ "rewards/rejected": 0.34678512811660767,
296
+ "step": 180
297
+ },
298
+ {
299
+ "epoch": 1.32,
300
+ "grad_norm": 51.75,
301
+ "learning_rate": 3.466051401903162e-07,
302
+ "logits/chosen": -1.0279661417007446,
303
+ "logits/rejected": -1.028050422668457,
304
+ "logps/chosen": -342.05322265625,
305
+ "logps/rejected": -329.083984375,
306
+ "loss": 0.6815151691436767,
307
+ "rewards/accuracies": 0.5249999761581421,
308
+ "rewards/chosen": 0.4475061893463135,
309
+ "rewards/margins": 0.03387105464935303,
310
+ "rewards/rejected": 0.41363516449928284,
311
+ "step": 190
312
+ },
313
+ {
314
+ "epoch": 1.3895652173913042,
315
+ "grad_norm": 43.25,
316
+ "learning_rate": 3.276391770484606e-07,
317
+ "logits/chosen": -1.0554686784744263,
318
+ "logits/rejected": -1.0444362163543701,
319
+ "logps/chosen": -323.93359375,
320
+ "logps/rejected": -300.8354187011719,
321
+ "loss": 0.6729463577270508,
322
+ "rewards/accuracies": 0.5874999761581421,
323
+ "rewards/chosen": 0.4051585793495178,
324
+ "rewards/margins": 0.04842006787657738,
325
+ "rewards/rejected": 0.35673850774765015,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 1.3895652173913042,
330
+ "eval_logits/chosen": -1.059758186340332,
331
+ "eval_logits/rejected": -1.0798335075378418,
332
+ "eval_logps/chosen": -315.1194763183594,
333
+ "eval_logps/rejected": -316.16046142578125,
334
+ "eval_loss": 0.6918731331825256,
335
+ "eval_rewards/accuracies": 0.5483871102333069,
336
+ "eval_rewards/chosen": 0.4346778690814972,
337
+ "eval_rewards/margins": 0.011690356768667698,
338
+ "eval_rewards/rejected": 0.42298752069473267,
339
+ "eval_runtime": 11.2737,
340
+ "eval_samples_per_second": 10.733,
341
+ "eval_steps_per_second": 2.75,
342
+ "step": 200
343
+ },
344
+ {
345
+ "epoch": 1.4591304347826086,
346
+ "grad_norm": 44.5,
347
+ "learning_rate": 3.0816449258938656e-07,
348
+ "logits/chosen": -1.0483617782592773,
349
+ "logits/rejected": -1.0491857528686523,
350
+ "logps/chosen": -326.4630126953125,
351
+ "logps/rejected": -321.35467529296875,
352
+ "loss": 0.6760656356811523,
353
+ "rewards/accuracies": 0.59375,
354
+ "rewards/chosen": 0.4774439334869385,
355
+ "rewards/margins": 0.04994555562734604,
356
+ "rewards/rejected": 0.42749834060668945,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 1.528695652173913,
361
+ "grad_norm": 69.0,
362
+ "learning_rate": 2.883086923275658e-07,
363
+ "logits/chosen": -1.0638173818588257,
364
+ "logits/rejected": -1.0861108303070068,
365
+ "logps/chosen": -323.24517822265625,
366
+ "logps/rejected": -316.22454833984375,
367
+ "loss": 0.6859281063079834,
368
+ "rewards/accuracies": 0.5375000238418579,
369
+ "rewards/chosen": 0.464817613363266,
370
+ "rewards/margins": 0.025601129978895187,
371
+ "rewards/rejected": 0.4392164647579193,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 1.5982608695652174,
376
+ "grad_norm": 51.25,
377
+ "learning_rate": 2.6820187899267197e-07,
378
+ "logits/chosen": -1.0582925081253052,
379
+ "logits/rejected": -1.0700470209121704,
380
+ "logps/chosen": -331.1497802734375,
381
+ "logps/rejected": -318.2004089355469,
382
+ "loss": 0.6922653675079345,
383
+ "rewards/accuracies": 0.5249999761581421,
384
+ "rewards/chosen": 0.4878065586090088,
385
+ "rewards/margins": 0.014133572578430176,
386
+ "rewards/rejected": 0.4736729562282562,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 1.6678260869565218,
391
+ "grad_norm": 44.0,
392
+ "learning_rate": 2.4797580004718036e-07,
393
+ "logits/chosen": -1.059127926826477,
394
+ "logits/rejected": -1.0511342287063599,
395
+ "logps/chosen": -329.36309814453125,
396
+ "logps/rejected": -324.93060302734375,
397
+ "loss": 0.6760355949401855,
398
+ "rewards/accuracies": 0.5562499761581421,
399
+ "rewards/chosen": 0.4827202260494232,
400
+ "rewards/margins": 0.04902523010969162,
401
+ "rewards/rejected": 0.433694988489151,
402
+ "step": 240
403
+ },
404
+ {
405
+ "epoch": 1.7373913043478262,
406
+ "grad_norm": 47.0,
407
+ "learning_rate": 2.277629844270543e-07,
408
+ "logits/chosen": -1.1127703189849854,
409
+ "logits/rejected": -1.119107961654663,
410
+ "logps/chosen": -313.205810546875,
411
+ "logps/rejected": -302.2017822265625,
412
+ "loss": 0.6791357040405274,
413
+ "rewards/accuracies": 0.59375,
414
+ "rewards/chosen": 0.46044379472732544,
415
+ "rewards/margins": 0.04207334667444229,
416
+ "rewards/rejected": 0.41837042570114136,
417
+ "step": 250
418
+ },
419
+ {
420
+ "epoch": 1.8069565217391306,
421
+ "grad_norm": 74.5,
422
+ "learning_rate": 2.0769587416192208e-07,
423
+ "logits/chosen": -1.0957348346710205,
424
+ "logits/rejected": -1.0845588445663452,
425
+ "logps/chosen": -326.6529235839844,
426
+ "logps/rejected": -328.163330078125,
427
+ "loss": 0.6897631645202636,
428
+ "rewards/accuracies": 0.550000011920929,
429
+ "rewards/chosen": 0.4732390344142914,
430
+ "rewards/margins": 0.017790159210562706,
431
+ "rewards/rejected": 0.4554489552974701,
432
+ "step": 260
433
+ },
434
+ {
435
+ "epoch": 1.8765217391304347,
436
+ "grad_norm": 54.25,
437
+ "learning_rate": 1.8790595656469626e-07,
438
+ "logits/chosen": -1.0816611051559448,
439
+ "logits/rejected": -1.0837030410766602,
440
+ "logps/chosen": -331.3684387207031,
441
+ "logps/rejected": -313.75445556640625,
442
+ "loss": 0.6794800758361816,
443
+ "rewards/accuracies": 0.5375000238418579,
444
+ "rewards/chosen": 0.4701307415962219,
445
+ "rewards/margins": 0.03732570633292198,
446
+ "rewards/rejected": 0.43280500173568726,
447
+ "step": 270
448
+ },
449
+ {
450
+ "epoch": 1.9460869565217391,
451
+ "grad_norm": 50.5,
452
+ "learning_rate": 1.6852290267685928e-07,
453
+ "logits/chosen": -1.0794267654418945,
454
+ "logits/rejected": -1.0937658548355103,
455
+ "logps/chosen": -331.467529296875,
456
+ "logps/rejected": -321.3943786621094,
457
+ "loss": 0.6855987071990967,
458
+ "rewards/accuracies": 0.543749988079071,
459
+ "rewards/chosen": 0.48015865683555603,
460
+ "rewards/margins": 0.03580061346292496,
461
+ "rewards/rejected": 0.44435805082321167,
462
+ "step": 280
463
+ },
464
+ {
465
+ "epoch": 2.013913043478261,
466
+ "grad_norm": 45.25,
467
+ "learning_rate": 1.4967371761464737e-07,
468
+ "logits/chosen": -1.0743939876556396,
469
+ "logits/rejected": -1.0771441459655762,
470
+ "logps/chosen": -322.74835205078125,
471
+ "logps/rejected": -319.2298278808594,
472
+ "loss": 0.690724229812622,
473
+ "rewards/accuracies": 0.4871794879436493,
474
+ "rewards/chosen": 0.4530966281890869,
475
+ "rewards/margins": 0.012794151902198792,
476
+ "rewards/rejected": 0.4403024911880493,
477
+ "step": 290
478
+ },
479
+ {
480
+ "epoch": 2.0834782608695654,
481
+ "grad_norm": 45.5,
482
+ "learning_rate": 1.3148190838338802e-07,
483
+ "logits/chosen": -1.0955299139022827,
484
+ "logits/rejected": -1.1200354099273682,
485
+ "logps/chosen": -331.6620178222656,
486
+ "logps/rejected": -313.5164489746094,
487
+ "loss": 0.6698547840118408,
488
+ "rewards/accuracies": 0.612500011920929,
489
+ "rewards/chosen": 0.44486236572265625,
490
+ "rewards/margins": 0.06079784035682678,
491
+ "rewards/rejected": 0.38406452536582947,
492
+ "step": 300
493
+ },
494
+ {
495
+ "epoch": 2.0834782608695654,
496
+ "eval_logits/chosen": -1.0598626136779785,
497
+ "eval_logits/rejected": -1.080617070198059,
498
+ "eval_logps/chosen": -315.2421875,
499
+ "eval_logps/rejected": -316.3223876953125,
500
+ "eval_loss": 0.6872237920761108,
501
+ "eval_rewards/accuracies": 0.5403226017951965,
502
+ "eval_rewards/chosen": 0.42240414023399353,
503
+ "eval_rewards/margins": 0.015609556809067726,
504
+ "eval_rewards/rejected": 0.40679463744163513,
505
+ "eval_runtime": 11.2881,
506
+ "eval_samples_per_second": 10.719,
507
+ "eval_steps_per_second": 2.746,
508
+ "step": 300
509
+ },
510
+ {
511
+ "epoch": 2.1530434782608694,
512
+ "grad_norm": 47.5,
513
+ "learning_rate": 1.1406667461278538e-07,
514
+ "logits/chosen": -1.0953623056411743,
515
+ "logits/rejected": -1.0638518333435059,
516
+ "logps/chosen": -317.5690002441406,
517
+ "logps/rejected": -313.86895751953125,
518
+ "loss": 0.6786671161651612,
519
+ "rewards/accuracies": 0.5874999761581421,
520
+ "rewards/chosen": 0.43794316053390503,
521
+ "rewards/margins": 0.040073662996292114,
522
+ "rewards/rejected": 0.39786943793296814,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 2.222608695652174,
527
+ "grad_norm": 75.5,
528
+ "learning_rate": 9.754212751576385e-08,
529
+ "logits/chosen": -1.036792516708374,
530
+ "logits/rejected": -1.0298279523849487,
531
+ "logps/chosen": -322.68511962890625,
532
+ "logps/rejected": -324.95562744140625,
533
+ "loss": 0.6964206695556641,
534
+ "rewards/accuracies": 0.5,
535
+ "rewards/chosen": 0.4462190568447113,
536
+ "rewards/margins": 0.005142434034496546,
537
+ "rewards/rejected": 0.4410766065120697,
538
+ "step": 320
539
+ },
540
+ {
541
+ "epoch": 2.292173913043478,
542
+ "grad_norm": 50.0,
543
+ "learning_rate": 8.201654218854689e-08,
544
+ "logits/chosen": -1.076971173286438,
545
+ "logits/rejected": -1.0646440982818604,
546
+ "logps/chosen": -314.86492919921875,
547
+ "logps/rejected": -309.55389404296875,
548
+ "loss": 0.6790688037872314,
549
+ "rewards/accuracies": 0.581250011920929,
550
+ "rewards/chosen": 0.4310247302055359,
551
+ "rewards/margins": 0.03967974707484245,
552
+ "rewards/rejected": 0.39134499430656433,
553
+ "step": 330
554
+ },
555
+ {
556
+ "epoch": 2.3617391304347826,
557
+ "grad_norm": 48.0,
558
+ "learning_rate": 6.759164815118493e-08,
559
+ "logits/chosen": -1.0706030130386353,
560
+ "logits/rejected": -1.0911849737167358,
561
+ "logps/chosen": -337.18853759765625,
562
+ "logps/rejected": -322.5784912109375,
563
+ "loss": 0.6748550415039063,
564
+ "rewards/accuracies": 0.625,
565
+ "rewards/chosen": 0.5078705549240112,
566
+ "rewards/margins": 0.05118296295404434,
567
+ "rewards/rejected": 0.4566876292228699,
568
+ "step": 340
569
+ },
570
+ {
571
+ "epoch": 2.431304347826087,
572
+ "grad_norm": 60.5,
573
+ "learning_rate": 5.4361962777179273e-08,
574
+ "logits/chosen": -1.0652002096176147,
575
+ "logits/rejected": -1.0517784357070923,
576
+ "logps/chosen": -330.02325439453125,
577
+ "logps/rejected": -326.01873779296875,
578
+ "loss": 0.691571044921875,
579
+ "rewards/accuracies": 0.5687500238418579,
580
+ "rewards/chosen": 0.46054545044898987,
581
+ "rewards/margins": 0.012338491156697273,
582
+ "rewards/rejected": 0.44820699095726013,
583
+ "step": 350
584
+ },
585
+ {
586
+ "epoch": 2.5008695652173913,
587
+ "grad_norm": 51.75,
588
+ "learning_rate": 4.2414171979824e-08,
589
+ "logits/chosen": -1.0523455142974854,
590
+ "logits/rejected": -1.0446560382843018,
591
+ "logps/chosen": -336.7765197753906,
592
+ "logps/rejected": -316.00067138671875,
593
+ "loss": 0.6638113975524902,
594
+ "rewards/accuracies": 0.6625000238418579,
595
+ "rewards/chosen": 0.46467241644859314,
596
+ "rewards/margins": 0.06902624666690826,
597
+ "rewards/rejected": 0.39564621448516846,
598
+ "step": 360
599
+ },
600
+ {
601
+ "epoch": 2.5704347826086957,
602
+ "grad_norm": 49.0,
603
+ "learning_rate": 3.1826562213243844e-08,
604
+ "logits/chosen": -1.0767319202423096,
605
+ "logits/rejected": -1.0776704549789429,
606
+ "logps/chosen": -329.4648742675781,
607
+ "logps/rejected": -319.5013732910156,
608
+ "loss": 0.6690452575683594,
609
+ "rewards/accuracies": 0.5687500238418579,
610
+ "rewards/chosen": 0.46328702569007874,
611
+ "rewards/margins": 0.061441998928785324,
612
+ "rewards/rejected": 0.40184497833251953,
613
+ "step": 370
614
+ },
615
+ {
616
+ "epoch": 2.64,
617
+ "grad_norm": 56.5,
618
+ "learning_rate": 2.2668507509871953e-08,
619
+ "logits/chosen": -1.1089640855789185,
620
+ "logits/rejected": -1.0823930501937866,
621
+ "logps/chosen": -328.79351806640625,
622
+ "logps/rejected": -311.1128845214844,
623
+ "loss": 0.645816707611084,
624
+ "rewards/accuracies": 0.675000011920929,
625
+ "rewards/chosen": 0.4771113991737366,
626
+ "rewards/margins": 0.11158710718154907,
627
+ "rewards/rejected": 0.3655242323875427,
628
+ "step": 380
629
+ },
630
+ {
631
+ "epoch": 2.7095652173913045,
632
+ "grad_norm": 51.0,
633
+ "learning_rate": 1.5000014915493465e-08,
634
+ "logits/chosen": -1.1323436498641968,
635
+ "logits/rejected": -1.1117795705795288,
636
+ "logps/chosen": -316.7195739746094,
637
+ "logps/rejected": -307.0235900878906,
638
+ "loss": 0.6755961894989013,
639
+ "rewards/accuracies": 0.581250011920929,
640
+ "rewards/chosen": 0.45080599188804626,
641
+ "rewards/margins": 0.048326827585697174,
642
+ "rewards/rejected": 0.4024791717529297,
643
+ "step": 390
644
+ },
645
+ {
646
+ "epoch": 2.7791304347826085,
647
+ "grad_norm": 77.0,
648
+ "learning_rate": 8.871331300335321e-09,
649
+ "logits/chosen": -1.0839048624038696,
650
+ "logits/rejected": -1.0926908254623413,
651
+ "logps/chosen": -324.00946044921875,
652
+ "logps/rejected": -318.5250549316406,
653
+ "loss": 0.6874616622924805,
654
+ "rewards/accuracies": 0.5562499761581421,
655
+ "rewards/chosen": 0.45788002014160156,
656
+ "rewards/margins": 0.025683864951133728,
657
+ "rewards/rejected": 0.4321961998939514,
658
+ "step": 400
659
+ },
660
+ {
661
+ "epoch": 2.7791304347826085,
662
+ "eval_logits/chosen": -1.059512972831726,
663
+ "eval_logits/rejected": -1.0796120166778564,
664
+ "eval_logps/chosen": -315.1502990722656,
665
+ "eval_logps/rejected": -316.1904296875,
666
+ "eval_loss": 0.6914414763450623,
667
+ "eval_rewards/accuracies": 0.5322580933570862,
668
+ "eval_rewards/chosen": 0.4315947890281677,
669
+ "eval_rewards/margins": 0.01160100195556879,
670
+ "eval_rewards/rejected": 0.4199938178062439,
671
+ "eval_runtime": 11.25,
672
+ "eval_samples_per_second": 10.756,
673
+ "eval_steps_per_second": 2.756,
674
+ "step": 400
675
+ },
676
+ {
677
+ "epoch": 2.8486956521739133,
678
+ "grad_norm": 41.75,
679
+ "learning_rate": 4.3226141225268796e-09,
680
+ "logits/chosen": -1.0775182247161865,
681
+ "logits/rejected": -1.0758936405181885,
682
+ "logps/chosen": -314.0296325683594,
683
+ "logps/rejected": -312.0280456542969,
684
+ "loss": 0.6733474254608154,
685
+ "rewards/accuracies": 0.574999988079071,
686
+ "rewards/chosen": 0.450728178024292,
687
+ "rewards/margins": 0.04884380102157593,
688
+ "rewards/rejected": 0.40188440680503845,
689
+ "step": 410
690
+ },
691
+ {
692
+ "epoch": 2.9182608695652172,
693
+ "grad_norm": 47.25,
694
+ "learning_rate": 1.383668301212393e-09,
695
+ "logits/chosen": -1.0620819330215454,
696
+ "logits/rejected": -1.0514990091323853,
697
+ "logps/chosen": -332.62652587890625,
698
+ "logps/rejected": -328.03424072265625,
699
+ "loss": 0.6754622459411621,
700
+ "rewards/accuracies": 0.574999988079071,
701
+ "rewards/chosen": 0.459503710269928,
702
+ "rewards/margins": 0.04998493567109108,
703
+ "rewards/rejected": 0.4095187783241272,
704
+ "step": 420
705
+ },
706
+ {
707
+ "epoch": 2.9878260869565216,
708
+ "grad_norm": 52.25,
709
+ "learning_rate": 7.375092342298828e-11,
710
+ "logits/chosen": -1.0443447828292847,
711
+ "logits/rejected": -1.0698919296264648,
712
+ "logps/chosen": -325.37957763671875,
713
+ "logps/rejected": -314.8664245605469,
714
+ "loss": 0.6853389263153076,
715
+ "rewards/accuracies": 0.5625,
716
+ "rewards/chosen": 0.4191187918186188,
717
+ "rewards/margins": 0.025067320093512535,
718
+ "rewards/rejected": 0.3940514922142029,
719
+ "step": 430
720
+ }
721
+ ],
722
+ "logging_steps": 10,
723
+ "max_steps": 432,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 3,
726
+ "save_steps": 100,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": true,
734
+ "should_training_stop": true
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 0.0,
740
+ "train_batch_size": 4,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
checkpoint-432/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b05394a09844ad8fbc5bf2ac3f5b6be474605707ae7d97f68562a3f9312197c4
3
+ size 6225
final_model/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
final_model/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Lfm2ForCausalLM"
4
+ ],
5
+ "block_auto_adjust_ff_dim": true,
6
+ "block_dim": 2048,
7
+ "block_ff_dim": 12288,
8
+ "block_ffn_dim_multiplier": 1.0,
9
+ "block_mlp_init_scale": 1.0,
10
+ "block_multiple_of": 256,
11
+ "block_norm_eps": 1e-05,
12
+ "block_out_init_scale": 1.0,
13
+ "block_use_swiglu": true,
14
+ "block_use_xavier_init": true,
15
+ "bos_token_id": 1,
16
+ "conv_L_cache": 3,
17
+ "conv_bias": false,
18
+ "conv_dim": 2048,
19
+ "conv_use_xavier_init": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 7,
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "layer_types": [
26
+ "conv",
27
+ "conv",
28
+ "full_attention",
29
+ "conv",
30
+ "conv",
31
+ "full_attention",
32
+ "conv",
33
+ "conv",
34
+ "full_attention",
35
+ "conv",
36
+ "full_attention",
37
+ "conv",
38
+ "full_attention",
39
+ "conv",
40
+ "full_attention",
41
+ "conv"
42
+ ],
43
+ "max_position_embeddings": 128000,
44
+ "model_type": "lfm2",
45
+ "norm_eps": 1e-05,
46
+ "num_attention_heads": 32,
47
+ "num_heads": 32,
48
+ "num_hidden_layers": 16,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 0,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "tie_embedding": true,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "5.2.0",
58
+ "use_cache": false,
59
+ "use_pos_enc": true,
60
+ "vocab_size": 65536
61
+ }
final_model/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 7
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0"
9
+ }
final_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8a8b6c80f8066d94803546515de24808278c0cfb7efff8e029129e40dd94ff
3
+ size 2340697936
final_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
final_model/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "right",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }
final_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b05394a09844ad8fbc5bf2ac3f5b6be474605707ae7d97f68562a3f9312197c4
3
+ size 6225
run_meta.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_name": "dpo_fft_LFM2.5-1.2B-Instruct_argilla__distilabel-math-preference-dpo_20260222_195126",
3
+ "model": "LiquidAI/LFM2.5-1.2B-Instruct",
4
+ "dataset": "argilla/distilabel-math-preference-dpo",
5
+ "timestamp": "20260222_195126",
6
+ "args": {
7
+ "dataset": "argilla/distilabel-math-preference-dpo",
8
+ "dataset_split": "train",
9
+ "instruction_col": null,
10
+ "chosen_col": null,
11
+ "rejected_col": null,
12
+ "max_samples": null,
13
+ "seed": 42,
14
+ "model_name": "LiquidAI/LFM2.5-1.2B-Instruct",
15
+ "ref_4bit": false,
16
+ "num_epochs": 3,
17
+ "batch_size": 4,
18
+ "grad_accum": 4,
19
+ "learning_rate": 5e-07,
20
+ "beta": 0.1,
21
+ "max_length": 1024,
22
+ "max_prompt_length": 512,
23
+ "warmup_ratio": 0.1,
24
+ "optim": "paged_adamw_8bit",
25
+ "logging_steps": 10,
26
+ "save_steps": 100,
27
+ "eval_ratio": 0.05,
28
+ "output_dir": "models",
29
+ "run_name": null
30
+ },
31
+ "train_metrics": {
32
+ "train_runtime": 1608.535,
33
+ "train_samples_per_second": 4.284,
34
+ "train_steps_per_second": 0.269,
35
+ "total_flos": 0.0,
36
+ "train_loss": 0.6827363446354866,
37
+ "epoch": 3.0
38
+ }
39
+ }