saranshagarwal2020 commited on
Commit
aedb0b7
·
verified ·
1 Parent(s): 2ba9426

Upload folder using huggingface_hub

Browse files
DPO_HH_final_model/chat_template.jinja ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token -}}
2
+ {%- set keep_past_thinking = keep_past_thinking | default(false) -%}
3
+ {%- set ns = namespace(system_prompt="") -%}
4
+ {%- if messages[0]["role"] == "system" -%}
5
+ {%- set ns.system_prompt = messages[0]["content"] -%}
6
+ {%- set messages = messages[1:] -%}
7
+ {%- endif -%}
8
+ {%- if tools -%}
9
+ {%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
10
+ {%- for tool in tools -%}
11
+ {%- if tool is not string -%}
12
+ {%- set tool = tool | tojson -%}
13
+ {%- endif -%}
14
+ {%- set ns.system_prompt = ns.system_prompt + tool -%}
15
+ {%- if not loop.last -%}
16
+ {%- set ns.system_prompt = ns.system_prompt + ", " -%}
17
+ {%- endif -%}
18
+ {%- endfor -%}
19
+ {%- set ns.system_prompt = ns.system_prompt + "]" -%}
20
+ {%- endif -%}
21
+ {%- if ns.system_prompt -%}
22
+ {{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
23
+ {%- endif -%}
24
+ {%- set ns.last_assistant_index = -1 -%}
25
+ {%- for message in messages -%}
26
+ {%- if message["role"] == "assistant" -%}
27
+ {%- set ns.last_assistant_index = loop.index0 -%}
28
+ {%- endif -%}
29
+ {%- endfor -%}
30
+ {%- for message in messages -%}
31
+ {{- "<|im_start|>" + message["role"] + "\n" -}}
32
+ {%- set content = message["content"] -%}
33
+ {%- if content is not string -%}
34
+ {%- set content = content | tojson -%}
35
+ {%- endif -%}
36
+ {%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
37
+ {%- if "</think>" in content -%}
38
+ {%- set content = content.split("</think>")[-1] | trim -%}
39
+ {%- endif -%}
40
+ {%- endif -%}
41
+ {{- content + "<|im_end|>\n" -}}
42
+ {%- endfor -%}
43
+ {%- if add_generation_prompt -%}
44
+ {{- "<|im_start|>assistant\n" -}}
45
+ {%- endif -%}
DPO_HH_final_model/config.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Lfm2ForCausalLM"
4
+ ],
5
+ "block_auto_adjust_ff_dim": true,
6
+ "block_dim": 2048,
7
+ "block_ff_dim": 12288,
8
+ "block_ffn_dim_multiplier": 1.0,
9
+ "block_mlp_init_scale": 1.0,
10
+ "block_multiple_of": 256,
11
+ "block_norm_eps": 1e-05,
12
+ "block_out_init_scale": 1.0,
13
+ "block_use_swiglu": true,
14
+ "block_use_xavier_init": true,
15
+ "bos_token_id": 1,
16
+ "conv_L_cache": 3,
17
+ "conv_bias": false,
18
+ "conv_dim": 2048,
19
+ "conv_use_xavier_init": true,
20
+ "dtype": "bfloat16",
21
+ "eos_token_id": 7,
22
+ "hidden_size": 2048,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 12288,
25
+ "layer_types": [
26
+ "conv",
27
+ "conv",
28
+ "full_attention",
29
+ "conv",
30
+ "conv",
31
+ "full_attention",
32
+ "conv",
33
+ "conv",
34
+ "full_attention",
35
+ "conv",
36
+ "full_attention",
37
+ "conv",
38
+ "full_attention",
39
+ "conv",
40
+ "full_attention",
41
+ "conv"
42
+ ],
43
+ "max_position_embeddings": 128000,
44
+ "model_type": "lfm2",
45
+ "norm_eps": 1e-05,
46
+ "num_attention_heads": 32,
47
+ "num_heads": 32,
48
+ "num_hidden_layers": 16,
49
+ "num_key_value_heads": 8,
50
+ "pad_token_id": 0,
51
+ "rope_parameters": {
52
+ "rope_theta": 1000000.0,
53
+ "rope_type": "default"
54
+ },
55
+ "tie_embedding": true,
56
+ "tie_word_embeddings": true,
57
+ "transformers_version": "5.2.0",
58
+ "use_cache": false,
59
+ "use_pos_enc": true,
60
+ "vocab_size": 65536
61
+ }
DPO_HH_final_model/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": [
5
+ 7
6
+ ],
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0"
9
+ }
DPO_HH_final_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2bcb3974ce453f6f0c9006aaa302a92fde9c90dbf466b238f163984c1471cd1
3
+ size 2340697936
DPO_HH_final_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
DPO_HH_final_model/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|startoftext|>",
4
+ "clean_up_tokenization_spaces": false,
5
+ "eos_token": "<|im_end|>",
6
+ "is_local": false,
7
+ "legacy": false,
8
+ "model_input_names": [
9
+ "input_ids",
10
+ "attention_mask"
11
+ ],
12
+ "model_max_length": 1000000000000000019884624838656,
13
+ "pad_token": "<|pad|>",
14
+ "padding_side": "right",
15
+ "sp_model_kwargs": {},
16
+ "spaces_between_special_tokens": false,
17
+ "tokenizer_class": "TokenizersBackend",
18
+ "use_default_system_prompt": false,
19
+ "use_fast": true
20
+ }
DPO_HH_final_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:983ea1c33ce5a577d9c542e6b3b45bd3b96cdc7a8f294566b89031fb84da6786
3
+ size 6225
DPO_HH_final_model/training_hh.txt ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ========================================
2
+ DPO Full Fine-Tuning
3
+ ========================================
4
+ Model : LiquidAI/LFM2.5-1.2B-Instruct
5
+ Dataset : Anthropic/hh-rlhf (data_dir=helpful-base)
6
+ Epochs : 1
7
+ Batch size : 5 (grad_accum=4, eff=20)
8
+ Learning rate : 2e-6
9
+ DPO beta : 0.2
10
+ Reference : NF4 4-bit (pass --no_ref_4bit for bfloat16)
11
+ Output dir : models
12
+ ========================================
13
+
14
+ [dpo_train] Run : dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653
15
+ [dpo_train] Output : models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653
16
+ [dpo_train] Loading dataset: Anthropic/hh-rlhf split=train data_dir=helpful-base
17
+ [dpo_train] Full size : 43,835 rows | columns: ['chosen', 'rejected']
18
+ [dpo_train] Format : hh-rlhf (full conversation strings)
19
+ [dpo_train] After cleaning: 43,785 rows
20
+ [dpo_train] Train: 41,595 Eval: 2,190
21
+ [dpo_train] Loading policy model (bfloat16, trainable) …
22
+ [dpo_train] Loading reference model (bfloat16, frozen) …
23
+ [dpo_train] Policy params : 1170M (all trainable)
24
+
25
+ [dpo_train] Starting DPO full fine-tuning (epochs=1 eff_batch=20) …
26
+
27
+ {'loss': '0.6883', 'grad_norm': '31', 'learning_rate': '8.654e-08', 'rewards/chosen': '-0.0005825', 'rewards/rejected': '-0.01204', 'rewards/accuracies': '0.41', 'rewards/margins': '0.01146', 'logps/chosen': '-122.7', 'logps/rejected': '-116.7', 'logits/chosen': '-0.9475', 'logits/rejected': '-0.9243', 'epoch': '0.004808'}
28
+ {'loss': '0.6935', 'grad_norm': '27.38', 'learning_rate': '1.827e-07', 'rewards/chosen': '0.002688', 'rewards/rejected': '0.001497', 'rewards/accuracies': '0.54', 'rewards/margins': '0.001191', 'logps/chosen': '-129.2', 'logps/rejected': '-122.4', 'logits/chosen': '-0.9333', 'logits/rejected': '-0.8879', 'epoch': '0.009617'}
29
+ {'loss': '0.6904', 'grad_norm': '25.5', 'learning_rate': '2.788e-07', 'rewards/chosen': '0.006291', 'rewards/rejected': '-0.0009644', 'rewards/accuracies': '0.49', 'rewards/margins': '0.007255', 'logps/chosen': '-138', 'logps/rejected': '-119.7', 'logits/chosen': '-0.9096', 'logits/rejected': '-0.9259', 'epoch': '0.01442'}
30
+ {'loss': '0.6963', 'grad_norm': '28.5', 'learning_rate': '3.75e-07', 'rewards/chosen': '0.002371', 'rewards/rejected': '0.006851', 'rewards/accuracies': '0.465', 'rewards/margins': '-0.004481', 'logps/chosen': '-125', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9426', 'logits/rejected': '-0.8451', 'epoch': '0.01923'}
31
+ {'loss': '0.6955', 'grad_norm': '30.88', 'learning_rate': '4.712e-07', 'rewards/chosen': '0.006614', 'rewards/rejected': '0.009299', 'rewards/accuracies': '0.51', 'rewards/margins': '-0.002685', 'logps/chosen': '-137.1', 'logps/rejected': '-110.3', 'logits/chosen': '-0.8802', 'logits/rejected': '-0.9408', 'epoch': '0.02404'}
32
+ {'loss': '0.6893', 'grad_norm': '31.75', 'learning_rate': '5.673e-07', 'rewards/chosen': '0.01269', 'rewards/rejected': '0.003417', 'rewards/accuracies': '0.545', 'rewards/margins': '0.009273', 'logps/chosen': '-134.3', 'logps/rejected': '-107.8', 'logits/chosen': '-0.8803', 'logits/rejected': '-0.8356', 'epoch': '0.02885'}
33
+ {'loss': '0.6908', 'grad_norm': '31.88', 'learning_rate': '6.635e-07', 'rewards/chosen': '0.01333', 'rewards/rejected': '0.006563', 'rewards/accuracies': '0.54', 'rewards/margins': '0.006767', 'logps/chosen': '-139.3', 'logps/rejected': '-108.5', 'logits/chosen': '-0.9105', 'logits/rejected': '-0.8774', 'epoch': '0.03366'}
34
+ {'loss': '0.6926', 'grad_norm': '29.62', 'learning_rate': '7.596e-07', 'rewards/chosen': '0.02761', 'rewards/rejected': '0.02442', 'rewards/accuracies': '0.515', 'rewards/margins': '0.003186', 'logps/chosen': '-125.4', 'logps/rejected': '-118', 'logits/chosen': '-0.9279', 'logits/rejected': '-0.9228', 'epoch': '0.03847'}
35
+ {'loss': '0.6868', 'grad_norm': '33.25', 'learning_rate': '8.558e-07', 'rewards/chosen': '0.04787', 'rewards/rejected': '0.03277', 'rewards/accuracies': '0.49', 'rewards/margins': '0.01509', 'logps/chosen': '-136.4', 'logps/rejected': '-119.5', 'logits/chosen': '-0.9487', 'logits/rejected': '-0.961', 'epoch': '0.04327'}
36
+ {'loss': '0.6894', 'grad_norm': '23', 'learning_rate': '9.519e-07', 'rewards/chosen': '0.04472', 'rewards/rejected': '0.03485', 'rewards/accuracies': '0.55', 'rewards/margins': '0.009868', 'logps/chosen': '-138.3', 'logps/rejected': '-118.5', 'logits/chosen': '-0.9422', 'logits/rejected': '-0.8888', 'epoch': '0.04808'}
37
+ {'eval_loss': '0.6858', 'eval_runtime': '161.1', 'eval_samples_per_second': '13.59', 'eval_steps_per_second': '2.719', 'eval_rewards/chosen': '0.05564', 'eval_rewards/rejected': '0.03785', 'eval_rewards/accuracies': '0.5694', 'eval_rewards/margins': '0.01779', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-112.5', 'eval_logits/chosen': '-0.9387', 'eval_logits/rejected': '-0.93', 'epoch': '0.04808'}
38
+ {'loss': '0.6917', 'grad_norm': '31.38', 'learning_rate': '1.048e-06', 'rewards/chosen': '0.0569', 'rewards/rejected': '0.05051', 'rewards/accuracies': '0.47', 'rewards/margins': '0.006385', 'logps/chosen': '-129.2', 'logps/rejected': '-117.6', 'logits/chosen': '-0.9519', 'logits/rejected': '-0.8707', 'epoch': '0.05289'}
39
+ {'loss': '0.6867', 'grad_norm': '27.5', 'learning_rate': '1.144e-06', 'rewards/chosen': '0.07523', 'rewards/rejected': '0.05881', 'rewards/accuracies': '0.525', 'rewards/margins': '0.01642', 'logps/chosen': '-121', 'logps/rejected': '-109.4', 'logits/chosen': '-0.9517', 'logits/rejected': '-0.8928', 'epoch': '0.0577'}
40
+ {'loss': '0.6854', 'grad_norm': '30.88', 'learning_rate': '1.24e-06', 'rewards/chosen': '0.09279', 'rewards/rejected': '0.07028', 'rewards/accuracies': '0.58', 'rewards/margins': '0.02251', 'logps/chosen': '-134.2', 'logps/rejected': '-113.6', 'logits/chosen': '-0.88', 'logits/rejected': '-0.8708', 'epoch': '0.06251'}
41
+ {'loss': '0.6779', 'grad_norm': '31.38', 'learning_rate': '1.337e-06', 'rewards/chosen': '0.114', 'rewards/rejected': '0.07756', 'rewards/accuracies': '0.57', 'rewards/margins': '0.03639', 'logps/chosen': '-140.8', 'logps/rejected': '-134', 'logits/chosen': '-0.92', 'logits/rejected': '-0.8675', 'epoch': '0.06732'}
42
+ {'loss': '0.6759', 'grad_norm': '36.5', 'learning_rate': '1.433e-06', 'rewards/chosen': '0.1178', 'rewards/rejected': '0.07662', 'rewards/accuracies': '0.57', 'rewards/margins': '0.04123', 'logps/chosen': '-134.7', 'logps/rejected': '-100.4', 'logits/chosen': '-0.9678', 'logits/rejected': '-0.9473', 'epoch': '0.07212'}
43
+ {'loss': '0.6826', 'grad_norm': '28.38', 'learning_rate': '1.529e-06', 'rewards/chosen': '0.1401', 'rewards/rejected': '0.1108', 'rewards/accuracies': '0.56', 'rewards/margins': '0.02931', 'logps/chosen': '-128.9', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9077', 'logits/rejected': '-0.934', 'epoch': '0.07693'}
44
+ {'loss': '0.6703', 'grad_norm': '25', 'learning_rate': '1.625e-06', 'rewards/chosen': '0.1574', 'rewards/rejected': '0.1022', 'rewards/accuracies': '0.575', 'rewards/margins': '0.05519', 'logps/chosen': '-136.8', 'logps/rejected': '-104.9', 'logits/chosen': '-0.9116', 'logits/rejected': '-0.9714', 'epoch': '0.08174'}
45
+ {'loss': '0.6654', 'grad_norm': '23.5', 'learning_rate': '1.721e-06', 'rewards/chosen': '0.1637', 'rewards/rejected': '0.09603', 'rewards/accuracies': '0.59', 'rewards/margins': '0.06763', 'logps/chosen': '-143.8', 'logps/rejected': '-112.8', 'logits/chosen': '-0.9118', 'logits/rejected': '-0.8706', 'epoch': '0.08655'}
46
+ {'loss': '0.6712', 'grad_norm': '23.75', 'learning_rate': '1.817e-06', 'rewards/chosen': '0.1769', 'rewards/rejected': '0.1167', 'rewards/accuracies': '0.61', 'rewards/margins': '0.06021', 'logps/chosen': '-132.5', 'logps/rejected': '-116.7', 'logits/chosen': '-0.865', 'logits/rejected': '-0.8988', 'epoch': '0.09136'}
47
+ {'loss': '0.6669', 'grad_norm': '31.62', 'learning_rate': '1.913e-06', 'rewards/chosen': '0.207', 'rewards/rejected': '0.135', 'rewards/accuracies': '0.64', 'rewards/margins': '0.07194', 'logps/chosen': '-144.9', 'logps/rejected': '-110.8', 'logits/chosen': '-0.889', 'logits/rejected': '-0.9204', 'epoch': '0.09617'}
48
+ {'eval_loss': '0.6687', 'eval_runtime': '159.7', 'eval_samples_per_second': '13.71', 'eval_steps_per_second': '2.743', 'eval_rewards/chosen': '0.2023', 'eval_rewards/rejected': '0.1295', 'eval_rewards/accuracies': '0.5991', 'eval_rewards/margins': '0.07285', 'eval_logps/chosen': '-132.7', 'eval_logps/rejected': '-112', 'eval_logits/chosen': '-0.9474', 'eval_logits/rejected': '-0.941', 'epoch': '0.09617'}
49
+ {'loss': '0.6711', 'grad_norm': '29.62', 'learning_rate': '2e-06', 'rewards/chosen': '0.1896', 'rewards/rejected': '0.1194', 'rewards/accuracies': '0.57', 'rewards/margins': '0.07017', 'logps/chosen': '-133', 'logps/rejected': '-120.2', 'logits/chosen': '-0.8696', 'logits/rejected': '-0.874', 'epoch': '0.101'}
50
+ {'loss': '0.6657', 'grad_norm': '26.75', 'learning_rate': '2e-06', 'rewards/chosen': '0.1632', 'rewards/rejected': '0.07528', 'rewards/accuracies': '0.625', 'rewards/margins': '0.08788', 'logps/chosen': '-123.4', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9532', 'logits/rejected': '-0.9136', 'epoch': '0.1058'}
51
+ {'loss': '0.6611', 'grad_norm': '27.62', 'learning_rate': '1.999e-06', 'rewards/chosen': '0.1101', 'rewards/rejected': '0.02324', 'rewards/accuracies': '0.655', 'rewards/margins': '0.08688', 'logps/chosen': '-124.4', 'logps/rejected': '-107.2', 'logits/chosen': '-0.9392', 'logits/rejected': '-0.9295', 'epoch': '0.1106'}
52
+ {'loss': '0.6415', 'grad_norm': '28.62', 'learning_rate': '1.999e-06', 'rewards/chosen': '0.0739', 'rewards/rejected': '-0.05723', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1311', 'logps/chosen': '-127', 'logps/rejected': '-106.9', 'logits/chosen': '-0.9826', 'logits/rejected': '-0.9332', 'epoch': '0.1154'}
53
+ {'loss': '0.6554', 'grad_norm': '30.25', 'learning_rate': '1.998e-06', 'rewards/chosen': '0.05428', 'rewards/rejected': '-0.05029', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1046', 'logps/chosen': '-144.8', 'logps/rejected': '-117.7', 'logits/chosen': '-0.9814', 'logits/rejected': '-0.9672', 'epoch': '0.1202'}
54
+ {'loss': '0.6437', 'grad_norm': '25.12', 'learning_rate': '1.996e-06', 'rewards/chosen': '0.08971', 'rewards/rejected': '-0.04912', 'rewards/accuracies': '0.625', 'rewards/margins': '0.1388', 'logps/chosen': '-131.5', 'logps/rejected': '-112.1', 'logits/chosen': '-1.002', 'logits/rejected': '-0.9217', 'epoch': '0.125'}
55
+ {'loss': '0.6711', 'grad_norm': '27.75', 'learning_rate': '1.995e-06', 'rewards/chosen': '0.1214', 'rewards/rejected': '0.0422', 'rewards/accuracies': '0.58', 'rewards/margins': '0.0792', 'logps/chosen': '-132.4', 'logps/rejected': '-118.9', 'logits/chosen': '-0.9889', 'logits/rejected': '-0.952', 'epoch': '0.1298'}
56
+ {'loss': '0.6558', 'grad_norm': '22.75', 'learning_rate': '1.993e-06', 'rewards/chosen': '0.1462', 'rewards/rejected': '0.03874', 'rewards/accuracies': '0.675', 'rewards/margins': '0.1075', 'logps/chosen': '-121.1', 'logps/rejected': '-111.4', 'logits/chosen': '-0.9636', 'logits/rejected': '-0.9747', 'epoch': '0.1346'}
57
+ {'loss': '0.6358', 'grad_norm': '24.5', 'learning_rate': '1.991e-06', 'rewards/chosen': '0.1654', 'rewards/rejected': '0.005945', 'rewards/accuracies': '0.68', 'rewards/margins': '0.1595', 'logps/chosen': '-130.7', 'logps/rejected': '-105.6', 'logits/chosen': '-0.9287', 'logits/rejected': '-0.8992', 'epoch': '0.1394'}
58
+ {'loss': '0.6442', 'grad_norm': '28.25', 'learning_rate': '1.988e-06', 'rewards/chosen': '0.2266', 'rewards/rejected': '0.06751', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1591', 'logps/chosen': '-141.6', 'logps/rejected': '-103.6', 'logits/chosen': '-1.063', 'logits/rejected': '-0.9664', 'epoch': '0.1442'}
59
+ {'eval_loss': '0.6489', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.2131', 'eval_rewards/rejected': '0.06933', 'eval_rewards/accuracies': '0.6224', 'eval_rewards/margins': '0.1438', 'eval_logps/chosen': '-132.6', 'eval_logps/rejected': '-112.3', 'eval_logits/chosen': '-0.9661', 'eval_logits/rejected': '-0.9621', 'epoch': '0.1442'}
60
+ {'loss': '0.6499', 'grad_norm': '26.88', 'learning_rate': '1.986e-06', 'rewards/chosen': '0.2295', 'rewards/rejected': '0.1049', 'rewards/accuracies': '0.68', 'rewards/margins': '0.1246', 'logps/chosen': '-128.6', 'logps/rejected': '-116', 'logits/chosen': '-0.9282', 'logits/rejected': '-0.9179', 'epoch': '0.1491'}
61
+ {'loss': '0.6379', 'grad_norm': '26', 'learning_rate': '1.983e-06', 'rewards/chosen': '0.2347', 'rewards/rejected': '0.06157', 'rewards/accuracies': '0.675', 'rewards/margins': '0.1732', 'logps/chosen': '-137.8', 'logps/rejected': '-102', 'logits/chosen': '-0.977', 'logits/rejected': '-1.001', 'epoch': '0.1539'}
62
+ {'loss': '0.6507', 'grad_norm': '30.12', 'learning_rate': '1.979e-06', 'rewards/chosen': '0.2242', 'rewards/rejected': '0.06374', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1605', 'logps/chosen': '-128.8', 'logps/rejected': '-113.3', 'logits/chosen': '-0.8801', 'logits/rejected': '-0.8927', 'epoch': '0.1587'}
63
+ {'loss': '0.6497', 'grad_norm': '26.38', 'learning_rate': '1.976e-06', 'rewards/chosen': '0.2519', 'rewards/rejected': '0.08665', 'rewards/accuracies': '0.615', 'rewards/margins': '0.1653', 'logps/chosen': '-144.7', 'logps/rejected': '-115.5', 'logits/chosen': '-0.9742', 'logits/rejected': '-0.9811', 'epoch': '0.1635'}
64
+ {'loss': '0.6489', 'grad_norm': '28', 'learning_rate': '1.972e-06', 'rewards/chosen': '0.2025', 'rewards/rejected': '0.04876', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1537', 'logps/chosen': '-144', 'logps/rejected': '-108.9', 'logits/chosen': '-1.025', 'logits/rejected': '-1.036', 'epoch': '0.1683'}
65
+ {'loss': '0.6199', 'grad_norm': '26.5', 'learning_rate': '1.968e-06', 'rewards/chosen': '0.1235', 'rewards/rejected': '-0.08143', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2049', 'logps/chosen': '-138.2', 'logps/rejected': '-115.1', 'logits/chosen': '-0.9685', 'logits/rejected': '-0.9499', 'epoch': '0.1731'}
66
+ {'loss': '0.661', 'grad_norm': '29.38', 'learning_rate': '1.964e-06', 'rewards/chosen': '0.06153', 'rewards/rejected': '-0.06504', 'rewards/accuracies': '0.59', 'rewards/margins': '0.1266', 'logps/chosen': '-136.6', 'logps/rejected': '-119.5', 'logits/chosen': '-0.9926', 'logits/rejected': '-1.001', 'epoch': '0.1779'}
67
+ {'loss': '0.656', 'grad_norm': '29.38', 'learning_rate': '1.959e-06', 'rewards/chosen': '0.07785', 'rewards/rejected': '-0.04319', 'rewards/accuracies': '0.605', 'rewards/margins': '0.121', 'logps/chosen': '-135.2', 'logps/rejected': '-105.1', 'logits/chosen': '-0.9725', 'logits/rejected': '-0.9489', 'epoch': '0.1827'}
68
+ {'loss': '0.615', 'grad_norm': '24.12', 'learning_rate': '1.954e-06', 'rewards/chosen': '0.184', 'rewards/rejected': '-0.04084', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2248', 'logps/chosen': '-138.4', 'logps/rejected': '-106.1', 'logits/chosen': '-0.9518', 'logits/rejected': '-0.9407', 'epoch': '0.1875'}
69
+ {'loss': '0.6265', 'grad_norm': '44.25', 'learning_rate': '1.949e-06', 'rewards/chosen': '0.248', 'rewards/rejected': '0.03399', 'rewards/accuracies': '0.67', 'rewards/margins': '0.214', 'logps/chosen': '-133.5', 'logps/rejected': '-114', 'logits/chosen': '-0.9333', 'logits/rejected': '-0.9424', 'epoch': '0.1923'}
70
+ {'eval_loss': '0.6418', 'eval_runtime': '159.8', 'eval_samples_per_second': '13.71', 'eval_steps_per_second': '2.741', 'eval_rewards/chosen': '0.2418', 'eval_rewards/rejected': '0.0508', 'eval_rewards/accuracies': '0.6356', 'eval_rewards/margins': '0.191', 'eval_logps/chosen': '-132.5', 'eval_logps/rejected': '-112.4', 'eval_logits/chosen': '-0.9695', 'eval_logits/rejected': '-0.9666', 'epoch': '0.1923'}
71
+ {'loss': '0.618', 'grad_norm': '26.12', 'learning_rate': '1.944e-06', 'rewards/chosen': '0.315', 'rewards/rejected': '0.05388', 'rewards/accuracies': '0.7', 'rewards/margins': '0.2612', 'logps/chosen': '-143.2', 'logps/rejected': '-112', 'logits/chosen': '-0.9392', 'logits/rejected': '-0.9602', 'epoch': '0.1971'}
72
+ {'loss': '0.6097', 'grad_norm': '23.75', 'learning_rate': '1.938e-06', 'rewards/chosen': '0.2828', 'rewards/rejected': '0.01982', 'rewards/accuracies': '0.7', 'rewards/margins': '0.263', 'logps/chosen': '-147.5', 'logps/rejected': '-108.1', 'logits/chosen': '-0.8923', 'logits/rejected': '-0.9127', 'epoch': '0.2019'}
73
+ {'loss': '0.6102', 'grad_norm': '30.62', 'learning_rate': '1.932e-06', 'rewards/chosen': '0.3189', 'rewards/rejected': '0.02987', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2891', 'logps/chosen': '-147', 'logps/rejected': '-106.5', 'logits/chosen': '-0.9787', 'logits/rejected': '-0.9898', 'epoch': '0.2068'}
74
+ {'loss': '0.6417', 'grad_norm': '31.12', 'learning_rate': '1.926e-06', 'rewards/chosen': '0.2384', 'rewards/rejected': '0.04542', 'rewards/accuracies': '0.625', 'rewards/margins': '0.193', 'logps/chosen': '-118.7', 'logps/rejected': '-108.7', 'logits/chosen': '-0.9971', 'logits/rejected': '-1.018', 'epoch': '0.2116'}
75
+ {'loss': '0.6385', 'grad_norm': '27.5', 'learning_rate': '1.919e-06', 'rewards/chosen': '0.2713', 'rewards/rejected': '0.0755', 'rewards/accuracies': '0.65', 'rewards/margins': '0.1958', 'logps/chosen': '-140.3', 'logps/rejected': '-109.6', 'logits/chosen': '-1.029', 'logits/rejected': '-0.9416', 'epoch': '0.2164'}
76
+ {'loss': '0.6357', 'grad_norm': '30.12', 'learning_rate': '1.913e-06', 'rewards/chosen': '0.2819', 'rewards/rejected': '0.08405', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1978', 'logps/chosen': '-138.9', 'logps/rejected': '-107.7', 'logits/chosen': '-0.9281', 'logits/rejected': '-0.95', 'epoch': '0.2212'}
77
+ {'loss': '0.6558', 'grad_norm': '28.38', 'learning_rate': '1.906e-06', 'rewards/chosen': '0.2306', 'rewards/rejected': '0.05818', 'rewards/accuracies': '0.6', 'rewards/margins': '0.1724', 'logps/chosen': '-137.8', 'logps/rejected': '-122', 'logits/chosen': '-1.046', 'logits/rejected': '-1.005', 'epoch': '0.226'}
78
+ {'loss': '0.6319', 'grad_norm': '26.75', 'learning_rate': '1.898e-06', 'rewards/chosen': '0.1872', 'rewards/rejected': '-0.02641', 'rewards/accuracies': '0.595', 'rewards/margins': '0.2137', 'logps/chosen': '-121.4', 'logps/rejected': '-113.5', 'logits/chosen': '-0.9336', 'logits/rejected': '-0.9091', 'epoch': '0.2308'}
79
+ {'loss': '0.611', 'grad_norm': '19', 'learning_rate': '1.891e-06', 'rewards/chosen': '0.1955', 'rewards/rejected': '-0.05569', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2512', 'logps/chosen': '-119.2', 'logps/rejected': '-100.1', 'logits/chosen': '-0.9642', 'logits/rejected': '-0.8836', 'epoch': '0.2356'}
80
+ {'loss': '0.6402', 'grad_norm': '24.12', 'learning_rate': '1.883e-06', 'rewards/chosen': '0.1428', 'rewards/rejected': '-0.06168', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2045', 'logps/chosen': '-124.9', 'logps/rejected': '-104.6', 'logits/chosen': '-0.9637', 'logits/rejected': '-0.9667', 'epoch': '0.2404'}
81
+ {'eval_loss': '0.6341', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.746', 'eval_rewards/chosen': '0.1508', 'eval_rewards/rejected': '-0.06118', 'eval_rewards/accuracies': '0.6379', 'eval_rewards/margins': '0.212', 'eval_logps/chosen': '-132.9', 'eval_logps/rejected': '-113', 'eval_logits/chosen': '-0.9813', 'eval_logits/rejected': '-0.9792', 'epoch': '0.2404'}
82
+ {'loss': '0.597', 'grad_norm': '35.75', 'learning_rate': '1.875e-06', 'rewards/chosen': '0.1986', 'rewards/rejected': '-0.07326', 'rewards/accuracies': '0.73', 'rewards/margins': '0.2719', 'logps/chosen': '-129.8', 'logps/rejected': '-105.9', 'logits/chosen': '-0.967', 'logits/rejected': '-0.891', 'epoch': '0.2452'}
83
+ {'loss': '0.6392', 'grad_norm': '29.25', 'learning_rate': '1.867e-06', 'rewards/chosen': '0.1554', 'rewards/rejected': '-0.02627', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1817', 'logps/chosen': '-125.5', 'logps/rejected': '-119.1', 'logits/chosen': '-0.9933', 'logits/rejected': '-1.009', 'epoch': '0.25'}
84
+ {'loss': '0.6291', 'grad_norm': '26.62', 'learning_rate': '1.858e-06', 'rewards/chosen': '0.2175', 'rewards/rejected': '-0.01706', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2346', 'logps/chosen': '-133.1', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9084', 'logits/rejected': '-0.9529', 'epoch': '0.2548'}
85
+ {'loss': '0.6334', 'grad_norm': '23.25', 'learning_rate': '1.85e-06', 'rewards/chosen': '0.1353', 'rewards/rejected': '-0.1065', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2418', 'logps/chosen': '-134.8', 'logps/rejected': '-113.9', 'logits/chosen': '-1.018', 'logits/rejected': '-0.9214', 'epoch': '0.2596'}
86
+ {'loss': '0.6322', 'grad_norm': '31.25', 'learning_rate': '1.841e-06', 'rewards/chosen': '0.2059', 'rewards/rejected': '-0.007326', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2132', 'logps/chosen': '-146.8', 'logps/rejected': '-119.6', 'logits/chosen': '-0.9414', 'logits/rejected': '-0.892', 'epoch': '0.2645'}
87
+ {'loss': '0.6363', 'grad_norm': '40.25', 'learning_rate': '1.831e-06', 'rewards/chosen': '0.1914', 'rewards/rejected': '-0.05578', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2472', 'logps/chosen': '-132.1', 'logps/rejected': '-109.6', 'logits/chosen': '-0.9965', 'logits/rejected': '-0.95', 'epoch': '0.2693'}
88
+ {'loss': '0.6188', 'grad_norm': '25.5', 'learning_rate': '1.822e-06', 'rewards/chosen': '0.1997', 'rewards/rejected': '-0.04471', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2445', 'logps/chosen': '-133.6', 'logps/rejected': '-113.7', 'logits/chosen': '-0.9632', 'logits/rejected': '-1.003', 'epoch': '0.2741'}
89
+ {'loss': '0.6543', 'grad_norm': '27.25', 'learning_rate': '1.812e-06', 'rewards/chosen': '0.09105', 'rewards/rejected': '-0.09299', 'rewards/accuracies': '0.63', 'rewards/margins': '0.184', 'logps/chosen': '-119.6', 'logps/rejected': '-107.9', 'logits/chosen': '-0.9797', 'logits/rejected': '-1.013', 'epoch': '0.2789'}
90
+ {'loss': '0.6133', 'grad_norm': '25.38', 'learning_rate': '1.802e-06', 'rewards/chosen': '0.06649', 'rewards/rejected': '-0.1867', 'rewards/accuracies': '0.69', 'rewards/margins': '0.2532', 'logps/chosen': '-124', 'logps/rejected': '-101', 'logits/chosen': '-0.9574', 'logits/rejected': '-0.9465', 'epoch': '0.2837'}
91
+ {'loss': '0.636', 'grad_norm': '26.62', 'learning_rate': '1.792e-06', 'rewards/chosen': '0.07529', 'rewards/rejected': '-0.1398', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2151', 'logps/chosen': '-137.9', 'logps/rejected': '-111', 'logits/chosen': '-1.004', 'logits/rejected': '-0.993', 'epoch': '0.2885'}
92
+ {'eval_loss': '0.6281', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.75', 'eval_steps_per_second': '2.749', 'eval_rewards/chosen': '0.06902', 'eval_rewards/rejected': '-0.1686', 'eval_rewards/accuracies': '0.6411', 'eval_rewards/margins': '0.2376', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.5', 'eval_logits/chosen': '-0.9913', 'eval_logits/rejected': '-0.9898', 'epoch': '0.2885'}
93
+ {'loss': '0.6471', 'grad_norm': '27.62', 'learning_rate': '1.782e-06', 'rewards/chosen': '0.04069', 'rewards/rejected': '-0.1679', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2086', 'logps/chosen': '-127.5', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9642', 'logits/rejected': '-0.8913', 'epoch': '0.2933'}
94
+ {'loss': '0.6378', 'grad_norm': '30.25', 'learning_rate': '1.771e-06', 'rewards/chosen': '0.107', 'rewards/rejected': '-0.1462', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2532', 'logps/chosen': '-141.8', 'logps/rejected': '-110.1', 'logits/chosen': '-1.034', 'logits/rejected': '-1.025', 'epoch': '0.2981'}
95
+ {'loss': '0.6681', 'grad_norm': '23.38', 'learning_rate': '1.761e-06', 'rewards/chosen': '0.1194', 'rewards/rejected': '-0.04729', 'rewards/accuracies': '0.605', 'rewards/margins': '0.1667', 'logps/chosen': '-134.4', 'logps/rejected': '-122', 'logits/chosen': '-0.9662', 'logits/rejected': '-0.9422', 'epoch': '0.3029'}
96
+ {'loss': '0.6354', 'grad_norm': '33.25', 'learning_rate': '1.75e-06', 'rewards/chosen': '0.1558', 'rewards/rejected': '-0.0791', 'rewards/accuracies': '0.59', 'rewards/margins': '0.2349', 'logps/chosen': '-127.1', 'logps/rejected': '-119.7', 'logits/chosen': '-0.9494', 'logits/rejected': '-0.9116', 'epoch': '0.3077'}
97
+ {'loss': '0.6125', 'grad_norm': '25.38', 'learning_rate': '1.738e-06', 'rewards/chosen': '0.1551', 'rewards/rejected': '-0.1041', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2592', 'logps/chosen': '-153.3', 'logps/rejected': '-104.8', 'logits/chosen': '-0.956', 'logits/rejected': '-0.9402', 'epoch': '0.3125'}
98
+ {'loss': '0.6565', 'grad_norm': '24.88', 'learning_rate': '1.727e-06', 'rewards/chosen': '0.08494', 'rewards/rejected': '-0.07927', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1642', 'logps/chosen': '-126.7', 'logps/rejected': '-110.5', 'logits/chosen': '-0.9842', 'logits/rejected': '-0.9274', 'epoch': '0.3173'}
99
+ {'loss': '0.6244', 'grad_norm': '32', 'learning_rate': '1.715e-06', 'rewards/chosen': '0.1263', 'rewards/rejected': '-0.1182', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2445', 'logps/chosen': '-119.4', 'logps/rejected': '-122.2', 'logits/chosen': '-0.9953', 'logits/rejected': '-1.006', 'epoch': '0.3222'}
100
+ {'loss': '0.6332', 'grad_norm': '27.38', 'learning_rate': '1.704e-06', 'rewards/chosen': '0.0588', 'rewards/rejected': '-0.1336', 'rewards/accuracies': '0.65', 'rewards/margins': '0.1924', 'logps/chosen': '-123', 'logps/rejected': '-108.3', 'logits/chosen': '-0.9625', 'logits/rejected': '-0.8965', 'epoch': '0.327'}
101
+ {'loss': '0.6331', 'grad_norm': '24.75', 'learning_rate': '1.692e-06', 'rewards/chosen': '0.1213', 'rewards/rejected': '-0.1225', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2438', 'logps/chosen': '-123.7', 'logps/rejected': '-111', 'logits/chosen': '-1.017', 'logits/rejected': '-0.9669', 'epoch': '0.3318'}
102
+ {'loss': '0.6159', 'grad_norm': '22.25', 'learning_rate': '1.679e-06', 'rewards/chosen': '0.04103', 'rewards/rejected': '-0.2323', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2734', 'logps/chosen': '-119.8', 'logps/rejected': '-105.6', 'logits/chosen': '-0.981', 'logits/rejected': '-0.982', 'epoch': '0.3366'}
103
+ {'eval_loss': '0.6262', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.05219', 'eval_rewards/rejected': '-0.1832', 'eval_rewards/accuracies': '0.6589', 'eval_rewards/margins': '0.2353', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-0.9941', 'eval_logits/rejected': '-0.9926', 'epoch': '0.3366'}
104
+ {'loss': '0.5909', 'grad_norm': '25.75', 'learning_rate': '1.667e-06', 'rewards/chosen': '0.1096', 'rewards/rejected': '-0.1837', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2932', 'logps/chosen': '-137.7', 'logps/rejected': '-119.6', 'logits/chosen': '-0.9429', 'logits/rejected': '-0.9566', 'epoch': '0.3414'}
105
+ {'loss': '0.6095', 'grad_norm': '19.75', 'learning_rate': '1.654e-06', 'rewards/chosen': '0.05275', 'rewards/rejected': '-0.2056', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2583', 'logps/chosen': '-118.9', 'logps/rejected': '-104.4', 'logits/chosen': '-0.9903', 'logits/rejected': '-0.9753', 'epoch': '0.3462'}
106
+ {'loss': '0.6072', 'grad_norm': '39.25', 'learning_rate': '1.642e-06', 'rewards/chosen': '0.05501', 'rewards/rejected': '-0.2174', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2725', 'logps/chosen': '-133.6', 'logps/rejected': '-103.4', 'logits/chosen': '-0.9403', 'logits/rejected': '-0.8617', 'epoch': '0.351'}
107
+ {'loss': '0.6099', 'grad_norm': '28.12', 'learning_rate': '1.629e-06', 'rewards/chosen': '0.106', 'rewards/rejected': '-0.1533', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2593', 'logps/chosen': '-139.5', 'logps/rejected': '-125.9', 'logits/chosen': '-0.9693', 'logits/rejected': '-0.9525', 'epoch': '0.3558'}
108
+ {'loss': '0.626', 'grad_norm': '29.38', 'learning_rate': '1.615e-06', 'rewards/chosen': '0.07547', 'rewards/rejected': '-0.1947', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2702', 'logps/chosen': '-131.4', 'logps/rejected': '-111.5', 'logits/chosen': '-0.948', 'logits/rejected': '-0.9349', 'epoch': '0.3606'}
109
+ {'loss': '0.5866', 'grad_norm': '25.25', 'learning_rate': '1.602e-06', 'rewards/chosen': '0.1291', 'rewards/rejected': '-0.1772', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3063', 'logps/chosen': '-123.3', 'logps/rejected': '-113.3', 'logits/chosen': '-1.019', 'logits/rejected': '-0.9748', 'epoch': '0.3654'}
110
+ {'loss': '0.6346', 'grad_norm': '24.88', 'learning_rate': '1.589e-06', 'rewards/chosen': '0.1312', 'rewards/rejected': '-0.1053', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2365', 'logps/chosen': '-117.5', 'logps/rejected': '-105.8', 'logits/chosen': '-1.056', 'logits/rejected': '-1.041', 'epoch': '0.3702'}
111
+ {'loss': '0.5949', 'grad_norm': '28.12', 'learning_rate': '1.575e-06', 'rewards/chosen': '0.09233', 'rewards/rejected': '-0.2184', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3107', 'logps/chosen': '-136.3', 'logps/rejected': '-124.7', 'logits/chosen': '-0.9485', 'logits/rejected': '-0.9489', 'epoch': '0.375'}
112
+ {'loss': '0.668', 'grad_norm': '35.75', 'learning_rate': '1.561e-06', 'rewards/chosen': '0.01486', 'rewards/rejected': '-0.1466', 'rewards/accuracies': '0.59', 'rewards/margins': '0.1615', 'logps/chosen': '-124.6', 'logps/rejected': '-116.9', 'logits/chosen': '-1.092', 'logits/rejected': '-0.9749', 'epoch': '0.3799'}
113
+ {'loss': '0.6477', 'grad_norm': '28.25', 'learning_rate': '1.547e-06', 'rewards/chosen': '0.03967', 'rewards/rejected': '-0.1497', 'rewards/accuracies': '0.61', 'rewards/margins': '0.1894', 'logps/chosen': '-128.2', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9921', 'logits/rejected': '-1.002', 'epoch': '0.3847'}
114
+ {'eval_loss': '0.6239', 'eval_runtime': '158.8', 'eval_samples_per_second': '13.79', 'eval_steps_per_second': '2.758', 'eval_rewards/chosen': '0.07336', 'eval_rewards/rejected': '-0.1804', 'eval_rewards/accuracies': '0.6548', 'eval_rewards/margins': '0.2538', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.002', 'eval_logits/rejected': '-1.001', 'epoch': '0.3847'}
115
+ {'loss': '0.6338', 'grad_norm': '22.12', 'learning_rate': '1.533e-06', 'rewards/chosen': '0.1184', 'rewards/rejected': '-0.121', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2394', 'logps/chosen': '-129.7', 'logps/rejected': '-123.2', 'logits/chosen': '-1.003', 'logits/rejected': '-0.9961', 'epoch': '0.3895'}
116
+ {'loss': '0.6632', 'grad_norm': '41.25', 'learning_rate': '1.519e-06', 'rewards/chosen': '0.0514', 'rewards/rejected': '-0.1366', 'rewards/accuracies': '0.62', 'rewards/margins': '0.188', 'logps/chosen': '-136.2', 'logps/rejected': '-119.2', 'logits/chosen': '-1.032', 'logits/rejected': '-1.03', 'epoch': '0.3943'}
117
+ {'loss': '0.6371', 'grad_norm': '28.25', 'learning_rate': '1.504e-06', 'rewards/chosen': '0.06569', 'rewards/rejected': '-0.156', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2217', 'logps/chosen': '-131.1', 'logps/rejected': '-117.2', 'logits/chosen': '-1.015', 'logits/rejected': '-1.012', 'epoch': '0.3991'}
118
+ {'loss': '0.604', 'grad_norm': '24.75', 'learning_rate': '1.49e-06', 'rewards/chosen': '0.07572', 'rewards/rejected': '-0.2187', 'rewards/accuracies': '0.74', 'rewards/margins': '0.2945', 'logps/chosen': '-137.4', 'logps/rejected': '-118.2', 'logits/chosen': '-0.9746', 'logits/rejected': '-0.9323', 'epoch': '0.4039'}
119
+ {'loss': '0.6266', 'grad_norm': '30', 'learning_rate': '1.475e-06', 'rewards/chosen': '0.1121', 'rewards/rejected': '-0.1312', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2433', 'logps/chosen': '-135.9', 'logps/rejected': '-116', 'logits/chosen': '-0.9413', 'logits/rejected': '-1.037', 'epoch': '0.4087'}
120
+ {'loss': '0.6456', 'grad_norm': '30.62', 'learning_rate': '1.46e-06', 'rewards/chosen': '-0.003546', 'rewards/rejected': '-0.2196', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2161', 'logps/chosen': '-123.7', 'logps/rejected': '-107.6', 'logits/chosen': '-1.004', 'logits/rejected': '-1.041', 'epoch': '0.4135'}
121
+ {'loss': '0.6239', 'grad_norm': '15.88', 'learning_rate': '1.445e-06', 'rewards/chosen': '0.07761', 'rewards/rejected': '-0.178', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2556', 'logps/chosen': '-124.6', 'logps/rejected': '-98.34', 'logits/chosen': '-0.9579', 'logits/rejected': '-0.9283', 'epoch': '0.4183'}
122
+ {'loss': '0.6506', 'grad_norm': '34.5', 'learning_rate': '1.43e-06', 'rewards/chosen': '0.1127', 'rewards/rejected': '-0.07568', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1883', 'logps/chosen': '-125.6', 'logps/rejected': '-113.9', 'logits/chosen': '-0.9883', 'logits/rejected': '-1.021', 'epoch': '0.4231'}
123
+ {'loss': '0.5877', 'grad_norm': '23.5', 'learning_rate': '1.415e-06', 'rewards/chosen': '0.1939', 'rewards/rejected': '-0.1659', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3598', 'logps/chosen': '-144.3', 'logps/rejected': '-104.2', 'logits/chosen': '-0.9698', 'logits/rejected': '-0.9483', 'epoch': '0.4279'}
124
+ {'loss': '0.5945', 'grad_norm': '23.62', 'learning_rate': '1.4e-06', 'rewards/chosen': '0.2027', 'rewards/rejected': '-0.1275', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3301', 'logps/chosen': '-136.4', 'logps/rejected': '-106.5', 'logits/chosen': '-1.04', 'logits/rejected': '-1.026', 'epoch': '0.4327'}
125
+ {'eval_loss': '0.621', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.747', 'eval_rewards/chosen': '0.1077', 'eval_rewards/rejected': '-0.1569', 'eval_rewards/accuracies': '0.6598', 'eval_rewards/margins': '0.2646', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.5', 'eval_logits/chosen': '-1.001', 'eval_logits/rejected': '-1', 'epoch': '0.4327'}
126
+ {'loss': '0.592', 'grad_norm': '28.75', 'learning_rate': '1.384e-06', 'rewards/chosen': '0.1301', 'rewards/rejected': '-0.1729', 'rewards/accuracies': '0.685', 'rewards/margins': '0.303', 'logps/chosen': '-131.6', 'logps/rejected': '-104.5', 'logits/chosen': '-1.047', 'logits/rejected': '-1.001', 'epoch': '0.4376'}
127
+ {'loss': '0.6258', 'grad_norm': '31.88', 'learning_rate': '1.369e-06', 'rewards/chosen': '0.1164', 'rewards/rejected': '-0.123', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2394', 'logps/chosen': '-127.5', 'logps/rejected': '-118.2', 'logits/chosen': '-1.001', 'logits/rejected': '-0.9918', 'epoch': '0.4424'}
128
+ {'loss': '0.6184', 'grad_norm': '25.12', 'learning_rate': '1.353e-06', 'rewards/chosen': '0.1316', 'rewards/rejected': '-0.1737', 'rewards/accuracies': '0.585', 'rewards/margins': '0.3054', 'logps/chosen': '-139.3', 'logps/rejected': '-118.4', 'logits/chosen': '-0.9795', 'logits/rejected': '-0.9423', 'epoch': '0.4472'}
129
+ {'loss': '0.6156', 'grad_norm': '30', 'learning_rate': '1.337e-06', 'rewards/chosen': '0.06766', 'rewards/rejected': '-0.225', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2927', 'logps/chosen': '-133.4', 'logps/rejected': '-105.4', 'logits/chosen': '-0.9695', 'logits/rejected': '-0.9534', 'epoch': '0.452'}
130
+ {'loss': '0.623', 'grad_norm': '27.25', 'learning_rate': '1.321e-06', 'rewards/chosen': '0.1109', 'rewards/rejected': '-0.1649', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2758', 'logps/chosen': '-135', 'logps/rejected': '-123.8', 'logits/chosen': '-0.9916', 'logits/rejected': '-0.934', 'epoch': '0.4568'}
131
+ {'loss': '0.6401', 'grad_norm': '27.5', 'learning_rate': '1.306e-06', 'rewards/chosen': '0.05028', 'rewards/rejected': '-0.1952', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2455', 'logps/chosen': '-121', 'logps/rejected': '-111.4', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9843', 'epoch': '0.4616'}
132
+ {'loss': '0.5608', 'grad_norm': '24.38', 'learning_rate': '1.289e-06', 'rewards/chosen': '0.1163', 'rewards/rejected': '-0.294', 'rewards/accuracies': '0.73', 'rewards/margins': '0.4103', 'logps/chosen': '-134.2', 'logps/rejected': '-103', 'logits/chosen': '-1.023', 'logits/rejected': '-1.027', 'epoch': '0.4664'}
133
+ {'loss': '0.5944', 'grad_norm': '24.38', 'learning_rate': '1.273e-06', 'rewards/chosen': '0.1042', 'rewards/rejected': '-0.2077', 'rewards/accuracies': '0.71', 'rewards/margins': '0.312', 'logps/chosen': '-144.2', 'logps/rejected': '-101.9', 'logits/chosen': '-0.9673', 'logits/rejected': '-0.9317', 'epoch': '0.4712'}
134
+ {'loss': '0.6302', 'grad_norm': '21.88', 'learning_rate': '1.257e-06', 'rewards/chosen': '0.1611', 'rewards/rejected': '-0.1021', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2632', 'logps/chosen': '-131.3', 'logps/rejected': '-117.5', 'logits/chosen': '-0.9503', 'logits/rejected': '-0.9778', 'epoch': '0.476'}
135
+ {'loss': '0.6027', 'grad_norm': '31.88', 'learning_rate': '1.241e-06', 'rewards/chosen': '0.1524', 'rewards/rejected': '-0.1854', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3378', 'logps/chosen': '-135.9', 'logps/rejected': '-115.3', 'logits/chosen': '-0.9442', 'logits/rejected': '-0.9484', 'epoch': '0.4808'}
136
+ {'eval_loss': '0.6208', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.75', 'eval_steps_per_second': '2.75', 'eval_rewards/chosen': '0.09863', 'eval_rewards/rejected': '-0.177', 'eval_rewards/accuracies': '0.658', 'eval_rewards/margins': '0.2757', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.001', 'eval_logits/rejected': '-1.001', 'epoch': '0.4808'}
137
+ {'loss': '0.5893', 'grad_norm': '28.5', 'learning_rate': '1.225e-06', 'rewards/chosen': '0.1764', 'rewards/rejected': '-0.1885', 'rewards/accuracies': '0.705', 'rewards/margins': '0.3649', 'logps/chosen': '-152.2', 'logps/rejected': '-117.6', 'logits/chosen': '-0.9854', 'logits/rejected': '-0.911', 'epoch': '0.4856'}
138
+ {'loss': '0.6349', 'grad_norm': '31.75', 'learning_rate': '1.208e-06', 'rewards/chosen': '0.09442', 'rewards/rejected': '-0.1627', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2571', 'logps/chosen': '-145.2', 'logps/rejected': '-111', 'logits/chosen': '-0.9148', 'logits/rejected': '-0.9454', 'epoch': '0.4904'}
139
+ {'loss': '0.6097', 'grad_norm': '22.75', 'learning_rate': '1.192e-06', 'rewards/chosen': '0.13', 'rewards/rejected': '-0.1788', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3088', 'logps/chosen': '-123', 'logps/rejected': '-106.8', 'logits/chosen': '-0.9644', 'logits/rejected': '-0.9964', 'epoch': '0.4953'}
140
+ {'loss': '0.5916', 'grad_norm': '48.75', 'learning_rate': '1.175e-06', 'rewards/chosen': '0.181', 'rewards/rejected': '-0.1517', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3327', 'logps/chosen': '-121.3', 'logps/rejected': '-131.1', 'logits/chosen': '-1.03', 'logits/rejected': '-0.934', 'epoch': '0.5001'}
141
+ {'loss': '0.6417', 'grad_norm': '28.75', 'learning_rate': '1.159e-06', 'rewards/chosen': '0.1072', 'rewards/rejected': '-0.122', 'rewards/accuracies': '0.61', 'rewards/margins': '0.2292', 'logps/chosen': '-128.8', 'logps/rejected': '-122.2', 'logits/chosen': '-0.9683', 'logits/rejected': '-0.9303', 'epoch': '0.5049'}
142
+ {'loss': '0.6223', 'grad_norm': '33', 'learning_rate': '1.142e-06', 'rewards/chosen': '0.1564', 'rewards/rejected': '-0.1069', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2633', 'logps/chosen': '-132.7', 'logps/rejected': '-116.8', 'logits/chosen': '-1', 'logits/rejected': '-1.002', 'epoch': '0.5097'}
143
+ {'loss': '0.5956', 'grad_norm': '19.88', 'learning_rate': '1.126e-06', 'rewards/chosen': '0.1761', 'rewards/rejected': '-0.168', 'rewards/accuracies': '0.665', 'rewards/margins': '0.344', 'logps/chosen': '-134.1', 'logps/rejected': '-113.3', 'logits/chosen': '-1.061', 'logits/rejected': '-1.011', 'epoch': '0.5145'}
144
+ {'loss': '0.6268', 'grad_norm': '25.25', 'learning_rate': '1.109e-06', 'rewards/chosen': '0.08844', 'rewards/rejected': '-0.1834', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2718', 'logps/chosen': '-120.9', 'logps/rejected': '-107.2', 'logits/chosen': '-0.9652', 'logits/rejected': '-0.9194', 'epoch': '0.5193'}
145
+ {'loss': '0.6106', 'grad_norm': '24.75', 'learning_rate': '1.092e-06', 'rewards/chosen': '0.07643', 'rewards/rejected': '-0.184', 'rewards/accuracies': '0.7', 'rewards/margins': '0.2604', 'logps/chosen': '-126', 'logps/rejected': '-105.6', 'logits/chosen': '-0.9865', 'logits/rejected': '-0.9771', 'epoch': '0.5241'}
146
+ {'loss': '0.611', 'grad_norm': '28', 'learning_rate': '1.075e-06', 'rewards/chosen': '0.09692', 'rewards/rejected': '-0.2066', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3035', 'logps/chosen': '-136.7', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9674', 'logits/rejected': '-0.9861', 'epoch': '0.5289'}
147
+ {'eval_loss': '0.6197', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.0907', 'eval_rewards/rejected': '-0.1901', 'eval_rewards/accuracies': '0.6562', 'eval_rewards/margins': '0.2808', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.004', 'eval_logits/rejected': '-1.004', 'epoch': '0.5289'}
148
+ {'loss': '0.5994', 'grad_norm': '28.25', 'learning_rate': '1.059e-06', 'rewards/chosen': '0.1401', 'rewards/rejected': '-0.2363', 'rewards/accuracies': '0.69', 'rewards/margins': '0.3764', 'logps/chosen': '-138.5', 'logps/rejected': '-115.4', 'logits/chosen': '-0.9296', 'logits/rejected': '-0.9662', 'epoch': '0.5337'}
149
+ {'loss': '0.6124', 'grad_norm': '31', 'learning_rate': '1.042e-06', 'rewards/chosen': '0.1156', 'rewards/rejected': '-0.1785', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2941', 'logps/chosen': '-140.3', 'logps/rejected': '-104.5', 'logits/chosen': '-1.001', 'logits/rejected': '-0.9529', 'epoch': '0.5385'}
150
+ {'loss': '0.5968', 'grad_norm': '24.62', 'learning_rate': '1.025e-06', 'rewards/chosen': '0.09342', 'rewards/rejected': '-0.2448', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3382', 'logps/chosen': '-138.1', 'logps/rejected': '-124.2', 'logits/chosen': '-0.926', 'logits/rejected': '-0.9841', 'epoch': '0.5433'}
151
+ {'loss': '0.6155', 'grad_norm': '30.88', 'learning_rate': '1.008e-06', 'rewards/chosen': '0.09974', 'rewards/rejected': '-0.198', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2977', 'logps/chosen': '-125.8', 'logps/rejected': '-104', 'logits/chosen': '-0.9486', 'logits/rejected': '-0.9328', 'epoch': '0.5481'}
152
+ {'loss': '0.6336', 'grad_norm': '26.88', 'learning_rate': '9.916e-07', 'rewards/chosen': '0.1246', 'rewards/rejected': '-0.1427', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2673', 'logps/chosen': '-147', 'logps/rejected': '-121.5', 'logits/chosen': '-0.9657', 'logits/rejected': '-0.9106', 'epoch': '0.553'}
153
+ {'loss': '0.5825', 'grad_norm': '29.38', 'learning_rate': '9.748e-07', 'rewards/chosen': '0.162', 'rewards/rejected': '-0.2003', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3623', 'logps/chosen': '-134.6', 'logps/rejected': '-105.5', 'logits/chosen': '-1.025', 'logits/rejected': '-0.9952', 'epoch': '0.5578'}
154
+ {'loss': '0.5974', 'grad_norm': '21.62', 'learning_rate': '9.581e-07', 'rewards/chosen': '0.08895', 'rewards/rejected': '-0.2469', 'rewards/accuracies': '0.655', 'rewards/margins': '0.3359', 'logps/chosen': '-134.8', 'logps/rejected': '-106.8', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9689', 'epoch': '0.5626'}
155
+ {'loss': '0.5966', 'grad_norm': '22', 'learning_rate': '9.413e-07', 'rewards/chosen': '0.09052', 'rewards/rejected': '-0.2249', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3154', 'logps/chosen': '-128.5', 'logps/rejected': '-105.4', 'logits/chosen': '-1.053', 'logits/rejected': '-0.9607', 'epoch': '0.5674'}
156
+ {'loss': '0.6572', 'grad_norm': '26.5', 'learning_rate': '9.246e-07', 'rewards/chosen': '0.02879', 'rewards/rejected': '-0.157', 'rewards/accuracies': '0.62', 'rewards/margins': '0.1858', 'logps/chosen': '-132.2', 'logps/rejected': '-105.7', 'logits/chosen': '-0.9782', 'logits/rejected': '-0.9498', 'epoch': '0.5722'}
157
+ {'loss': '0.6372', 'grad_norm': '35', 'learning_rate': '9.078e-07', 'rewards/chosen': '0.07806', 'rewards/rejected': '-0.1434', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2215', 'logps/chosen': '-125.9', 'logps/rejected': '-126', 'logits/chosen': '-1.035', 'logits/rejected': '-0.9866', 'epoch': '0.577'}
158
+ {'eval_loss': '0.6194', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.749', 'eval_rewards/chosen': '0.102', 'eval_rewards/rejected': '-0.184', 'eval_rewards/accuracies': '0.658', 'eval_rewards/margins': '0.286', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.006', 'eval_logits/rejected': '-1.006', 'epoch': '0.577'}
159
+ {'loss': '0.5651', 'grad_norm': '25.38', 'learning_rate': '8.911e-07', 'rewards/chosen': '0.141', 'rewards/rejected': '-0.2724', 'rewards/accuracies': '0.725', 'rewards/margins': '0.4134', 'logps/chosen': '-123.4', 'logps/rejected': '-113.6', 'logits/chosen': '-0.9879', 'logits/rejected': '-0.9599', 'epoch': '0.5818'}
160
+ {'loss': '0.5842', 'grad_norm': '23.12', 'learning_rate': '8.745e-07', 'rewards/chosen': '0.1504', 'rewards/rejected': '-0.1981', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3485', 'logps/chosen': '-149.7', 'logps/rejected': '-116.2', 'logits/chosen': '-1.047', 'logits/rejected': '-1.062', 'epoch': '0.5866'}
161
+ {'loss': '0.6093', 'grad_norm': '27.5', 'learning_rate': '8.578e-07', 'rewards/chosen': '0.1642', 'rewards/rejected': '-0.1456', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3098', 'logps/chosen': '-130.4', 'logps/rejected': '-121.4', 'logits/chosen': '-1.036', 'logits/rejected': '-0.9943', 'epoch': '0.5914'}
162
+ {'loss': '0.6234', 'grad_norm': '34.25', 'learning_rate': '8.412e-07', 'rewards/chosen': '0.1339', 'rewards/rejected': '-0.1329', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2668', 'logps/chosen': '-133.1', 'logps/rejected': '-122.2', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9569', 'epoch': '0.5962'}
163
+ {'loss': '0.5824', 'grad_norm': '25.12', 'learning_rate': '8.247e-07', 'rewards/chosen': '0.1535', 'rewards/rejected': '-0.2017', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3552', 'logps/chosen': '-134.8', 'logps/rejected': '-111.3', 'logits/chosen': '-1.021', 'logits/rejected': '-0.983', 'epoch': '0.601'}
164
+ {'loss': '0.6085', 'grad_norm': '36.5', 'learning_rate': '8.082e-07', 'rewards/chosen': '0.121', 'rewards/rejected': '-0.2039', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3248', 'logps/chosen': '-132.3', 'logps/rejected': '-110.7', 'logits/chosen': '-0.9972', 'logits/rejected': '-0.9701', 'epoch': '0.6058'}
165
+ {'loss': '0.5618', 'grad_norm': '26.38', 'learning_rate': '7.918e-07', 'rewards/chosen': '0.1927', 'rewards/rejected': '-0.2224', 'rewards/accuracies': '0.71', 'rewards/margins': '0.4152', 'logps/chosen': '-145', 'logps/rejected': '-119.6', 'logits/chosen': '-1.046', 'logits/rejected': '-0.9814', 'epoch': '0.6107'}
166
+ {'loss': '0.6005', 'grad_norm': '13.94', 'learning_rate': '7.754e-07', 'rewards/chosen': '0.1197', 'rewards/rejected': '-0.1972', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3169', 'logps/chosen': '-127.7', 'logps/rejected': '-97.99', 'logits/chosen': '-1.017', 'logits/rejected': '-1.026', 'epoch': '0.6155'}
167
+ {'loss': '0.6101', 'grad_norm': '31.12', 'learning_rate': '7.591e-07', 'rewards/chosen': '0.1124', 'rewards/rejected': '-0.199', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3115', 'logps/chosen': '-128.6', 'logps/rejected': '-111.4', 'logits/chosen': '-1.014', 'logits/rejected': '-1.001', 'epoch': '0.6203'}
168
+ {'loss': '0.5744', 'grad_norm': '30.38', 'learning_rate': '7.428e-07', 'rewards/chosen': '0.1661', 'rewards/rejected': '-0.2175', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3836', 'logps/chosen': '-140.8', 'logps/rejected': '-122.9', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9857', 'epoch': '0.6251'}
169
+ {'eval_loss': '0.6182', 'eval_runtime': '159.6', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.744', 'eval_rewards/chosen': '0.08965', 'eval_rewards/rejected': '-0.2029', 'eval_rewards/accuracies': '0.6543', 'eval_rewards/margins': '0.2925', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.7', 'eval_logits/chosen': '-1.007', 'eval_logits/rejected': '-1.008', 'epoch': '0.6251'}
170
+ {'loss': '0.6543', 'grad_norm': '22.5', 'learning_rate': '7.266e-07', 'rewards/chosen': '0.09405', 'rewards/rejected': '-0.1148', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2088', 'logps/chosen': '-123.1', 'logps/rejected': '-102.8', 'logits/chosen': '-1.008', 'logits/rejected': '-0.9873', 'epoch': '0.6299'}
171
+ {'loss': '0.6178', 'grad_norm': '26.38', 'learning_rate': '7.105e-07', 'rewards/chosen': '0.0634', 'rewards/rejected': '-0.2023', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2657', 'logps/chosen': '-142.2', 'logps/rejected': '-124.9', 'logits/chosen': '-1.034', 'logits/rejected': '-1.03', 'epoch': '0.6347'}
172
+ {'loss': '0.632', 'grad_norm': '26.75', 'learning_rate': '6.945e-07', 'rewards/chosen': '0.06304', 'rewards/rejected': '-0.1975', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2606', 'logps/chosen': '-129.1', 'logps/rejected': '-112.8', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9704', 'epoch': '0.6395'}
173
+ {'loss': '0.6087', 'grad_norm': '23.38', 'learning_rate': '6.786e-07', 'rewards/chosen': '0.07113', 'rewards/rejected': '-0.2305', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3016', 'logps/chosen': '-119.9', 'logps/rejected': '-108.1', 'logits/chosen': '-1.016', 'logits/rejected': '-0.9578', 'epoch': '0.6443'}
174
+ {'loss': '0.6433', 'grad_norm': '35.75', 'learning_rate': '6.627e-07', 'rewards/chosen': '0.06799', 'rewards/rejected': '-0.1603', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2282', 'logps/chosen': '-128.5', 'logps/rejected': '-120.6', 'logits/chosen': '-1.028', 'logits/rejected': '-1.012', 'epoch': '0.6491'}
175
+ {'loss': '0.6032', 'grad_norm': '21.62', 'learning_rate': '6.47e-07', 'rewards/chosen': '0.1394', 'rewards/rejected': '-0.1772', 'rewards/accuracies': '0.69', 'rewards/margins': '0.3166', 'logps/chosen': '-129.2', 'logps/rejected': '-108.2', 'logits/chosen': '-1.021', 'logits/rejected': '-1.034', 'epoch': '0.6539'}
176
+ {'loss': '0.6335', 'grad_norm': '42.75', 'learning_rate': '6.313e-07', 'rewards/chosen': '-0.004546', 'rewards/rejected': '-0.2462', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2416', 'logps/chosen': '-125', 'logps/rejected': '-105.8', 'logits/chosen': '-1.04', 'logits/rejected': '-0.9723', 'epoch': '0.6587'}
177
+ {'loss': '0.6123', 'grad_norm': '25.38', 'learning_rate': '6.158e-07', 'rewards/chosen': '0.1051', 'rewards/rejected': '-0.1864', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2915', 'logps/chosen': '-137.5', 'logps/rejected': '-121.2', 'logits/chosen': '-0.9934', 'logits/rejected': '-0.9458', 'epoch': '0.6635'}
178
+ {'loss': '0.5943', 'grad_norm': '27', 'learning_rate': '6.003e-07', 'rewards/chosen': '0.1287', 'rewards/rejected': '-0.2019', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3306', 'logps/chosen': '-134.4', 'logps/rejected': '-110.2', 'logits/chosen': '-0.9903', 'logits/rejected': '-0.9943', 'epoch': '0.6683'}
179
+ {'loss': '0.6111', 'grad_norm': '29.5', 'learning_rate': '5.85e-07', 'rewards/chosen': '0.07695', 'rewards/rejected': '-0.2288', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3057', 'logps/chosen': '-132.5', 'logps/rejected': '-106.4', 'logits/chosen': '-0.9616', 'logits/rejected': '-0.9204', 'epoch': '0.6732'}
180
+ {'eval_loss': '0.6172', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.0812', 'eval_rewards/rejected': '-0.2119', 'eval_rewards/accuracies': '0.6644', 'eval_rewards/margins': '0.2931', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.7', 'eval_logits/chosen': '-1.008', 'eval_logits/rejected': '-1.008', 'epoch': '0.6732'}
181
+ {'loss': '0.6044', 'grad_norm': '39.75', 'learning_rate': '5.698e-07', 'rewards/chosen': '0.1359', 'rewards/rejected': '-0.2258', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3617', 'logps/chosen': '-133.3', 'logps/rejected': '-106.4', 'logits/chosen': '-0.9936', 'logits/rejected': '-0.9936', 'epoch': '0.678'}
182
+ {'loss': '0.5767', 'grad_norm': '29.5', 'learning_rate': '5.547e-07', 'rewards/chosen': '0.1588', 'rewards/rejected': '-0.2603', 'rewards/accuracies': '0.69', 'rewards/margins': '0.4192', 'logps/chosen': '-134', 'logps/rejected': '-122.9', 'logits/chosen': '-0.981', 'logits/rejected': '-0.9433', 'epoch': '0.6828'}
183
+ {'loss': '0.6466', 'grad_norm': '34.75', 'learning_rate': '5.397e-07', 'rewards/chosen': '0.06965', 'rewards/rejected': '-0.1698', 'rewards/accuracies': '0.59', 'rewards/margins': '0.2395', 'logps/chosen': '-134.3', 'logps/rejected': '-122.1', 'logits/chosen': '-0.9917', 'logits/rejected': '-0.974', 'epoch': '0.6876'}
184
+ {'loss': '0.5897', 'grad_norm': '34.5', 'learning_rate': '5.249e-07', 'rewards/chosen': '0.07432', 'rewards/rejected': '-0.2534', 'rewards/accuracies': '0.73', 'rewards/margins': '0.3278', 'logps/chosen': '-130.2', 'logps/rejected': '-106.7', 'logits/chosen': '-0.9764', 'logits/rejected': '-0.9641', 'epoch': '0.6924'}
185
+ {'loss': '0.6133', 'grad_norm': '26.38', 'learning_rate': '5.102e-07', 'rewards/chosen': '0.06769', 'rewards/rejected': '-0.2367', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3044', 'logps/chosen': '-132.5', 'logps/rejected': '-112.7', 'logits/chosen': '-0.9815', 'logits/rejected': '-0.9777', 'epoch': '0.6972'}
186
+ {'loss': '0.6261', 'grad_norm': '21.25', 'learning_rate': '4.956e-07', 'rewards/chosen': '0.04256', 'rewards/rejected': '-0.2149', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2575', 'logps/chosen': '-113.9', 'logps/rejected': '-104.1', 'logits/chosen': '-1.023', 'logits/rejected': '-1.017', 'epoch': '0.702'}
187
+ {'loss': '0.6319', 'grad_norm': '31.12', 'learning_rate': '4.812e-07', 'rewards/chosen': '0.07442', 'rewards/rejected': '-0.2227', 'rewards/accuracies': '0.725', 'rewards/margins': '0.2971', 'logps/chosen': '-144.3', 'logps/rejected': '-120.9', 'logits/chosen': '-0.9851', 'logits/rejected': '-0.9347', 'epoch': '0.7068'}
188
+ {'loss': '0.6194', 'grad_norm': '29.88', 'learning_rate': '4.67e-07', 'rewards/chosen': '0.09555', 'rewards/rejected': '-0.2322', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3277', 'logps/chosen': '-126.8', 'logps/rejected': '-113.1', 'logits/chosen': '-1.055', 'logits/rejected': '-1.009', 'epoch': '0.7116'}
189
+ {'loss': '0.6323', 'grad_norm': '28', 'learning_rate': '4.528e-07', 'rewards/chosen': '0.05447', 'rewards/rejected': '-0.2012', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2557', 'logps/chosen': '-129.2', 'logps/rejected': '-115.4', 'logits/chosen': '-1.021', 'logits/rejected': '-1.045', 'epoch': '0.7164'}
190
+ {'loss': '0.5895', 'grad_norm': '26', 'learning_rate': '4.389e-07', 'rewards/chosen': '0.02393', 'rewards/rejected': '-0.2991', 'rewards/accuracies': '0.72', 'rewards/margins': '0.323', 'logps/chosen': '-120.2', 'logps/rejected': '-101.5', 'logits/chosen': '-1.104', 'logits/rejected': '-0.9578', 'epoch': '0.7212'}
191
+ {'eval_loss': '0.6169', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.746', 'eval_rewards/chosen': '0.06714', 'eval_rewards/rejected': '-0.2274', 'eval_rewards/accuracies': '0.6676', 'eval_rewards/margins': '0.2946', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.7212'}
192
+ {'loss': '0.5721', 'grad_norm': '37.25', 'learning_rate': '4.25e-07', 'rewards/chosen': '0.1668', 'rewards/rejected': '-0.2653', 'rewards/accuracies': '0.705', 'rewards/margins': '0.4321', 'logps/chosen': '-152.7', 'logps/rejected': '-115.3', 'logits/chosen': '-1.047', 'logits/rejected': '-0.9918', 'epoch': '0.726'}
193
+ {'loss': '0.6355', 'grad_norm': '24.12', 'learning_rate': '4.114e-07', 'rewards/chosen': '0.06084', 'rewards/rejected': '-0.174', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2348', 'logps/chosen': '-118', 'logps/rejected': '-113.7', 'logits/chosen': '-0.9507', 'logits/rejected': '-0.9365', 'epoch': '0.7309'}
194
+ {'loss': '0.6423', 'grad_norm': '27.5', 'learning_rate': '3.979e-07', 'rewards/chosen': '0.08837', 'rewards/rejected': '-0.1562', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2445', 'logps/chosen': '-129.5', 'logps/rejected': '-105.8', 'logits/chosen': '-0.9463', 'logits/rejected': '-1.061', 'epoch': '0.7357'}
195
+ {'loss': '0.5898', 'grad_norm': '30.38', 'learning_rate': '3.846e-07', 'rewards/chosen': '0.1229', 'rewards/rejected': '-0.2618', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3847', 'logps/chosen': '-142.4', 'logps/rejected': '-115.1', 'logits/chosen': '-1.005', 'logits/rejected': '-0.9725', 'epoch': '0.7405'}
196
+ {'loss': '0.5932', 'grad_norm': '24', 'learning_rate': '3.715e-07', 'rewards/chosen': '0.0869', 'rewards/rejected': '-0.2666', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3535', 'logps/chosen': '-130.5', 'logps/rejected': '-117.2', 'logits/chosen': '-1.014', 'logits/rejected': '-0.9823', 'epoch': '0.7453'}
197
+ {'loss': '0.6321', 'grad_norm': '28.25', 'learning_rate': '3.585e-07', 'rewards/chosen': '0.1186', 'rewards/rejected': '-0.1726', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2912', 'logps/chosen': '-122.4', 'logps/rejected': '-110.7', 'logits/chosen': '-1.069', 'logits/rejected': '-1.031', 'epoch': '0.7501'}
198
+ {'loss': '0.6521', 'grad_norm': '26.5', 'learning_rate': '3.457e-07', 'rewards/chosen': '0.08378', 'rewards/rejected': '-0.1438', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2276', 'logps/chosen': '-131.7', 'logps/rejected': '-109.7', 'logits/chosen': '-1.01', 'logits/rejected': '-1.022', 'epoch': '0.7549'}
199
+ {'loss': '0.586', 'grad_norm': '26.88', 'learning_rate': '3.331e-07', 'rewards/chosen': '0.09078', 'rewards/rejected': '-0.2869', 'rewards/accuracies': '0.655', 'rewards/margins': '0.3777', 'logps/chosen': '-138', 'logps/rejected': '-111.6', 'logits/chosen': '-1.026', 'logits/rejected': '-0.9772', 'epoch': '0.7597'}
200
+ {'loss': '0.5819', 'grad_norm': '31', 'learning_rate': '3.207e-07', 'rewards/chosen': '0.111', 'rewards/rejected': '-0.2876', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3987', 'logps/chosen': '-132.5', 'logps/rejected': '-116.4', 'logits/chosen': '-1.013', 'logits/rejected': '-0.9545', 'epoch': '0.7645'}
201
+ {'loss': '0.5849', 'grad_norm': '31.88', 'learning_rate': '3.085e-07', 'rewards/chosen': '0.03506', 'rewards/rejected': '-0.3219', 'rewards/accuracies': '0.695', 'rewards/margins': '0.357', 'logps/chosen': '-131.8', 'logps/rejected': '-111.7', 'logits/chosen': '-1.046', 'logits/rejected': '-1.014', 'epoch': '0.7693'}
202
+ {'eval_loss': '0.6172', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.747', 'eval_rewards/chosen': '0.0699', 'eval_rewards/rejected': '-0.2252', 'eval_rewards/accuracies': '0.6639', 'eval_rewards/margins': '0.2951', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.009', 'eval_logits/rejected': '-1.01', 'epoch': '0.7693'}
203
+ {'loss': '0.5601', 'grad_norm': '22.88', 'learning_rate': '2.965e-07', 'rewards/chosen': '0.17', 'rewards/rejected': '-0.2553', 'rewards/accuracies': '0.74', 'rewards/margins': '0.4254', 'logps/chosen': '-134.6', 'logps/rejected': '-120', 'logits/chosen': '-1.073', 'logits/rejected': '-1.052', 'epoch': '0.7741'}
204
+ {'loss': '0.5903', 'grad_norm': '32.25', 'learning_rate': '2.846e-07', 'rewards/chosen': '0.117', 'rewards/rejected': '-0.2267', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3436', 'logps/chosen': '-148.8', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9712', 'logits/rejected': '-0.9916', 'epoch': '0.7789'}
205
+ {'loss': '0.632', 'grad_norm': '25.12', 'learning_rate': '2.73e-07', 'rewards/chosen': '0.04253', 'rewards/rejected': '-0.2194', 'rewards/accuracies': '0.62', 'rewards/margins': '0.262', 'logps/chosen': '-122.7', 'logps/rejected': '-124.5', 'logits/chosen': '-1.014', 'logits/rejected': '-1.028', 'epoch': '0.7837'}
206
+ {'loss': '0.6115', 'grad_norm': '30.62', 'learning_rate': '2.616e-07', 'rewards/chosen': '0.1155', 'rewards/rejected': '-0.1882', 'rewards/accuracies': '0.62', 'rewards/margins': '0.3036', 'logps/chosen': '-132.6', 'logps/rejected': '-112.1', 'logits/chosen': '-1.035', 'logits/rejected': '-0.99', 'epoch': '0.7886'}
207
+ {'loss': '0.5935', 'grad_norm': '27.5', 'learning_rate': '2.504e-07', 'rewards/chosen': '0.08252', 'rewards/rejected': '-0.2534', 'rewards/accuracies': '0.67', 'rewards/margins': '0.3359', 'logps/chosen': '-116', 'logps/rejected': '-114.6', 'logits/chosen': '-1.09', 'logits/rejected': '-1.055', 'epoch': '0.7934'}
208
+ {'loss': '0.6075', 'grad_norm': '25.12', 'learning_rate': '2.394e-07', 'rewards/chosen': '0.1463', 'rewards/rejected': '-0.1917', 'rewards/accuracies': '0.68', 'rewards/margins': '0.338', 'logps/chosen': '-131.6', 'logps/rejected': '-110.9', 'logits/chosen': '-0.9839', 'logits/rejected': '-0.9589', 'epoch': '0.7982'}
209
+ {'loss': '0.6178', 'grad_norm': '25.5', 'learning_rate': '2.286e-07', 'rewards/chosen': '0.08', 'rewards/rejected': '-0.2258', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3058', 'logps/chosen': '-136.2', 'logps/rejected': '-110.5', 'logits/chosen': '-0.9616', 'logits/rejected': '-1.004', 'epoch': '0.803'}
210
+ {'loss': '0.6047', 'grad_norm': '24.38', 'learning_rate': '2.18e-07', 'rewards/chosen': '0.07041', 'rewards/rejected': '-0.2607', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3311', 'logps/chosen': '-132.8', 'logps/rejected': '-109.6', 'logits/chosen': '-0.9952', 'logits/rejected': '-0.9531', 'epoch': '0.8078'}
211
+ {'loss': '0.5989', 'grad_norm': '23', 'learning_rate': '2.077e-07', 'rewards/chosen': '0.1002', 'rewards/rejected': '-0.2467', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3469', 'logps/chosen': '-145.4', 'logps/rejected': '-114.1', 'logits/chosen': '-0.9995', 'logits/rejected': '-0.9752', 'epoch': '0.8126'}
212
+ {'loss': '0.5759', 'grad_norm': '27.62', 'learning_rate': '1.975e-07', 'rewards/chosen': '0.07599', 'rewards/rejected': '-0.3065', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3825', 'logps/chosen': '-137.4', 'logps/rejected': '-123.1', 'logits/chosen': '-1.024', 'logits/rejected': '-0.9887', 'epoch': '0.8174'}
213
+ {'eval_loss': '0.6168', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.07051', 'eval_rewards/rejected': '-0.2252', 'eval_rewards/accuracies': '0.6653', 'eval_rewards/margins': '0.2957', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.8174'}
214
+ {'loss': '0.5734', 'grad_norm': '19.12', 'learning_rate': '1.876e-07', 'rewards/chosen': '0.1173', 'rewards/rejected': '-0.2696', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3869', 'logps/chosen': '-127.1', 'logps/rejected': '-116.2', 'logits/chosen': '-0.968', 'logits/rejected': '-0.9611', 'epoch': '0.8222'}
215
+ {'loss': '0.6326', 'grad_norm': '24.5', 'learning_rate': '1.78e-07', 'rewards/chosen': '0.1114', 'rewards/rejected': '-0.1299', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2413', 'logps/chosen': '-148', 'logps/rejected': '-114.8', 'logits/chosen': '-0.9754', 'logits/rejected': '-0.9537', 'epoch': '0.827'}
216
+ {'loss': '0.6235', 'grad_norm': '24.5', 'learning_rate': '1.685e-07', 'rewards/chosen': '0.08442', 'rewards/rejected': '-0.181', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2654', 'logps/chosen': '-125.4', 'logps/rejected': '-112.2', 'logits/chosen': '-0.965', 'logits/rejected': '-1', 'epoch': '0.8318'}
217
+ {'loss': '0.6202', 'grad_norm': '27', 'learning_rate': '1.593e-07', 'rewards/chosen': '0.07857', 'rewards/rejected': '-0.2079', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2864', 'logps/chosen': '-124.8', 'logps/rejected': '-104.4', 'logits/chosen': '-1.067', 'logits/rejected': '-0.9671', 'epoch': '0.8366'}
218
+ {'loss': '0.6125', 'grad_norm': '31', 'learning_rate': '1.504e-07', 'rewards/chosen': '0.1301', 'rewards/rejected': '-0.1922', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3224', 'logps/chosen': '-123.3', 'logps/rejected': '-123.1', 'logits/chosen': '-1.041', 'logits/rejected': '-0.9867', 'epoch': '0.8414'}
219
+ {'loss': '0.631', 'grad_norm': '25.5', 'learning_rate': '1.416e-07', 'rewards/chosen': '0.06422', 'rewards/rejected': '-0.2029', 'rewards/accuracies': '0.68', 'rewards/margins': '0.2671', 'logps/chosen': '-152.8', 'logps/rejected': '-103.1', 'logits/chosen': '-0.9218', 'logits/rejected': '-1.025', 'epoch': '0.8463'}
220
+ {'loss': '0.598', 'grad_norm': '36.25', 'learning_rate': '1.331e-07', 'rewards/chosen': '0.04709', 'rewards/rejected': '-0.2808', 'rewards/accuracies': '0.645', 'rewards/margins': '0.3279', 'logps/chosen': '-136.1', 'logps/rejected': '-99.2', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9484', 'epoch': '0.8511'}
221
+ {'loss': '0.6003', 'grad_norm': '27.75', 'learning_rate': '1.249e-07', 'rewards/chosen': '0.06051', 'rewards/rejected': '-0.2485', 'rewards/accuracies': '0.715', 'rewards/margins': '0.309', 'logps/chosen': '-128.1', 'logps/rejected': '-120.9', 'logits/chosen': '-1.018', 'logits/rejected': '-0.9545', 'epoch': '0.8559'}
222
+ {'loss': '0.591', 'grad_norm': '31.25', 'learning_rate': '1.169e-07', 'rewards/chosen': '0.1146', 'rewards/rejected': '-0.2206', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3352', 'logps/chosen': '-137.7', 'logps/rejected': '-116.4', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9895', 'epoch': '0.8607'}
223
+ {'loss': '0.5831', 'grad_norm': '27.75', 'learning_rate': '1.091e-07', 'rewards/chosen': '0.1675', 'rewards/rejected': '-0.2213', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3888', 'logps/chosen': '-143', 'logps/rejected': '-115', 'logits/chosen': '-0.9924', 'logits/rejected': '-0.9801', 'epoch': '0.8655'}
224
+ {'eval_loss': '0.6168', 'eval_runtime': '159.9', 'eval_samples_per_second': '13.69', 'eval_steps_per_second': '2.738', 'eval_rewards/chosen': '0.06656', 'eval_rewards/rejected': '-0.2297', 'eval_rewards/accuracies': '0.6662', 'eval_rewards/margins': '0.2962', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.8655'}
225
+ {'loss': '0.5867', 'grad_norm': '24.88', 'learning_rate': '1.016e-07', 'rewards/chosen': '0.1309', 'rewards/rejected': '-0.221', 'rewards/accuracies': '0.7', 'rewards/margins': '0.352', 'logps/chosen': '-140.8', 'logps/rejected': '-116.5', 'logits/chosen': '-0.984', 'logits/rejected': '-1.022', 'epoch': '0.8703'}
226
+ {'loss': '0.606', 'grad_norm': '29.25', 'learning_rate': '9.44e-08', 'rewards/chosen': '0.09663', 'rewards/rejected': '-0.2094', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3061', 'logps/chosen': '-133.8', 'logps/rejected': '-116.2', 'logits/chosen': '-1.058', 'logits/rejected': '-0.9796', 'epoch': '0.8751'}
227
+ {'loss': '0.6057', 'grad_norm': '21.75', 'learning_rate': '8.741e-08', 'rewards/chosen': '0.09259', 'rewards/rejected': '-0.2016', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2942', 'logps/chosen': '-135', 'logps/rejected': '-109.1', 'logits/chosen': '-0.9949', 'logits/rejected': '-0.9764', 'epoch': '0.8799'}
228
+ {'loss': '0.5924', 'grad_norm': '25.38', 'learning_rate': '8.068e-08', 'rewards/chosen': '0.14', 'rewards/rejected': '-0.2309', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3709', 'logps/chosen': '-141.4', 'logps/rejected': '-111.2', 'logits/chosen': '-1.006', 'logits/rejected': '-0.9707', 'epoch': '0.8847'}
229
+ {'loss': '0.6111', 'grad_norm': '24.12', 'learning_rate': '7.421e-08', 'rewards/chosen': '0.05592', 'rewards/rejected': '-0.235', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2909', 'logps/chosen': '-135.2', 'logps/rejected': '-115', 'logits/chosen': '-1.069', 'logits/rejected': '-1.037', 'epoch': '0.8895'}
230
+ {'loss': '0.6033', 'grad_norm': '24.62', 'learning_rate': '6.799e-08', 'rewards/chosen': '0.0985', 'rewards/rejected': '-0.2023', 'rewards/accuracies': '0.68', 'rewards/margins': '0.3008', 'logps/chosen': '-137.5', 'logps/rejected': '-123.3', 'logits/chosen': '-1.06', 'logits/rejected': '-0.9708', 'epoch': '0.8943'}
231
+ {'loss': '0.5975', 'grad_norm': '24.62', 'learning_rate': '6.204e-08', 'rewards/chosen': '0.1767', 'rewards/rejected': '-0.1992', 'rewards/accuracies': '0.705', 'rewards/margins': '0.376', 'logps/chosen': '-142.1', 'logps/rejected': '-131.8', 'logits/chosen': '-1.021', 'logits/rejected': '-0.9542', 'epoch': '0.8991'}
232
+ {'loss': '0.5923', 'grad_norm': '31.12', 'learning_rate': '5.635e-08', 'rewards/chosen': '0.1364', 'rewards/rejected': '-0.2067', 'rewards/accuracies': '0.71', 'rewards/margins': '0.343', 'logps/chosen': '-136.7', 'logps/rejected': '-111.7', 'logits/chosen': '-1.127', 'logits/rejected': '-1.063', 'epoch': '0.904'}
233
+ {'loss': '0.6425', 'grad_norm': '33.25', 'learning_rate': '5.093e-08', 'rewards/chosen': '0.06713', 'rewards/rejected': '-0.1448', 'rewards/accuracies': '0.6', 'rewards/margins': '0.212', 'logps/chosen': '-125.2', 'logps/rejected': '-119.6', 'logits/chosen': '-1.044', 'logits/rejected': '-0.971', 'epoch': '0.9088'}
234
+ {'loss': '0.6038', 'grad_norm': '31.25', 'learning_rate': '4.578e-08', 'rewards/chosen': '0.144', 'rewards/rejected': '-0.2002', 'rewards/accuracies': '0.64', 'rewards/margins': '0.3442', 'logps/chosen': '-150.9', 'logps/rejected': '-127.8', 'logits/chosen': '-0.9139', 'logits/rejected': '-0.9313', 'epoch': '0.9136'}
235
+ {'eval_loss': '0.6162', 'eval_runtime': '159.6', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.745', 'eval_rewards/chosen': '0.06609', 'eval_rewards/rejected': '-0.231', 'eval_rewards/accuracies': '0.6644', 'eval_rewards/margins': '0.2971', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.011', 'epoch': '0.9136'}
236
+ {'loss': '0.565', 'grad_norm': '26.62', 'learning_rate': '4.089e-08', 'rewards/chosen': '0.1648', 'rewards/rejected': '-0.2517', 'rewards/accuracies': '0.695', 'rewards/margins': '0.4165', 'logps/chosen': '-140.4', 'logps/rejected': '-116.5', 'logits/chosen': '-0.9448', 'logits/rejected': '-0.9807', 'epoch': '0.9184'}
237
+ {'loss': '0.6396', 'grad_norm': '22.88', 'learning_rate': '3.628e-08', 'rewards/chosen': '0.0286', 'rewards/rejected': '-0.2316', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2602', 'logps/chosen': '-130.4', 'logps/rejected': '-118.4', 'logits/chosen': '-0.9926', 'logits/rejected': '-0.9664', 'epoch': '0.9232'}
238
+ {'loss': '0.6648', 'grad_norm': '25.25', 'learning_rate': '3.194e-08', 'rewards/chosen': '0.007605', 'rewards/rejected': '-0.1721', 'rewards/accuracies': '0.66', 'rewards/margins': '0.1797', 'logps/chosen': '-129.3', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9287', 'logits/rejected': '-1.047', 'epoch': '0.928'}
239
+ {'loss': '0.6219', 'grad_norm': '22.38', 'learning_rate': '2.787e-08', 'rewards/chosen': '0.01246', 'rewards/rejected': '-0.264', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2764', 'logps/chosen': '-131.1', 'logps/rejected': '-126.7', 'logits/chosen': '-0.9881', 'logits/rejected': '-0.9735', 'epoch': '0.9328'}
240
+ {'loss': '0.5815', 'grad_norm': '26.25', 'learning_rate': '2.407e-08', 'rewards/chosen': '0.09192', 'rewards/rejected': '-0.2578', 'rewards/accuracies': '0.735', 'rewards/margins': '0.3497', 'logps/chosen': '-135.7', 'logps/rejected': '-104.4', 'logits/chosen': '-0.9655', 'logits/rejected': '-1.012', 'epoch': '0.9376'}
241
+ {'loss': '0.6224', 'grad_norm': '40', 'learning_rate': '2.055e-08', 'rewards/chosen': '0.06027', 'rewards/rejected': '-0.205', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2653', 'logps/chosen': '-142', 'logps/rejected': '-110.6', 'logits/chosen': '-1.003', 'logits/rejected': '-1.027', 'epoch': '0.9424'}
242
+ {'loss': '0.6462', 'grad_norm': '31.38', 'learning_rate': '1.73e-08', 'rewards/chosen': '0.02588', 'rewards/rejected': '-0.191', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2169', 'logps/chosen': '-130.3', 'logps/rejected': '-104.3', 'logits/chosen': '-0.9781', 'logits/rejected': '-0.9542', 'epoch': '0.9472'}
243
+ {'loss': '0.5845', 'grad_norm': '26.75', 'learning_rate': '1.433e-08', 'rewards/chosen': '0.117', 'rewards/rejected': '-0.2956', 'rewards/accuracies': '0.705', 'rewards/margins': '0.4126', 'logps/chosen': '-135.4', 'logps/rejected': '-108.1', 'logits/chosen': '-0.9659', 'logits/rejected': '-0.9748', 'epoch': '0.952'}
244
+ {'loss': '0.6054', 'grad_norm': '21.12', 'learning_rate': '1.164e-08', 'rewards/chosen': '0.09082', 'rewards/rejected': '-0.2336', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3244', 'logps/chosen': '-131.2', 'logps/rejected': '-126.3', 'logits/chosen': '-0.9997', 'logits/rejected': '-0.9632', 'epoch': '0.9568'}
245
+ {'loss': '0.6403', 'grad_norm': '27.12', 'learning_rate': '9.225e-09', 'rewards/chosen': '0.08749', 'rewards/rejected': '-0.1388', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2263', 'logps/chosen': '-123.6', 'logps/rejected': '-111.1', 'logits/chosen': '-1.051', 'logits/rejected': '-1.017', 'epoch': '0.9617'}
246
+ {'eval_loss': '0.617', 'eval_runtime': '159.7', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.743', 'eval_rewards/chosen': '0.06905', 'eval_rewards/rejected': '-0.2269', 'eval_rewards/accuracies': '0.6648', 'eval_rewards/margins': '0.296', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.9617'}
247
+ {'loss': '0.5908', 'grad_norm': '20.88', 'learning_rate': '7.09e-09', 'rewards/chosen': '0.1685', 'rewards/rejected': '-0.2152', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3837', 'logps/chosen': '-124.9', 'logps/rejected': '-116.3', 'logits/chosen': '-1.08', 'logits/rejected': '-1.014', 'epoch': '0.9665'}
248
+ {'loss': '0.6394', 'grad_norm': '25', 'learning_rate': '5.235e-09', 'rewards/chosen': '0.05396', 'rewards/rejected': '-0.1692', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2231', 'logps/chosen': '-127.7', 'logps/rejected': '-110.3', 'logits/chosen': '-1.066', 'logits/rejected': '-1.035', 'epoch': '0.9713'}
249
+ {'loss': '0.5833', 'grad_norm': '25.5', 'learning_rate': '3.66e-09', 'rewards/chosen': '0.1592', 'rewards/rejected': '-0.2082', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3674', 'logps/chosen': '-135.9', 'logps/rejected': '-114', 'logits/chosen': '-0.9349', 'logits/rejected': '-1.003', 'epoch': '0.9761'}
250
+ {'loss': '0.5958', 'grad_norm': '24.75', 'learning_rate': '2.366e-09', 'rewards/chosen': '0.1077', 'rewards/rejected': '-0.2435', 'rewards/accuracies': '0.68', 'rewards/margins': '0.3513', 'logps/chosen': '-135.1', 'logps/rejected': '-103.6', 'logits/chosen': '-0.9955', 'logits/rejected': '-1.007', 'epoch': '0.9809'}
251
+ {'loss': '0.6165', 'grad_norm': '37', 'learning_rate': '1.353e-09', 'rewards/chosen': '0.09438', 'rewards/rejected': '-0.2076', 'rewards/accuracies': '0.65', 'rewards/margins': '0.302', 'logps/chosen': '-146.4', 'logps/rejected': '-109.5', 'logits/chosen': '-1.048', 'logits/rejected': '-1.034', 'epoch': '0.9857'}
252
+ {'loss': '0.6304', 'grad_norm': '24.12', 'learning_rate': '6.209e-10', 'rewards/chosen': '0.007376', 'rewards/rejected': '-0.2453', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2526', 'logps/chosen': '-129.7', 'logps/rejected': '-115.5', 'logits/chosen': '-1.008', 'logits/rejected': '-1.016', 'epoch': '0.9905'}
253
+ {'loss': '0.5655', 'grad_norm': '31.25', 'learning_rate': '1.704e-10', 'rewards/chosen': '0.1247', 'rewards/rejected': '-0.3073', 'rewards/accuracies': '0.755', 'rewards/margins': '0.432', 'logps/chosen': '-137.2', 'logps/rejected': '-115.6', 'logits/chosen': '-1.054', 'logits/rejected': '-1.011', 'epoch': '0.9953'}
254
+ {'loss': '0.61', 'grad_norm': '24.75', 'learning_rate': '1.408e-12', 'rewards/chosen': '0.0525', 'rewards/rejected': '-0.2418', 'rewards/accuracies': '0.6667', 'rewards/margins': '0.2943', 'logps/chosen': '-122.4', 'logps/rejected': '-103.4', 'logits/chosen': '-1.078', 'logits/rejected': '-1.001', 'epoch': '1'}
255
+ {'train_runtime': '1.07e+04', 'train_samples_per_second': '3.889', 'train_steps_per_second': '0.194', 'train_loss': '0.6242', 'epoch': '1'}
256
+ [dpo_train] Final model saved → models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653/final_model
257
+ [dpo_train] Run metadata → models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653/run_meta.json
258
+
259
+ [dpo_train] Done.