Upload folder using huggingface_hub
Browse files- DPO_HH_final_model/chat_template.jinja +45 -0
- DPO_HH_final_model/config.json +61 -0
- DPO_HH_final_model/generation_config.json +9 -0
- DPO_HH_final_model/model.safetensors +3 -0
- DPO_HH_final_model/tokenizer.json +0 -0
- DPO_HH_final_model/tokenizer_config.json +20 -0
- DPO_HH_final_model/training_args.bin +3 -0
- DPO_HH_final_model/training_hh.txt +259 -0
DPO_HH_final_model/chat_template.jinja
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{{- bos_token -}}
|
| 2 |
+
{%- set keep_past_thinking = keep_past_thinking | default(false) -%}
|
| 3 |
+
{%- set ns = namespace(system_prompt="") -%}
|
| 4 |
+
{%- if messages[0]["role"] == "system" -%}
|
| 5 |
+
{%- set ns.system_prompt = messages[0]["content"] -%}
|
| 6 |
+
{%- set messages = messages[1:] -%}
|
| 7 |
+
{%- endif -%}
|
| 8 |
+
{%- if tools -%}
|
| 9 |
+
{%- set ns.system_prompt = ns.system_prompt + ("\n" if ns.system_prompt else "") + "List of tools: [" -%}
|
| 10 |
+
{%- for tool in tools -%}
|
| 11 |
+
{%- if tool is not string -%}
|
| 12 |
+
{%- set tool = tool | tojson -%}
|
| 13 |
+
{%- endif -%}
|
| 14 |
+
{%- set ns.system_prompt = ns.system_prompt + tool -%}
|
| 15 |
+
{%- if not loop.last -%}
|
| 16 |
+
{%- set ns.system_prompt = ns.system_prompt + ", " -%}
|
| 17 |
+
{%- endif -%}
|
| 18 |
+
{%- endfor -%}
|
| 19 |
+
{%- set ns.system_prompt = ns.system_prompt + "]" -%}
|
| 20 |
+
{%- endif -%}
|
| 21 |
+
{%- if ns.system_prompt -%}
|
| 22 |
+
{{- "<|im_start|>system\n" + ns.system_prompt + "<|im_end|>\n" -}}
|
| 23 |
+
{%- endif -%}
|
| 24 |
+
{%- set ns.last_assistant_index = -1 -%}
|
| 25 |
+
{%- for message in messages -%}
|
| 26 |
+
{%- if message["role"] == "assistant" -%}
|
| 27 |
+
{%- set ns.last_assistant_index = loop.index0 -%}
|
| 28 |
+
{%- endif -%}
|
| 29 |
+
{%- endfor -%}
|
| 30 |
+
{%- for message in messages -%}
|
| 31 |
+
{{- "<|im_start|>" + message["role"] + "\n" -}}
|
| 32 |
+
{%- set content = message["content"] -%}
|
| 33 |
+
{%- if content is not string -%}
|
| 34 |
+
{%- set content = content | tojson -%}
|
| 35 |
+
{%- endif -%}
|
| 36 |
+
{%- if message["role"] == "assistant" and not keep_past_thinking and loop.index0 != ns.last_assistant_index -%}
|
| 37 |
+
{%- if "</think>" in content -%}
|
| 38 |
+
{%- set content = content.split("</think>")[-1] | trim -%}
|
| 39 |
+
{%- endif -%}
|
| 40 |
+
{%- endif -%}
|
| 41 |
+
{{- content + "<|im_end|>\n" -}}
|
| 42 |
+
{%- endfor -%}
|
| 43 |
+
{%- if add_generation_prompt -%}
|
| 44 |
+
{{- "<|im_start|>assistant\n" -}}
|
| 45 |
+
{%- endif -%}
|
DPO_HH_final_model/config.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Lfm2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"block_auto_adjust_ff_dim": true,
|
| 6 |
+
"block_dim": 2048,
|
| 7 |
+
"block_ff_dim": 12288,
|
| 8 |
+
"block_ffn_dim_multiplier": 1.0,
|
| 9 |
+
"block_mlp_init_scale": 1.0,
|
| 10 |
+
"block_multiple_of": 256,
|
| 11 |
+
"block_norm_eps": 1e-05,
|
| 12 |
+
"block_out_init_scale": 1.0,
|
| 13 |
+
"block_use_swiglu": true,
|
| 14 |
+
"block_use_xavier_init": true,
|
| 15 |
+
"bos_token_id": 1,
|
| 16 |
+
"conv_L_cache": 3,
|
| 17 |
+
"conv_bias": false,
|
| 18 |
+
"conv_dim": 2048,
|
| 19 |
+
"conv_use_xavier_init": true,
|
| 20 |
+
"dtype": "bfloat16",
|
| 21 |
+
"eos_token_id": 7,
|
| 22 |
+
"hidden_size": 2048,
|
| 23 |
+
"initializer_range": 0.02,
|
| 24 |
+
"intermediate_size": 12288,
|
| 25 |
+
"layer_types": [
|
| 26 |
+
"conv",
|
| 27 |
+
"conv",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"conv",
|
| 30 |
+
"conv",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"conv",
|
| 33 |
+
"conv",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"conv",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"conv",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"conv",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"conv"
|
| 42 |
+
],
|
| 43 |
+
"max_position_embeddings": 128000,
|
| 44 |
+
"model_type": "lfm2",
|
| 45 |
+
"norm_eps": 1e-05,
|
| 46 |
+
"num_attention_heads": 32,
|
| 47 |
+
"num_heads": 32,
|
| 48 |
+
"num_hidden_layers": 16,
|
| 49 |
+
"num_key_value_heads": 8,
|
| 50 |
+
"pad_token_id": 0,
|
| 51 |
+
"rope_parameters": {
|
| 52 |
+
"rope_theta": 1000000.0,
|
| 53 |
+
"rope_type": "default"
|
| 54 |
+
},
|
| 55 |
+
"tie_embedding": true,
|
| 56 |
+
"tie_word_embeddings": true,
|
| 57 |
+
"transformers_version": "5.2.0",
|
| 58 |
+
"use_cache": false,
|
| 59 |
+
"use_pos_enc": true,
|
| 60 |
+
"vocab_size": 65536
|
| 61 |
+
}
|
DPO_HH_final_model/generation_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
7
|
| 6 |
+
],
|
| 7 |
+
"pad_token_id": 0,
|
| 8 |
+
"transformers_version": "5.2.0"
|
| 9 |
+
}
|
DPO_HH_final_model/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2bcb3974ce453f6f0c9006aaa302a92fde9c90dbf466b238f163984c1471cd1
|
| 3 |
+
size 2340697936
|
DPO_HH_final_model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
DPO_HH_final_model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": "<|startoftext|>",
|
| 4 |
+
"clean_up_tokenization_spaces": false,
|
| 5 |
+
"eos_token": "<|im_end|>",
|
| 6 |
+
"is_local": false,
|
| 7 |
+
"legacy": false,
|
| 8 |
+
"model_input_names": [
|
| 9 |
+
"input_ids",
|
| 10 |
+
"attention_mask"
|
| 11 |
+
],
|
| 12 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 13 |
+
"pad_token": "<|pad|>",
|
| 14 |
+
"padding_side": "right",
|
| 15 |
+
"sp_model_kwargs": {},
|
| 16 |
+
"spaces_between_special_tokens": false,
|
| 17 |
+
"tokenizer_class": "TokenizersBackend",
|
| 18 |
+
"use_default_system_prompt": false,
|
| 19 |
+
"use_fast": true
|
| 20 |
+
}
|
DPO_HH_final_model/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:983ea1c33ce5a577d9c542e6b3b45bd3b96cdc7a8f294566b89031fb84da6786
|
| 3 |
+
size 6225
|
DPO_HH_final_model/training_hh.txt
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
========================================
|
| 2 |
+
DPO Full Fine-Tuning
|
| 3 |
+
========================================
|
| 4 |
+
Model : LiquidAI/LFM2.5-1.2B-Instruct
|
| 5 |
+
Dataset : Anthropic/hh-rlhf (data_dir=helpful-base)
|
| 6 |
+
Epochs : 1
|
| 7 |
+
Batch size : 5 (grad_accum=4, eff=20)
|
| 8 |
+
Learning rate : 2e-6
|
| 9 |
+
DPO beta : 0.2
|
| 10 |
+
Reference : NF4 4-bit (pass --no_ref_4bit for bfloat16)
|
| 11 |
+
Output dir : models
|
| 12 |
+
========================================
|
| 13 |
+
|
| 14 |
+
[dpo_train] Run : dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653
|
| 15 |
+
[dpo_train] Output : models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653
|
| 16 |
+
[dpo_train] Loading dataset: Anthropic/hh-rlhf split=train data_dir=helpful-base
|
| 17 |
+
[dpo_train] Full size : 43,835 rows | columns: ['chosen', 'rejected']
|
| 18 |
+
[dpo_train] Format : hh-rlhf (full conversation strings)
|
| 19 |
+
[dpo_train] After cleaning: 43,785 rows
|
| 20 |
+
[dpo_train] Train: 41,595 Eval: 2,190
|
| 21 |
+
[dpo_train] Loading policy model (bfloat16, trainable) …
|
| 22 |
+
[dpo_train] Loading reference model (bfloat16, frozen) …
|
| 23 |
+
[dpo_train] Policy params : 1170M (all trainable)
|
| 24 |
+
|
| 25 |
+
[dpo_train] Starting DPO full fine-tuning (epochs=1 eff_batch=20) …
|
| 26 |
+
|
| 27 |
+
{'loss': '0.6883', 'grad_norm': '31', 'learning_rate': '8.654e-08', 'rewards/chosen': '-0.0005825', 'rewards/rejected': '-0.01204', 'rewards/accuracies': '0.41', 'rewards/margins': '0.01146', 'logps/chosen': '-122.7', 'logps/rejected': '-116.7', 'logits/chosen': '-0.9475', 'logits/rejected': '-0.9243', 'epoch': '0.004808'}
|
| 28 |
+
{'loss': '0.6935', 'grad_norm': '27.38', 'learning_rate': '1.827e-07', 'rewards/chosen': '0.002688', 'rewards/rejected': '0.001497', 'rewards/accuracies': '0.54', 'rewards/margins': '0.001191', 'logps/chosen': '-129.2', 'logps/rejected': '-122.4', 'logits/chosen': '-0.9333', 'logits/rejected': '-0.8879', 'epoch': '0.009617'}
|
| 29 |
+
{'loss': '0.6904', 'grad_norm': '25.5', 'learning_rate': '2.788e-07', 'rewards/chosen': '0.006291', 'rewards/rejected': '-0.0009644', 'rewards/accuracies': '0.49', 'rewards/margins': '0.007255', 'logps/chosen': '-138', 'logps/rejected': '-119.7', 'logits/chosen': '-0.9096', 'logits/rejected': '-0.9259', 'epoch': '0.01442'}
|
| 30 |
+
{'loss': '0.6963', 'grad_norm': '28.5', 'learning_rate': '3.75e-07', 'rewards/chosen': '0.002371', 'rewards/rejected': '0.006851', 'rewards/accuracies': '0.465', 'rewards/margins': '-0.004481', 'logps/chosen': '-125', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9426', 'logits/rejected': '-0.8451', 'epoch': '0.01923'}
|
| 31 |
+
{'loss': '0.6955', 'grad_norm': '30.88', 'learning_rate': '4.712e-07', 'rewards/chosen': '0.006614', 'rewards/rejected': '0.009299', 'rewards/accuracies': '0.51', 'rewards/margins': '-0.002685', 'logps/chosen': '-137.1', 'logps/rejected': '-110.3', 'logits/chosen': '-0.8802', 'logits/rejected': '-0.9408', 'epoch': '0.02404'}
|
| 32 |
+
{'loss': '0.6893', 'grad_norm': '31.75', 'learning_rate': '5.673e-07', 'rewards/chosen': '0.01269', 'rewards/rejected': '0.003417', 'rewards/accuracies': '0.545', 'rewards/margins': '0.009273', 'logps/chosen': '-134.3', 'logps/rejected': '-107.8', 'logits/chosen': '-0.8803', 'logits/rejected': '-0.8356', 'epoch': '0.02885'}
|
| 33 |
+
{'loss': '0.6908', 'grad_norm': '31.88', 'learning_rate': '6.635e-07', 'rewards/chosen': '0.01333', 'rewards/rejected': '0.006563', 'rewards/accuracies': '0.54', 'rewards/margins': '0.006767', 'logps/chosen': '-139.3', 'logps/rejected': '-108.5', 'logits/chosen': '-0.9105', 'logits/rejected': '-0.8774', 'epoch': '0.03366'}
|
| 34 |
+
{'loss': '0.6926', 'grad_norm': '29.62', 'learning_rate': '7.596e-07', 'rewards/chosen': '0.02761', 'rewards/rejected': '0.02442', 'rewards/accuracies': '0.515', 'rewards/margins': '0.003186', 'logps/chosen': '-125.4', 'logps/rejected': '-118', 'logits/chosen': '-0.9279', 'logits/rejected': '-0.9228', 'epoch': '0.03847'}
|
| 35 |
+
{'loss': '0.6868', 'grad_norm': '33.25', 'learning_rate': '8.558e-07', 'rewards/chosen': '0.04787', 'rewards/rejected': '0.03277', 'rewards/accuracies': '0.49', 'rewards/margins': '0.01509', 'logps/chosen': '-136.4', 'logps/rejected': '-119.5', 'logits/chosen': '-0.9487', 'logits/rejected': '-0.961', 'epoch': '0.04327'}
|
| 36 |
+
{'loss': '0.6894', 'grad_norm': '23', 'learning_rate': '9.519e-07', 'rewards/chosen': '0.04472', 'rewards/rejected': '0.03485', 'rewards/accuracies': '0.55', 'rewards/margins': '0.009868', 'logps/chosen': '-138.3', 'logps/rejected': '-118.5', 'logits/chosen': '-0.9422', 'logits/rejected': '-0.8888', 'epoch': '0.04808'}
|
| 37 |
+
{'eval_loss': '0.6858', 'eval_runtime': '161.1', 'eval_samples_per_second': '13.59', 'eval_steps_per_second': '2.719', 'eval_rewards/chosen': '0.05564', 'eval_rewards/rejected': '0.03785', 'eval_rewards/accuracies': '0.5694', 'eval_rewards/margins': '0.01779', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-112.5', 'eval_logits/chosen': '-0.9387', 'eval_logits/rejected': '-0.93', 'epoch': '0.04808'}
|
| 38 |
+
{'loss': '0.6917', 'grad_norm': '31.38', 'learning_rate': '1.048e-06', 'rewards/chosen': '0.0569', 'rewards/rejected': '0.05051', 'rewards/accuracies': '0.47', 'rewards/margins': '0.006385', 'logps/chosen': '-129.2', 'logps/rejected': '-117.6', 'logits/chosen': '-0.9519', 'logits/rejected': '-0.8707', 'epoch': '0.05289'}
|
| 39 |
+
{'loss': '0.6867', 'grad_norm': '27.5', 'learning_rate': '1.144e-06', 'rewards/chosen': '0.07523', 'rewards/rejected': '0.05881', 'rewards/accuracies': '0.525', 'rewards/margins': '0.01642', 'logps/chosen': '-121', 'logps/rejected': '-109.4', 'logits/chosen': '-0.9517', 'logits/rejected': '-0.8928', 'epoch': '0.0577'}
|
| 40 |
+
{'loss': '0.6854', 'grad_norm': '30.88', 'learning_rate': '1.24e-06', 'rewards/chosen': '0.09279', 'rewards/rejected': '0.07028', 'rewards/accuracies': '0.58', 'rewards/margins': '0.02251', 'logps/chosen': '-134.2', 'logps/rejected': '-113.6', 'logits/chosen': '-0.88', 'logits/rejected': '-0.8708', 'epoch': '0.06251'}
|
| 41 |
+
{'loss': '0.6779', 'grad_norm': '31.38', 'learning_rate': '1.337e-06', 'rewards/chosen': '0.114', 'rewards/rejected': '0.07756', 'rewards/accuracies': '0.57', 'rewards/margins': '0.03639', 'logps/chosen': '-140.8', 'logps/rejected': '-134', 'logits/chosen': '-0.92', 'logits/rejected': '-0.8675', 'epoch': '0.06732'}
|
| 42 |
+
{'loss': '0.6759', 'grad_norm': '36.5', 'learning_rate': '1.433e-06', 'rewards/chosen': '0.1178', 'rewards/rejected': '0.07662', 'rewards/accuracies': '0.57', 'rewards/margins': '0.04123', 'logps/chosen': '-134.7', 'logps/rejected': '-100.4', 'logits/chosen': '-0.9678', 'logits/rejected': '-0.9473', 'epoch': '0.07212'}
|
| 43 |
+
{'loss': '0.6826', 'grad_norm': '28.38', 'learning_rate': '1.529e-06', 'rewards/chosen': '0.1401', 'rewards/rejected': '0.1108', 'rewards/accuracies': '0.56', 'rewards/margins': '0.02931', 'logps/chosen': '-128.9', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9077', 'logits/rejected': '-0.934', 'epoch': '0.07693'}
|
| 44 |
+
{'loss': '0.6703', 'grad_norm': '25', 'learning_rate': '1.625e-06', 'rewards/chosen': '0.1574', 'rewards/rejected': '0.1022', 'rewards/accuracies': '0.575', 'rewards/margins': '0.05519', 'logps/chosen': '-136.8', 'logps/rejected': '-104.9', 'logits/chosen': '-0.9116', 'logits/rejected': '-0.9714', 'epoch': '0.08174'}
|
| 45 |
+
{'loss': '0.6654', 'grad_norm': '23.5', 'learning_rate': '1.721e-06', 'rewards/chosen': '0.1637', 'rewards/rejected': '0.09603', 'rewards/accuracies': '0.59', 'rewards/margins': '0.06763', 'logps/chosen': '-143.8', 'logps/rejected': '-112.8', 'logits/chosen': '-0.9118', 'logits/rejected': '-0.8706', 'epoch': '0.08655'}
|
| 46 |
+
{'loss': '0.6712', 'grad_norm': '23.75', 'learning_rate': '1.817e-06', 'rewards/chosen': '0.1769', 'rewards/rejected': '0.1167', 'rewards/accuracies': '0.61', 'rewards/margins': '0.06021', 'logps/chosen': '-132.5', 'logps/rejected': '-116.7', 'logits/chosen': '-0.865', 'logits/rejected': '-0.8988', 'epoch': '0.09136'}
|
| 47 |
+
{'loss': '0.6669', 'grad_norm': '31.62', 'learning_rate': '1.913e-06', 'rewards/chosen': '0.207', 'rewards/rejected': '0.135', 'rewards/accuracies': '0.64', 'rewards/margins': '0.07194', 'logps/chosen': '-144.9', 'logps/rejected': '-110.8', 'logits/chosen': '-0.889', 'logits/rejected': '-0.9204', 'epoch': '0.09617'}
|
| 48 |
+
{'eval_loss': '0.6687', 'eval_runtime': '159.7', 'eval_samples_per_second': '13.71', 'eval_steps_per_second': '2.743', 'eval_rewards/chosen': '0.2023', 'eval_rewards/rejected': '0.1295', 'eval_rewards/accuracies': '0.5991', 'eval_rewards/margins': '0.07285', 'eval_logps/chosen': '-132.7', 'eval_logps/rejected': '-112', 'eval_logits/chosen': '-0.9474', 'eval_logits/rejected': '-0.941', 'epoch': '0.09617'}
|
| 49 |
+
{'loss': '0.6711', 'grad_norm': '29.62', 'learning_rate': '2e-06', 'rewards/chosen': '0.1896', 'rewards/rejected': '0.1194', 'rewards/accuracies': '0.57', 'rewards/margins': '0.07017', 'logps/chosen': '-133', 'logps/rejected': '-120.2', 'logits/chosen': '-0.8696', 'logits/rejected': '-0.874', 'epoch': '0.101'}
|
| 50 |
+
{'loss': '0.6657', 'grad_norm': '26.75', 'learning_rate': '2e-06', 'rewards/chosen': '0.1632', 'rewards/rejected': '0.07528', 'rewards/accuracies': '0.625', 'rewards/margins': '0.08788', 'logps/chosen': '-123.4', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9532', 'logits/rejected': '-0.9136', 'epoch': '0.1058'}
|
| 51 |
+
{'loss': '0.6611', 'grad_norm': '27.62', 'learning_rate': '1.999e-06', 'rewards/chosen': '0.1101', 'rewards/rejected': '0.02324', 'rewards/accuracies': '0.655', 'rewards/margins': '0.08688', 'logps/chosen': '-124.4', 'logps/rejected': '-107.2', 'logits/chosen': '-0.9392', 'logits/rejected': '-0.9295', 'epoch': '0.1106'}
|
| 52 |
+
{'loss': '0.6415', 'grad_norm': '28.62', 'learning_rate': '1.999e-06', 'rewards/chosen': '0.0739', 'rewards/rejected': '-0.05723', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1311', 'logps/chosen': '-127', 'logps/rejected': '-106.9', 'logits/chosen': '-0.9826', 'logits/rejected': '-0.9332', 'epoch': '0.1154'}
|
| 53 |
+
{'loss': '0.6554', 'grad_norm': '30.25', 'learning_rate': '1.998e-06', 'rewards/chosen': '0.05428', 'rewards/rejected': '-0.05029', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1046', 'logps/chosen': '-144.8', 'logps/rejected': '-117.7', 'logits/chosen': '-0.9814', 'logits/rejected': '-0.9672', 'epoch': '0.1202'}
|
| 54 |
+
{'loss': '0.6437', 'grad_norm': '25.12', 'learning_rate': '1.996e-06', 'rewards/chosen': '0.08971', 'rewards/rejected': '-0.04912', 'rewards/accuracies': '0.625', 'rewards/margins': '0.1388', 'logps/chosen': '-131.5', 'logps/rejected': '-112.1', 'logits/chosen': '-1.002', 'logits/rejected': '-0.9217', 'epoch': '0.125'}
|
| 55 |
+
{'loss': '0.6711', 'grad_norm': '27.75', 'learning_rate': '1.995e-06', 'rewards/chosen': '0.1214', 'rewards/rejected': '0.0422', 'rewards/accuracies': '0.58', 'rewards/margins': '0.0792', 'logps/chosen': '-132.4', 'logps/rejected': '-118.9', 'logits/chosen': '-0.9889', 'logits/rejected': '-0.952', 'epoch': '0.1298'}
|
| 56 |
+
{'loss': '0.6558', 'grad_norm': '22.75', 'learning_rate': '1.993e-06', 'rewards/chosen': '0.1462', 'rewards/rejected': '0.03874', 'rewards/accuracies': '0.675', 'rewards/margins': '0.1075', 'logps/chosen': '-121.1', 'logps/rejected': '-111.4', 'logits/chosen': '-0.9636', 'logits/rejected': '-0.9747', 'epoch': '0.1346'}
|
| 57 |
+
{'loss': '0.6358', 'grad_norm': '24.5', 'learning_rate': '1.991e-06', 'rewards/chosen': '0.1654', 'rewards/rejected': '0.005945', 'rewards/accuracies': '0.68', 'rewards/margins': '0.1595', 'logps/chosen': '-130.7', 'logps/rejected': '-105.6', 'logits/chosen': '-0.9287', 'logits/rejected': '-0.8992', 'epoch': '0.1394'}
|
| 58 |
+
{'loss': '0.6442', 'grad_norm': '28.25', 'learning_rate': '1.988e-06', 'rewards/chosen': '0.2266', 'rewards/rejected': '0.06751', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1591', 'logps/chosen': '-141.6', 'logps/rejected': '-103.6', 'logits/chosen': '-1.063', 'logits/rejected': '-0.9664', 'epoch': '0.1442'}
|
| 59 |
+
{'eval_loss': '0.6489', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.2131', 'eval_rewards/rejected': '0.06933', 'eval_rewards/accuracies': '0.6224', 'eval_rewards/margins': '0.1438', 'eval_logps/chosen': '-132.6', 'eval_logps/rejected': '-112.3', 'eval_logits/chosen': '-0.9661', 'eval_logits/rejected': '-0.9621', 'epoch': '0.1442'}
|
| 60 |
+
{'loss': '0.6499', 'grad_norm': '26.88', 'learning_rate': '1.986e-06', 'rewards/chosen': '0.2295', 'rewards/rejected': '0.1049', 'rewards/accuracies': '0.68', 'rewards/margins': '0.1246', 'logps/chosen': '-128.6', 'logps/rejected': '-116', 'logits/chosen': '-0.9282', 'logits/rejected': '-0.9179', 'epoch': '0.1491'}
|
| 61 |
+
{'loss': '0.6379', 'grad_norm': '26', 'learning_rate': '1.983e-06', 'rewards/chosen': '0.2347', 'rewards/rejected': '0.06157', 'rewards/accuracies': '0.675', 'rewards/margins': '0.1732', 'logps/chosen': '-137.8', 'logps/rejected': '-102', 'logits/chosen': '-0.977', 'logits/rejected': '-1.001', 'epoch': '0.1539'}
|
| 62 |
+
{'loss': '0.6507', 'grad_norm': '30.12', 'learning_rate': '1.979e-06', 'rewards/chosen': '0.2242', 'rewards/rejected': '0.06374', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1605', 'logps/chosen': '-128.8', 'logps/rejected': '-113.3', 'logits/chosen': '-0.8801', 'logits/rejected': '-0.8927', 'epoch': '0.1587'}
|
| 63 |
+
{'loss': '0.6497', 'grad_norm': '26.38', 'learning_rate': '1.976e-06', 'rewards/chosen': '0.2519', 'rewards/rejected': '0.08665', 'rewards/accuracies': '0.615', 'rewards/margins': '0.1653', 'logps/chosen': '-144.7', 'logps/rejected': '-115.5', 'logits/chosen': '-0.9742', 'logits/rejected': '-0.9811', 'epoch': '0.1635'}
|
| 64 |
+
{'loss': '0.6489', 'grad_norm': '28', 'learning_rate': '1.972e-06', 'rewards/chosen': '0.2025', 'rewards/rejected': '0.04876', 'rewards/accuracies': '0.63', 'rewards/margins': '0.1537', 'logps/chosen': '-144', 'logps/rejected': '-108.9', 'logits/chosen': '-1.025', 'logits/rejected': '-1.036', 'epoch': '0.1683'}
|
| 65 |
+
{'loss': '0.6199', 'grad_norm': '26.5', 'learning_rate': '1.968e-06', 'rewards/chosen': '0.1235', 'rewards/rejected': '-0.08143', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2049', 'logps/chosen': '-138.2', 'logps/rejected': '-115.1', 'logits/chosen': '-0.9685', 'logits/rejected': '-0.9499', 'epoch': '0.1731'}
|
| 66 |
+
{'loss': '0.661', 'grad_norm': '29.38', 'learning_rate': '1.964e-06', 'rewards/chosen': '0.06153', 'rewards/rejected': '-0.06504', 'rewards/accuracies': '0.59', 'rewards/margins': '0.1266', 'logps/chosen': '-136.6', 'logps/rejected': '-119.5', 'logits/chosen': '-0.9926', 'logits/rejected': '-1.001', 'epoch': '0.1779'}
|
| 67 |
+
{'loss': '0.656', 'grad_norm': '29.38', 'learning_rate': '1.959e-06', 'rewards/chosen': '0.07785', 'rewards/rejected': '-0.04319', 'rewards/accuracies': '0.605', 'rewards/margins': '0.121', 'logps/chosen': '-135.2', 'logps/rejected': '-105.1', 'logits/chosen': '-0.9725', 'logits/rejected': '-0.9489', 'epoch': '0.1827'}
|
| 68 |
+
{'loss': '0.615', 'grad_norm': '24.12', 'learning_rate': '1.954e-06', 'rewards/chosen': '0.184', 'rewards/rejected': '-0.04084', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2248', 'logps/chosen': '-138.4', 'logps/rejected': '-106.1', 'logits/chosen': '-0.9518', 'logits/rejected': '-0.9407', 'epoch': '0.1875'}
|
| 69 |
+
{'loss': '0.6265', 'grad_norm': '44.25', 'learning_rate': '1.949e-06', 'rewards/chosen': '0.248', 'rewards/rejected': '0.03399', 'rewards/accuracies': '0.67', 'rewards/margins': '0.214', 'logps/chosen': '-133.5', 'logps/rejected': '-114', 'logits/chosen': '-0.9333', 'logits/rejected': '-0.9424', 'epoch': '0.1923'}
|
| 70 |
+
{'eval_loss': '0.6418', 'eval_runtime': '159.8', 'eval_samples_per_second': '13.71', 'eval_steps_per_second': '2.741', 'eval_rewards/chosen': '0.2418', 'eval_rewards/rejected': '0.0508', 'eval_rewards/accuracies': '0.6356', 'eval_rewards/margins': '0.191', 'eval_logps/chosen': '-132.5', 'eval_logps/rejected': '-112.4', 'eval_logits/chosen': '-0.9695', 'eval_logits/rejected': '-0.9666', 'epoch': '0.1923'}
|
| 71 |
+
{'loss': '0.618', 'grad_norm': '26.12', 'learning_rate': '1.944e-06', 'rewards/chosen': '0.315', 'rewards/rejected': '0.05388', 'rewards/accuracies': '0.7', 'rewards/margins': '0.2612', 'logps/chosen': '-143.2', 'logps/rejected': '-112', 'logits/chosen': '-0.9392', 'logits/rejected': '-0.9602', 'epoch': '0.1971'}
|
| 72 |
+
{'loss': '0.6097', 'grad_norm': '23.75', 'learning_rate': '1.938e-06', 'rewards/chosen': '0.2828', 'rewards/rejected': '0.01982', 'rewards/accuracies': '0.7', 'rewards/margins': '0.263', 'logps/chosen': '-147.5', 'logps/rejected': '-108.1', 'logits/chosen': '-0.8923', 'logits/rejected': '-0.9127', 'epoch': '0.2019'}
|
| 73 |
+
{'loss': '0.6102', 'grad_norm': '30.62', 'learning_rate': '1.932e-06', 'rewards/chosen': '0.3189', 'rewards/rejected': '0.02987', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2891', 'logps/chosen': '-147', 'logps/rejected': '-106.5', 'logits/chosen': '-0.9787', 'logits/rejected': '-0.9898', 'epoch': '0.2068'}
|
| 74 |
+
{'loss': '0.6417', 'grad_norm': '31.12', 'learning_rate': '1.926e-06', 'rewards/chosen': '0.2384', 'rewards/rejected': '0.04542', 'rewards/accuracies': '0.625', 'rewards/margins': '0.193', 'logps/chosen': '-118.7', 'logps/rejected': '-108.7', 'logits/chosen': '-0.9971', 'logits/rejected': '-1.018', 'epoch': '0.2116'}
|
| 75 |
+
{'loss': '0.6385', 'grad_norm': '27.5', 'learning_rate': '1.919e-06', 'rewards/chosen': '0.2713', 'rewards/rejected': '0.0755', 'rewards/accuracies': '0.65', 'rewards/margins': '0.1958', 'logps/chosen': '-140.3', 'logps/rejected': '-109.6', 'logits/chosen': '-1.029', 'logits/rejected': '-0.9416', 'epoch': '0.2164'}
|
| 76 |
+
{'loss': '0.6357', 'grad_norm': '30.12', 'learning_rate': '1.913e-06', 'rewards/chosen': '0.2819', 'rewards/rejected': '0.08405', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1978', 'logps/chosen': '-138.9', 'logps/rejected': '-107.7', 'logits/chosen': '-0.9281', 'logits/rejected': '-0.95', 'epoch': '0.2212'}
|
| 77 |
+
{'loss': '0.6558', 'grad_norm': '28.38', 'learning_rate': '1.906e-06', 'rewards/chosen': '0.2306', 'rewards/rejected': '0.05818', 'rewards/accuracies': '0.6', 'rewards/margins': '0.1724', 'logps/chosen': '-137.8', 'logps/rejected': '-122', 'logits/chosen': '-1.046', 'logits/rejected': '-1.005', 'epoch': '0.226'}
|
| 78 |
+
{'loss': '0.6319', 'grad_norm': '26.75', 'learning_rate': '1.898e-06', 'rewards/chosen': '0.1872', 'rewards/rejected': '-0.02641', 'rewards/accuracies': '0.595', 'rewards/margins': '0.2137', 'logps/chosen': '-121.4', 'logps/rejected': '-113.5', 'logits/chosen': '-0.9336', 'logits/rejected': '-0.9091', 'epoch': '0.2308'}
|
| 79 |
+
{'loss': '0.611', 'grad_norm': '19', 'learning_rate': '1.891e-06', 'rewards/chosen': '0.1955', 'rewards/rejected': '-0.05569', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2512', 'logps/chosen': '-119.2', 'logps/rejected': '-100.1', 'logits/chosen': '-0.9642', 'logits/rejected': '-0.8836', 'epoch': '0.2356'}
|
| 80 |
+
{'loss': '0.6402', 'grad_norm': '24.12', 'learning_rate': '1.883e-06', 'rewards/chosen': '0.1428', 'rewards/rejected': '-0.06168', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2045', 'logps/chosen': '-124.9', 'logps/rejected': '-104.6', 'logits/chosen': '-0.9637', 'logits/rejected': '-0.9667', 'epoch': '0.2404'}
|
| 81 |
+
{'eval_loss': '0.6341', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.746', 'eval_rewards/chosen': '0.1508', 'eval_rewards/rejected': '-0.06118', 'eval_rewards/accuracies': '0.6379', 'eval_rewards/margins': '0.212', 'eval_logps/chosen': '-132.9', 'eval_logps/rejected': '-113', 'eval_logits/chosen': '-0.9813', 'eval_logits/rejected': '-0.9792', 'epoch': '0.2404'}
|
| 82 |
+
{'loss': '0.597', 'grad_norm': '35.75', 'learning_rate': '1.875e-06', 'rewards/chosen': '0.1986', 'rewards/rejected': '-0.07326', 'rewards/accuracies': '0.73', 'rewards/margins': '0.2719', 'logps/chosen': '-129.8', 'logps/rejected': '-105.9', 'logits/chosen': '-0.967', 'logits/rejected': '-0.891', 'epoch': '0.2452'}
|
| 83 |
+
{'loss': '0.6392', 'grad_norm': '29.25', 'learning_rate': '1.867e-06', 'rewards/chosen': '0.1554', 'rewards/rejected': '-0.02627', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1817', 'logps/chosen': '-125.5', 'logps/rejected': '-119.1', 'logits/chosen': '-0.9933', 'logits/rejected': '-1.009', 'epoch': '0.25'}
|
| 84 |
+
{'loss': '0.6291', 'grad_norm': '26.62', 'learning_rate': '1.858e-06', 'rewards/chosen': '0.2175', 'rewards/rejected': '-0.01706', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2346', 'logps/chosen': '-133.1', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9084', 'logits/rejected': '-0.9529', 'epoch': '0.2548'}
|
| 85 |
+
{'loss': '0.6334', 'grad_norm': '23.25', 'learning_rate': '1.85e-06', 'rewards/chosen': '0.1353', 'rewards/rejected': '-0.1065', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2418', 'logps/chosen': '-134.8', 'logps/rejected': '-113.9', 'logits/chosen': '-1.018', 'logits/rejected': '-0.9214', 'epoch': '0.2596'}
|
| 86 |
+
{'loss': '0.6322', 'grad_norm': '31.25', 'learning_rate': '1.841e-06', 'rewards/chosen': '0.2059', 'rewards/rejected': '-0.007326', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2132', 'logps/chosen': '-146.8', 'logps/rejected': '-119.6', 'logits/chosen': '-0.9414', 'logits/rejected': '-0.892', 'epoch': '0.2645'}
|
| 87 |
+
{'loss': '0.6363', 'grad_norm': '40.25', 'learning_rate': '1.831e-06', 'rewards/chosen': '0.1914', 'rewards/rejected': '-0.05578', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2472', 'logps/chosen': '-132.1', 'logps/rejected': '-109.6', 'logits/chosen': '-0.9965', 'logits/rejected': '-0.95', 'epoch': '0.2693'}
|
| 88 |
+
{'loss': '0.6188', 'grad_norm': '25.5', 'learning_rate': '1.822e-06', 'rewards/chosen': '0.1997', 'rewards/rejected': '-0.04471', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2445', 'logps/chosen': '-133.6', 'logps/rejected': '-113.7', 'logits/chosen': '-0.9632', 'logits/rejected': '-1.003', 'epoch': '0.2741'}
|
| 89 |
+
{'loss': '0.6543', 'grad_norm': '27.25', 'learning_rate': '1.812e-06', 'rewards/chosen': '0.09105', 'rewards/rejected': '-0.09299', 'rewards/accuracies': '0.63', 'rewards/margins': '0.184', 'logps/chosen': '-119.6', 'logps/rejected': '-107.9', 'logits/chosen': '-0.9797', 'logits/rejected': '-1.013', 'epoch': '0.2789'}
|
| 90 |
+
{'loss': '0.6133', 'grad_norm': '25.38', 'learning_rate': '1.802e-06', 'rewards/chosen': '0.06649', 'rewards/rejected': '-0.1867', 'rewards/accuracies': '0.69', 'rewards/margins': '0.2532', 'logps/chosen': '-124', 'logps/rejected': '-101', 'logits/chosen': '-0.9574', 'logits/rejected': '-0.9465', 'epoch': '0.2837'}
|
| 91 |
+
{'loss': '0.636', 'grad_norm': '26.62', 'learning_rate': '1.792e-06', 'rewards/chosen': '0.07529', 'rewards/rejected': '-0.1398', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2151', 'logps/chosen': '-137.9', 'logps/rejected': '-111', 'logits/chosen': '-1.004', 'logits/rejected': '-0.993', 'epoch': '0.2885'}
|
| 92 |
+
{'eval_loss': '0.6281', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.75', 'eval_steps_per_second': '2.749', 'eval_rewards/chosen': '0.06902', 'eval_rewards/rejected': '-0.1686', 'eval_rewards/accuracies': '0.6411', 'eval_rewards/margins': '0.2376', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.5', 'eval_logits/chosen': '-0.9913', 'eval_logits/rejected': '-0.9898', 'epoch': '0.2885'}
|
| 93 |
+
{'loss': '0.6471', 'grad_norm': '27.62', 'learning_rate': '1.782e-06', 'rewards/chosen': '0.04069', 'rewards/rejected': '-0.1679', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2086', 'logps/chosen': '-127.5', 'logps/rejected': '-113.4', 'logits/chosen': '-0.9642', 'logits/rejected': '-0.8913', 'epoch': '0.2933'}
|
| 94 |
+
{'loss': '0.6378', 'grad_norm': '30.25', 'learning_rate': '1.771e-06', 'rewards/chosen': '0.107', 'rewards/rejected': '-0.1462', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2532', 'logps/chosen': '-141.8', 'logps/rejected': '-110.1', 'logits/chosen': '-1.034', 'logits/rejected': '-1.025', 'epoch': '0.2981'}
|
| 95 |
+
{'loss': '0.6681', 'grad_norm': '23.38', 'learning_rate': '1.761e-06', 'rewards/chosen': '0.1194', 'rewards/rejected': '-0.04729', 'rewards/accuracies': '0.605', 'rewards/margins': '0.1667', 'logps/chosen': '-134.4', 'logps/rejected': '-122', 'logits/chosen': '-0.9662', 'logits/rejected': '-0.9422', 'epoch': '0.3029'}
|
| 96 |
+
{'loss': '0.6354', 'grad_norm': '33.25', 'learning_rate': '1.75e-06', 'rewards/chosen': '0.1558', 'rewards/rejected': '-0.0791', 'rewards/accuracies': '0.59', 'rewards/margins': '0.2349', 'logps/chosen': '-127.1', 'logps/rejected': '-119.7', 'logits/chosen': '-0.9494', 'logits/rejected': '-0.9116', 'epoch': '0.3077'}
|
| 97 |
+
{'loss': '0.6125', 'grad_norm': '25.38', 'learning_rate': '1.738e-06', 'rewards/chosen': '0.1551', 'rewards/rejected': '-0.1041', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2592', 'logps/chosen': '-153.3', 'logps/rejected': '-104.8', 'logits/chosen': '-0.956', 'logits/rejected': '-0.9402', 'epoch': '0.3125'}
|
| 98 |
+
{'loss': '0.6565', 'grad_norm': '24.88', 'learning_rate': '1.727e-06', 'rewards/chosen': '0.08494', 'rewards/rejected': '-0.07927', 'rewards/accuracies': '0.64', 'rewards/margins': '0.1642', 'logps/chosen': '-126.7', 'logps/rejected': '-110.5', 'logits/chosen': '-0.9842', 'logits/rejected': '-0.9274', 'epoch': '0.3173'}
|
| 99 |
+
{'loss': '0.6244', 'grad_norm': '32', 'learning_rate': '1.715e-06', 'rewards/chosen': '0.1263', 'rewards/rejected': '-0.1182', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2445', 'logps/chosen': '-119.4', 'logps/rejected': '-122.2', 'logits/chosen': '-0.9953', 'logits/rejected': '-1.006', 'epoch': '0.3222'}
|
| 100 |
+
{'loss': '0.6332', 'grad_norm': '27.38', 'learning_rate': '1.704e-06', 'rewards/chosen': '0.0588', 'rewards/rejected': '-0.1336', 'rewards/accuracies': '0.65', 'rewards/margins': '0.1924', 'logps/chosen': '-123', 'logps/rejected': '-108.3', 'logits/chosen': '-0.9625', 'logits/rejected': '-0.8965', 'epoch': '0.327'}
|
| 101 |
+
{'loss': '0.6331', 'grad_norm': '24.75', 'learning_rate': '1.692e-06', 'rewards/chosen': '0.1213', 'rewards/rejected': '-0.1225', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2438', 'logps/chosen': '-123.7', 'logps/rejected': '-111', 'logits/chosen': '-1.017', 'logits/rejected': '-0.9669', 'epoch': '0.3318'}
|
| 102 |
+
{'loss': '0.6159', 'grad_norm': '22.25', 'learning_rate': '1.679e-06', 'rewards/chosen': '0.04103', 'rewards/rejected': '-0.2323', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2734', 'logps/chosen': '-119.8', 'logps/rejected': '-105.6', 'logits/chosen': '-0.981', 'logits/rejected': '-0.982', 'epoch': '0.3366'}
|
| 103 |
+
{'eval_loss': '0.6262', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.05219', 'eval_rewards/rejected': '-0.1832', 'eval_rewards/accuracies': '0.6589', 'eval_rewards/margins': '0.2353', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-0.9941', 'eval_logits/rejected': '-0.9926', 'epoch': '0.3366'}
|
| 104 |
+
{'loss': '0.5909', 'grad_norm': '25.75', 'learning_rate': '1.667e-06', 'rewards/chosen': '0.1096', 'rewards/rejected': '-0.1837', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2932', 'logps/chosen': '-137.7', 'logps/rejected': '-119.6', 'logits/chosen': '-0.9429', 'logits/rejected': '-0.9566', 'epoch': '0.3414'}
|
| 105 |
+
{'loss': '0.6095', 'grad_norm': '19.75', 'learning_rate': '1.654e-06', 'rewards/chosen': '0.05275', 'rewards/rejected': '-0.2056', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2583', 'logps/chosen': '-118.9', 'logps/rejected': '-104.4', 'logits/chosen': '-0.9903', 'logits/rejected': '-0.9753', 'epoch': '0.3462'}
|
| 106 |
+
{'loss': '0.6072', 'grad_norm': '39.25', 'learning_rate': '1.642e-06', 'rewards/chosen': '0.05501', 'rewards/rejected': '-0.2174', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2725', 'logps/chosen': '-133.6', 'logps/rejected': '-103.4', 'logits/chosen': '-0.9403', 'logits/rejected': '-0.8617', 'epoch': '0.351'}
|
| 107 |
+
{'loss': '0.6099', 'grad_norm': '28.12', 'learning_rate': '1.629e-06', 'rewards/chosen': '0.106', 'rewards/rejected': '-0.1533', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2593', 'logps/chosen': '-139.5', 'logps/rejected': '-125.9', 'logits/chosen': '-0.9693', 'logits/rejected': '-0.9525', 'epoch': '0.3558'}
|
| 108 |
+
{'loss': '0.626', 'grad_norm': '29.38', 'learning_rate': '1.615e-06', 'rewards/chosen': '0.07547', 'rewards/rejected': '-0.1947', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2702', 'logps/chosen': '-131.4', 'logps/rejected': '-111.5', 'logits/chosen': '-0.948', 'logits/rejected': '-0.9349', 'epoch': '0.3606'}
|
| 109 |
+
{'loss': '0.5866', 'grad_norm': '25.25', 'learning_rate': '1.602e-06', 'rewards/chosen': '0.1291', 'rewards/rejected': '-0.1772', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3063', 'logps/chosen': '-123.3', 'logps/rejected': '-113.3', 'logits/chosen': '-1.019', 'logits/rejected': '-0.9748', 'epoch': '0.3654'}
|
| 110 |
+
{'loss': '0.6346', 'grad_norm': '24.88', 'learning_rate': '1.589e-06', 'rewards/chosen': '0.1312', 'rewards/rejected': '-0.1053', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2365', 'logps/chosen': '-117.5', 'logps/rejected': '-105.8', 'logits/chosen': '-1.056', 'logits/rejected': '-1.041', 'epoch': '0.3702'}
|
| 111 |
+
{'loss': '0.5949', 'grad_norm': '28.12', 'learning_rate': '1.575e-06', 'rewards/chosen': '0.09233', 'rewards/rejected': '-0.2184', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3107', 'logps/chosen': '-136.3', 'logps/rejected': '-124.7', 'logits/chosen': '-0.9485', 'logits/rejected': '-0.9489', 'epoch': '0.375'}
|
| 112 |
+
{'loss': '0.668', 'grad_norm': '35.75', 'learning_rate': '1.561e-06', 'rewards/chosen': '0.01486', 'rewards/rejected': '-0.1466', 'rewards/accuracies': '0.59', 'rewards/margins': '0.1615', 'logps/chosen': '-124.6', 'logps/rejected': '-116.9', 'logits/chosen': '-1.092', 'logits/rejected': '-0.9749', 'epoch': '0.3799'}
|
| 113 |
+
{'loss': '0.6477', 'grad_norm': '28.25', 'learning_rate': '1.547e-06', 'rewards/chosen': '0.03967', 'rewards/rejected': '-0.1497', 'rewards/accuracies': '0.61', 'rewards/margins': '0.1894', 'logps/chosen': '-128.2', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9921', 'logits/rejected': '-1.002', 'epoch': '0.3847'}
|
| 114 |
+
{'eval_loss': '0.6239', 'eval_runtime': '158.8', 'eval_samples_per_second': '13.79', 'eval_steps_per_second': '2.758', 'eval_rewards/chosen': '0.07336', 'eval_rewards/rejected': '-0.1804', 'eval_rewards/accuracies': '0.6548', 'eval_rewards/margins': '0.2538', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.002', 'eval_logits/rejected': '-1.001', 'epoch': '0.3847'}
|
| 115 |
+
{'loss': '0.6338', 'grad_norm': '22.12', 'learning_rate': '1.533e-06', 'rewards/chosen': '0.1184', 'rewards/rejected': '-0.121', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2394', 'logps/chosen': '-129.7', 'logps/rejected': '-123.2', 'logits/chosen': '-1.003', 'logits/rejected': '-0.9961', 'epoch': '0.3895'}
|
| 116 |
+
{'loss': '0.6632', 'grad_norm': '41.25', 'learning_rate': '1.519e-06', 'rewards/chosen': '0.0514', 'rewards/rejected': '-0.1366', 'rewards/accuracies': '0.62', 'rewards/margins': '0.188', 'logps/chosen': '-136.2', 'logps/rejected': '-119.2', 'logits/chosen': '-1.032', 'logits/rejected': '-1.03', 'epoch': '0.3943'}
|
| 117 |
+
{'loss': '0.6371', 'grad_norm': '28.25', 'learning_rate': '1.504e-06', 'rewards/chosen': '0.06569', 'rewards/rejected': '-0.156', 'rewards/accuracies': '0.67', 'rewards/margins': '0.2217', 'logps/chosen': '-131.1', 'logps/rejected': '-117.2', 'logits/chosen': '-1.015', 'logits/rejected': '-1.012', 'epoch': '0.3991'}
|
| 118 |
+
{'loss': '0.604', 'grad_norm': '24.75', 'learning_rate': '1.49e-06', 'rewards/chosen': '0.07572', 'rewards/rejected': '-0.2187', 'rewards/accuracies': '0.74', 'rewards/margins': '0.2945', 'logps/chosen': '-137.4', 'logps/rejected': '-118.2', 'logits/chosen': '-0.9746', 'logits/rejected': '-0.9323', 'epoch': '0.4039'}
|
| 119 |
+
{'loss': '0.6266', 'grad_norm': '30', 'learning_rate': '1.475e-06', 'rewards/chosen': '0.1121', 'rewards/rejected': '-0.1312', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2433', 'logps/chosen': '-135.9', 'logps/rejected': '-116', 'logits/chosen': '-0.9413', 'logits/rejected': '-1.037', 'epoch': '0.4087'}
|
| 120 |
+
{'loss': '0.6456', 'grad_norm': '30.62', 'learning_rate': '1.46e-06', 'rewards/chosen': '-0.003546', 'rewards/rejected': '-0.2196', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2161', 'logps/chosen': '-123.7', 'logps/rejected': '-107.6', 'logits/chosen': '-1.004', 'logits/rejected': '-1.041', 'epoch': '0.4135'}
|
| 121 |
+
{'loss': '0.6239', 'grad_norm': '15.88', 'learning_rate': '1.445e-06', 'rewards/chosen': '0.07761', 'rewards/rejected': '-0.178', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2556', 'logps/chosen': '-124.6', 'logps/rejected': '-98.34', 'logits/chosen': '-0.9579', 'logits/rejected': '-0.9283', 'epoch': '0.4183'}
|
| 122 |
+
{'loss': '0.6506', 'grad_norm': '34.5', 'learning_rate': '1.43e-06', 'rewards/chosen': '0.1127', 'rewards/rejected': '-0.07568', 'rewards/accuracies': '0.635', 'rewards/margins': '0.1883', 'logps/chosen': '-125.6', 'logps/rejected': '-113.9', 'logits/chosen': '-0.9883', 'logits/rejected': '-1.021', 'epoch': '0.4231'}
|
| 123 |
+
{'loss': '0.5877', 'grad_norm': '23.5', 'learning_rate': '1.415e-06', 'rewards/chosen': '0.1939', 'rewards/rejected': '-0.1659', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3598', 'logps/chosen': '-144.3', 'logps/rejected': '-104.2', 'logits/chosen': '-0.9698', 'logits/rejected': '-0.9483', 'epoch': '0.4279'}
|
| 124 |
+
{'loss': '0.5945', 'grad_norm': '23.62', 'learning_rate': '1.4e-06', 'rewards/chosen': '0.2027', 'rewards/rejected': '-0.1275', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3301', 'logps/chosen': '-136.4', 'logps/rejected': '-106.5', 'logits/chosen': '-1.04', 'logits/rejected': '-1.026', 'epoch': '0.4327'}
|
| 125 |
+
{'eval_loss': '0.621', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.747', 'eval_rewards/chosen': '0.1077', 'eval_rewards/rejected': '-0.1569', 'eval_rewards/accuracies': '0.6598', 'eval_rewards/margins': '0.2646', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.5', 'eval_logits/chosen': '-1.001', 'eval_logits/rejected': '-1', 'epoch': '0.4327'}
|
| 126 |
+
{'loss': '0.592', 'grad_norm': '28.75', 'learning_rate': '1.384e-06', 'rewards/chosen': '0.1301', 'rewards/rejected': '-0.1729', 'rewards/accuracies': '0.685', 'rewards/margins': '0.303', 'logps/chosen': '-131.6', 'logps/rejected': '-104.5', 'logits/chosen': '-1.047', 'logits/rejected': '-1.001', 'epoch': '0.4376'}
|
| 127 |
+
{'loss': '0.6258', 'grad_norm': '31.88', 'learning_rate': '1.369e-06', 'rewards/chosen': '0.1164', 'rewards/rejected': '-0.123', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2394', 'logps/chosen': '-127.5', 'logps/rejected': '-118.2', 'logits/chosen': '-1.001', 'logits/rejected': '-0.9918', 'epoch': '0.4424'}
|
| 128 |
+
{'loss': '0.6184', 'grad_norm': '25.12', 'learning_rate': '1.353e-06', 'rewards/chosen': '0.1316', 'rewards/rejected': '-0.1737', 'rewards/accuracies': '0.585', 'rewards/margins': '0.3054', 'logps/chosen': '-139.3', 'logps/rejected': '-118.4', 'logits/chosen': '-0.9795', 'logits/rejected': '-0.9423', 'epoch': '0.4472'}
|
| 129 |
+
{'loss': '0.6156', 'grad_norm': '30', 'learning_rate': '1.337e-06', 'rewards/chosen': '0.06766', 'rewards/rejected': '-0.225', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2927', 'logps/chosen': '-133.4', 'logps/rejected': '-105.4', 'logits/chosen': '-0.9695', 'logits/rejected': '-0.9534', 'epoch': '0.452'}
|
| 130 |
+
{'loss': '0.623', 'grad_norm': '27.25', 'learning_rate': '1.321e-06', 'rewards/chosen': '0.1109', 'rewards/rejected': '-0.1649', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2758', 'logps/chosen': '-135', 'logps/rejected': '-123.8', 'logits/chosen': '-0.9916', 'logits/rejected': '-0.934', 'epoch': '0.4568'}
|
| 131 |
+
{'loss': '0.6401', 'grad_norm': '27.5', 'learning_rate': '1.306e-06', 'rewards/chosen': '0.05028', 'rewards/rejected': '-0.1952', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2455', 'logps/chosen': '-121', 'logps/rejected': '-111.4', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9843', 'epoch': '0.4616'}
|
| 132 |
+
{'loss': '0.5608', 'grad_norm': '24.38', 'learning_rate': '1.289e-06', 'rewards/chosen': '0.1163', 'rewards/rejected': '-0.294', 'rewards/accuracies': '0.73', 'rewards/margins': '0.4103', 'logps/chosen': '-134.2', 'logps/rejected': '-103', 'logits/chosen': '-1.023', 'logits/rejected': '-1.027', 'epoch': '0.4664'}
|
| 133 |
+
{'loss': '0.5944', 'grad_norm': '24.38', 'learning_rate': '1.273e-06', 'rewards/chosen': '0.1042', 'rewards/rejected': '-0.2077', 'rewards/accuracies': '0.71', 'rewards/margins': '0.312', 'logps/chosen': '-144.2', 'logps/rejected': '-101.9', 'logits/chosen': '-0.9673', 'logits/rejected': '-0.9317', 'epoch': '0.4712'}
|
| 134 |
+
{'loss': '0.6302', 'grad_norm': '21.88', 'learning_rate': '1.257e-06', 'rewards/chosen': '0.1611', 'rewards/rejected': '-0.1021', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2632', 'logps/chosen': '-131.3', 'logps/rejected': '-117.5', 'logits/chosen': '-0.9503', 'logits/rejected': '-0.9778', 'epoch': '0.476'}
|
| 135 |
+
{'loss': '0.6027', 'grad_norm': '31.88', 'learning_rate': '1.241e-06', 'rewards/chosen': '0.1524', 'rewards/rejected': '-0.1854', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3378', 'logps/chosen': '-135.9', 'logps/rejected': '-115.3', 'logits/chosen': '-0.9442', 'logits/rejected': '-0.9484', 'epoch': '0.4808'}
|
| 136 |
+
{'eval_loss': '0.6208', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.75', 'eval_steps_per_second': '2.75', 'eval_rewards/chosen': '0.09863', 'eval_rewards/rejected': '-0.177', 'eval_rewards/accuracies': '0.658', 'eval_rewards/margins': '0.2757', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.001', 'eval_logits/rejected': '-1.001', 'epoch': '0.4808'}
|
| 137 |
+
{'loss': '0.5893', 'grad_norm': '28.5', 'learning_rate': '1.225e-06', 'rewards/chosen': '0.1764', 'rewards/rejected': '-0.1885', 'rewards/accuracies': '0.705', 'rewards/margins': '0.3649', 'logps/chosen': '-152.2', 'logps/rejected': '-117.6', 'logits/chosen': '-0.9854', 'logits/rejected': '-0.911', 'epoch': '0.4856'}
|
| 138 |
+
{'loss': '0.6349', 'grad_norm': '31.75', 'learning_rate': '1.208e-06', 'rewards/chosen': '0.09442', 'rewards/rejected': '-0.1627', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2571', 'logps/chosen': '-145.2', 'logps/rejected': '-111', 'logits/chosen': '-0.9148', 'logits/rejected': '-0.9454', 'epoch': '0.4904'}
|
| 139 |
+
{'loss': '0.6097', 'grad_norm': '22.75', 'learning_rate': '1.192e-06', 'rewards/chosen': '0.13', 'rewards/rejected': '-0.1788', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3088', 'logps/chosen': '-123', 'logps/rejected': '-106.8', 'logits/chosen': '-0.9644', 'logits/rejected': '-0.9964', 'epoch': '0.4953'}
|
| 140 |
+
{'loss': '0.5916', 'grad_norm': '48.75', 'learning_rate': '1.175e-06', 'rewards/chosen': '0.181', 'rewards/rejected': '-0.1517', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3327', 'logps/chosen': '-121.3', 'logps/rejected': '-131.1', 'logits/chosen': '-1.03', 'logits/rejected': '-0.934', 'epoch': '0.5001'}
|
| 141 |
+
{'loss': '0.6417', 'grad_norm': '28.75', 'learning_rate': '1.159e-06', 'rewards/chosen': '0.1072', 'rewards/rejected': '-0.122', 'rewards/accuracies': '0.61', 'rewards/margins': '0.2292', 'logps/chosen': '-128.8', 'logps/rejected': '-122.2', 'logits/chosen': '-0.9683', 'logits/rejected': '-0.9303', 'epoch': '0.5049'}
|
| 142 |
+
{'loss': '0.6223', 'grad_norm': '33', 'learning_rate': '1.142e-06', 'rewards/chosen': '0.1564', 'rewards/rejected': '-0.1069', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2633', 'logps/chosen': '-132.7', 'logps/rejected': '-116.8', 'logits/chosen': '-1', 'logits/rejected': '-1.002', 'epoch': '0.5097'}
|
| 143 |
+
{'loss': '0.5956', 'grad_norm': '19.88', 'learning_rate': '1.126e-06', 'rewards/chosen': '0.1761', 'rewards/rejected': '-0.168', 'rewards/accuracies': '0.665', 'rewards/margins': '0.344', 'logps/chosen': '-134.1', 'logps/rejected': '-113.3', 'logits/chosen': '-1.061', 'logits/rejected': '-1.011', 'epoch': '0.5145'}
|
| 144 |
+
{'loss': '0.6268', 'grad_norm': '25.25', 'learning_rate': '1.109e-06', 'rewards/chosen': '0.08844', 'rewards/rejected': '-0.1834', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2718', 'logps/chosen': '-120.9', 'logps/rejected': '-107.2', 'logits/chosen': '-0.9652', 'logits/rejected': '-0.9194', 'epoch': '0.5193'}
|
| 145 |
+
{'loss': '0.6106', 'grad_norm': '24.75', 'learning_rate': '1.092e-06', 'rewards/chosen': '0.07643', 'rewards/rejected': '-0.184', 'rewards/accuracies': '0.7', 'rewards/margins': '0.2604', 'logps/chosen': '-126', 'logps/rejected': '-105.6', 'logits/chosen': '-0.9865', 'logits/rejected': '-0.9771', 'epoch': '0.5241'}
|
| 146 |
+
{'loss': '0.611', 'grad_norm': '28', 'learning_rate': '1.075e-06', 'rewards/chosen': '0.09692', 'rewards/rejected': '-0.2066', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3035', 'logps/chosen': '-136.7', 'logps/rejected': '-109.7', 'logits/chosen': '-0.9674', 'logits/rejected': '-0.9861', 'epoch': '0.5289'}
|
| 147 |
+
{'eval_loss': '0.6197', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.0907', 'eval_rewards/rejected': '-0.1901', 'eval_rewards/accuracies': '0.6562', 'eval_rewards/margins': '0.2808', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.004', 'eval_logits/rejected': '-1.004', 'epoch': '0.5289'}
|
| 148 |
+
{'loss': '0.5994', 'grad_norm': '28.25', 'learning_rate': '1.059e-06', 'rewards/chosen': '0.1401', 'rewards/rejected': '-0.2363', 'rewards/accuracies': '0.69', 'rewards/margins': '0.3764', 'logps/chosen': '-138.5', 'logps/rejected': '-115.4', 'logits/chosen': '-0.9296', 'logits/rejected': '-0.9662', 'epoch': '0.5337'}
|
| 149 |
+
{'loss': '0.6124', 'grad_norm': '31', 'learning_rate': '1.042e-06', 'rewards/chosen': '0.1156', 'rewards/rejected': '-0.1785', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2941', 'logps/chosen': '-140.3', 'logps/rejected': '-104.5', 'logits/chosen': '-1.001', 'logits/rejected': '-0.9529', 'epoch': '0.5385'}
|
| 150 |
+
{'loss': '0.5968', 'grad_norm': '24.62', 'learning_rate': '1.025e-06', 'rewards/chosen': '0.09342', 'rewards/rejected': '-0.2448', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3382', 'logps/chosen': '-138.1', 'logps/rejected': '-124.2', 'logits/chosen': '-0.926', 'logits/rejected': '-0.9841', 'epoch': '0.5433'}
|
| 151 |
+
{'loss': '0.6155', 'grad_norm': '30.88', 'learning_rate': '1.008e-06', 'rewards/chosen': '0.09974', 'rewards/rejected': '-0.198', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2977', 'logps/chosen': '-125.8', 'logps/rejected': '-104', 'logits/chosen': '-0.9486', 'logits/rejected': '-0.9328', 'epoch': '0.5481'}
|
| 152 |
+
{'loss': '0.6336', 'grad_norm': '26.88', 'learning_rate': '9.916e-07', 'rewards/chosen': '0.1246', 'rewards/rejected': '-0.1427', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2673', 'logps/chosen': '-147', 'logps/rejected': '-121.5', 'logits/chosen': '-0.9657', 'logits/rejected': '-0.9106', 'epoch': '0.553'}
|
| 153 |
+
{'loss': '0.5825', 'grad_norm': '29.38', 'learning_rate': '9.748e-07', 'rewards/chosen': '0.162', 'rewards/rejected': '-0.2003', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3623', 'logps/chosen': '-134.6', 'logps/rejected': '-105.5', 'logits/chosen': '-1.025', 'logits/rejected': '-0.9952', 'epoch': '0.5578'}
|
| 154 |
+
{'loss': '0.5974', 'grad_norm': '21.62', 'learning_rate': '9.581e-07', 'rewards/chosen': '0.08895', 'rewards/rejected': '-0.2469', 'rewards/accuracies': '0.655', 'rewards/margins': '0.3359', 'logps/chosen': '-134.8', 'logps/rejected': '-106.8', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9689', 'epoch': '0.5626'}
|
| 155 |
+
{'loss': '0.5966', 'grad_norm': '22', 'learning_rate': '9.413e-07', 'rewards/chosen': '0.09052', 'rewards/rejected': '-0.2249', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3154', 'logps/chosen': '-128.5', 'logps/rejected': '-105.4', 'logits/chosen': '-1.053', 'logits/rejected': '-0.9607', 'epoch': '0.5674'}
|
| 156 |
+
{'loss': '0.6572', 'grad_norm': '26.5', 'learning_rate': '9.246e-07', 'rewards/chosen': '0.02879', 'rewards/rejected': '-0.157', 'rewards/accuracies': '0.62', 'rewards/margins': '0.1858', 'logps/chosen': '-132.2', 'logps/rejected': '-105.7', 'logits/chosen': '-0.9782', 'logits/rejected': '-0.9498', 'epoch': '0.5722'}
|
| 157 |
+
{'loss': '0.6372', 'grad_norm': '35', 'learning_rate': '9.078e-07', 'rewards/chosen': '0.07806', 'rewards/rejected': '-0.1434', 'rewards/accuracies': '0.665', 'rewards/margins': '0.2215', 'logps/chosen': '-125.9', 'logps/rejected': '-126', 'logits/chosen': '-1.035', 'logits/rejected': '-0.9866', 'epoch': '0.577'}
|
| 158 |
+
{'eval_loss': '0.6194', 'eval_runtime': '159.3', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.749', 'eval_rewards/chosen': '0.102', 'eval_rewards/rejected': '-0.184', 'eval_rewards/accuracies': '0.658', 'eval_rewards/margins': '0.286', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.6', 'eval_logits/chosen': '-1.006', 'eval_logits/rejected': '-1.006', 'epoch': '0.577'}
|
| 159 |
+
{'loss': '0.5651', 'grad_norm': '25.38', 'learning_rate': '8.911e-07', 'rewards/chosen': '0.141', 'rewards/rejected': '-0.2724', 'rewards/accuracies': '0.725', 'rewards/margins': '0.4134', 'logps/chosen': '-123.4', 'logps/rejected': '-113.6', 'logits/chosen': '-0.9879', 'logits/rejected': '-0.9599', 'epoch': '0.5818'}
|
| 160 |
+
{'loss': '0.5842', 'grad_norm': '23.12', 'learning_rate': '8.745e-07', 'rewards/chosen': '0.1504', 'rewards/rejected': '-0.1981', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3485', 'logps/chosen': '-149.7', 'logps/rejected': '-116.2', 'logits/chosen': '-1.047', 'logits/rejected': '-1.062', 'epoch': '0.5866'}
|
| 161 |
+
{'loss': '0.6093', 'grad_norm': '27.5', 'learning_rate': '8.578e-07', 'rewards/chosen': '0.1642', 'rewards/rejected': '-0.1456', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3098', 'logps/chosen': '-130.4', 'logps/rejected': '-121.4', 'logits/chosen': '-1.036', 'logits/rejected': '-0.9943', 'epoch': '0.5914'}
|
| 162 |
+
{'loss': '0.6234', 'grad_norm': '34.25', 'learning_rate': '8.412e-07', 'rewards/chosen': '0.1339', 'rewards/rejected': '-0.1329', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2668', 'logps/chosen': '-133.1', 'logps/rejected': '-122.2', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9569', 'epoch': '0.5962'}
|
| 163 |
+
{'loss': '0.5824', 'grad_norm': '25.12', 'learning_rate': '8.247e-07', 'rewards/chosen': '0.1535', 'rewards/rejected': '-0.2017', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3552', 'logps/chosen': '-134.8', 'logps/rejected': '-111.3', 'logits/chosen': '-1.021', 'logits/rejected': '-0.983', 'epoch': '0.601'}
|
| 164 |
+
{'loss': '0.6085', 'grad_norm': '36.5', 'learning_rate': '8.082e-07', 'rewards/chosen': '0.121', 'rewards/rejected': '-0.2039', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3248', 'logps/chosen': '-132.3', 'logps/rejected': '-110.7', 'logits/chosen': '-0.9972', 'logits/rejected': '-0.9701', 'epoch': '0.6058'}
|
| 165 |
+
{'loss': '0.5618', 'grad_norm': '26.38', 'learning_rate': '7.918e-07', 'rewards/chosen': '0.1927', 'rewards/rejected': '-0.2224', 'rewards/accuracies': '0.71', 'rewards/margins': '0.4152', 'logps/chosen': '-145', 'logps/rejected': '-119.6', 'logits/chosen': '-1.046', 'logits/rejected': '-0.9814', 'epoch': '0.6107'}
|
| 166 |
+
{'loss': '0.6005', 'grad_norm': '13.94', 'learning_rate': '7.754e-07', 'rewards/chosen': '0.1197', 'rewards/rejected': '-0.1972', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3169', 'logps/chosen': '-127.7', 'logps/rejected': '-97.99', 'logits/chosen': '-1.017', 'logits/rejected': '-1.026', 'epoch': '0.6155'}
|
| 167 |
+
{'loss': '0.6101', 'grad_norm': '31.12', 'learning_rate': '7.591e-07', 'rewards/chosen': '0.1124', 'rewards/rejected': '-0.199', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3115', 'logps/chosen': '-128.6', 'logps/rejected': '-111.4', 'logits/chosen': '-1.014', 'logits/rejected': '-1.001', 'epoch': '0.6203'}
|
| 168 |
+
{'loss': '0.5744', 'grad_norm': '30.38', 'learning_rate': '7.428e-07', 'rewards/chosen': '0.1661', 'rewards/rejected': '-0.2175', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3836', 'logps/chosen': '-140.8', 'logps/rejected': '-122.9', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9857', 'epoch': '0.6251'}
|
| 169 |
+
{'eval_loss': '0.6182', 'eval_runtime': '159.6', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.744', 'eval_rewards/chosen': '0.08965', 'eval_rewards/rejected': '-0.2029', 'eval_rewards/accuracies': '0.6543', 'eval_rewards/margins': '0.2925', 'eval_logps/chosen': '-133.2', 'eval_logps/rejected': '-113.7', 'eval_logits/chosen': '-1.007', 'eval_logits/rejected': '-1.008', 'epoch': '0.6251'}
|
| 170 |
+
{'loss': '0.6543', 'grad_norm': '22.5', 'learning_rate': '7.266e-07', 'rewards/chosen': '0.09405', 'rewards/rejected': '-0.1148', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2088', 'logps/chosen': '-123.1', 'logps/rejected': '-102.8', 'logits/chosen': '-1.008', 'logits/rejected': '-0.9873', 'epoch': '0.6299'}
|
| 171 |
+
{'loss': '0.6178', 'grad_norm': '26.38', 'learning_rate': '7.105e-07', 'rewards/chosen': '0.0634', 'rewards/rejected': '-0.2023', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2657', 'logps/chosen': '-142.2', 'logps/rejected': '-124.9', 'logits/chosen': '-1.034', 'logits/rejected': '-1.03', 'epoch': '0.6347'}
|
| 172 |
+
{'loss': '0.632', 'grad_norm': '26.75', 'learning_rate': '6.945e-07', 'rewards/chosen': '0.06304', 'rewards/rejected': '-0.1975', 'rewards/accuracies': '0.63', 'rewards/margins': '0.2606', 'logps/chosen': '-129.1', 'logps/rejected': '-112.8', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9704', 'epoch': '0.6395'}
|
| 173 |
+
{'loss': '0.6087', 'grad_norm': '23.38', 'learning_rate': '6.786e-07', 'rewards/chosen': '0.07113', 'rewards/rejected': '-0.2305', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3016', 'logps/chosen': '-119.9', 'logps/rejected': '-108.1', 'logits/chosen': '-1.016', 'logits/rejected': '-0.9578', 'epoch': '0.6443'}
|
| 174 |
+
{'loss': '0.6433', 'grad_norm': '35.75', 'learning_rate': '6.627e-07', 'rewards/chosen': '0.06799', 'rewards/rejected': '-0.1603', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2282', 'logps/chosen': '-128.5', 'logps/rejected': '-120.6', 'logits/chosen': '-1.028', 'logits/rejected': '-1.012', 'epoch': '0.6491'}
|
| 175 |
+
{'loss': '0.6032', 'grad_norm': '21.62', 'learning_rate': '6.47e-07', 'rewards/chosen': '0.1394', 'rewards/rejected': '-0.1772', 'rewards/accuracies': '0.69', 'rewards/margins': '0.3166', 'logps/chosen': '-129.2', 'logps/rejected': '-108.2', 'logits/chosen': '-1.021', 'logits/rejected': '-1.034', 'epoch': '0.6539'}
|
| 176 |
+
{'loss': '0.6335', 'grad_norm': '42.75', 'learning_rate': '6.313e-07', 'rewards/chosen': '-0.004546', 'rewards/rejected': '-0.2462', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2416', 'logps/chosen': '-125', 'logps/rejected': '-105.8', 'logits/chosen': '-1.04', 'logits/rejected': '-0.9723', 'epoch': '0.6587'}
|
| 177 |
+
{'loss': '0.6123', 'grad_norm': '25.38', 'learning_rate': '6.158e-07', 'rewards/chosen': '0.1051', 'rewards/rejected': '-0.1864', 'rewards/accuracies': '0.64', 'rewards/margins': '0.2915', 'logps/chosen': '-137.5', 'logps/rejected': '-121.2', 'logits/chosen': '-0.9934', 'logits/rejected': '-0.9458', 'epoch': '0.6635'}
|
| 178 |
+
{'loss': '0.5943', 'grad_norm': '27', 'learning_rate': '6.003e-07', 'rewards/chosen': '0.1287', 'rewards/rejected': '-0.2019', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3306', 'logps/chosen': '-134.4', 'logps/rejected': '-110.2', 'logits/chosen': '-0.9903', 'logits/rejected': '-0.9943', 'epoch': '0.6683'}
|
| 179 |
+
{'loss': '0.6111', 'grad_norm': '29.5', 'learning_rate': '5.85e-07', 'rewards/chosen': '0.07695', 'rewards/rejected': '-0.2288', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3057', 'logps/chosen': '-132.5', 'logps/rejected': '-106.4', 'logits/chosen': '-0.9616', 'logits/rejected': '-0.9204', 'epoch': '0.6732'}
|
| 180 |
+
{'eval_loss': '0.6172', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.0812', 'eval_rewards/rejected': '-0.2119', 'eval_rewards/accuracies': '0.6644', 'eval_rewards/margins': '0.2931', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.7', 'eval_logits/chosen': '-1.008', 'eval_logits/rejected': '-1.008', 'epoch': '0.6732'}
|
| 181 |
+
{'loss': '0.6044', 'grad_norm': '39.75', 'learning_rate': '5.698e-07', 'rewards/chosen': '0.1359', 'rewards/rejected': '-0.2258', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3617', 'logps/chosen': '-133.3', 'logps/rejected': '-106.4', 'logits/chosen': '-0.9936', 'logits/rejected': '-0.9936', 'epoch': '0.678'}
|
| 182 |
+
{'loss': '0.5767', 'grad_norm': '29.5', 'learning_rate': '5.547e-07', 'rewards/chosen': '0.1588', 'rewards/rejected': '-0.2603', 'rewards/accuracies': '0.69', 'rewards/margins': '0.4192', 'logps/chosen': '-134', 'logps/rejected': '-122.9', 'logits/chosen': '-0.981', 'logits/rejected': '-0.9433', 'epoch': '0.6828'}
|
| 183 |
+
{'loss': '0.6466', 'grad_norm': '34.75', 'learning_rate': '5.397e-07', 'rewards/chosen': '0.06965', 'rewards/rejected': '-0.1698', 'rewards/accuracies': '0.59', 'rewards/margins': '0.2395', 'logps/chosen': '-134.3', 'logps/rejected': '-122.1', 'logits/chosen': '-0.9917', 'logits/rejected': '-0.974', 'epoch': '0.6876'}
|
| 184 |
+
{'loss': '0.5897', 'grad_norm': '34.5', 'learning_rate': '5.249e-07', 'rewards/chosen': '0.07432', 'rewards/rejected': '-0.2534', 'rewards/accuracies': '0.73', 'rewards/margins': '0.3278', 'logps/chosen': '-130.2', 'logps/rejected': '-106.7', 'logits/chosen': '-0.9764', 'logits/rejected': '-0.9641', 'epoch': '0.6924'}
|
| 185 |
+
{'loss': '0.6133', 'grad_norm': '26.38', 'learning_rate': '5.102e-07', 'rewards/chosen': '0.06769', 'rewards/rejected': '-0.2367', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3044', 'logps/chosen': '-132.5', 'logps/rejected': '-112.7', 'logits/chosen': '-0.9815', 'logits/rejected': '-0.9777', 'epoch': '0.6972'}
|
| 186 |
+
{'loss': '0.6261', 'grad_norm': '21.25', 'learning_rate': '4.956e-07', 'rewards/chosen': '0.04256', 'rewards/rejected': '-0.2149', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2575', 'logps/chosen': '-113.9', 'logps/rejected': '-104.1', 'logits/chosen': '-1.023', 'logits/rejected': '-1.017', 'epoch': '0.702'}
|
| 187 |
+
{'loss': '0.6319', 'grad_norm': '31.12', 'learning_rate': '4.812e-07', 'rewards/chosen': '0.07442', 'rewards/rejected': '-0.2227', 'rewards/accuracies': '0.725', 'rewards/margins': '0.2971', 'logps/chosen': '-144.3', 'logps/rejected': '-120.9', 'logits/chosen': '-0.9851', 'logits/rejected': '-0.9347', 'epoch': '0.7068'}
|
| 188 |
+
{'loss': '0.6194', 'grad_norm': '29.88', 'learning_rate': '4.67e-07', 'rewards/chosen': '0.09555', 'rewards/rejected': '-0.2322', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3277', 'logps/chosen': '-126.8', 'logps/rejected': '-113.1', 'logits/chosen': '-1.055', 'logits/rejected': '-1.009', 'epoch': '0.7116'}
|
| 189 |
+
{'loss': '0.6323', 'grad_norm': '28', 'learning_rate': '4.528e-07', 'rewards/chosen': '0.05447', 'rewards/rejected': '-0.2012', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2557', 'logps/chosen': '-129.2', 'logps/rejected': '-115.4', 'logits/chosen': '-1.021', 'logits/rejected': '-1.045', 'epoch': '0.7164'}
|
| 190 |
+
{'loss': '0.5895', 'grad_norm': '26', 'learning_rate': '4.389e-07', 'rewards/chosen': '0.02393', 'rewards/rejected': '-0.2991', 'rewards/accuracies': '0.72', 'rewards/margins': '0.323', 'logps/chosen': '-120.2', 'logps/rejected': '-101.5', 'logits/chosen': '-1.104', 'logits/rejected': '-0.9578', 'epoch': '0.7212'}
|
| 191 |
+
{'eval_loss': '0.6169', 'eval_runtime': '159.5', 'eval_samples_per_second': '13.73', 'eval_steps_per_second': '2.746', 'eval_rewards/chosen': '0.06714', 'eval_rewards/rejected': '-0.2274', 'eval_rewards/accuracies': '0.6676', 'eval_rewards/margins': '0.2946', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.7212'}
|
| 192 |
+
{'loss': '0.5721', 'grad_norm': '37.25', 'learning_rate': '4.25e-07', 'rewards/chosen': '0.1668', 'rewards/rejected': '-0.2653', 'rewards/accuracies': '0.705', 'rewards/margins': '0.4321', 'logps/chosen': '-152.7', 'logps/rejected': '-115.3', 'logits/chosen': '-1.047', 'logits/rejected': '-0.9918', 'epoch': '0.726'}
|
| 193 |
+
{'loss': '0.6355', 'grad_norm': '24.12', 'learning_rate': '4.114e-07', 'rewards/chosen': '0.06084', 'rewards/rejected': '-0.174', 'rewards/accuracies': '0.615', 'rewards/margins': '0.2348', 'logps/chosen': '-118', 'logps/rejected': '-113.7', 'logits/chosen': '-0.9507', 'logits/rejected': '-0.9365', 'epoch': '0.7309'}
|
| 194 |
+
{'loss': '0.6423', 'grad_norm': '27.5', 'learning_rate': '3.979e-07', 'rewards/chosen': '0.08837', 'rewards/rejected': '-0.1562', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2445', 'logps/chosen': '-129.5', 'logps/rejected': '-105.8', 'logits/chosen': '-0.9463', 'logits/rejected': '-1.061', 'epoch': '0.7357'}
|
| 195 |
+
{'loss': '0.5898', 'grad_norm': '30.38', 'learning_rate': '3.846e-07', 'rewards/chosen': '0.1229', 'rewards/rejected': '-0.2618', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3847', 'logps/chosen': '-142.4', 'logps/rejected': '-115.1', 'logits/chosen': '-1.005', 'logits/rejected': '-0.9725', 'epoch': '0.7405'}
|
| 196 |
+
{'loss': '0.5932', 'grad_norm': '24', 'learning_rate': '3.715e-07', 'rewards/chosen': '0.0869', 'rewards/rejected': '-0.2666', 'rewards/accuracies': '0.685', 'rewards/margins': '0.3535', 'logps/chosen': '-130.5', 'logps/rejected': '-117.2', 'logits/chosen': '-1.014', 'logits/rejected': '-0.9823', 'epoch': '0.7453'}
|
| 197 |
+
{'loss': '0.6321', 'grad_norm': '28.25', 'learning_rate': '3.585e-07', 'rewards/chosen': '0.1186', 'rewards/rejected': '-0.1726', 'rewards/accuracies': '0.675', 'rewards/margins': '0.2912', 'logps/chosen': '-122.4', 'logps/rejected': '-110.7', 'logits/chosen': '-1.069', 'logits/rejected': '-1.031', 'epoch': '0.7501'}
|
| 198 |
+
{'loss': '0.6521', 'grad_norm': '26.5', 'learning_rate': '3.457e-07', 'rewards/chosen': '0.08378', 'rewards/rejected': '-0.1438', 'rewards/accuracies': '0.62', 'rewards/margins': '0.2276', 'logps/chosen': '-131.7', 'logps/rejected': '-109.7', 'logits/chosen': '-1.01', 'logits/rejected': '-1.022', 'epoch': '0.7549'}
|
| 199 |
+
{'loss': '0.586', 'grad_norm': '26.88', 'learning_rate': '3.331e-07', 'rewards/chosen': '0.09078', 'rewards/rejected': '-0.2869', 'rewards/accuracies': '0.655', 'rewards/margins': '0.3777', 'logps/chosen': '-138', 'logps/rejected': '-111.6', 'logits/chosen': '-1.026', 'logits/rejected': '-0.9772', 'epoch': '0.7597'}
|
| 200 |
+
{'loss': '0.5819', 'grad_norm': '31', 'learning_rate': '3.207e-07', 'rewards/chosen': '0.111', 'rewards/rejected': '-0.2876', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3987', 'logps/chosen': '-132.5', 'logps/rejected': '-116.4', 'logits/chosen': '-1.013', 'logits/rejected': '-0.9545', 'epoch': '0.7645'}
|
| 201 |
+
{'loss': '0.5849', 'grad_norm': '31.88', 'learning_rate': '3.085e-07', 'rewards/chosen': '0.03506', 'rewards/rejected': '-0.3219', 'rewards/accuracies': '0.695', 'rewards/margins': '0.357', 'logps/chosen': '-131.8', 'logps/rejected': '-111.7', 'logits/chosen': '-1.046', 'logits/rejected': '-1.014', 'epoch': '0.7693'}
|
| 202 |
+
{'eval_loss': '0.6172', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.747', 'eval_rewards/chosen': '0.0699', 'eval_rewards/rejected': '-0.2252', 'eval_rewards/accuracies': '0.6639', 'eval_rewards/margins': '0.2951', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.009', 'eval_logits/rejected': '-1.01', 'epoch': '0.7693'}
|
| 203 |
+
{'loss': '0.5601', 'grad_norm': '22.88', 'learning_rate': '2.965e-07', 'rewards/chosen': '0.17', 'rewards/rejected': '-0.2553', 'rewards/accuracies': '0.74', 'rewards/margins': '0.4254', 'logps/chosen': '-134.6', 'logps/rejected': '-120', 'logits/chosen': '-1.073', 'logits/rejected': '-1.052', 'epoch': '0.7741'}
|
| 204 |
+
{'loss': '0.5903', 'grad_norm': '32.25', 'learning_rate': '2.846e-07', 'rewards/chosen': '0.117', 'rewards/rejected': '-0.2267', 'rewards/accuracies': '0.7', 'rewards/margins': '0.3436', 'logps/chosen': '-148.8', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9712', 'logits/rejected': '-0.9916', 'epoch': '0.7789'}
|
| 205 |
+
{'loss': '0.632', 'grad_norm': '25.12', 'learning_rate': '2.73e-07', 'rewards/chosen': '0.04253', 'rewards/rejected': '-0.2194', 'rewards/accuracies': '0.62', 'rewards/margins': '0.262', 'logps/chosen': '-122.7', 'logps/rejected': '-124.5', 'logits/chosen': '-1.014', 'logits/rejected': '-1.028', 'epoch': '0.7837'}
|
| 206 |
+
{'loss': '0.6115', 'grad_norm': '30.62', 'learning_rate': '2.616e-07', 'rewards/chosen': '0.1155', 'rewards/rejected': '-0.1882', 'rewards/accuracies': '0.62', 'rewards/margins': '0.3036', 'logps/chosen': '-132.6', 'logps/rejected': '-112.1', 'logits/chosen': '-1.035', 'logits/rejected': '-0.99', 'epoch': '0.7886'}
|
| 207 |
+
{'loss': '0.5935', 'grad_norm': '27.5', 'learning_rate': '2.504e-07', 'rewards/chosen': '0.08252', 'rewards/rejected': '-0.2534', 'rewards/accuracies': '0.67', 'rewards/margins': '0.3359', 'logps/chosen': '-116', 'logps/rejected': '-114.6', 'logits/chosen': '-1.09', 'logits/rejected': '-1.055', 'epoch': '0.7934'}
|
| 208 |
+
{'loss': '0.6075', 'grad_norm': '25.12', 'learning_rate': '2.394e-07', 'rewards/chosen': '0.1463', 'rewards/rejected': '-0.1917', 'rewards/accuracies': '0.68', 'rewards/margins': '0.338', 'logps/chosen': '-131.6', 'logps/rejected': '-110.9', 'logits/chosen': '-0.9839', 'logits/rejected': '-0.9589', 'epoch': '0.7982'}
|
| 209 |
+
{'loss': '0.6178', 'grad_norm': '25.5', 'learning_rate': '2.286e-07', 'rewards/chosen': '0.08', 'rewards/rejected': '-0.2258', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3058', 'logps/chosen': '-136.2', 'logps/rejected': '-110.5', 'logits/chosen': '-0.9616', 'logits/rejected': '-1.004', 'epoch': '0.803'}
|
| 210 |
+
{'loss': '0.6047', 'grad_norm': '24.38', 'learning_rate': '2.18e-07', 'rewards/chosen': '0.07041', 'rewards/rejected': '-0.2607', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3311', 'logps/chosen': '-132.8', 'logps/rejected': '-109.6', 'logits/chosen': '-0.9952', 'logits/rejected': '-0.9531', 'epoch': '0.8078'}
|
| 211 |
+
{'loss': '0.5989', 'grad_norm': '23', 'learning_rate': '2.077e-07', 'rewards/chosen': '0.1002', 'rewards/rejected': '-0.2467', 'rewards/accuracies': '0.66', 'rewards/margins': '0.3469', 'logps/chosen': '-145.4', 'logps/rejected': '-114.1', 'logits/chosen': '-0.9995', 'logits/rejected': '-0.9752', 'epoch': '0.8126'}
|
| 212 |
+
{'loss': '0.5759', 'grad_norm': '27.62', 'learning_rate': '1.975e-07', 'rewards/chosen': '0.07599', 'rewards/rejected': '-0.3065', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3825', 'logps/chosen': '-137.4', 'logps/rejected': '-123.1', 'logits/chosen': '-1.024', 'logits/rejected': '-0.9887', 'epoch': '0.8174'}
|
| 213 |
+
{'eval_loss': '0.6168', 'eval_runtime': '159.4', 'eval_samples_per_second': '13.74', 'eval_steps_per_second': '2.748', 'eval_rewards/chosen': '0.07051', 'eval_rewards/rejected': '-0.2252', 'eval_rewards/accuracies': '0.6653', 'eval_rewards/margins': '0.2957', 'eval_logps/chosen': '-133.3', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.8174'}
|
| 214 |
+
{'loss': '0.5734', 'grad_norm': '19.12', 'learning_rate': '1.876e-07', 'rewards/chosen': '0.1173', 'rewards/rejected': '-0.2696', 'rewards/accuracies': '0.72', 'rewards/margins': '0.3869', 'logps/chosen': '-127.1', 'logps/rejected': '-116.2', 'logits/chosen': '-0.968', 'logits/rejected': '-0.9611', 'epoch': '0.8222'}
|
| 215 |
+
{'loss': '0.6326', 'grad_norm': '24.5', 'learning_rate': '1.78e-07', 'rewards/chosen': '0.1114', 'rewards/rejected': '-0.1299', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2413', 'logps/chosen': '-148', 'logps/rejected': '-114.8', 'logits/chosen': '-0.9754', 'logits/rejected': '-0.9537', 'epoch': '0.827'}
|
| 216 |
+
{'loss': '0.6235', 'grad_norm': '24.5', 'learning_rate': '1.685e-07', 'rewards/chosen': '0.08442', 'rewards/rejected': '-0.181', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2654', 'logps/chosen': '-125.4', 'logps/rejected': '-112.2', 'logits/chosen': '-0.965', 'logits/rejected': '-1', 'epoch': '0.8318'}
|
| 217 |
+
{'loss': '0.6202', 'grad_norm': '27', 'learning_rate': '1.593e-07', 'rewards/chosen': '0.07857', 'rewards/rejected': '-0.2079', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2864', 'logps/chosen': '-124.8', 'logps/rejected': '-104.4', 'logits/chosen': '-1.067', 'logits/rejected': '-0.9671', 'epoch': '0.8366'}
|
| 218 |
+
{'loss': '0.6125', 'grad_norm': '31', 'learning_rate': '1.504e-07', 'rewards/chosen': '0.1301', 'rewards/rejected': '-0.1922', 'rewards/accuracies': '0.665', 'rewards/margins': '0.3224', 'logps/chosen': '-123.3', 'logps/rejected': '-123.1', 'logits/chosen': '-1.041', 'logits/rejected': '-0.9867', 'epoch': '0.8414'}
|
| 219 |
+
{'loss': '0.631', 'grad_norm': '25.5', 'learning_rate': '1.416e-07', 'rewards/chosen': '0.06422', 'rewards/rejected': '-0.2029', 'rewards/accuracies': '0.68', 'rewards/margins': '0.2671', 'logps/chosen': '-152.8', 'logps/rejected': '-103.1', 'logits/chosen': '-0.9218', 'logits/rejected': '-1.025', 'epoch': '0.8463'}
|
| 220 |
+
{'loss': '0.598', 'grad_norm': '36.25', 'learning_rate': '1.331e-07', 'rewards/chosen': '0.04709', 'rewards/rejected': '-0.2808', 'rewards/accuracies': '0.645', 'rewards/margins': '0.3279', 'logps/chosen': '-136.1', 'logps/rejected': '-99.2', 'logits/chosen': '-1.01', 'logits/rejected': '-0.9484', 'epoch': '0.8511'}
|
| 221 |
+
{'loss': '0.6003', 'grad_norm': '27.75', 'learning_rate': '1.249e-07', 'rewards/chosen': '0.06051', 'rewards/rejected': '-0.2485', 'rewards/accuracies': '0.715', 'rewards/margins': '0.309', 'logps/chosen': '-128.1', 'logps/rejected': '-120.9', 'logits/chosen': '-1.018', 'logits/rejected': '-0.9545', 'epoch': '0.8559'}
|
| 222 |
+
{'loss': '0.591', 'grad_norm': '31.25', 'learning_rate': '1.169e-07', 'rewards/chosen': '0.1146', 'rewards/rejected': '-0.2206', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3352', 'logps/chosen': '-137.7', 'logps/rejected': '-116.4', 'logits/chosen': '-1.009', 'logits/rejected': '-0.9895', 'epoch': '0.8607'}
|
| 223 |
+
{'loss': '0.5831', 'grad_norm': '27.75', 'learning_rate': '1.091e-07', 'rewards/chosen': '0.1675', 'rewards/rejected': '-0.2213', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3888', 'logps/chosen': '-143', 'logps/rejected': '-115', 'logits/chosen': '-0.9924', 'logits/rejected': '-0.9801', 'epoch': '0.8655'}
|
| 224 |
+
{'eval_loss': '0.6168', 'eval_runtime': '159.9', 'eval_samples_per_second': '13.69', 'eval_steps_per_second': '2.738', 'eval_rewards/chosen': '0.06656', 'eval_rewards/rejected': '-0.2297', 'eval_rewards/accuracies': '0.6662', 'eval_rewards/margins': '0.2962', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.8655'}
|
| 225 |
+
{'loss': '0.5867', 'grad_norm': '24.88', 'learning_rate': '1.016e-07', 'rewards/chosen': '0.1309', 'rewards/rejected': '-0.221', 'rewards/accuracies': '0.7', 'rewards/margins': '0.352', 'logps/chosen': '-140.8', 'logps/rejected': '-116.5', 'logits/chosen': '-0.984', 'logits/rejected': '-1.022', 'epoch': '0.8703'}
|
| 226 |
+
{'loss': '0.606', 'grad_norm': '29.25', 'learning_rate': '9.44e-08', 'rewards/chosen': '0.09663', 'rewards/rejected': '-0.2094', 'rewards/accuracies': '0.695', 'rewards/margins': '0.3061', 'logps/chosen': '-133.8', 'logps/rejected': '-116.2', 'logits/chosen': '-1.058', 'logits/rejected': '-0.9796', 'epoch': '0.8751'}
|
| 227 |
+
{'loss': '0.6057', 'grad_norm': '21.75', 'learning_rate': '8.741e-08', 'rewards/chosen': '0.09259', 'rewards/rejected': '-0.2016', 'rewards/accuracies': '0.66', 'rewards/margins': '0.2942', 'logps/chosen': '-135', 'logps/rejected': '-109.1', 'logits/chosen': '-0.9949', 'logits/rejected': '-0.9764', 'epoch': '0.8799'}
|
| 228 |
+
{'loss': '0.5924', 'grad_norm': '25.38', 'learning_rate': '8.068e-08', 'rewards/chosen': '0.14', 'rewards/rejected': '-0.2309', 'rewards/accuracies': '0.71', 'rewards/margins': '0.3709', 'logps/chosen': '-141.4', 'logps/rejected': '-111.2', 'logits/chosen': '-1.006', 'logits/rejected': '-0.9707', 'epoch': '0.8847'}
|
| 229 |
+
{'loss': '0.6111', 'grad_norm': '24.12', 'learning_rate': '7.421e-08', 'rewards/chosen': '0.05592', 'rewards/rejected': '-0.235', 'rewards/accuracies': '0.695', 'rewards/margins': '0.2909', 'logps/chosen': '-135.2', 'logps/rejected': '-115', 'logits/chosen': '-1.069', 'logits/rejected': '-1.037', 'epoch': '0.8895'}
|
| 230 |
+
{'loss': '0.6033', 'grad_norm': '24.62', 'learning_rate': '6.799e-08', 'rewards/chosen': '0.0985', 'rewards/rejected': '-0.2023', 'rewards/accuracies': '0.68', 'rewards/margins': '0.3008', 'logps/chosen': '-137.5', 'logps/rejected': '-123.3', 'logits/chosen': '-1.06', 'logits/rejected': '-0.9708', 'epoch': '0.8943'}
|
| 231 |
+
{'loss': '0.5975', 'grad_norm': '24.62', 'learning_rate': '6.204e-08', 'rewards/chosen': '0.1767', 'rewards/rejected': '-0.1992', 'rewards/accuracies': '0.705', 'rewards/margins': '0.376', 'logps/chosen': '-142.1', 'logps/rejected': '-131.8', 'logits/chosen': '-1.021', 'logits/rejected': '-0.9542', 'epoch': '0.8991'}
|
| 232 |
+
{'loss': '0.5923', 'grad_norm': '31.12', 'learning_rate': '5.635e-08', 'rewards/chosen': '0.1364', 'rewards/rejected': '-0.2067', 'rewards/accuracies': '0.71', 'rewards/margins': '0.343', 'logps/chosen': '-136.7', 'logps/rejected': '-111.7', 'logits/chosen': '-1.127', 'logits/rejected': '-1.063', 'epoch': '0.904'}
|
| 233 |
+
{'loss': '0.6425', 'grad_norm': '33.25', 'learning_rate': '5.093e-08', 'rewards/chosen': '0.06713', 'rewards/rejected': '-0.1448', 'rewards/accuracies': '0.6', 'rewards/margins': '0.212', 'logps/chosen': '-125.2', 'logps/rejected': '-119.6', 'logits/chosen': '-1.044', 'logits/rejected': '-0.971', 'epoch': '0.9088'}
|
| 234 |
+
{'loss': '0.6038', 'grad_norm': '31.25', 'learning_rate': '4.578e-08', 'rewards/chosen': '0.144', 'rewards/rejected': '-0.2002', 'rewards/accuracies': '0.64', 'rewards/margins': '0.3442', 'logps/chosen': '-150.9', 'logps/rejected': '-127.8', 'logits/chosen': '-0.9139', 'logits/rejected': '-0.9313', 'epoch': '0.9136'}
|
| 235 |
+
{'eval_loss': '0.6162', 'eval_runtime': '159.6', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.745', 'eval_rewards/chosen': '0.06609', 'eval_rewards/rejected': '-0.231', 'eval_rewards/accuracies': '0.6644', 'eval_rewards/margins': '0.2971', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.011', 'epoch': '0.9136'}
|
| 236 |
+
{'loss': '0.565', 'grad_norm': '26.62', 'learning_rate': '4.089e-08', 'rewards/chosen': '0.1648', 'rewards/rejected': '-0.2517', 'rewards/accuracies': '0.695', 'rewards/margins': '0.4165', 'logps/chosen': '-140.4', 'logps/rejected': '-116.5', 'logits/chosen': '-0.9448', 'logits/rejected': '-0.9807', 'epoch': '0.9184'}
|
| 237 |
+
{'loss': '0.6396', 'grad_norm': '22.88', 'learning_rate': '3.628e-08', 'rewards/chosen': '0.0286', 'rewards/rejected': '-0.2316', 'rewards/accuracies': '0.625', 'rewards/margins': '0.2602', 'logps/chosen': '-130.4', 'logps/rejected': '-118.4', 'logits/chosen': '-0.9926', 'logits/rejected': '-0.9664', 'epoch': '0.9232'}
|
| 238 |
+
{'loss': '0.6648', 'grad_norm': '25.25', 'learning_rate': '3.194e-08', 'rewards/chosen': '0.007605', 'rewards/rejected': '-0.1721', 'rewards/accuracies': '0.66', 'rewards/margins': '0.1797', 'logps/chosen': '-129.3', 'logps/rejected': '-115.7', 'logits/chosen': '-0.9287', 'logits/rejected': '-1.047', 'epoch': '0.928'}
|
| 239 |
+
{'loss': '0.6219', 'grad_norm': '22.38', 'learning_rate': '2.787e-08', 'rewards/chosen': '0.01246', 'rewards/rejected': '-0.264', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2764', 'logps/chosen': '-131.1', 'logps/rejected': '-126.7', 'logits/chosen': '-0.9881', 'logits/rejected': '-0.9735', 'epoch': '0.9328'}
|
| 240 |
+
{'loss': '0.5815', 'grad_norm': '26.25', 'learning_rate': '2.407e-08', 'rewards/chosen': '0.09192', 'rewards/rejected': '-0.2578', 'rewards/accuracies': '0.735', 'rewards/margins': '0.3497', 'logps/chosen': '-135.7', 'logps/rejected': '-104.4', 'logits/chosen': '-0.9655', 'logits/rejected': '-1.012', 'epoch': '0.9376'}
|
| 241 |
+
{'loss': '0.6224', 'grad_norm': '40', 'learning_rate': '2.055e-08', 'rewards/chosen': '0.06027', 'rewards/rejected': '-0.205', 'rewards/accuracies': '0.655', 'rewards/margins': '0.2653', 'logps/chosen': '-142', 'logps/rejected': '-110.6', 'logits/chosen': '-1.003', 'logits/rejected': '-1.027', 'epoch': '0.9424'}
|
| 242 |
+
{'loss': '0.6462', 'grad_norm': '31.38', 'learning_rate': '1.73e-08', 'rewards/chosen': '0.02588', 'rewards/rejected': '-0.191', 'rewards/accuracies': '0.635', 'rewards/margins': '0.2169', 'logps/chosen': '-130.3', 'logps/rejected': '-104.3', 'logits/chosen': '-0.9781', 'logits/rejected': '-0.9542', 'epoch': '0.9472'}
|
| 243 |
+
{'loss': '0.5845', 'grad_norm': '26.75', 'learning_rate': '1.433e-08', 'rewards/chosen': '0.117', 'rewards/rejected': '-0.2956', 'rewards/accuracies': '0.705', 'rewards/margins': '0.4126', 'logps/chosen': '-135.4', 'logps/rejected': '-108.1', 'logits/chosen': '-0.9659', 'logits/rejected': '-0.9748', 'epoch': '0.952'}
|
| 244 |
+
{'loss': '0.6054', 'grad_norm': '21.12', 'learning_rate': '1.164e-08', 'rewards/chosen': '0.09082', 'rewards/rejected': '-0.2336', 'rewards/accuracies': '0.675', 'rewards/margins': '0.3244', 'logps/chosen': '-131.2', 'logps/rejected': '-126.3', 'logits/chosen': '-0.9997', 'logits/rejected': '-0.9632', 'epoch': '0.9568'}
|
| 245 |
+
{'loss': '0.6403', 'grad_norm': '27.12', 'learning_rate': '9.225e-09', 'rewards/chosen': '0.08749', 'rewards/rejected': '-0.1388', 'rewards/accuracies': '0.65', 'rewards/margins': '0.2263', 'logps/chosen': '-123.6', 'logps/rejected': '-111.1', 'logits/chosen': '-1.051', 'logits/rejected': '-1.017', 'epoch': '0.9617'}
|
| 246 |
+
{'eval_loss': '0.617', 'eval_runtime': '159.7', 'eval_samples_per_second': '13.72', 'eval_steps_per_second': '2.743', 'eval_rewards/chosen': '0.06905', 'eval_rewards/rejected': '-0.2269', 'eval_rewards/accuracies': '0.6648', 'eval_rewards/margins': '0.296', 'eval_logps/chosen': '-133.4', 'eval_logps/rejected': '-113.8', 'eval_logits/chosen': '-1.01', 'eval_logits/rejected': '-1.01', 'epoch': '0.9617'}
|
| 247 |
+
{'loss': '0.5908', 'grad_norm': '20.88', 'learning_rate': '7.09e-09', 'rewards/chosen': '0.1685', 'rewards/rejected': '-0.2152', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3837', 'logps/chosen': '-124.9', 'logps/rejected': '-116.3', 'logits/chosen': '-1.08', 'logits/rejected': '-1.014', 'epoch': '0.9665'}
|
| 248 |
+
{'loss': '0.6394', 'grad_norm': '25', 'learning_rate': '5.235e-09', 'rewards/chosen': '0.05396', 'rewards/rejected': '-0.1692', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2231', 'logps/chosen': '-127.7', 'logps/rejected': '-110.3', 'logits/chosen': '-1.066', 'logits/rejected': '-1.035', 'epoch': '0.9713'}
|
| 249 |
+
{'loss': '0.5833', 'grad_norm': '25.5', 'learning_rate': '3.66e-09', 'rewards/chosen': '0.1592', 'rewards/rejected': '-0.2082', 'rewards/accuracies': '0.715', 'rewards/margins': '0.3674', 'logps/chosen': '-135.9', 'logps/rejected': '-114', 'logits/chosen': '-0.9349', 'logits/rejected': '-1.003', 'epoch': '0.9761'}
|
| 250 |
+
{'loss': '0.5958', 'grad_norm': '24.75', 'learning_rate': '2.366e-09', 'rewards/chosen': '0.1077', 'rewards/rejected': '-0.2435', 'rewards/accuracies': '0.68', 'rewards/margins': '0.3513', 'logps/chosen': '-135.1', 'logps/rejected': '-103.6', 'logits/chosen': '-0.9955', 'logits/rejected': '-1.007', 'epoch': '0.9809'}
|
| 251 |
+
{'loss': '0.6165', 'grad_norm': '37', 'learning_rate': '1.353e-09', 'rewards/chosen': '0.09438', 'rewards/rejected': '-0.2076', 'rewards/accuracies': '0.65', 'rewards/margins': '0.302', 'logps/chosen': '-146.4', 'logps/rejected': '-109.5', 'logits/chosen': '-1.048', 'logits/rejected': '-1.034', 'epoch': '0.9857'}
|
| 252 |
+
{'loss': '0.6304', 'grad_norm': '24.12', 'learning_rate': '6.209e-10', 'rewards/chosen': '0.007376', 'rewards/rejected': '-0.2453', 'rewards/accuracies': '0.645', 'rewards/margins': '0.2526', 'logps/chosen': '-129.7', 'logps/rejected': '-115.5', 'logits/chosen': '-1.008', 'logits/rejected': '-1.016', 'epoch': '0.9905'}
|
| 253 |
+
{'loss': '0.5655', 'grad_norm': '31.25', 'learning_rate': '1.704e-10', 'rewards/chosen': '0.1247', 'rewards/rejected': '-0.3073', 'rewards/accuracies': '0.755', 'rewards/margins': '0.432', 'logps/chosen': '-137.2', 'logps/rejected': '-115.6', 'logits/chosen': '-1.054', 'logits/rejected': '-1.011', 'epoch': '0.9953'}
|
| 254 |
+
{'loss': '0.61', 'grad_norm': '24.75', 'learning_rate': '1.408e-12', 'rewards/chosen': '0.0525', 'rewards/rejected': '-0.2418', 'rewards/accuracies': '0.6667', 'rewards/margins': '0.2943', 'logps/chosen': '-122.4', 'logps/rejected': '-103.4', 'logits/chosen': '-1.078', 'logits/rejected': '-1.001', 'epoch': '1'}
|
| 255 |
+
{'train_runtime': '1.07e+04', 'train_samples_per_second': '3.889', 'train_steps_per_second': '0.194', 'train_loss': '0.6242', 'epoch': '1'}
|
| 256 |
+
[dpo_train] Final model saved → models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653/final_model
|
| 257 |
+
[dpo_train] Run metadata → models/dpo_fft_LFM2.5-1.2B-Instruct_Anthropic__hh-rlhf_20260223_210653/run_meta.json
|
| 258 |
+
|
| 259 |
+
[dpo_train] Done.
|