diff --git a/.gitattributes b/.gitattributes index 0fdbcaeade47bcd566445166fb4713f7bbd4d2b3..1c2b0d973caa4147ece5be07d3759ddffe6cd46e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -78,3 +78,47 @@ v4/DPO/DPO_5k/MDPO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text v4/DPO/DPO_5k/lora/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text v4/DPO/DPO_5k/lora/checkpoint-1250/tokenizer.json filter=lfs diff=lfs merge=lfs -text v4/DPO/DPO_5k/lora/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_10k/MDPO_10k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_1k/MDPO_1k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_5k/MDPO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_10k/MKTO_10k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_1k/MKTO_1k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_20k/MKTO_20k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_2k/MKTO_2k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_5k/MKTO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_10k/MORPO_10k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_1k/MORPO_1k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_5k/MORPO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/v5/DPO/DPO_10k/DPO_10k/README.md b/v5/DPO/DPO_10k/DPO_10k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_10k/DPO_10k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_10k/DPO_10k/adapter_config.json b/v5/DPO/DPO_10k/DPO_10k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8436359b1aa944f94290f60b93e89d8644f8843e --- /dev/null +++ b/v5/DPO/DPO_10k/DPO_10k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "down_proj", + "k_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_10k/DPO_10k/adapter_model.safetensors b/v5/DPO/DPO_10k/DPO_10k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dd8744027b8b8460ccc5781d116916d221d3faf --- /dev/null +++ b/v5/DPO/DPO_10k/DPO_10k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ecfca13898e10dceef60271a2e863d7cc7700ce1555e77092c7927008052ac +size 180385008 diff --git a/v5/DPO/DPO_10k/MDPO_10k/chat_template.jinja b/v5/DPO/DPO_10k/MDPO_10k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_10k/MDPO_10k/config.json b/v5/DPO/DPO_10k/MDPO_10k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..269c2ffa2c365f594cb5e44218192c94b419a0cb --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/DPO/DPO_10k/MDPO_10k/generation_config.json b/v5/DPO/DPO_10k/MDPO_10k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9c2224cd391437f7236b3f36305dd39a63ab0a --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.0.0" +} diff --git a/v5/DPO/DPO_10k/MDPO_10k/model.safetensors b/v5/DPO/DPO_10k/MDPO_10k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52c883bcb61063b4ae63b2e0b8b47471610bfbf0 --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c840c13f70893c0b8d55e3acd6553093375b17eaa47b583753de8b121643123 +size 2471645464 diff --git a/v5/DPO/DPO_10k/MDPO_10k/tokenizer.json b/v5/DPO/DPO_10k/MDPO_10k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_10k/MDPO_10k/tokenizer_config.json b/v5/DPO/DPO_10k/MDPO_10k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_10k/MDPO_10k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_10k/lora/README.md b/v5/DPO/DPO_10k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a688c843ff4c4eef54768b9136aba56ab0849246 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/README.md @@ -0,0 +1,69 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- dpo +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/5573bwv9) + + +This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290). + +### Framework versions + +- TRL: 0.27.2 +- Transformers: 5.0.0 +- Pytorch: 2.8.0+cu128 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite DPO as: + +```bibtex +@inproceedings{rafailov2023direct, + title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}}, + author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn}, + year = 2023, + booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023}, + url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html}, + editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/README.md b/v5/DPO/DPO_10k/lora/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_config.json b/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8436359b1aa944f94290f60b93e89d8644f8843e --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "down_proj", + "k_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_model.safetensors b/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4dd8744027b8b8460ccc5781d116916d221d3faf --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ecfca13898e10dceef60271a2e863d7cc7700ce1555e77092c7927008052ac +size 180385008 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/chat_template.jinja b/v5/DPO/DPO_10k/lora/checkpoint-1300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/optimizer.pt b/v5/DPO/DPO_10k/lora/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..543e2d22ab564cd543f9c61891f72165ae1c61a9 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71b5e638f59b9595d8dca87dcaa601984e35f6d6732e3de54770d1173653d04e +size 360902475 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/rng_state.pth b/v5/DPO/DPO_10k/lora/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..db7cae7ca4b94d2104169aa731ae8748bfa04a8f --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d744506ed8242dbe82c0f3357716f73248e5153ff68604326958faa28d9296 +size 14645 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/scaler.pt b/v5/DPO/DPO_10k/lora/checkpoint-1300/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..93250a8b253c0e13b4760735d31ba9ac2df264fa --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e6dc9658957f5cc4ceed4243b2de1cacb7762930be64bb5a8e59e057d65e5e2 +size 1383 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/scheduler.pt b/v5/DPO/DPO_10k/lora/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..607c8750f2da162dbeb432f2b787bf1d0f3b6c7e --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:619902af4750c8c03b81f910de02f255dd9ba762e9ef95909c545df05bfb4a75 +size 1465 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer.json b/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer_config.json b/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/trainer_state.json b/v5/DPO/DPO_10k/lora/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..37504b73d3b708eb485598fb4a13e782b5cfc5a8 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/trainer_state.json @@ -0,0 +1,2192 @@ +{ + "best_global_step": 1300, + "best_metric": 0.5460000038146973, + "best_model_checkpoint": "output/lora/checkpoint-1300", + "epoch": 1.04, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 4.452983379364014, + "learning_rate": 3.6e-08, + "logits/chosen": 1.5510008335113525, + "logits/rejected": 1.5244438648223877, + "logps/chosen": -131.24708557128906, + "logps/rejected": -146.8297576904297, + "loss": 0.6932957172393799, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.00019072293071076274, + "rewards/margins": -0.00029331922996789217, + "rewards/rejected": 0.0001025962847052142, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 5.203515529632568, + "learning_rate": 7.599999999999999e-08, + "logits/chosen": 1.6611576080322266, + "logits/rejected": 1.6220839023590088, + "logps/chosen": -156.2080078125, + "logps/rejected": -142.72964477539062, + "loss": 0.6937986850738526, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.0006145072402432561, + "rewards/margins": -0.0012890815269201994, + "rewards/rejected": 0.000674574519507587, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 4.087289810180664, + "learning_rate": 1.16e-07, + "logits/chosen": 1.9773778915405273, + "logits/rejected": 1.8304665088653564, + "logps/chosen": -163.54708862304688, + "logps/rejected": -157.88926696777344, + "loss": 0.6931437492370606, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0003584886435419321, + "rewards/margins": 2.1700874640373513e-05, + "rewards/rejected": 0.00033678775071166456, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 4.74172830581665, + "learning_rate": 1.56e-07, + "logits/chosen": 1.5896285772323608, + "logits/rejected": 1.7109922170639038, + "logps/chosen": -144.44276428222656, + "logps/rejected": -133.09629821777344, + "loss": 0.6932558059692383, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00048673158744350076, + "rewards/margins": -0.00020531899644993246, + "rewards/rejected": 0.0006920504383742809, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 4.32133150100708, + "learning_rate": 1.96e-07, + "logits/chosen": 1.5152148008346558, + "logits/rejected": 1.585367202758789, + "logps/chosen": -131.73226928710938, + "logps/rejected": -136.8301239013672, + "loss": 0.6930522918701172, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0008355070021934807, + "rewards/margins": 0.00019948731642216444, + "rewards/rejected": 0.0006360196857713163, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 3.915316343307495, + "learning_rate": 2.3599999999999997e-07, + "logits/chosen": 1.5138778686523438, + "logits/rejected": 1.4824903011322021, + "logps/chosen": -136.43399047851562, + "logps/rejected": -126.70623779296875, + "loss": 0.6929163455963134, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000302538916002959, + "rewards/margins": 0.00047383070341311395, + "rewards/rejected": -0.00017129186016973108, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 4.329769134521484, + "learning_rate": 2.7600000000000004e-07, + "logits/chosen": 1.6920913457870483, + "logits/rejected": 1.8169019222259521, + "logps/chosen": -152.056640625, + "logps/rejected": -155.9404296875, + "loss": 0.6935575008392334, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0025545789394527674, + "rewards/margins": -0.0008073424687609076, + "rewards/rejected": 0.003361921291798353, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 3.964193344116211, + "learning_rate": 3.1599999999999997e-07, + "logits/chosen": 1.5645431280136108, + "logits/rejected": 1.5879082679748535, + "logps/chosen": -147.78839111328125, + "logps/rejected": -135.19906616210938, + "loss": 0.6925086498260498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006093275733292103, + "rewards/margins": 0.0012893510283902287, + "rewards/rejected": 0.004803924821317196, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 3.935694694519043, + "learning_rate": 3.5599999999999996e-07, + "logits/chosen": 1.5960246324539185, + "logits/rejected": 1.6901094913482666, + "logps/chosen": -157.85256958007812, + "logps/rejected": -150.51974487304688, + "loss": 0.6931850433349609, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.010219026356935501, + "rewards/margins": -5.4271204135147855e-05, + "rewards/rejected": 0.010273297317326069, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 5.13019323348999, + "learning_rate": 3.96e-07, + "logits/chosen": 1.5446935892105103, + "logits/rejected": 1.6452451944351196, + "logps/chosen": -149.88038635253906, + "logps/rejected": -169.9078826904297, + "loss": 0.6935123443603516, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011777262203395367, + "rewards/margins": -0.0007071519503369927, + "rewards/rejected": 0.0124844154343009, + "step": 100 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5421667098999023, + "eval_logits/rejected": 1.5734084844589233, + "eval_logps/chosen": -153.21649169921875, + "eval_logps/rejected": -147.7952117919922, + "eval_loss": 0.6929068565368652, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.011265883222222328, + "eval_rewards/margins": 0.0005034058121964335, + "eval_rewards/rejected": 0.010762478224933147, + "eval_runtime": 90.2131, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 5.695896625518799, + "learning_rate": 4.36e-07, + "logits/chosen": 1.781393051147461, + "logits/rejected": 1.7461833953857422, + "logps/chosen": -172.24188232421875, + "logps/rejected": -154.40878295898438, + "loss": 0.6922736167907715, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.014022141695022583, + "rewards/margins": 0.001777560799382627, + "rewards/rejected": 0.012244580313563347, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 4.398581027984619, + "learning_rate": 4.76e-07, + "logits/chosen": 1.611268401145935, + "logits/rejected": 1.6106624603271484, + "logps/chosen": -135.13426208496094, + "logps/rejected": -139.7284393310547, + "loss": 0.6927696228027344, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.014869053848087788, + "rewards/margins": 0.0007819391903467476, + "rewards/rejected": 0.014087115414440632, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 5.19202995300293, + "learning_rate": 5.16e-07, + "logits/chosen": 1.5615273714065552, + "logits/rejected": 1.7724416255950928, + "logps/chosen": -157.66746520996094, + "logps/rejected": -161.90391540527344, + "loss": 0.6928309917449951, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013302234932780266, + "rewards/margins": 0.0006531739491038024, + "rewards/rejected": 0.012649061158299446, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 3.4575726985931396, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.492018699645996, + "logits/rejected": 1.5187314748764038, + "logps/chosen": -131.4152374267578, + "logps/rejected": -125.62705993652344, + "loss": 0.6929276943206787, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014355423860251904, + "rewards/margins": 0.000462935131508857, + "rewards/rejected": 0.013892488554120064, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 4.500187397003174, + "learning_rate": 5.96e-07, + "logits/chosen": 1.5862048864364624, + "logits/rejected": 1.6784181594848633, + "logps/chosen": -163.6667938232422, + "logps/rejected": -157.76402282714844, + "loss": 0.6910766124725342, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.018648523837327957, + "rewards/margins": 0.004183619283139706, + "rewards/rejected": 0.014464902691543102, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 4.113079071044922, + "learning_rate": 6.36e-07, + "logits/chosen": 1.7717370986938477, + "logits/rejected": 1.8070589303970337, + "logps/chosen": -158.02734375, + "logps/rejected": -145.92495727539062, + "loss": 0.6927172183990479, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.02493301033973694, + "rewards/margins": 0.0009325124556198716, + "rewards/rejected": 0.024000495672225952, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 5.4983696937561035, + "learning_rate": 6.76e-07, + "logits/chosen": 1.5733931064605713, + "logits/rejected": 1.6008774042129517, + "logps/chosen": -147.1856689453125, + "logps/rejected": -159.93077087402344, + "loss": 0.6926907062530517, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02877199277281761, + "rewards/margins": 0.0010106085101142526, + "rewards/rejected": 0.027761384844779968, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 4.50191068649292, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.5886398553848267, + "logits/rejected": 1.7399513721466064, + "logps/chosen": -157.6659393310547, + "logps/rejected": -160.65431213378906, + "loss": 0.6925735473632812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024526067078113556, + "rewards/margins": 0.0012190367560833693, + "rewards/rejected": 0.02330702915787697, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 4.708652019500732, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.6504443883895874, + "logits/rejected": 1.7761001586914062, + "logps/chosen": -141.34536743164062, + "logps/rejected": -143.41159057617188, + "loss": 0.6928653240203857, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.026504456996917725, + "rewards/margins": 0.0006459927535615861, + "rewards/rejected": 0.02585846558213234, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 4.143187046051025, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7108211517333984, + "logits/rejected": 1.6271438598632812, + "logps/chosen": -158.04931640625, + "logps/rejected": -132.23463439941406, + "loss": 0.6927096843719482, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.02748889848589897, + "rewards/margins": 0.0009879134595394135, + "rewards/rejected": 0.026500985026359558, + "step": 200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.54364013671875, + "eval_logits/rejected": 1.5745173692703247, + "eval_logps/chosen": -153.04653930664062, + "eval_logps/rejected": -147.63844299316406, + "eval_loss": 0.6923084855079651, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.028259562328457832, + "eval_rewards/margins": 0.0018185621593147516, + "eval_rewards/rejected": 0.026441000401973724, + "eval_runtime": 90.4481, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 2.764, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 4.559652328491211, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.7295278310775757, + "logits/rejected": 1.6801897287368774, + "logps/chosen": -158.0893096923828, + "logps/rejected": -168.72427368164062, + "loss": 0.6922987461090088, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02837887406349182, + "rewards/margins": 0.0018027331680059433, + "rewards/rejected": 0.026576142758131027, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 3.907545328140259, + "learning_rate": 8.76e-07, + "logits/chosen": 1.6849712133407593, + "logits/rejected": 1.7441444396972656, + "logps/chosen": -158.67384338378906, + "logps/rejected": -143.02920532226562, + "loss": 0.6933117389678956, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03286944702267647, + "rewards/margins": -0.0002176974667236209, + "rewards/rejected": 0.03308714181184769, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 3.5083253383636475, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5701725482940674, + "logits/rejected": 1.7182495594024658, + "logps/chosen": -160.56790161132812, + "logps/rejected": -138.05374145507812, + "loss": 0.6915814399719238, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.03840702772140503, + "rewards/margins": 0.0033035180531442165, + "rewards/rejected": 0.035103507339954376, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 4.424270153045654, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.671190857887268, + "logits/rejected": 1.6964585781097412, + "logps/chosen": -170.28260803222656, + "logps/rejected": -144.33534240722656, + "loss": 0.6900368690490722, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0511443130671978, + "rewards/margins": 0.006511001847684383, + "rewards/rejected": 0.04463331401348114, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 4.5393967628479, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.6391950845718384, + "logits/rejected": 1.5815935134887695, + "logps/chosen": -160.45225524902344, + "logps/rejected": -147.56185913085938, + "loss": 0.6940414905548096, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.04959743469953537, + "rewards/margins": -0.001381749869324267, + "rewards/rejected": 0.05097918584942818, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 4.256033897399902, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.5204452276229858, + "logits/rejected": 1.6171140670776367, + "logps/chosen": -131.5397186279297, + "logps/rejected": -145.2186279296875, + "loss": 0.6930822372436524, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.045291412621736526, + "rewards/margins": 0.000405142258387059, + "rewards/rejected": 0.0448862686753273, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 4.027031421661377, + "learning_rate": 9.915555555555556e-07, + "logits/chosen": 1.6407123804092407, + "logits/rejected": 1.7262824773788452, + "logps/chosen": -145.78701782226562, + "logps/rejected": -146.34481811523438, + "loss": 0.6946187496185303, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04104622080922127, + "rewards/margins": -0.0027089794166386127, + "rewards/rejected": 0.043755196034908295, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 5.568243026733398, + "learning_rate": 9.87111111111111e-07, + "logits/chosen": 1.6697533130645752, + "logits/rejected": 1.5154677629470825, + "logps/chosen": -171.0277099609375, + "logps/rejected": -154.05654907226562, + "loss": 0.6873753070831299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04779375344514847, + "rewards/margins": 0.011815806850790977, + "rewards/rejected": 0.03597795218229294, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 4.041477680206299, + "learning_rate": 9.826666666666667e-07, + "logits/chosen": 1.6633354425430298, + "logits/rejected": 1.6905081272125244, + "logps/chosen": -142.88864135742188, + "logps/rejected": -152.04757690429688, + "loss": 0.6929869174957275, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04824609309434891, + "rewards/margins": 0.0005596639821305871, + "rewards/rejected": 0.04768642783164978, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 4.9481635093688965, + "learning_rate": 9.782222222222222e-07, + "logits/chosen": 1.617485761642456, + "logits/rejected": 1.6837307214736938, + "logps/chosen": -152.088134765625, + "logps/rejected": -164.15158081054688, + "loss": 0.6897455215454101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.057097338140010834, + "rewards/margins": 0.007080497685819864, + "rewards/rejected": 0.050016842782497406, + "step": 300 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5520602464675903, + "eval_logits/rejected": 1.5826917886734009, + "eval_logps/chosen": -152.8227996826172, + "eval_logps/rejected": -147.43496704101562, + "eval_loss": 0.6914217472076416, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": 0.05063560605049133, + "eval_rewards/margins": 0.0038486982230097055, + "eval_rewards/rejected": 0.04678690433502197, + "eval_runtime": 90.237, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 2.77, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 4.486109256744385, + "learning_rate": 9.737777777777777e-07, + "logits/chosen": 1.7188745737075806, + "logits/rejected": 1.7590553760528564, + "logps/chosen": -140.6877899169922, + "logps/rejected": -155.31893920898438, + "loss": 0.6952545166015625, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": 0.046565137803554535, + "rewards/margins": -0.003889651270583272, + "rewards/rejected": 0.050454795360565186, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 4.374355316162109, + "learning_rate": 9.693333333333334e-07, + "logits/chosen": 1.7103513479232788, + "logits/rejected": 1.7379261255264282, + "logps/chosen": -137.8660888671875, + "logps/rejected": -140.40956115722656, + "loss": 0.692354393005371, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04029277712106705, + "rewards/margins": 0.0018124934285879135, + "rewards/rejected": 0.038480278104543686, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 4.618821144104004, + "learning_rate": 9.648888888888889e-07, + "logits/chosen": 1.5603777170181274, + "logits/rejected": 1.5868213176727295, + "logps/chosen": -157.3379669189453, + "logps/rejected": -182.77377319335938, + "loss": 0.6921377182006836, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03832743316888809, + "rewards/margins": 0.0022720033302903175, + "rewards/rejected": 0.03605542704463005, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 4.1974406242370605, + "learning_rate": 9.604444444444443e-07, + "logits/chosen": 1.8520517349243164, + "logits/rejected": 1.716774582862854, + "logps/chosen": -158.16363525390625, + "logps/rejected": -149.66162109375, + "loss": 0.692081069946289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0396885983645916, + "rewards/margins": 0.0023556998930871487, + "rewards/rejected": 0.03733289986848831, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 4.700806140899658, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.6886537075042725, + "logits/rejected": 1.8079423904418945, + "logps/chosen": -165.5438232421875, + "logps/rejected": -194.98428344726562, + "loss": 0.6890993118286133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05031610652804375, + "rewards/margins": 0.008422226645052433, + "rewards/rejected": 0.041893888264894485, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 3.7786731719970703, + "learning_rate": 9.515555555555555e-07, + "logits/chosen": 1.5483795404434204, + "logits/rejected": 1.4731425046920776, + "logps/chosen": -161.77774047851562, + "logps/rejected": -168.05458068847656, + "loss": 0.6911417007446289, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.050054989755153656, + "rewards/margins": 0.0043535237200558186, + "rewards/rejected": 0.045701466500759125, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 4.362200736999512, + "learning_rate": 9.471111111111111e-07, + "logits/chosen": 1.735099196434021, + "logits/rejected": 1.7567729949951172, + "logps/chosen": -161.30374145507812, + "logps/rejected": -153.4731903076172, + "loss": 0.6882652282714844, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06078929826617241, + "rewards/margins": 0.010189466178417206, + "rewards/rejected": 0.0505998320877552, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 4.35581111907959, + "learning_rate": 9.426666666666666e-07, + "logits/chosen": 1.604020118713379, + "logits/rejected": 1.524717926979065, + "logps/chosen": -141.42324829101562, + "logps/rejected": -151.82521057128906, + "loss": 0.6888412952423095, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.06947806477546692, + "rewards/margins": 0.009350637905299664, + "rewards/rejected": 0.06012742593884468, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 4.360926628112793, + "learning_rate": 9.382222222222222e-07, + "logits/chosen": 1.7075706720352173, + "logits/rejected": 1.6819493770599365, + "logps/chosen": -150.63375854492188, + "logps/rejected": -137.05673217773438, + "loss": 0.6836989879608154, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.08739721775054932, + "rewards/margins": 0.019574418663978577, + "rewards/rejected": 0.06782279908657074, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 4.909813404083252, + "learning_rate": 9.337777777777778e-07, + "logits/chosen": 1.6920162439346313, + "logits/rejected": 1.675100326538086, + "logps/chosen": -154.39663696289062, + "logps/rejected": -147.51455688476562, + "loss": 0.6892420768737793, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08908190578222275, + "rewards/margins": 0.008944114670157433, + "rewards/rejected": 0.08013778924942017, + "step": 400 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5610027313232422, + "eval_logits/rejected": 1.5909091234207153, + "eval_logps/chosen": -152.39871215820312, + "eval_logps/rejected": -147.04049682617188, + "eval_loss": 0.6903401017189026, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.09304190427064896, + "eval_rewards/margins": 0.006809028796851635, + "eval_rewards/rejected": 0.08623287081718445, + "eval_runtime": 90.296, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.769, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 4.541158199310303, + "learning_rate": 9.293333333333333e-07, + "logits/chosen": 1.6515496969223022, + "logits/rejected": 1.548688530921936, + "logps/chosen": -147.21546936035156, + "logps/rejected": -187.50816345214844, + "loss": 0.6924624919891358, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.10388608276844025, + "rewards/margins": 0.002456969814375043, + "rewards/rejected": 0.10142910480499268, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 5.246954917907715, + "learning_rate": 9.248888888888888e-07, + "logits/chosen": 1.6460405588150024, + "logits/rejected": 1.6713184118270874, + "logps/chosen": -151.11341857910156, + "logps/rejected": -166.2979736328125, + "loss": 0.6986268043518067, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.09254096448421478, + "rewards/margins": -0.009933066554367542, + "rewards/rejected": 0.10247401893138885, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 4.068811416625977, + "learning_rate": 9.204444444444443e-07, + "logits/chosen": 1.6973702907562256, + "logits/rejected": 1.7605253458023071, + "logps/chosen": -162.7523651123047, + "logps/rejected": -150.79718017578125, + "loss": 0.6896752834320068, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.08682320266962051, + "rewards/margins": 0.007874277420341969, + "rewards/rejected": 0.07894892990589142, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 4.387909412384033, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5456931591033936, + "logits/rejected": 1.4381892681121826, + "logps/chosen": -155.777099609375, + "logps/rejected": -144.95742797851562, + "loss": 0.6881390571594238, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08465877920389175, + "rewards/margins": 0.01117948442697525, + "rewards/rejected": 0.0734792947769165, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 4.3955864906311035, + "learning_rate": 9.115555555555555e-07, + "logits/chosen": 1.7298389673233032, + "logits/rejected": 1.681171178817749, + "logps/chosen": -156.2227783203125, + "logps/rejected": -158.81114196777344, + "loss": 0.685992956161499, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08095243573188782, + "rewards/margins": 0.015426402911543846, + "rewards/rejected": 0.06552603840827942, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 4.6138176918029785, + "learning_rate": 9.071111111111111e-07, + "logits/chosen": 1.6315510272979736, + "logits/rejected": 1.6908462047576904, + "logps/chosen": -150.84512329101562, + "logps/rejected": -163.89492797851562, + "loss": 0.6891673088073731, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07254813611507416, + "rewards/margins": 0.008895034901797771, + "rewards/rejected": 0.06365309655666351, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 4.3172101974487305, + "learning_rate": 9.026666666666665e-07, + "logits/chosen": 1.4395225048065186, + "logits/rejected": 1.4489599466323853, + "logps/chosen": -130.1565399169922, + "logps/rejected": -122.24504089355469, + "loss": 0.6887143135070801, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.08355607837438583, + "rewards/margins": 0.009718736633658409, + "rewards/rejected": 0.07383735477924347, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 4.2122087478637695, + "learning_rate": 8.982222222222222e-07, + "logits/chosen": 1.5334614515304565, + "logits/rejected": 1.5769469738006592, + "logps/chosen": -147.21896362304688, + "logps/rejected": -162.89804077148438, + "loss": 0.6849615573883057, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.10068760812282562, + "rewards/margins": 0.017685385420918465, + "rewards/rejected": 0.0830022394657135, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 4.740354061126709, + "learning_rate": 8.937777777777777e-07, + "logits/chosen": 1.6524379253387451, + "logits/rejected": 1.7100518941879272, + "logps/chosen": -142.10653686523438, + "logps/rejected": -158.3316192626953, + "loss": 0.696216630935669, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.08680267632007599, + "rewards/margins": -0.004664557985961437, + "rewards/rejected": 0.0914672389626503, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 3.6374881267547607, + "learning_rate": 8.893333333333333e-07, + "logits/chosen": 1.518328309059143, + "logits/rejected": 1.6029644012451172, + "logps/chosen": -143.19154357910156, + "logps/rejected": -134.3892059326172, + "loss": 0.6908615589141845, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.06735874712467194, + "rewards/margins": 0.005541653838008642, + "rewards/rejected": 0.06181709095835686, + "step": 500 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5418404340744019, + "eval_logits/rejected": 1.571341633796692, + "eval_logps/chosen": -152.5870819091797, + "eval_logps/rejected": -147.23146057128906, + "eval_loss": 0.6903930902481079, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": 0.07420650124549866, + "eval_rewards/margins": 0.007066408637911081, + "eval_rewards/rejected": 0.06714009493589401, + "eval_runtime": 90.217, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 4.53076171875, + "learning_rate": 8.848888888888888e-07, + "logits/chosen": 1.6947540044784546, + "logits/rejected": 1.6306483745574951, + "logps/chosen": -130.33372497558594, + "logps/rejected": -139.05648803710938, + "loss": 0.6863756656646729, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.08373989164829254, + "rewards/margins": 0.014761297032237053, + "rewards/rejected": 0.06897859275341034, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 5.064472675323486, + "learning_rate": 8.804444444444445e-07, + "logits/chosen": 1.795907974243164, + "logits/rejected": 1.6805435419082642, + "logps/chosen": -165.10183715820312, + "logps/rejected": -170.87112426757812, + "loss": 0.6915029525756836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07381857931613922, + "rewards/margins": 0.00408085435628891, + "rewards/rejected": 0.06973771750926971, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 4.472287178039551, + "learning_rate": 8.76e-07, + "logits/chosen": 1.7226626873016357, + "logits/rejected": 1.6465301513671875, + "logps/chosen": -165.50076293945312, + "logps/rejected": -167.12991333007812, + "loss": 0.6784487724304199, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09387621283531189, + "rewards/margins": 0.031213903799653053, + "rewards/rejected": 0.06266231089830399, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 4.193634033203125, + "learning_rate": 8.715555555555554e-07, + "logits/chosen": 1.7823143005371094, + "logits/rejected": 1.7374283075332642, + "logps/chosen": -180.05233764648438, + "logps/rejected": -157.24835205078125, + "loss": 0.6891638278961182, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10219593346118927, + "rewards/margins": 0.009201721288263798, + "rewards/rejected": 0.09299422055482864, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 5.877465724945068, + "learning_rate": 8.671111111111111e-07, + "logits/chosen": 1.6558294296264648, + "logits/rejected": 1.7549035549163818, + "logps/chosen": -149.97171020507812, + "logps/rejected": -166.52127075195312, + "loss": 0.6909477233886718, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09389887005090714, + "rewards/margins": 0.00655100354924798, + "rewards/rejected": 0.0873478576540947, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 3.9154303073883057, + "learning_rate": 8.626666666666666e-07, + "logits/chosen": 1.7343839406967163, + "logits/rejected": 1.6256252527236938, + "logps/chosen": -153.2657470703125, + "logps/rejected": -137.84548950195312, + "loss": 0.6832056045532227, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08965932577848434, + "rewards/margins": 0.02167549543082714, + "rewards/rejected": 0.06798382848501205, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 3.569357395172119, + "learning_rate": 8.582222222222222e-07, + "logits/chosen": 1.6020238399505615, + "logits/rejected": 1.5468555688858032, + "logps/chosen": -156.9928741455078, + "logps/rejected": -150.9776153564453, + "loss": 0.6857921123504639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07846538722515106, + "rewards/margins": 0.016974106431007385, + "rewards/rejected": 0.06149129942059517, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 5.710695266723633, + "learning_rate": 8.537777777777777e-07, + "logits/chosen": 1.4293699264526367, + "logits/rejected": 1.583032250404358, + "logps/chosen": -134.98165893554688, + "logps/rejected": -153.61439514160156, + "loss": 0.6899324417114258, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05123991519212723, + "rewards/margins": 0.008615568280220032, + "rewards/rejected": 0.042624346911907196, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 3.757844924926758, + "learning_rate": 8.493333333333334e-07, + "logits/chosen": 1.5719819068908691, + "logits/rejected": 1.5706799030303955, + "logps/chosen": -143.9678955078125, + "logps/rejected": -130.64585876464844, + "loss": 0.6851204395294189, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07710663974285126, + "rewards/margins": 0.01796458289027214, + "rewards/rejected": 0.059142060577869415, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 5.662181854248047, + "learning_rate": 8.448888888888888e-07, + "logits/chosen": 1.6224644184112549, + "logits/rejected": 1.6623615026474, + "logps/chosen": -130.7429962158203, + "logps/rejected": -157.59295654296875, + "loss": 0.6958520889282227, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0648646205663681, + "rewards/margins": -0.003241670085117221, + "rewards/rejected": 0.0681062787771225, + "step": 600 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5268914699554443, + "eval_logits/rejected": 1.5556302070617676, + "eval_logps/chosen": -152.5731201171875, + "eval_logps/rejected": -147.22303771972656, + "eval_loss": 0.6907246708869934, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.07560181617736816, + "eval_rewards/margins": 0.007620053365826607, + "eval_rewards/rejected": 0.0679817646741867, + "eval_runtime": 90.3327, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 2.768, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 3.7953426837921143, + "learning_rate": 8.404444444444444e-07, + "logits/chosen": 1.6380192041397095, + "logits/rejected": 1.6921494007110596, + "logps/chosen": -130.59445190429688, + "logps/rejected": -148.48709106445312, + "loss": 0.6903901100158691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08007006347179413, + "rewards/margins": 0.00720745325088501, + "rewards/rejected": 0.07286261022090912, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 7.121775150299072, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.6000845432281494, + "logits/rejected": 1.731951355934143, + "logps/chosen": -154.8905792236328, + "logps/rejected": -166.4490966796875, + "loss": 0.6969138145446777, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.05843223258852959, + "rewards/margins": -0.0054575116373598576, + "rewards/rejected": 0.06388974189758301, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 5.155455589294434, + "learning_rate": 8.315555555555556e-07, + "logits/chosen": 1.6201622486114502, + "logits/rejected": 1.6479911804199219, + "logps/chosen": -165.98980712890625, + "logps/rejected": -145.71644592285156, + "loss": 0.6804090023040772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0773121565580368, + "rewards/margins": 0.027506589889526367, + "rewards/rejected": 0.049805570393800735, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 4.009693145751953, + "learning_rate": 8.271111111111111e-07, + "logits/chosen": 1.5530269145965576, + "logits/rejected": 1.5585509538650513, + "logps/chosen": -166.77560424804688, + "logps/rejected": -151.09249877929688, + "loss": 0.6879617691040039, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07863648235797882, + "rewards/margins": 0.014577758498489857, + "rewards/rejected": 0.06405872106552124, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 4.472072601318359, + "learning_rate": 8.226666666666666e-07, + "logits/chosen": 1.662239670753479, + "logits/rejected": 1.6585584878921509, + "logps/chosen": -153.26776123046875, + "logps/rejected": -125.24166107177734, + "loss": 0.6882720470428467, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09423185139894485, + "rewards/margins": 0.011949598789215088, + "rewards/rejected": 0.08228223770856857, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 5.073488712310791, + "learning_rate": 8.182222222222222e-07, + "logits/chosen": 1.6752973794937134, + "logits/rejected": 1.6020495891571045, + "logps/chosen": -150.0669708251953, + "logps/rejected": -131.1305694580078, + "loss": 0.6880992889404297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.07629784196615219, + "rewards/margins": 0.01174530852586031, + "rewards/rejected": 0.06455253064632416, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 5.938063621520996, + "learning_rate": 8.137777777777777e-07, + "logits/chosen": 1.7563416957855225, + "logits/rejected": 1.5739262104034424, + "logps/chosen": -165.046875, + "logps/rejected": -150.13104248046875, + "loss": 0.6939912796020508, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.09020708501338959, + "rewards/margins": 0.0006308574229478836, + "rewards/rejected": 0.08957622945308685, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 4.362247467041016, + "learning_rate": 8.093333333333333e-07, + "logits/chosen": 1.6460363864898682, + "logits/rejected": 1.6379966735839844, + "logps/chosen": -143.24754333496094, + "logps/rejected": -131.8529815673828, + "loss": 0.6835652351379394, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.10346712917089462, + "rewards/margins": 0.02079077437520027, + "rewards/rejected": 0.08267635107040405, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 4.323369026184082, + "learning_rate": 8.048888888888888e-07, + "logits/chosen": 1.5466216802597046, + "logits/rejected": 1.541775107383728, + "logps/chosen": -171.82138061523438, + "logps/rejected": -158.87603759765625, + "loss": 0.6892048358917237, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0915951356291771, + "rewards/margins": 0.009338131174445152, + "rewards/rejected": 0.08225701004266739, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 5.28114128112793, + "learning_rate": 8.004444444444444e-07, + "logits/chosen": 1.6419496536254883, + "logits/rejected": 1.6641361713409424, + "logps/chosen": -158.20787048339844, + "logps/rejected": -136.3108367919922, + "loss": 0.6985964775085449, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0700095146894455, + "rewards/margins": -0.008601363748311996, + "rewards/rejected": 0.07861088216304779, + "step": 700 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5386524200439453, + "eval_logits/rejected": 1.5675796270370483, + "eval_logps/chosen": -152.39877319335938, + "eval_logps/rejected": -147.0768585205078, + "eval_loss": 0.6893304586410522, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.09303704649209976, + "eval_rewards/margins": 0.010439171455800533, + "eval_rewards/rejected": 0.08259786665439606, + "eval_runtime": 90.3103, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 4.867155075073242, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7137393951416016, + "logits/rejected": 1.6643224954605103, + "logps/chosen": -147.054931640625, + "logps/rejected": -162.10067749023438, + "loss": 0.6891860008239746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11649386584758759, + "rewards/margins": 0.00985223613679409, + "rewards/rejected": 0.10664163529872894, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 4.687198638916016, + "learning_rate": 7.915555555555556e-07, + "logits/chosen": 1.686532974243164, + "logits/rejected": 1.7992823123931885, + "logps/chosen": -138.60238647460938, + "logps/rejected": -134.22702026367188, + "loss": 0.7006660461425781, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.08661777526140213, + "rewards/margins": -0.013613695278763771, + "rewards/rejected": 0.10023146867752075, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 4.63344669342041, + "learning_rate": 7.87111111111111e-07, + "logits/chosen": 1.781561255455017, + "logits/rejected": 1.7561432123184204, + "logps/chosen": -151.60018920898438, + "logps/rejected": -147.93264770507812, + "loss": 0.6958267688751221, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10642895847558975, + "rewards/margins": -0.002830044599249959, + "rewards/rejected": 0.1092589944601059, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 4.5400800704956055, + "learning_rate": 7.826666666666666e-07, + "logits/chosen": 1.670771837234497, + "logits/rejected": 1.5866410732269287, + "logps/chosen": -155.36764526367188, + "logps/rejected": -132.60902404785156, + "loss": 0.6921723842620849, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0910472720861435, + "rewards/margins": 0.003630922408774495, + "rewards/rejected": 0.08741635084152222, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 4.779706954956055, + "learning_rate": 7.782222222222222e-07, + "logits/chosen": 1.531534194946289, + "logits/rejected": 1.5548356771469116, + "logps/chosen": -135.88177490234375, + "logps/rejected": -157.09231567382812, + "loss": 0.6919455528259277, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.09000497311353683, + "rewards/margins": 0.004116452299058437, + "rewards/rejected": 0.08588851988315582, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 5.283969879150391, + "learning_rate": 7.737777777777777e-07, + "logits/chosen": 1.6809686422348022, + "logits/rejected": 1.501511812210083, + "logps/chosen": -137.67315673828125, + "logps/rejected": -128.26022338867188, + "loss": 0.6907838344573974, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.08058954030275345, + "rewards/margins": 0.0064643076620996, + "rewards/rejected": 0.07412523031234741, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 4.341912269592285, + "learning_rate": 7.693333333333333e-07, + "logits/chosen": 1.684203863143921, + "logits/rejected": 1.6489808559417725, + "logps/chosen": -139.82455444335938, + "logps/rejected": -135.16998291015625, + "loss": 0.6793179988861084, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10346569865942001, + "rewards/margins": 0.02992106042802334, + "rewards/rejected": 0.07354463636875153, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 5.209469318389893, + "learning_rate": 7.648888888888888e-07, + "logits/chosen": 1.5599935054779053, + "logits/rejected": 1.6487398147583008, + "logps/chosen": -152.46170043945312, + "logps/rejected": -157.7329559326172, + "loss": 0.6873491287231446, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10296590626239777, + "rewards/margins": 0.01368915755301714, + "rewards/rejected": 0.0892767459154129, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 5.364309310913086, + "learning_rate": 7.604444444444445e-07, + "logits/chosen": 1.5357733964920044, + "logits/rejected": 1.5833505392074585, + "logps/chosen": -146.4203338623047, + "logps/rejected": -149.77499389648438, + "loss": 0.68800368309021, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0752335637807846, + "rewards/margins": 0.01265893317759037, + "rewards/rejected": 0.06257463991641998, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 4.701781272888184, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.7693378925323486, + "logits/rejected": 1.784106969833374, + "logps/chosen": -178.75717163085938, + "logps/rejected": -192.69229125976562, + "loss": 0.7001357078552246, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.07940518856048584, + "rewards/margins": -0.01138945110142231, + "rewards/rejected": 0.0907946228981018, + "step": 800 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.532821536064148, + "eval_logits/rejected": 1.5615730285644531, + "eval_logps/chosen": -152.57040405273438, + "eval_logps/rejected": -147.24534606933594, + "eval_loss": 0.6894002556800842, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.07587439566850662, + "eval_rewards/margins": 0.010124183259904385, + "eval_rewards/rejected": 0.06575021147727966, + "eval_runtime": 90.2864, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 4.731827259063721, + "learning_rate": 7.515555555555555e-07, + "logits/chosen": 1.5014355182647705, + "logits/rejected": 1.706011176109314, + "logps/chosen": -113.23974609375, + "logps/rejected": -150.55316162109375, + "loss": 0.6896101951599121, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.066777303814888, + "rewards/margins": 0.008613836951553822, + "rewards/rejected": 0.05816347524523735, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 4.050163745880127, + "learning_rate": 7.47111111111111e-07, + "logits/chosen": 1.701898217201233, + "logits/rejected": 1.7274971008300781, + "logps/chosen": -147.45330810546875, + "logps/rejected": -140.33255004882812, + "loss": 0.6757836818695069, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08680602163076401, + "rewards/margins": 0.03675536438822746, + "rewards/rejected": 0.050050657242536545, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 4.168673992156982, + "learning_rate": 7.426666666666667e-07, + "logits/chosen": 1.6135514974594116, + "logits/rejected": 1.6518815755844116, + "logps/chosen": -137.38467407226562, + "logps/rejected": -132.65890502929688, + "loss": 0.6800778865814209, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07235284894704819, + "rewards/margins": 0.028275374323129654, + "rewards/rejected": 0.04407747834920883, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 4.72458028793335, + "learning_rate": 7.382222222222222e-07, + "logits/chosen": 1.5987484455108643, + "logits/rejected": 1.6328668594360352, + "logps/chosen": -146.712158203125, + "logps/rejected": -156.0950469970703, + "loss": 0.6804145336151123, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07264034450054169, + "rewards/margins": 0.0286283977329731, + "rewards/rejected": 0.04401194304227829, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 4.735199928283691, + "learning_rate": 7.337777777777778e-07, + "logits/chosen": 1.6810247898101807, + "logits/rejected": 1.6662237644195557, + "logps/chosen": -159.40650939941406, + "logps/rejected": -140.65591430664062, + "loss": 0.6805107116699218, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.061445970088243484, + "rewards/margins": 0.02743927761912346, + "rewards/rejected": 0.034006692469120026, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 3.7038252353668213, + "learning_rate": 7.293333333333332e-07, + "logits/chosen": 1.6597106456756592, + "logits/rejected": 1.6951271295547485, + "logps/chosen": -138.1852569580078, + "logps/rejected": -128.6427764892578, + "loss": 0.6821750164031982, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.07160943746566772, + "rewards/margins": 0.023774990811944008, + "rewards/rejected": 0.047834448516368866, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 4.820807456970215, + "learning_rate": 7.248888888888888e-07, + "logits/chosen": 1.5708585977554321, + "logits/rejected": 1.5483477115631104, + "logps/chosen": -152.8867950439453, + "logps/rejected": -152.02584838867188, + "loss": 0.6911486625671387, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04338831081986427, + "rewards/margins": 0.006898392923176289, + "rewards/rejected": 0.03648992255330086, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 4.6849493980407715, + "learning_rate": 7.204444444444444e-07, + "logits/chosen": 1.5262442827224731, + "logits/rejected": 1.7751166820526123, + "logps/chosen": -143.77993774414062, + "logps/rejected": -155.7498016357422, + "loss": 0.6910871028900146, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.061417657881975174, + "rewards/margins": 0.006087464280426502, + "rewards/rejected": 0.055330194532871246, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 4.951540946960449, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.405790090560913, + "logits/rejected": 1.5980100631713867, + "logps/chosen": -147.6872100830078, + "logps/rejected": -160.23947143554688, + "loss": 0.6822467803955078, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.049768321216106415, + "rewards/margins": 0.024227874353528023, + "rewards/rejected": 0.02554045058786869, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 4.255526542663574, + "learning_rate": 7.115555555555556e-07, + "logits/chosen": 1.6527436971664429, + "logits/rejected": 1.787755012512207, + "logps/chosen": -164.73355102539062, + "logps/rejected": -178.98507690429688, + "loss": 0.6861439704895019, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07546674460172653, + "rewards/margins": 0.017860155552625656, + "rewards/rejected": 0.05760659649968147, + "step": 900 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5091361999511719, + "eval_logits/rejected": 1.5371856689453125, + "eval_logps/chosen": -152.8962860107422, + "eval_logps/rejected": -147.56655883789062, + "eval_loss": 0.690089225769043, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.04328843951225281, + "eval_rewards/margins": 0.009660834446549416, + "eval_rewards/rejected": 0.03362761065363884, + "eval_runtime": 90.3227, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 4.453512668609619, + "learning_rate": 7.071111111111111e-07, + "logits/chosen": 1.700484037399292, + "logits/rejected": 1.4941186904907227, + "logps/chosen": -138.50682067871094, + "logps/rejected": -137.02490234375, + "loss": 0.6877517700195312, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05909284949302673, + "rewards/margins": 0.013610092923045158, + "rewards/rejected": 0.04548276215791702, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 5.548420429229736, + "learning_rate": 7.026666666666667e-07, + "logits/chosen": 1.409182071685791, + "logits/rejected": 1.375797152519226, + "logps/chosen": -158.7325439453125, + "logps/rejected": -161.9824981689453, + "loss": 0.6867617607116699, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07141564786434174, + "rewards/margins": 0.015245514921844006, + "rewards/rejected": 0.05617012828588486, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 5.186211109161377, + "learning_rate": 6.982222222222221e-07, + "logits/chosen": 1.6255321502685547, + "logits/rejected": 1.7182047367095947, + "logps/chosen": -156.53213500976562, + "logps/rejected": -160.67556762695312, + "loss": 0.6812005519866944, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.062074560672044754, + "rewards/margins": 0.026883777230978012, + "rewards/rejected": 0.03519078344106674, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 4.044335842132568, + "learning_rate": 6.937777777777778e-07, + "logits/chosen": 1.6656415462493896, + "logits/rejected": 1.7865594625473022, + "logps/chosen": -147.00344848632812, + "logps/rejected": -173.11428833007812, + "loss": 0.690484619140625, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0269255880266428, + "rewards/margins": 0.007569611072540283, + "rewards/rejected": 0.019355975091457367, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 4.8925299644470215, + "learning_rate": 6.893333333333333e-07, + "logits/chosen": 1.4856427907943726, + "logits/rejected": 1.5664136409759521, + "logps/chosen": -139.77938842773438, + "logps/rejected": -152.74557495117188, + "loss": 0.6817938804626464, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05857279896736145, + "rewards/margins": 0.02614629827439785, + "rewards/rejected": 0.03242649883031845, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 4.514585018157959, + "learning_rate": 6.848888888888889e-07, + "logits/chosen": 1.6077378988265991, + "logits/rejected": 1.4770267009735107, + "logps/chosen": -134.602294921875, + "logps/rejected": -117.98567199707031, + "loss": 0.694350004196167, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.028881916776299477, + "rewards/margins": 0.00047348294174298644, + "rewards/rejected": 0.028408434242010117, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 3.9295125007629395, + "learning_rate": 6.804444444444444e-07, + "logits/chosen": 1.7124595642089844, + "logits/rejected": 1.8135782480239868, + "logps/chosen": -152.15426635742188, + "logps/rejected": -147.62945556640625, + "loss": 0.6929523944854736, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.057992029935121536, + "rewards/margins": 0.002844708738848567, + "rewards/rejected": 0.05514732003211975, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 5.510717391967773, + "learning_rate": 6.76e-07, + "logits/chosen": 1.483984351158142, + "logits/rejected": 1.4224226474761963, + "logps/chosen": -167.27133178710938, + "logps/rejected": -140.16891479492188, + "loss": 0.6842909336090088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0873967856168747, + "rewards/margins": 0.020737329497933388, + "rewards/rejected": 0.06665945053100586, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 4.097748279571533, + "learning_rate": 6.715555555555556e-07, + "logits/chosen": 1.6006208658218384, + "logits/rejected": 1.6803340911865234, + "logps/chosen": -163.8089141845703, + "logps/rejected": -167.1127166748047, + "loss": 0.6833163261413574, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.07443337142467499, + "rewards/margins": 0.023210588842630386, + "rewards/rejected": 0.0512227788567543, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 3.959730625152588, + "learning_rate": 6.67111111111111e-07, + "logits/chosen": 1.5806870460510254, + "logits/rejected": 1.5673385858535767, + "logps/chosen": -148.22584533691406, + "logps/rejected": -123.79376220703125, + "loss": 0.6892601490020752, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07928630709648132, + "rewards/margins": 0.009630966000258923, + "rewards/rejected": 0.06965534389019012, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.522994875907898, + "eval_logits/rejected": 1.5511444807052612, + "eval_logps/chosen": -152.6211395263672, + "eval_logps/rejected": -147.31727600097656, + "eval_loss": 0.6889244914054871, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.07080094516277313, + "eval_rewards/margins": 0.012245929799973965, + "eval_rewards/rejected": 0.058555010706186295, + "eval_runtime": 90.2821, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 4.409601211547852, + "learning_rate": 6.626666666666666e-07, + "logits/chosen": 1.6469438076019287, + "logits/rejected": 1.6551824808120728, + "logps/chosen": -153.6597442626953, + "logps/rejected": -140.854248046875, + "loss": 0.6809319496154785, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09710784256458282, + "rewards/margins": 0.026540305465459824, + "rewards/rejected": 0.0705675408244133, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 6.559939384460449, + "learning_rate": 6.582222222222222e-07, + "logits/chosen": 1.5092687606811523, + "logits/rejected": 1.613526701927185, + "logps/chosen": -141.97103881835938, + "logps/rejected": -145.5518035888672, + "loss": 0.6865349292755127, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0591324046254158, + "rewards/margins": 0.017797131091356277, + "rewards/rejected": 0.041335273534059525, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 5.1169047355651855, + "learning_rate": 6.537777777777778e-07, + "logits/chosen": 1.6573750972747803, + "logits/rejected": 1.4447792768478394, + "logps/chosen": -161.51220703125, + "logps/rejected": -135.18309020996094, + "loss": 0.6804659366607666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10412336885929108, + "rewards/margins": 0.02856394089758396, + "rewards/rejected": 0.07555942982435226, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 4.2392072677612305, + "learning_rate": 6.493333333333333e-07, + "logits/chosen": 1.6003319025039673, + "logits/rejected": 1.6346886157989502, + "logps/chosen": -139.08448791503906, + "logps/rejected": -139.89825439453125, + "loss": 0.6737568378448486, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11095432937145233, + "rewards/margins": 0.043066851794719696, + "rewards/rejected": 0.06788748502731323, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 5.133569240570068, + "learning_rate": 6.448888888888889e-07, + "logits/chosen": 1.4209873676300049, + "logits/rejected": 1.5513734817504883, + "logps/chosen": -141.1917266845703, + "logps/rejected": -130.70431518554688, + "loss": 0.6861515998840332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10248366743326187, + "rewards/margins": 0.016522446647286415, + "rewards/rejected": 0.08596121519804001, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 4.0574445724487305, + "learning_rate": 6.404444444444444e-07, + "logits/chosen": 1.439145803451538, + "logits/rejected": 1.5661519765853882, + "logps/chosen": -111.7681884765625, + "logps/rejected": -126.37353515625, + "loss": 0.6691905975341796, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15403084456920624, + "rewards/margins": 0.05248977616429329, + "rewards/rejected": 0.10154107958078384, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 4.832082748413086, + "learning_rate": 6.36e-07, + "logits/chosen": 1.4902942180633545, + "logits/rejected": 1.3948299884796143, + "logps/chosen": -153.6760711669922, + "logps/rejected": -147.40023803710938, + "loss": 0.6729560375213623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1259043663740158, + "rewards/margins": 0.0469796285033226, + "rewards/rejected": 0.0789247453212738, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 3.609558343887329, + "learning_rate": 6.315555555555555e-07, + "logits/chosen": 1.5476783514022827, + "logits/rejected": 1.6365705728530884, + "logps/chosen": -148.82101440429688, + "logps/rejected": -122.3703842163086, + "loss": 0.6743530750274658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1420261561870575, + "rewards/margins": 0.041455820202827454, + "rewards/rejected": 0.10057034343481064, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 4.052758693695068, + "learning_rate": 6.27111111111111e-07, + "logits/chosen": 1.7934048175811768, + "logits/rejected": 1.7036349773406982, + "logps/chosen": -174.28765869140625, + "logps/rejected": -183.97897338867188, + "loss": 0.6748029708862304, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1439160704612732, + "rewards/margins": 0.0408162847161293, + "rewards/rejected": 0.1030997782945633, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 5.463155269622803, + "learning_rate": 6.226666666666667e-07, + "logits/chosen": 1.5579731464385986, + "logits/rejected": 1.5956088304519653, + "logps/chosen": -138.96115112304688, + "logps/rejected": -152.51766967773438, + "loss": 0.6884016513824462, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11226280778646469, + "rewards/margins": 0.013228577561676502, + "rewards/rejected": 0.09903421252965927, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.5295906066894531, + "eval_logits/rejected": 1.5565518140792847, + "eval_logps/chosen": -152.15892028808594, + "eval_logps/rejected": -146.88125610351562, + "eval_loss": 0.6887561678886414, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11702151596546173, + "eval_rewards/margins": 0.014860817231237888, + "eval_rewards/rejected": 0.10216069966554642, + "eval_runtime": 90.4847, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 2.763, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 5.542585372924805, + "learning_rate": 6.182222222222222e-07, + "logits/chosen": 1.6245231628417969, + "logits/rejected": 1.600940465927124, + "logps/chosen": -155.20175170898438, + "logps/rejected": -144.58438110351562, + "loss": 0.6818556308746337, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1249178797006607, + "rewards/margins": 0.027298670262098312, + "rewards/rejected": 0.0976191908121109, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 3.8583486080169678, + "learning_rate": 6.137777777777778e-07, + "logits/chosen": 1.6029832363128662, + "logits/rejected": 1.652834177017212, + "logps/chosen": -155.6839141845703, + "logps/rejected": -149.71646118164062, + "loss": 0.6816732883453369, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13532081246376038, + "rewards/margins": 0.025635983794927597, + "rewards/rejected": 0.10968482494354248, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 4.535235404968262, + "learning_rate": 6.093333333333332e-07, + "logits/chosen": 1.7116715908050537, + "logits/rejected": 1.5788238048553467, + "logps/chosen": -146.831787109375, + "logps/rejected": -134.20765686035156, + "loss": 0.6897575855255127, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.12892299890518188, + "rewards/margins": 0.011926446110010147, + "rewards/rejected": 0.11699654906988144, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 4.356500148773193, + "learning_rate": 6.048888888888889e-07, + "logits/chosen": 1.6915569305419922, + "logits/rejected": 1.6864850521087646, + "logps/chosen": -154.83705139160156, + "logps/rejected": -141.92440795898438, + "loss": 0.677583646774292, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.13437099754810333, + "rewards/margins": 0.03629336506128311, + "rewards/rejected": 0.09807763993740082, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 4.3424553871154785, + "learning_rate": 6.004444444444444e-07, + "logits/chosen": 1.3488795757293701, + "logits/rejected": 1.3907456398010254, + "logps/chosen": -138.03089904785156, + "logps/rejected": -129.59719848632812, + "loss": 0.6962613582611084, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0907142236828804, + "rewards/margins": 0.0030510523356497288, + "rewards/rejected": 0.08766315877437592, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 4.354366779327393, + "learning_rate": 5.96e-07, + "logits/chosen": 1.6015634536743164, + "logits/rejected": 1.5302627086639404, + "logps/chosen": -125.86180114746094, + "logps/rejected": -113.89128112792969, + "loss": 0.6872058868408203, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11558832228183746, + "rewards/margins": 0.016637511551380157, + "rewards/rejected": 0.09895080327987671, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 6.197093486785889, + "learning_rate": 5.915555555555555e-07, + "logits/chosen": 1.8193966150283813, + "logits/rejected": 1.7454732656478882, + "logps/chosen": -155.8551788330078, + "logps/rejected": -165.95828247070312, + "loss": 0.6903214454650879, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12587358057498932, + "rewards/margins": 0.01157000008970499, + "rewards/rejected": 0.1143035739660263, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 6.9796624183654785, + "learning_rate": 5.871111111111112e-07, + "logits/chosen": 1.7167119979858398, + "logits/rejected": 1.693549394607544, + "logps/chosen": -185.20008850097656, + "logps/rejected": -150.20620727539062, + "loss": 0.6912973880767822, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.1253143846988678, + "rewards/margins": 0.009792610071599483, + "rewards/rejected": 0.11552176624536514, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 4.833356857299805, + "learning_rate": 5.826666666666666e-07, + "logits/chosen": 1.7434288263320923, + "logits/rejected": 1.7330595254898071, + "logps/chosen": -165.2196044921875, + "logps/rejected": -195.1993408203125, + "loss": 0.6812029361724854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11465537548065186, + "rewards/margins": 0.028904888778924942, + "rewards/rejected": 0.08575049787759781, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 4.727373123168945, + "learning_rate": 5.782222222222221e-07, + "logits/chosen": 1.6402454376220703, + "logits/rejected": 1.6071112155914307, + "logps/chosen": -138.0218963623047, + "logps/rejected": -144.56817626953125, + "loss": 0.694274616241455, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.11662209033966064, + "rewards/margins": 0.003029861254617572, + "rewards/rejected": 0.1135922223329544, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5258080959320068, + "eval_logits/rejected": 1.5528327226638794, + "eval_logps/chosen": -152.21652221679688, + "eval_logps/rejected": -146.95628356933594, + "eval_loss": 0.6881142854690552, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11126487702131271, + "eval_rewards/margins": 0.01660888083279133, + "eval_rewards/rejected": 0.09465599805116653, + "eval_runtime": 90.2009, + "eval_samples_per_second": 5.543, + "eval_steps_per_second": 2.772, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 4.669800758361816, + "learning_rate": 5.737777777777778e-07, + "logits/chosen": 1.469012975692749, + "logits/rejected": 1.4835999011993408, + "logps/chosen": -153.94541931152344, + "logps/rejected": -140.69659423828125, + "loss": 0.6921857357025146, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.12547791004180908, + "rewards/margins": 0.005948380567133427, + "rewards/rejected": 0.11952953040599823, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 5.349202632904053, + "learning_rate": 5.693333333333333e-07, + "logits/chosen": 1.5056023597717285, + "logits/rejected": 1.4966309070587158, + "logps/chosen": -144.36300659179688, + "logps/rejected": -122.79240417480469, + "loss": 0.695373821258545, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.09502485394477844, + "rewards/margins": -0.0005314469453878701, + "rewards/rejected": 0.09555630385875702, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 5.629171848297119, + "learning_rate": 5.648888888888889e-07, + "logits/chosen": 1.5664876699447632, + "logits/rejected": 1.7090803384780884, + "logps/chosen": -136.5504150390625, + "logps/rejected": -148.81802368164062, + "loss": 0.6888413906097413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13565854728221893, + "rewards/margins": 0.013341712765395641, + "rewards/rejected": 0.12231683731079102, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 5.568618297576904, + "learning_rate": 5.604444444444444e-07, + "logits/chosen": 1.512286901473999, + "logits/rejected": 1.7082159519195557, + "logps/chosen": -143.9463653564453, + "logps/rejected": -146.82052612304688, + "loss": 0.6722752571105957, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.12829402089118958, + "rewards/margins": 0.046781741082668304, + "rewards/rejected": 0.08151227235794067, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 4.786505699157715, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.6615822315216064, + "logits/rejected": 1.7368179559707642, + "logps/chosen": -144.40292358398438, + "logps/rejected": -157.58763122558594, + "loss": 0.6829993724822998, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11200229823589325, + "rewards/margins": 0.026959875598549843, + "rewards/rejected": 0.08504240959882736, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 4.871355056762695, + "learning_rate": 5.515555555555555e-07, + "logits/chosen": 1.581756830215454, + "logits/rejected": 1.643133521080017, + "logps/chosen": -123.44065856933594, + "logps/rejected": -131.16763305664062, + "loss": 0.6869094848632813, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.11280514299869537, + "rewards/margins": 0.016064399853348732, + "rewards/rejected": 0.09674074500799179, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 5.43352746963501, + "learning_rate": 5.471111111111111e-07, + "logits/chosen": 1.8056955337524414, + "logits/rejected": 1.7294971942901611, + "logps/chosen": -159.89236450195312, + "logps/rejected": -152.61293029785156, + "loss": 0.6701028347015381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1585489809513092, + "rewards/margins": 0.0533718541264534, + "rewards/rejected": 0.1051771491765976, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 5.0044474601745605, + "learning_rate": 5.426666666666666e-07, + "logits/chosen": 1.68048894405365, + "logits/rejected": 1.6845731735229492, + "logps/chosen": -153.21994018554688, + "logps/rejected": -140.06024169921875, + "loss": 0.675870132446289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11108388751745224, + "rewards/margins": 0.039517562836408615, + "rewards/rejected": 0.07156632840633392, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 4.051770210266113, + "learning_rate": 5.382222222222223e-07, + "logits/chosen": 1.6341949701309204, + "logits/rejected": 1.7052574157714844, + "logps/chosen": -143.89512634277344, + "logps/rejected": -132.5374298095703, + "loss": 0.6808773040771484, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11111323535442352, + "rewards/margins": 0.02908928692340851, + "rewards/rejected": 0.08202396333217621, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 4.0852885246276855, + "learning_rate": 5.337777777777778e-07, + "logits/chosen": 1.701703429222107, + "logits/rejected": 1.5303716659545898, + "logps/chosen": -149.3526611328125, + "logps/rejected": -129.2085418701172, + "loss": 0.6718564987182617, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13976502418518066, + "rewards/margins": 0.04783231392502785, + "rewards/rejected": 0.09193271398544312, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5326013565063477, + "eval_logits/rejected": 1.5598564147949219, + "eval_logps/chosen": -152.15328979492188, + "eval_logps/rejected": -146.90542602539062, + "eval_loss": 0.6874103546142578, + "eval_rewards/accuracies": 0.5460000038146973, + "eval_rewards/chosen": 0.1175844818353653, + "eval_rewards/margins": 0.01784202829003334, + "eval_rewards/rejected": 0.09974244982004166, + "eval_runtime": 90.5137, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 2.762, + "step": 1300 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-1300/training_args.bin b/v5/DPO/DPO_10k/lora/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..104933ebf9c17ba9c2c1c1d39a0d26ccafdfe373 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677b288b67816c6ab7a9dcdd40d26bcb142fa3ad3ad050eaeeb4b73a1ba4b498 +size 6161 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/README.md b/v5/DPO/DPO_10k/lora/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_config.json b/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8436359b1aa944f94290f60b93e89d8644f8843e --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "down_proj", + "k_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_model.safetensors b/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1a876a19d885f95052152e95c8928ccfdd1bd7c0 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:925fb02b11fdf9941bdfae0889066e9dab940f09395d0149a16abe025b37ad15 +size 180385008 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/chat_template.jinja b/v5/DPO/DPO_10k/lora/checkpoint-2400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/optimizer.pt b/v5/DPO/DPO_10k/lora/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5127d70358d32ff0f8fd07832f27f3ea0774d3c --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98ac224d5ed27ace528190a6990080c745116a4b920d737d850dcbf7cec83638 +size 360902475 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/rng_state.pth b/v5/DPO/DPO_10k/lora/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..13e11a54e352d8a7149df1f88c1b023ee9973959 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7affab63b271ed0f59a5b53056fc0a581226a41dcdf2fc2b80b669e7c3cf714 +size 14645 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/scaler.pt b/v5/DPO/DPO_10k/lora/checkpoint-2400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..18e01c523cd2afae482cde80c5c92c0b1c481848 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee3be4931d9e7c32cc76f610ed77ea9a83d47b5639a53431c51dcc937f80ff5 +size 1383 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/scheduler.pt b/v5/DPO/DPO_10k/lora/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e024b190639747730099dcc6f51be6fbfe2be2b8 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:050d6987b00cc915a861681f6abd4fe1d6f88b067e04461ce21e4f85653a111f +size 1465 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer.json b/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer_config.json b/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/trainer_state.json b/v5/DPO/DPO_10k/lora/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3f981985490441f321a9660752ddb9380abe9d83 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/trainer_state.json @@ -0,0 +1,4018 @@ +{ + "best_global_step": 1300, + "best_metric": 0.5460000038146973, + "best_model_checkpoint": "output/lora/checkpoint-1300", + "epoch": 1.92, + "eval_steps": 100, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 4.452983379364014, + "learning_rate": 3.6e-08, + "logits/chosen": 1.5510008335113525, + "logits/rejected": 1.5244438648223877, + "logps/chosen": -131.24708557128906, + "logps/rejected": -146.8297576904297, + "loss": 0.6932957172393799, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.00019072293071076274, + "rewards/margins": -0.00029331922996789217, + "rewards/rejected": 0.0001025962847052142, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 5.203515529632568, + "learning_rate": 7.599999999999999e-08, + "logits/chosen": 1.6611576080322266, + "logits/rejected": 1.6220839023590088, + "logps/chosen": -156.2080078125, + "logps/rejected": -142.72964477539062, + "loss": 0.6937986850738526, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.0006145072402432561, + "rewards/margins": -0.0012890815269201994, + "rewards/rejected": 0.000674574519507587, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 4.087289810180664, + "learning_rate": 1.16e-07, + "logits/chosen": 1.9773778915405273, + "logits/rejected": 1.8304665088653564, + "logps/chosen": -163.54708862304688, + "logps/rejected": -157.88926696777344, + "loss": 0.6931437492370606, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0003584886435419321, + "rewards/margins": 2.1700874640373513e-05, + "rewards/rejected": 0.00033678775071166456, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 4.74172830581665, + "learning_rate": 1.56e-07, + "logits/chosen": 1.5896285772323608, + "logits/rejected": 1.7109922170639038, + "logps/chosen": -144.44276428222656, + "logps/rejected": -133.09629821777344, + "loss": 0.6932558059692383, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00048673158744350076, + "rewards/margins": -0.00020531899644993246, + "rewards/rejected": 0.0006920504383742809, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 4.32133150100708, + "learning_rate": 1.96e-07, + "logits/chosen": 1.5152148008346558, + "logits/rejected": 1.585367202758789, + "logps/chosen": -131.73226928710938, + "logps/rejected": -136.8301239013672, + "loss": 0.6930522918701172, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0008355070021934807, + "rewards/margins": 0.00019948731642216444, + "rewards/rejected": 0.0006360196857713163, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 3.915316343307495, + "learning_rate": 2.3599999999999997e-07, + "logits/chosen": 1.5138778686523438, + "logits/rejected": 1.4824903011322021, + "logps/chosen": -136.43399047851562, + "logps/rejected": -126.70623779296875, + "loss": 0.6929163455963134, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000302538916002959, + "rewards/margins": 0.00047383070341311395, + "rewards/rejected": -0.00017129186016973108, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 4.329769134521484, + "learning_rate": 2.7600000000000004e-07, + "logits/chosen": 1.6920913457870483, + "logits/rejected": 1.8169019222259521, + "logps/chosen": -152.056640625, + "logps/rejected": -155.9404296875, + "loss": 0.6935575008392334, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0025545789394527674, + "rewards/margins": -0.0008073424687609076, + "rewards/rejected": 0.003361921291798353, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 3.964193344116211, + "learning_rate": 3.1599999999999997e-07, + "logits/chosen": 1.5645431280136108, + "logits/rejected": 1.5879082679748535, + "logps/chosen": -147.78839111328125, + "logps/rejected": -135.19906616210938, + "loss": 0.6925086498260498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006093275733292103, + "rewards/margins": 0.0012893510283902287, + "rewards/rejected": 0.004803924821317196, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 3.935694694519043, + "learning_rate": 3.5599999999999996e-07, + "logits/chosen": 1.5960246324539185, + "logits/rejected": 1.6901094913482666, + "logps/chosen": -157.85256958007812, + "logps/rejected": -150.51974487304688, + "loss": 0.6931850433349609, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.010219026356935501, + "rewards/margins": -5.4271204135147855e-05, + "rewards/rejected": 0.010273297317326069, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 5.13019323348999, + "learning_rate": 3.96e-07, + "logits/chosen": 1.5446935892105103, + "logits/rejected": 1.6452451944351196, + "logps/chosen": -149.88038635253906, + "logps/rejected": -169.9078826904297, + "loss": 0.6935123443603516, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011777262203395367, + "rewards/margins": -0.0007071519503369927, + "rewards/rejected": 0.0124844154343009, + "step": 100 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5421667098999023, + "eval_logits/rejected": 1.5734084844589233, + "eval_logps/chosen": -153.21649169921875, + "eval_logps/rejected": -147.7952117919922, + "eval_loss": 0.6929068565368652, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.011265883222222328, + "eval_rewards/margins": 0.0005034058121964335, + "eval_rewards/rejected": 0.010762478224933147, + "eval_runtime": 90.2131, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 5.695896625518799, + "learning_rate": 4.36e-07, + "logits/chosen": 1.781393051147461, + "logits/rejected": 1.7461833953857422, + "logps/chosen": -172.24188232421875, + "logps/rejected": -154.40878295898438, + "loss": 0.6922736167907715, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.014022141695022583, + "rewards/margins": 0.001777560799382627, + "rewards/rejected": 0.012244580313563347, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 4.398581027984619, + "learning_rate": 4.76e-07, + "logits/chosen": 1.611268401145935, + "logits/rejected": 1.6106624603271484, + "logps/chosen": -135.13426208496094, + "logps/rejected": -139.7284393310547, + "loss": 0.6927696228027344, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.014869053848087788, + "rewards/margins": 0.0007819391903467476, + "rewards/rejected": 0.014087115414440632, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 5.19202995300293, + "learning_rate": 5.16e-07, + "logits/chosen": 1.5615273714065552, + "logits/rejected": 1.7724416255950928, + "logps/chosen": -157.66746520996094, + "logps/rejected": -161.90391540527344, + "loss": 0.6928309917449951, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013302234932780266, + "rewards/margins": 0.0006531739491038024, + "rewards/rejected": 0.012649061158299446, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 3.4575726985931396, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.492018699645996, + "logits/rejected": 1.5187314748764038, + "logps/chosen": -131.4152374267578, + "logps/rejected": -125.62705993652344, + "loss": 0.6929276943206787, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014355423860251904, + "rewards/margins": 0.000462935131508857, + "rewards/rejected": 0.013892488554120064, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 4.500187397003174, + "learning_rate": 5.96e-07, + "logits/chosen": 1.5862048864364624, + "logits/rejected": 1.6784181594848633, + "logps/chosen": -163.6667938232422, + "logps/rejected": -157.76402282714844, + "loss": 0.6910766124725342, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.018648523837327957, + "rewards/margins": 0.004183619283139706, + "rewards/rejected": 0.014464902691543102, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 4.113079071044922, + "learning_rate": 6.36e-07, + "logits/chosen": 1.7717370986938477, + "logits/rejected": 1.8070589303970337, + "logps/chosen": -158.02734375, + "logps/rejected": -145.92495727539062, + "loss": 0.6927172183990479, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.02493301033973694, + "rewards/margins": 0.0009325124556198716, + "rewards/rejected": 0.024000495672225952, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 5.4983696937561035, + "learning_rate": 6.76e-07, + "logits/chosen": 1.5733931064605713, + "logits/rejected": 1.6008774042129517, + "logps/chosen": -147.1856689453125, + "logps/rejected": -159.93077087402344, + "loss": 0.6926907062530517, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02877199277281761, + "rewards/margins": 0.0010106085101142526, + "rewards/rejected": 0.027761384844779968, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 4.50191068649292, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.5886398553848267, + "logits/rejected": 1.7399513721466064, + "logps/chosen": -157.6659393310547, + "logps/rejected": -160.65431213378906, + "loss": 0.6925735473632812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024526067078113556, + "rewards/margins": 0.0012190367560833693, + "rewards/rejected": 0.02330702915787697, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 4.708652019500732, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.6504443883895874, + "logits/rejected": 1.7761001586914062, + "logps/chosen": -141.34536743164062, + "logps/rejected": -143.41159057617188, + "loss": 0.6928653240203857, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.026504456996917725, + "rewards/margins": 0.0006459927535615861, + "rewards/rejected": 0.02585846558213234, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 4.143187046051025, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7108211517333984, + "logits/rejected": 1.6271438598632812, + "logps/chosen": -158.04931640625, + "logps/rejected": -132.23463439941406, + "loss": 0.6927096843719482, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.02748889848589897, + "rewards/margins": 0.0009879134595394135, + "rewards/rejected": 0.026500985026359558, + "step": 200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.54364013671875, + "eval_logits/rejected": 1.5745173692703247, + "eval_logps/chosen": -153.04653930664062, + "eval_logps/rejected": -147.63844299316406, + "eval_loss": 0.6923084855079651, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.028259562328457832, + "eval_rewards/margins": 0.0018185621593147516, + "eval_rewards/rejected": 0.026441000401973724, + "eval_runtime": 90.4481, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 2.764, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 4.559652328491211, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.7295278310775757, + "logits/rejected": 1.6801897287368774, + "logps/chosen": -158.0893096923828, + "logps/rejected": -168.72427368164062, + "loss": 0.6922987461090088, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02837887406349182, + "rewards/margins": 0.0018027331680059433, + "rewards/rejected": 0.026576142758131027, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 3.907545328140259, + "learning_rate": 8.76e-07, + "logits/chosen": 1.6849712133407593, + "logits/rejected": 1.7441444396972656, + "logps/chosen": -158.67384338378906, + "logps/rejected": -143.02920532226562, + "loss": 0.6933117389678956, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03286944702267647, + "rewards/margins": -0.0002176974667236209, + "rewards/rejected": 0.03308714181184769, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 3.5083253383636475, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5701725482940674, + "logits/rejected": 1.7182495594024658, + "logps/chosen": -160.56790161132812, + "logps/rejected": -138.05374145507812, + "loss": 0.6915814399719238, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.03840702772140503, + "rewards/margins": 0.0033035180531442165, + "rewards/rejected": 0.035103507339954376, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 4.424270153045654, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.671190857887268, + "logits/rejected": 1.6964585781097412, + "logps/chosen": -170.28260803222656, + "logps/rejected": -144.33534240722656, + "loss": 0.6900368690490722, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0511443130671978, + "rewards/margins": 0.006511001847684383, + "rewards/rejected": 0.04463331401348114, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 4.5393967628479, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.6391950845718384, + "logits/rejected": 1.5815935134887695, + "logps/chosen": -160.45225524902344, + "logps/rejected": -147.56185913085938, + "loss": 0.6940414905548096, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.04959743469953537, + "rewards/margins": -0.001381749869324267, + "rewards/rejected": 0.05097918584942818, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 4.256033897399902, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.5204452276229858, + "logits/rejected": 1.6171140670776367, + "logps/chosen": -131.5397186279297, + "logps/rejected": -145.2186279296875, + "loss": 0.6930822372436524, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.045291412621736526, + "rewards/margins": 0.000405142258387059, + "rewards/rejected": 0.0448862686753273, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 4.027031421661377, + "learning_rate": 9.915555555555556e-07, + "logits/chosen": 1.6407123804092407, + "logits/rejected": 1.7262824773788452, + "logps/chosen": -145.78701782226562, + "logps/rejected": -146.34481811523438, + "loss": 0.6946187496185303, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04104622080922127, + "rewards/margins": -0.0027089794166386127, + "rewards/rejected": 0.043755196034908295, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 5.568243026733398, + "learning_rate": 9.87111111111111e-07, + "logits/chosen": 1.6697533130645752, + "logits/rejected": 1.5154677629470825, + "logps/chosen": -171.0277099609375, + "logps/rejected": -154.05654907226562, + "loss": 0.6873753070831299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04779375344514847, + "rewards/margins": 0.011815806850790977, + "rewards/rejected": 0.03597795218229294, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 4.041477680206299, + "learning_rate": 9.826666666666667e-07, + "logits/chosen": 1.6633354425430298, + "logits/rejected": 1.6905081272125244, + "logps/chosen": -142.88864135742188, + "logps/rejected": -152.04757690429688, + "loss": 0.6929869174957275, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04824609309434891, + "rewards/margins": 0.0005596639821305871, + "rewards/rejected": 0.04768642783164978, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 4.9481635093688965, + "learning_rate": 9.782222222222222e-07, + "logits/chosen": 1.617485761642456, + "logits/rejected": 1.6837307214736938, + "logps/chosen": -152.088134765625, + "logps/rejected": -164.15158081054688, + "loss": 0.6897455215454101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.057097338140010834, + "rewards/margins": 0.007080497685819864, + "rewards/rejected": 0.050016842782497406, + "step": 300 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5520602464675903, + "eval_logits/rejected": 1.5826917886734009, + "eval_logps/chosen": -152.8227996826172, + "eval_logps/rejected": -147.43496704101562, + "eval_loss": 0.6914217472076416, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": 0.05063560605049133, + "eval_rewards/margins": 0.0038486982230097055, + "eval_rewards/rejected": 0.04678690433502197, + "eval_runtime": 90.237, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 2.77, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 4.486109256744385, + "learning_rate": 9.737777777777777e-07, + "logits/chosen": 1.7188745737075806, + "logits/rejected": 1.7590553760528564, + "logps/chosen": -140.6877899169922, + "logps/rejected": -155.31893920898438, + "loss": 0.6952545166015625, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": 0.046565137803554535, + "rewards/margins": -0.003889651270583272, + "rewards/rejected": 0.050454795360565186, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 4.374355316162109, + "learning_rate": 9.693333333333334e-07, + "logits/chosen": 1.7103513479232788, + "logits/rejected": 1.7379261255264282, + "logps/chosen": -137.8660888671875, + "logps/rejected": -140.40956115722656, + "loss": 0.692354393005371, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04029277712106705, + "rewards/margins": 0.0018124934285879135, + "rewards/rejected": 0.038480278104543686, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 4.618821144104004, + "learning_rate": 9.648888888888889e-07, + "logits/chosen": 1.5603777170181274, + "logits/rejected": 1.5868213176727295, + "logps/chosen": -157.3379669189453, + "logps/rejected": -182.77377319335938, + "loss": 0.6921377182006836, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03832743316888809, + "rewards/margins": 0.0022720033302903175, + "rewards/rejected": 0.03605542704463005, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 4.1974406242370605, + "learning_rate": 9.604444444444443e-07, + "logits/chosen": 1.8520517349243164, + "logits/rejected": 1.716774582862854, + "logps/chosen": -158.16363525390625, + "logps/rejected": -149.66162109375, + "loss": 0.692081069946289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0396885983645916, + "rewards/margins": 0.0023556998930871487, + "rewards/rejected": 0.03733289986848831, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 4.700806140899658, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.6886537075042725, + "logits/rejected": 1.8079423904418945, + "logps/chosen": -165.5438232421875, + "logps/rejected": -194.98428344726562, + "loss": 0.6890993118286133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05031610652804375, + "rewards/margins": 0.008422226645052433, + "rewards/rejected": 0.041893888264894485, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 3.7786731719970703, + "learning_rate": 9.515555555555555e-07, + "logits/chosen": 1.5483795404434204, + "logits/rejected": 1.4731425046920776, + "logps/chosen": -161.77774047851562, + "logps/rejected": -168.05458068847656, + "loss": 0.6911417007446289, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.050054989755153656, + "rewards/margins": 0.0043535237200558186, + "rewards/rejected": 0.045701466500759125, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 4.362200736999512, + "learning_rate": 9.471111111111111e-07, + "logits/chosen": 1.735099196434021, + "logits/rejected": 1.7567729949951172, + "logps/chosen": -161.30374145507812, + "logps/rejected": -153.4731903076172, + "loss": 0.6882652282714844, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06078929826617241, + "rewards/margins": 0.010189466178417206, + "rewards/rejected": 0.0505998320877552, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 4.35581111907959, + "learning_rate": 9.426666666666666e-07, + "logits/chosen": 1.604020118713379, + "logits/rejected": 1.524717926979065, + "logps/chosen": -141.42324829101562, + "logps/rejected": -151.82521057128906, + "loss": 0.6888412952423095, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.06947806477546692, + "rewards/margins": 0.009350637905299664, + "rewards/rejected": 0.06012742593884468, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 4.360926628112793, + "learning_rate": 9.382222222222222e-07, + "logits/chosen": 1.7075706720352173, + "logits/rejected": 1.6819493770599365, + "logps/chosen": -150.63375854492188, + "logps/rejected": -137.05673217773438, + "loss": 0.6836989879608154, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.08739721775054932, + "rewards/margins": 0.019574418663978577, + "rewards/rejected": 0.06782279908657074, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 4.909813404083252, + "learning_rate": 9.337777777777778e-07, + "logits/chosen": 1.6920162439346313, + "logits/rejected": 1.675100326538086, + "logps/chosen": -154.39663696289062, + "logps/rejected": -147.51455688476562, + "loss": 0.6892420768737793, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08908190578222275, + "rewards/margins": 0.008944114670157433, + "rewards/rejected": 0.08013778924942017, + "step": 400 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5610027313232422, + "eval_logits/rejected": 1.5909091234207153, + "eval_logps/chosen": -152.39871215820312, + "eval_logps/rejected": -147.04049682617188, + "eval_loss": 0.6903401017189026, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.09304190427064896, + "eval_rewards/margins": 0.006809028796851635, + "eval_rewards/rejected": 0.08623287081718445, + "eval_runtime": 90.296, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.769, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 4.541158199310303, + "learning_rate": 9.293333333333333e-07, + "logits/chosen": 1.6515496969223022, + "logits/rejected": 1.548688530921936, + "logps/chosen": -147.21546936035156, + "logps/rejected": -187.50816345214844, + "loss": 0.6924624919891358, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.10388608276844025, + "rewards/margins": 0.002456969814375043, + "rewards/rejected": 0.10142910480499268, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 5.246954917907715, + "learning_rate": 9.248888888888888e-07, + "logits/chosen": 1.6460405588150024, + "logits/rejected": 1.6713184118270874, + "logps/chosen": -151.11341857910156, + "logps/rejected": -166.2979736328125, + "loss": 0.6986268043518067, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.09254096448421478, + "rewards/margins": -0.009933066554367542, + "rewards/rejected": 0.10247401893138885, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 4.068811416625977, + "learning_rate": 9.204444444444443e-07, + "logits/chosen": 1.6973702907562256, + "logits/rejected": 1.7605253458023071, + "logps/chosen": -162.7523651123047, + "logps/rejected": -150.79718017578125, + "loss": 0.6896752834320068, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.08682320266962051, + "rewards/margins": 0.007874277420341969, + "rewards/rejected": 0.07894892990589142, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 4.387909412384033, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5456931591033936, + "logits/rejected": 1.4381892681121826, + "logps/chosen": -155.777099609375, + "logps/rejected": -144.95742797851562, + "loss": 0.6881390571594238, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08465877920389175, + "rewards/margins": 0.01117948442697525, + "rewards/rejected": 0.0734792947769165, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 4.3955864906311035, + "learning_rate": 9.115555555555555e-07, + "logits/chosen": 1.7298389673233032, + "logits/rejected": 1.681171178817749, + "logps/chosen": -156.2227783203125, + "logps/rejected": -158.81114196777344, + "loss": 0.685992956161499, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08095243573188782, + "rewards/margins": 0.015426402911543846, + "rewards/rejected": 0.06552603840827942, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 4.6138176918029785, + "learning_rate": 9.071111111111111e-07, + "logits/chosen": 1.6315510272979736, + "logits/rejected": 1.6908462047576904, + "logps/chosen": -150.84512329101562, + "logps/rejected": -163.89492797851562, + "loss": 0.6891673088073731, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07254813611507416, + "rewards/margins": 0.008895034901797771, + "rewards/rejected": 0.06365309655666351, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 4.3172101974487305, + "learning_rate": 9.026666666666665e-07, + "logits/chosen": 1.4395225048065186, + "logits/rejected": 1.4489599466323853, + "logps/chosen": -130.1565399169922, + "logps/rejected": -122.24504089355469, + "loss": 0.6887143135070801, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.08355607837438583, + "rewards/margins": 0.009718736633658409, + "rewards/rejected": 0.07383735477924347, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 4.2122087478637695, + "learning_rate": 8.982222222222222e-07, + "logits/chosen": 1.5334614515304565, + "logits/rejected": 1.5769469738006592, + "logps/chosen": -147.21896362304688, + "logps/rejected": -162.89804077148438, + "loss": 0.6849615573883057, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.10068760812282562, + "rewards/margins": 0.017685385420918465, + "rewards/rejected": 0.0830022394657135, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 4.740354061126709, + "learning_rate": 8.937777777777777e-07, + "logits/chosen": 1.6524379253387451, + "logits/rejected": 1.7100518941879272, + "logps/chosen": -142.10653686523438, + "logps/rejected": -158.3316192626953, + "loss": 0.696216630935669, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.08680267632007599, + "rewards/margins": -0.004664557985961437, + "rewards/rejected": 0.0914672389626503, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 3.6374881267547607, + "learning_rate": 8.893333333333333e-07, + "logits/chosen": 1.518328309059143, + "logits/rejected": 1.6029644012451172, + "logps/chosen": -143.19154357910156, + "logps/rejected": -134.3892059326172, + "loss": 0.6908615589141845, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.06735874712467194, + "rewards/margins": 0.005541653838008642, + "rewards/rejected": 0.06181709095835686, + "step": 500 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5418404340744019, + "eval_logits/rejected": 1.571341633796692, + "eval_logps/chosen": -152.5870819091797, + "eval_logps/rejected": -147.23146057128906, + "eval_loss": 0.6903930902481079, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": 0.07420650124549866, + "eval_rewards/margins": 0.007066408637911081, + "eval_rewards/rejected": 0.06714009493589401, + "eval_runtime": 90.217, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 4.53076171875, + "learning_rate": 8.848888888888888e-07, + "logits/chosen": 1.6947540044784546, + "logits/rejected": 1.6306483745574951, + "logps/chosen": -130.33372497558594, + "logps/rejected": -139.05648803710938, + "loss": 0.6863756656646729, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.08373989164829254, + "rewards/margins": 0.014761297032237053, + "rewards/rejected": 0.06897859275341034, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 5.064472675323486, + "learning_rate": 8.804444444444445e-07, + "logits/chosen": 1.795907974243164, + "logits/rejected": 1.6805435419082642, + "logps/chosen": -165.10183715820312, + "logps/rejected": -170.87112426757812, + "loss": 0.6915029525756836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07381857931613922, + "rewards/margins": 0.00408085435628891, + "rewards/rejected": 0.06973771750926971, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 4.472287178039551, + "learning_rate": 8.76e-07, + "logits/chosen": 1.7226626873016357, + "logits/rejected": 1.6465301513671875, + "logps/chosen": -165.50076293945312, + "logps/rejected": -167.12991333007812, + "loss": 0.6784487724304199, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09387621283531189, + "rewards/margins": 0.031213903799653053, + "rewards/rejected": 0.06266231089830399, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 4.193634033203125, + "learning_rate": 8.715555555555554e-07, + "logits/chosen": 1.7823143005371094, + "logits/rejected": 1.7374283075332642, + "logps/chosen": -180.05233764648438, + "logps/rejected": -157.24835205078125, + "loss": 0.6891638278961182, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10219593346118927, + "rewards/margins": 0.009201721288263798, + "rewards/rejected": 0.09299422055482864, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 5.877465724945068, + "learning_rate": 8.671111111111111e-07, + "logits/chosen": 1.6558294296264648, + "logits/rejected": 1.7549035549163818, + "logps/chosen": -149.97171020507812, + "logps/rejected": -166.52127075195312, + "loss": 0.6909477233886718, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09389887005090714, + "rewards/margins": 0.00655100354924798, + "rewards/rejected": 0.0873478576540947, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 3.9154303073883057, + "learning_rate": 8.626666666666666e-07, + "logits/chosen": 1.7343839406967163, + "logits/rejected": 1.6256252527236938, + "logps/chosen": -153.2657470703125, + "logps/rejected": -137.84548950195312, + "loss": 0.6832056045532227, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08965932577848434, + "rewards/margins": 0.02167549543082714, + "rewards/rejected": 0.06798382848501205, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 3.569357395172119, + "learning_rate": 8.582222222222222e-07, + "logits/chosen": 1.6020238399505615, + "logits/rejected": 1.5468555688858032, + "logps/chosen": -156.9928741455078, + "logps/rejected": -150.9776153564453, + "loss": 0.6857921123504639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07846538722515106, + "rewards/margins": 0.016974106431007385, + "rewards/rejected": 0.06149129942059517, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 5.710695266723633, + "learning_rate": 8.537777777777777e-07, + "logits/chosen": 1.4293699264526367, + "logits/rejected": 1.583032250404358, + "logps/chosen": -134.98165893554688, + "logps/rejected": -153.61439514160156, + "loss": 0.6899324417114258, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05123991519212723, + "rewards/margins": 0.008615568280220032, + "rewards/rejected": 0.042624346911907196, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 3.757844924926758, + "learning_rate": 8.493333333333334e-07, + "logits/chosen": 1.5719819068908691, + "logits/rejected": 1.5706799030303955, + "logps/chosen": -143.9678955078125, + "logps/rejected": -130.64585876464844, + "loss": 0.6851204395294189, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07710663974285126, + "rewards/margins": 0.01796458289027214, + "rewards/rejected": 0.059142060577869415, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 5.662181854248047, + "learning_rate": 8.448888888888888e-07, + "logits/chosen": 1.6224644184112549, + "logits/rejected": 1.6623615026474, + "logps/chosen": -130.7429962158203, + "logps/rejected": -157.59295654296875, + "loss": 0.6958520889282227, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0648646205663681, + "rewards/margins": -0.003241670085117221, + "rewards/rejected": 0.0681062787771225, + "step": 600 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5268914699554443, + "eval_logits/rejected": 1.5556302070617676, + "eval_logps/chosen": -152.5731201171875, + "eval_logps/rejected": -147.22303771972656, + "eval_loss": 0.6907246708869934, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.07560181617736816, + "eval_rewards/margins": 0.007620053365826607, + "eval_rewards/rejected": 0.0679817646741867, + "eval_runtime": 90.3327, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 2.768, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 3.7953426837921143, + "learning_rate": 8.404444444444444e-07, + "logits/chosen": 1.6380192041397095, + "logits/rejected": 1.6921494007110596, + "logps/chosen": -130.59445190429688, + "logps/rejected": -148.48709106445312, + "loss": 0.6903901100158691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08007006347179413, + "rewards/margins": 0.00720745325088501, + "rewards/rejected": 0.07286261022090912, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 7.121775150299072, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.6000845432281494, + "logits/rejected": 1.731951355934143, + "logps/chosen": -154.8905792236328, + "logps/rejected": -166.4490966796875, + "loss": 0.6969138145446777, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.05843223258852959, + "rewards/margins": -0.0054575116373598576, + "rewards/rejected": 0.06388974189758301, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 5.155455589294434, + "learning_rate": 8.315555555555556e-07, + "logits/chosen": 1.6201622486114502, + "logits/rejected": 1.6479911804199219, + "logps/chosen": -165.98980712890625, + "logps/rejected": -145.71644592285156, + "loss": 0.6804090023040772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0773121565580368, + "rewards/margins": 0.027506589889526367, + "rewards/rejected": 0.049805570393800735, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 4.009693145751953, + "learning_rate": 8.271111111111111e-07, + "logits/chosen": 1.5530269145965576, + "logits/rejected": 1.5585509538650513, + "logps/chosen": -166.77560424804688, + "logps/rejected": -151.09249877929688, + "loss": 0.6879617691040039, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07863648235797882, + "rewards/margins": 0.014577758498489857, + "rewards/rejected": 0.06405872106552124, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 4.472072601318359, + "learning_rate": 8.226666666666666e-07, + "logits/chosen": 1.662239670753479, + "logits/rejected": 1.6585584878921509, + "logps/chosen": -153.26776123046875, + "logps/rejected": -125.24166107177734, + "loss": 0.6882720470428467, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09423185139894485, + "rewards/margins": 0.011949598789215088, + "rewards/rejected": 0.08228223770856857, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 5.073488712310791, + "learning_rate": 8.182222222222222e-07, + "logits/chosen": 1.6752973794937134, + "logits/rejected": 1.6020495891571045, + "logps/chosen": -150.0669708251953, + "logps/rejected": -131.1305694580078, + "loss": 0.6880992889404297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.07629784196615219, + "rewards/margins": 0.01174530852586031, + "rewards/rejected": 0.06455253064632416, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 5.938063621520996, + "learning_rate": 8.137777777777777e-07, + "logits/chosen": 1.7563416957855225, + "logits/rejected": 1.5739262104034424, + "logps/chosen": -165.046875, + "logps/rejected": -150.13104248046875, + "loss": 0.6939912796020508, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.09020708501338959, + "rewards/margins": 0.0006308574229478836, + "rewards/rejected": 0.08957622945308685, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 4.362247467041016, + "learning_rate": 8.093333333333333e-07, + "logits/chosen": 1.6460363864898682, + "logits/rejected": 1.6379966735839844, + "logps/chosen": -143.24754333496094, + "logps/rejected": -131.8529815673828, + "loss": 0.6835652351379394, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.10346712917089462, + "rewards/margins": 0.02079077437520027, + "rewards/rejected": 0.08267635107040405, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 4.323369026184082, + "learning_rate": 8.048888888888888e-07, + "logits/chosen": 1.5466216802597046, + "logits/rejected": 1.541775107383728, + "logps/chosen": -171.82138061523438, + "logps/rejected": -158.87603759765625, + "loss": 0.6892048358917237, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0915951356291771, + "rewards/margins": 0.009338131174445152, + "rewards/rejected": 0.08225701004266739, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 5.28114128112793, + "learning_rate": 8.004444444444444e-07, + "logits/chosen": 1.6419496536254883, + "logits/rejected": 1.6641361713409424, + "logps/chosen": -158.20787048339844, + "logps/rejected": -136.3108367919922, + "loss": 0.6985964775085449, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0700095146894455, + "rewards/margins": -0.008601363748311996, + "rewards/rejected": 0.07861088216304779, + "step": 700 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5386524200439453, + "eval_logits/rejected": 1.5675796270370483, + "eval_logps/chosen": -152.39877319335938, + "eval_logps/rejected": -147.0768585205078, + "eval_loss": 0.6893304586410522, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.09303704649209976, + "eval_rewards/margins": 0.010439171455800533, + "eval_rewards/rejected": 0.08259786665439606, + "eval_runtime": 90.3103, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 4.867155075073242, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7137393951416016, + "logits/rejected": 1.6643224954605103, + "logps/chosen": -147.054931640625, + "logps/rejected": -162.10067749023438, + "loss": 0.6891860008239746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11649386584758759, + "rewards/margins": 0.00985223613679409, + "rewards/rejected": 0.10664163529872894, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 4.687198638916016, + "learning_rate": 7.915555555555556e-07, + "logits/chosen": 1.686532974243164, + "logits/rejected": 1.7992823123931885, + "logps/chosen": -138.60238647460938, + "logps/rejected": -134.22702026367188, + "loss": 0.7006660461425781, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.08661777526140213, + "rewards/margins": -0.013613695278763771, + "rewards/rejected": 0.10023146867752075, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 4.63344669342041, + "learning_rate": 7.87111111111111e-07, + "logits/chosen": 1.781561255455017, + "logits/rejected": 1.7561432123184204, + "logps/chosen": -151.60018920898438, + "logps/rejected": -147.93264770507812, + "loss": 0.6958267688751221, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10642895847558975, + "rewards/margins": -0.002830044599249959, + "rewards/rejected": 0.1092589944601059, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 4.5400800704956055, + "learning_rate": 7.826666666666666e-07, + "logits/chosen": 1.670771837234497, + "logits/rejected": 1.5866410732269287, + "logps/chosen": -155.36764526367188, + "logps/rejected": -132.60902404785156, + "loss": 0.6921723842620849, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0910472720861435, + "rewards/margins": 0.003630922408774495, + "rewards/rejected": 0.08741635084152222, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 4.779706954956055, + "learning_rate": 7.782222222222222e-07, + "logits/chosen": 1.531534194946289, + "logits/rejected": 1.5548356771469116, + "logps/chosen": -135.88177490234375, + "logps/rejected": -157.09231567382812, + "loss": 0.6919455528259277, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.09000497311353683, + "rewards/margins": 0.004116452299058437, + "rewards/rejected": 0.08588851988315582, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 5.283969879150391, + "learning_rate": 7.737777777777777e-07, + "logits/chosen": 1.6809686422348022, + "logits/rejected": 1.501511812210083, + "logps/chosen": -137.67315673828125, + "logps/rejected": -128.26022338867188, + "loss": 0.6907838344573974, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.08058954030275345, + "rewards/margins": 0.0064643076620996, + "rewards/rejected": 0.07412523031234741, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 4.341912269592285, + "learning_rate": 7.693333333333333e-07, + "logits/chosen": 1.684203863143921, + "logits/rejected": 1.6489808559417725, + "logps/chosen": -139.82455444335938, + "logps/rejected": -135.16998291015625, + "loss": 0.6793179988861084, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10346569865942001, + "rewards/margins": 0.02992106042802334, + "rewards/rejected": 0.07354463636875153, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 5.209469318389893, + "learning_rate": 7.648888888888888e-07, + "logits/chosen": 1.5599935054779053, + "logits/rejected": 1.6487398147583008, + "logps/chosen": -152.46170043945312, + "logps/rejected": -157.7329559326172, + "loss": 0.6873491287231446, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10296590626239777, + "rewards/margins": 0.01368915755301714, + "rewards/rejected": 0.0892767459154129, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 5.364309310913086, + "learning_rate": 7.604444444444445e-07, + "logits/chosen": 1.5357733964920044, + "logits/rejected": 1.5833505392074585, + "logps/chosen": -146.4203338623047, + "logps/rejected": -149.77499389648438, + "loss": 0.68800368309021, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0752335637807846, + "rewards/margins": 0.01265893317759037, + "rewards/rejected": 0.06257463991641998, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 4.701781272888184, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.7693378925323486, + "logits/rejected": 1.784106969833374, + "logps/chosen": -178.75717163085938, + "logps/rejected": -192.69229125976562, + "loss": 0.7001357078552246, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.07940518856048584, + "rewards/margins": -0.01138945110142231, + "rewards/rejected": 0.0907946228981018, + "step": 800 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.532821536064148, + "eval_logits/rejected": 1.5615730285644531, + "eval_logps/chosen": -152.57040405273438, + "eval_logps/rejected": -147.24534606933594, + "eval_loss": 0.6894002556800842, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.07587439566850662, + "eval_rewards/margins": 0.010124183259904385, + "eval_rewards/rejected": 0.06575021147727966, + "eval_runtime": 90.2864, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 4.731827259063721, + "learning_rate": 7.515555555555555e-07, + "logits/chosen": 1.5014355182647705, + "logits/rejected": 1.706011176109314, + "logps/chosen": -113.23974609375, + "logps/rejected": -150.55316162109375, + "loss": 0.6896101951599121, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.066777303814888, + "rewards/margins": 0.008613836951553822, + "rewards/rejected": 0.05816347524523735, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 4.050163745880127, + "learning_rate": 7.47111111111111e-07, + "logits/chosen": 1.701898217201233, + "logits/rejected": 1.7274971008300781, + "logps/chosen": -147.45330810546875, + "logps/rejected": -140.33255004882812, + "loss": 0.6757836818695069, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08680602163076401, + "rewards/margins": 0.03675536438822746, + "rewards/rejected": 0.050050657242536545, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 4.168673992156982, + "learning_rate": 7.426666666666667e-07, + "logits/chosen": 1.6135514974594116, + "logits/rejected": 1.6518815755844116, + "logps/chosen": -137.38467407226562, + "logps/rejected": -132.65890502929688, + "loss": 0.6800778865814209, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07235284894704819, + "rewards/margins": 0.028275374323129654, + "rewards/rejected": 0.04407747834920883, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 4.72458028793335, + "learning_rate": 7.382222222222222e-07, + "logits/chosen": 1.5987484455108643, + "logits/rejected": 1.6328668594360352, + "logps/chosen": -146.712158203125, + "logps/rejected": -156.0950469970703, + "loss": 0.6804145336151123, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07264034450054169, + "rewards/margins": 0.0286283977329731, + "rewards/rejected": 0.04401194304227829, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 4.735199928283691, + "learning_rate": 7.337777777777778e-07, + "logits/chosen": 1.6810247898101807, + "logits/rejected": 1.6662237644195557, + "logps/chosen": -159.40650939941406, + "logps/rejected": -140.65591430664062, + "loss": 0.6805107116699218, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.061445970088243484, + "rewards/margins": 0.02743927761912346, + "rewards/rejected": 0.034006692469120026, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 3.7038252353668213, + "learning_rate": 7.293333333333332e-07, + "logits/chosen": 1.6597106456756592, + "logits/rejected": 1.6951271295547485, + "logps/chosen": -138.1852569580078, + "logps/rejected": -128.6427764892578, + "loss": 0.6821750164031982, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.07160943746566772, + "rewards/margins": 0.023774990811944008, + "rewards/rejected": 0.047834448516368866, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 4.820807456970215, + "learning_rate": 7.248888888888888e-07, + "logits/chosen": 1.5708585977554321, + "logits/rejected": 1.5483477115631104, + "logps/chosen": -152.8867950439453, + "logps/rejected": -152.02584838867188, + "loss": 0.6911486625671387, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04338831081986427, + "rewards/margins": 0.006898392923176289, + "rewards/rejected": 0.03648992255330086, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 4.6849493980407715, + "learning_rate": 7.204444444444444e-07, + "logits/chosen": 1.5262442827224731, + "logits/rejected": 1.7751166820526123, + "logps/chosen": -143.77993774414062, + "logps/rejected": -155.7498016357422, + "loss": 0.6910871028900146, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.061417657881975174, + "rewards/margins": 0.006087464280426502, + "rewards/rejected": 0.055330194532871246, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 4.951540946960449, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.405790090560913, + "logits/rejected": 1.5980100631713867, + "logps/chosen": -147.6872100830078, + "logps/rejected": -160.23947143554688, + "loss": 0.6822467803955078, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.049768321216106415, + "rewards/margins": 0.024227874353528023, + "rewards/rejected": 0.02554045058786869, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 4.255526542663574, + "learning_rate": 7.115555555555556e-07, + "logits/chosen": 1.6527436971664429, + "logits/rejected": 1.787755012512207, + "logps/chosen": -164.73355102539062, + "logps/rejected": -178.98507690429688, + "loss": 0.6861439704895019, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07546674460172653, + "rewards/margins": 0.017860155552625656, + "rewards/rejected": 0.05760659649968147, + "step": 900 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5091361999511719, + "eval_logits/rejected": 1.5371856689453125, + "eval_logps/chosen": -152.8962860107422, + "eval_logps/rejected": -147.56655883789062, + "eval_loss": 0.690089225769043, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.04328843951225281, + "eval_rewards/margins": 0.009660834446549416, + "eval_rewards/rejected": 0.03362761065363884, + "eval_runtime": 90.3227, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 4.453512668609619, + "learning_rate": 7.071111111111111e-07, + "logits/chosen": 1.700484037399292, + "logits/rejected": 1.4941186904907227, + "logps/chosen": -138.50682067871094, + "logps/rejected": -137.02490234375, + "loss": 0.6877517700195312, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05909284949302673, + "rewards/margins": 0.013610092923045158, + "rewards/rejected": 0.04548276215791702, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 5.548420429229736, + "learning_rate": 7.026666666666667e-07, + "logits/chosen": 1.409182071685791, + "logits/rejected": 1.375797152519226, + "logps/chosen": -158.7325439453125, + "logps/rejected": -161.9824981689453, + "loss": 0.6867617607116699, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07141564786434174, + "rewards/margins": 0.015245514921844006, + "rewards/rejected": 0.05617012828588486, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 5.186211109161377, + "learning_rate": 6.982222222222221e-07, + "logits/chosen": 1.6255321502685547, + "logits/rejected": 1.7182047367095947, + "logps/chosen": -156.53213500976562, + "logps/rejected": -160.67556762695312, + "loss": 0.6812005519866944, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.062074560672044754, + "rewards/margins": 0.026883777230978012, + "rewards/rejected": 0.03519078344106674, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 4.044335842132568, + "learning_rate": 6.937777777777778e-07, + "logits/chosen": 1.6656415462493896, + "logits/rejected": 1.7865594625473022, + "logps/chosen": -147.00344848632812, + "logps/rejected": -173.11428833007812, + "loss": 0.690484619140625, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0269255880266428, + "rewards/margins": 0.007569611072540283, + "rewards/rejected": 0.019355975091457367, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 4.8925299644470215, + "learning_rate": 6.893333333333333e-07, + "logits/chosen": 1.4856427907943726, + "logits/rejected": 1.5664136409759521, + "logps/chosen": -139.77938842773438, + "logps/rejected": -152.74557495117188, + "loss": 0.6817938804626464, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05857279896736145, + "rewards/margins": 0.02614629827439785, + "rewards/rejected": 0.03242649883031845, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 4.514585018157959, + "learning_rate": 6.848888888888889e-07, + "logits/chosen": 1.6077378988265991, + "logits/rejected": 1.4770267009735107, + "logps/chosen": -134.602294921875, + "logps/rejected": -117.98567199707031, + "loss": 0.694350004196167, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.028881916776299477, + "rewards/margins": 0.00047348294174298644, + "rewards/rejected": 0.028408434242010117, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 3.9295125007629395, + "learning_rate": 6.804444444444444e-07, + "logits/chosen": 1.7124595642089844, + "logits/rejected": 1.8135782480239868, + "logps/chosen": -152.15426635742188, + "logps/rejected": -147.62945556640625, + "loss": 0.6929523944854736, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.057992029935121536, + "rewards/margins": 0.002844708738848567, + "rewards/rejected": 0.05514732003211975, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 5.510717391967773, + "learning_rate": 6.76e-07, + "logits/chosen": 1.483984351158142, + "logits/rejected": 1.4224226474761963, + "logps/chosen": -167.27133178710938, + "logps/rejected": -140.16891479492188, + "loss": 0.6842909336090088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0873967856168747, + "rewards/margins": 0.020737329497933388, + "rewards/rejected": 0.06665945053100586, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 4.097748279571533, + "learning_rate": 6.715555555555556e-07, + "logits/chosen": 1.6006208658218384, + "logits/rejected": 1.6803340911865234, + "logps/chosen": -163.8089141845703, + "logps/rejected": -167.1127166748047, + "loss": 0.6833163261413574, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.07443337142467499, + "rewards/margins": 0.023210588842630386, + "rewards/rejected": 0.0512227788567543, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 3.959730625152588, + "learning_rate": 6.67111111111111e-07, + "logits/chosen": 1.5806870460510254, + "logits/rejected": 1.5673385858535767, + "logps/chosen": -148.22584533691406, + "logps/rejected": -123.79376220703125, + "loss": 0.6892601490020752, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07928630709648132, + "rewards/margins": 0.009630966000258923, + "rewards/rejected": 0.06965534389019012, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.522994875907898, + "eval_logits/rejected": 1.5511444807052612, + "eval_logps/chosen": -152.6211395263672, + "eval_logps/rejected": -147.31727600097656, + "eval_loss": 0.6889244914054871, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.07080094516277313, + "eval_rewards/margins": 0.012245929799973965, + "eval_rewards/rejected": 0.058555010706186295, + "eval_runtime": 90.2821, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 4.409601211547852, + "learning_rate": 6.626666666666666e-07, + "logits/chosen": 1.6469438076019287, + "logits/rejected": 1.6551824808120728, + "logps/chosen": -153.6597442626953, + "logps/rejected": -140.854248046875, + "loss": 0.6809319496154785, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09710784256458282, + "rewards/margins": 0.026540305465459824, + "rewards/rejected": 0.0705675408244133, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 6.559939384460449, + "learning_rate": 6.582222222222222e-07, + "logits/chosen": 1.5092687606811523, + "logits/rejected": 1.613526701927185, + "logps/chosen": -141.97103881835938, + "logps/rejected": -145.5518035888672, + "loss": 0.6865349292755127, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0591324046254158, + "rewards/margins": 0.017797131091356277, + "rewards/rejected": 0.041335273534059525, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 5.1169047355651855, + "learning_rate": 6.537777777777778e-07, + "logits/chosen": 1.6573750972747803, + "logits/rejected": 1.4447792768478394, + "logps/chosen": -161.51220703125, + "logps/rejected": -135.18309020996094, + "loss": 0.6804659366607666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10412336885929108, + "rewards/margins": 0.02856394089758396, + "rewards/rejected": 0.07555942982435226, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 4.2392072677612305, + "learning_rate": 6.493333333333333e-07, + "logits/chosen": 1.6003319025039673, + "logits/rejected": 1.6346886157989502, + "logps/chosen": -139.08448791503906, + "logps/rejected": -139.89825439453125, + "loss": 0.6737568378448486, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11095432937145233, + "rewards/margins": 0.043066851794719696, + "rewards/rejected": 0.06788748502731323, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 5.133569240570068, + "learning_rate": 6.448888888888889e-07, + "logits/chosen": 1.4209873676300049, + "logits/rejected": 1.5513734817504883, + "logps/chosen": -141.1917266845703, + "logps/rejected": -130.70431518554688, + "loss": 0.6861515998840332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10248366743326187, + "rewards/margins": 0.016522446647286415, + "rewards/rejected": 0.08596121519804001, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 4.0574445724487305, + "learning_rate": 6.404444444444444e-07, + "logits/chosen": 1.439145803451538, + "logits/rejected": 1.5661519765853882, + "logps/chosen": -111.7681884765625, + "logps/rejected": -126.37353515625, + "loss": 0.6691905975341796, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15403084456920624, + "rewards/margins": 0.05248977616429329, + "rewards/rejected": 0.10154107958078384, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 4.832082748413086, + "learning_rate": 6.36e-07, + "logits/chosen": 1.4902942180633545, + "logits/rejected": 1.3948299884796143, + "logps/chosen": -153.6760711669922, + "logps/rejected": -147.40023803710938, + "loss": 0.6729560375213623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1259043663740158, + "rewards/margins": 0.0469796285033226, + "rewards/rejected": 0.0789247453212738, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 3.609558343887329, + "learning_rate": 6.315555555555555e-07, + "logits/chosen": 1.5476783514022827, + "logits/rejected": 1.6365705728530884, + "logps/chosen": -148.82101440429688, + "logps/rejected": -122.3703842163086, + "loss": 0.6743530750274658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1420261561870575, + "rewards/margins": 0.041455820202827454, + "rewards/rejected": 0.10057034343481064, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 4.052758693695068, + "learning_rate": 6.27111111111111e-07, + "logits/chosen": 1.7934048175811768, + "logits/rejected": 1.7036349773406982, + "logps/chosen": -174.28765869140625, + "logps/rejected": -183.97897338867188, + "loss": 0.6748029708862304, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1439160704612732, + "rewards/margins": 0.0408162847161293, + "rewards/rejected": 0.1030997782945633, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 5.463155269622803, + "learning_rate": 6.226666666666667e-07, + "logits/chosen": 1.5579731464385986, + "logits/rejected": 1.5956088304519653, + "logps/chosen": -138.96115112304688, + "logps/rejected": -152.51766967773438, + "loss": 0.6884016513824462, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11226280778646469, + "rewards/margins": 0.013228577561676502, + "rewards/rejected": 0.09903421252965927, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.5295906066894531, + "eval_logits/rejected": 1.5565518140792847, + "eval_logps/chosen": -152.15892028808594, + "eval_logps/rejected": -146.88125610351562, + "eval_loss": 0.6887561678886414, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11702151596546173, + "eval_rewards/margins": 0.014860817231237888, + "eval_rewards/rejected": 0.10216069966554642, + "eval_runtime": 90.4847, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 2.763, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 5.542585372924805, + "learning_rate": 6.182222222222222e-07, + "logits/chosen": 1.6245231628417969, + "logits/rejected": 1.600940465927124, + "logps/chosen": -155.20175170898438, + "logps/rejected": -144.58438110351562, + "loss": 0.6818556308746337, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1249178797006607, + "rewards/margins": 0.027298670262098312, + "rewards/rejected": 0.0976191908121109, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 3.8583486080169678, + "learning_rate": 6.137777777777778e-07, + "logits/chosen": 1.6029832363128662, + "logits/rejected": 1.652834177017212, + "logps/chosen": -155.6839141845703, + "logps/rejected": -149.71646118164062, + "loss": 0.6816732883453369, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13532081246376038, + "rewards/margins": 0.025635983794927597, + "rewards/rejected": 0.10968482494354248, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 4.535235404968262, + "learning_rate": 6.093333333333332e-07, + "logits/chosen": 1.7116715908050537, + "logits/rejected": 1.5788238048553467, + "logps/chosen": -146.831787109375, + "logps/rejected": -134.20765686035156, + "loss": 0.6897575855255127, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.12892299890518188, + "rewards/margins": 0.011926446110010147, + "rewards/rejected": 0.11699654906988144, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 4.356500148773193, + "learning_rate": 6.048888888888889e-07, + "logits/chosen": 1.6915569305419922, + "logits/rejected": 1.6864850521087646, + "logps/chosen": -154.83705139160156, + "logps/rejected": -141.92440795898438, + "loss": 0.677583646774292, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.13437099754810333, + "rewards/margins": 0.03629336506128311, + "rewards/rejected": 0.09807763993740082, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 4.3424553871154785, + "learning_rate": 6.004444444444444e-07, + "logits/chosen": 1.3488795757293701, + "logits/rejected": 1.3907456398010254, + "logps/chosen": -138.03089904785156, + "logps/rejected": -129.59719848632812, + "loss": 0.6962613582611084, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0907142236828804, + "rewards/margins": 0.0030510523356497288, + "rewards/rejected": 0.08766315877437592, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 4.354366779327393, + "learning_rate": 5.96e-07, + "logits/chosen": 1.6015634536743164, + "logits/rejected": 1.5302627086639404, + "logps/chosen": -125.86180114746094, + "logps/rejected": -113.89128112792969, + "loss": 0.6872058868408203, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11558832228183746, + "rewards/margins": 0.016637511551380157, + "rewards/rejected": 0.09895080327987671, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 6.197093486785889, + "learning_rate": 5.915555555555555e-07, + "logits/chosen": 1.8193966150283813, + "logits/rejected": 1.7454732656478882, + "logps/chosen": -155.8551788330078, + "logps/rejected": -165.95828247070312, + "loss": 0.6903214454650879, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12587358057498932, + "rewards/margins": 0.01157000008970499, + "rewards/rejected": 0.1143035739660263, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 6.9796624183654785, + "learning_rate": 5.871111111111112e-07, + "logits/chosen": 1.7167119979858398, + "logits/rejected": 1.693549394607544, + "logps/chosen": -185.20008850097656, + "logps/rejected": -150.20620727539062, + "loss": 0.6912973880767822, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.1253143846988678, + "rewards/margins": 0.009792610071599483, + "rewards/rejected": 0.11552176624536514, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 4.833356857299805, + "learning_rate": 5.826666666666666e-07, + "logits/chosen": 1.7434288263320923, + "logits/rejected": 1.7330595254898071, + "logps/chosen": -165.2196044921875, + "logps/rejected": -195.1993408203125, + "loss": 0.6812029361724854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11465537548065186, + "rewards/margins": 0.028904888778924942, + "rewards/rejected": 0.08575049787759781, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 4.727373123168945, + "learning_rate": 5.782222222222221e-07, + "logits/chosen": 1.6402454376220703, + "logits/rejected": 1.6071112155914307, + "logps/chosen": -138.0218963623047, + "logps/rejected": -144.56817626953125, + "loss": 0.694274616241455, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.11662209033966064, + "rewards/margins": 0.003029861254617572, + "rewards/rejected": 0.1135922223329544, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5258080959320068, + "eval_logits/rejected": 1.5528327226638794, + "eval_logps/chosen": -152.21652221679688, + "eval_logps/rejected": -146.95628356933594, + "eval_loss": 0.6881142854690552, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11126487702131271, + "eval_rewards/margins": 0.01660888083279133, + "eval_rewards/rejected": 0.09465599805116653, + "eval_runtime": 90.2009, + "eval_samples_per_second": 5.543, + "eval_steps_per_second": 2.772, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 4.669800758361816, + "learning_rate": 5.737777777777778e-07, + "logits/chosen": 1.469012975692749, + "logits/rejected": 1.4835999011993408, + "logps/chosen": -153.94541931152344, + "logps/rejected": -140.69659423828125, + "loss": 0.6921857357025146, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.12547791004180908, + "rewards/margins": 0.005948380567133427, + "rewards/rejected": 0.11952953040599823, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 5.349202632904053, + "learning_rate": 5.693333333333333e-07, + "logits/chosen": 1.5056023597717285, + "logits/rejected": 1.4966309070587158, + "logps/chosen": -144.36300659179688, + "logps/rejected": -122.79240417480469, + "loss": 0.695373821258545, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.09502485394477844, + "rewards/margins": -0.0005314469453878701, + "rewards/rejected": 0.09555630385875702, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 5.629171848297119, + "learning_rate": 5.648888888888889e-07, + "logits/chosen": 1.5664876699447632, + "logits/rejected": 1.7090803384780884, + "logps/chosen": -136.5504150390625, + "logps/rejected": -148.81802368164062, + "loss": 0.6888413906097413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13565854728221893, + "rewards/margins": 0.013341712765395641, + "rewards/rejected": 0.12231683731079102, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 5.568618297576904, + "learning_rate": 5.604444444444444e-07, + "logits/chosen": 1.512286901473999, + "logits/rejected": 1.7082159519195557, + "logps/chosen": -143.9463653564453, + "logps/rejected": -146.82052612304688, + "loss": 0.6722752571105957, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.12829402089118958, + "rewards/margins": 0.046781741082668304, + "rewards/rejected": 0.08151227235794067, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 4.786505699157715, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.6615822315216064, + "logits/rejected": 1.7368179559707642, + "logps/chosen": -144.40292358398438, + "logps/rejected": -157.58763122558594, + "loss": 0.6829993724822998, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11200229823589325, + "rewards/margins": 0.026959875598549843, + "rewards/rejected": 0.08504240959882736, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 4.871355056762695, + "learning_rate": 5.515555555555555e-07, + "logits/chosen": 1.581756830215454, + "logits/rejected": 1.643133521080017, + "logps/chosen": -123.44065856933594, + "logps/rejected": -131.16763305664062, + "loss": 0.6869094848632813, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.11280514299869537, + "rewards/margins": 0.016064399853348732, + "rewards/rejected": 0.09674074500799179, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 5.43352746963501, + "learning_rate": 5.471111111111111e-07, + "logits/chosen": 1.8056955337524414, + "logits/rejected": 1.7294971942901611, + "logps/chosen": -159.89236450195312, + "logps/rejected": -152.61293029785156, + "loss": 0.6701028347015381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1585489809513092, + "rewards/margins": 0.0533718541264534, + "rewards/rejected": 0.1051771491765976, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 5.0044474601745605, + "learning_rate": 5.426666666666666e-07, + "logits/chosen": 1.68048894405365, + "logits/rejected": 1.6845731735229492, + "logps/chosen": -153.21994018554688, + "logps/rejected": -140.06024169921875, + "loss": 0.675870132446289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11108388751745224, + "rewards/margins": 0.039517562836408615, + "rewards/rejected": 0.07156632840633392, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 4.051770210266113, + "learning_rate": 5.382222222222223e-07, + "logits/chosen": 1.6341949701309204, + "logits/rejected": 1.7052574157714844, + "logps/chosen": -143.89512634277344, + "logps/rejected": -132.5374298095703, + "loss": 0.6808773040771484, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11111323535442352, + "rewards/margins": 0.02908928692340851, + "rewards/rejected": 0.08202396333217621, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 4.0852885246276855, + "learning_rate": 5.337777777777778e-07, + "logits/chosen": 1.701703429222107, + "logits/rejected": 1.5303716659545898, + "logps/chosen": -149.3526611328125, + "logps/rejected": -129.2085418701172, + "loss": 0.6718564987182617, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13976502418518066, + "rewards/margins": 0.04783231392502785, + "rewards/rejected": 0.09193271398544312, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5326013565063477, + "eval_logits/rejected": 1.5598564147949219, + "eval_logps/chosen": -152.15328979492188, + "eval_logps/rejected": -146.90542602539062, + "eval_loss": 0.6874103546142578, + "eval_rewards/accuracies": 0.5460000038146973, + "eval_rewards/chosen": 0.1175844818353653, + "eval_rewards/margins": 0.01784202829003334, + "eval_rewards/rejected": 0.09974244982004166, + "eval_runtime": 90.5137, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 2.762, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 5.752922534942627, + "learning_rate": 5.293333333333333e-07, + "logits/chosen": 1.7796787023544312, + "logits/rejected": 1.7884852886199951, + "logps/chosen": -164.8948211669922, + "logps/rejected": -159.16610717773438, + "loss": 0.6778300285339356, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.13073134422302246, + "rewards/margins": 0.03521668165922165, + "rewards/rejected": 0.09551465511322021, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 5.461390018463135, + "learning_rate": 5.248888888888888e-07, + "logits/chosen": 1.4079742431640625, + "logits/rejected": 1.6630131006240845, + "logps/chosen": -128.2792510986328, + "logps/rejected": -150.0707244873047, + "loss": 0.6839999198913574, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11985810846090317, + "rewards/margins": 0.021619705483317375, + "rewards/rejected": 0.09823839366436005, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 4.624034881591797, + "learning_rate": 5.204444444444444e-07, + "logits/chosen": 1.727718710899353, + "logits/rejected": 1.7013204097747803, + "logps/chosen": -155.4447479248047, + "logps/rejected": -145.0660858154297, + "loss": 0.6688554286956787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14348448812961578, + "rewards/margins": 0.05262491852045059, + "rewards/rejected": 0.09085958451032639, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 4.8239617347717285, + "learning_rate": 5.16e-07, + "logits/chosen": 1.5751596689224243, + "logits/rejected": 1.5026706457138062, + "logps/chosen": -151.81521606445312, + "logps/rejected": -144.22158813476562, + "loss": 0.6607092380523681, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15143291652202606, + "rewards/margins": 0.06974462419748306, + "rewards/rejected": 0.0816882774233818, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 5.160701274871826, + "learning_rate": 5.115555555555555e-07, + "logits/chosen": 1.5843019485473633, + "logits/rejected": 1.4874061346054077, + "logps/chosen": -164.33465576171875, + "logps/rejected": -138.5998077392578, + "loss": 0.658644723892212, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.18924950063228607, + "rewards/margins": 0.07819559425115585, + "rewards/rejected": 0.11105390638113022, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 3.7558722496032715, + "learning_rate": 5.071111111111111e-07, + "logits/chosen": 1.7361290454864502, + "logits/rejected": 1.7833389043807983, + "logps/chosen": -165.33505249023438, + "logps/rejected": -163.27987670898438, + "loss": 0.675537109375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13921695947647095, + "rewards/margins": 0.03955959528684616, + "rewards/rejected": 0.09965735673904419, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 3.7639994621276855, + "learning_rate": 5.026666666666667e-07, + "logits/chosen": 1.5270617008209229, + "logits/rejected": 1.6595666408538818, + "logps/chosen": -130.36978149414062, + "logps/rejected": -127.0807113647461, + "loss": 0.6692055702209473, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14394144713878632, + "rewards/margins": 0.053458504378795624, + "rewards/rejected": 0.09048295766115189, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 4.3264946937561035, + "learning_rate": 4.982222222222223e-07, + "logits/chosen": 1.6070820093154907, + "logits/rejected": 1.7583873271942139, + "logps/chosen": -160.203369140625, + "logps/rejected": -165.0500946044922, + "loss": 0.6763527393341064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1415795534849167, + "rewards/margins": 0.03869130462408066, + "rewards/rejected": 0.10288827121257782, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 5.288540840148926, + "learning_rate": 4.937777777777777e-07, + "logits/chosen": 1.5944023132324219, + "logits/rejected": 1.5990722179412842, + "logps/chosen": -165.3871307373047, + "logps/rejected": -161.1041717529297, + "loss": 0.6612223148345947, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16650637984275818, + "rewards/margins": 0.07088200747966766, + "rewards/rejected": 0.09562437236309052, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 4.826308727264404, + "learning_rate": 4.893333333333333e-07, + "logits/chosen": 1.5509014129638672, + "logits/rejected": 1.682037115097046, + "logps/chosen": -151.07647705078125, + "logps/rejected": -160.36459350585938, + "loss": 0.6734431266784668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1914874017238617, + "rewards/margins": 0.043385379016399384, + "rewards/rejected": 0.1481020450592041, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.5414153337478638, + "eval_logits/rejected": 1.5681167840957642, + "eval_logps/chosen": -151.83712768554688, + "eval_logps/rejected": -146.61451721191406, + "eval_loss": 0.6868449449539185, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.14920170605182648, + "eval_rewards/margins": 0.020368749275803566, + "eval_rewards/rejected": 0.12883296608924866, + "eval_runtime": 90.3054, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.768, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 5.3463215827941895, + "learning_rate": 4.848888888888888e-07, + "logits/chosen": 1.7068660259246826, + "logits/rejected": 1.7066301107406616, + "logps/chosen": -144.30587768554688, + "logps/rejected": -148.75234985351562, + "loss": 0.6804617881774903, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14624662697315216, + "rewards/margins": 0.029874861240386963, + "rewards/rejected": 0.1163717657327652, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 6.410695552825928, + "learning_rate": 4.804444444444444e-07, + "logits/chosen": 1.69613778591156, + "logits/rejected": 1.7461612224578857, + "logps/chosen": -134.14889526367188, + "logps/rejected": -149.71217346191406, + "loss": 0.6900940895080566, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.14041271805763245, + "rewards/margins": 0.010535283014178276, + "rewards/rejected": 0.12987744808197021, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 6.171964168548584, + "learning_rate": 4.76e-07, + "logits/chosen": 1.5641189813613892, + "logits/rejected": 1.3624608516693115, + "logps/chosen": -152.79434204101562, + "logps/rejected": -146.59823608398438, + "loss": 0.6705893516540528, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16485603153705597, + "rewards/margins": 0.05301935225725174, + "rewards/rejected": 0.11183668673038483, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 6.058873176574707, + "learning_rate": 4.7155555555555556e-07, + "logits/chosen": 1.554245114326477, + "logits/rejected": 1.5402500629425049, + "logps/chosen": -169.1145782470703, + "logps/rejected": -150.73916625976562, + "loss": 0.6832107543945313, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13169637322425842, + "rewards/margins": 0.023917924612760544, + "rewards/rejected": 0.10777842998504639, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 6.125433921813965, + "learning_rate": 4.6711111111111104e-07, + "logits/chosen": 1.5380629301071167, + "logits/rejected": 1.5384807586669922, + "logps/chosen": -152.90679931640625, + "logps/rejected": -147.15878295898438, + "loss": 0.678408432006836, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.14591780304908752, + "rewards/margins": 0.034742556512355804, + "rewards/rejected": 0.11117523908615112, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 4.146246433258057, + "learning_rate": 4.6266666666666663e-07, + "logits/chosen": 1.6788570880889893, + "logits/rejected": 1.672521948814392, + "logps/chosen": -148.545654296875, + "logps/rejected": -173.09043884277344, + "loss": 0.6805721282958984, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14106473326683044, + "rewards/margins": 0.03079717420041561, + "rewards/rejected": 0.11026755720376968, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 4.701725006103516, + "learning_rate": 4.5822222222222216e-07, + "logits/chosen": 1.549036979675293, + "logits/rejected": 1.7567065954208374, + "logps/chosen": -136.66348266601562, + "logps/rejected": -153.68270874023438, + "loss": 0.6799561977386475, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.10922833532094955, + "rewards/margins": 0.03145608678460121, + "rewards/rejected": 0.07777224481105804, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 4.027674198150635, + "learning_rate": 4.5377777777777775e-07, + "logits/chosen": 1.5800559520721436, + "logits/rejected": 1.6139322519302368, + "logps/chosen": -142.79774475097656, + "logps/rejected": -138.2237548828125, + "loss": 0.6735220909118652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12952227890491486, + "rewards/margins": 0.044702861458063126, + "rewards/rejected": 0.08481942117214203, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 4.0694756507873535, + "learning_rate": 4.493333333333333e-07, + "logits/chosen": 1.8074331283569336, + "logits/rejected": 1.7226593494415283, + "logps/chosen": -139.64735412597656, + "logps/rejected": -145.65493774414062, + "loss": 0.6719463348388672, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12084708362817764, + "rewards/margins": 0.04711727052927017, + "rewards/rejected": 0.07372982054948807, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 3.381568670272827, + "learning_rate": 4.4488888888888887e-07, + "logits/chosen": 1.6288955211639404, + "logits/rejected": 1.710903525352478, + "logps/chosen": -146.3409423828125, + "logps/rejected": -140.67259216308594, + "loss": 0.6622853755950928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14423500001430511, + "rewards/margins": 0.06720604002475739, + "rewards/rejected": 0.07702895998954773, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.5185505151748657, + "eval_logits/rejected": 1.5454010963439941, + "eval_logps/chosen": -152.3339080810547, + "eval_logps/rejected": -147.08615112304688, + "eval_loss": 0.6876598596572876, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.0995246097445488, + "eval_rewards/margins": 0.017856568098068237, + "eval_rewards/rejected": 0.08166804164648056, + "eval_runtime": 90.2948, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.769, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 4.928984642028809, + "learning_rate": 4.4044444444444445e-07, + "logits/chosen": 1.60953688621521, + "logits/rejected": 1.5581461191177368, + "logps/chosen": -138.63772583007812, + "logps/rejected": -153.8605194091797, + "loss": 0.6731616020202636, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.09628470242023468, + "rewards/margins": 0.046329062432050705, + "rewards/rejected": 0.04995563626289368, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 5.149722099304199, + "learning_rate": 4.36e-07, + "logits/chosen": 1.4915835857391357, + "logits/rejected": 1.5048246383666992, + "logps/chosen": -141.77896118164062, + "logps/rejected": -151.37413024902344, + "loss": 0.6731313228607178, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09898792207241058, + "rewards/margins": 0.04597122594714165, + "rewards/rejected": 0.05301668494939804, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 4.9786505699157715, + "learning_rate": 4.3155555555555557e-07, + "logits/chosen": 1.5681893825531006, + "logits/rejected": 1.602756142616272, + "logps/chosen": -180.0757598876953, + "logps/rejected": -172.3198699951172, + "loss": 0.6719411849975586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.12683382630348206, + "rewards/margins": 0.048313409090042114, + "rewards/rejected": 0.07852041721343994, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 5.631436824798584, + "learning_rate": 4.271111111111111e-07, + "logits/chosen": 1.596337080001831, + "logits/rejected": 1.5848333835601807, + "logps/chosen": -165.8964080810547, + "logps/rejected": -152.3248748779297, + "loss": 0.6702041149139404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10199526697397232, + "rewards/margins": 0.052691929042339325, + "rewards/rejected": 0.049303337931632996, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 4.669321537017822, + "learning_rate": 4.226666666666667e-07, + "logits/chosen": 1.3906035423278809, + "logits/rejected": 1.64999520778656, + "logps/chosen": -128.77316284179688, + "logps/rejected": -130.9954376220703, + "loss": 0.6716189384460449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11641445010900497, + "rewards/margins": 0.048517655581235886, + "rewards/rejected": 0.06789680570363998, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 4.713964462280273, + "learning_rate": 4.1822222222222217e-07, + "logits/chosen": 1.3268989324569702, + "logits/rejected": 1.3860762119293213, + "logps/chosen": -131.51779174804688, + "logps/rejected": -149.37356567382812, + "loss": 0.6767086029052735, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.08011795580387115, + "rewards/margins": 0.03772367164492607, + "rewards/rejected": 0.04239428788423538, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 4.6052775382995605, + "learning_rate": 4.1377777777777776e-07, + "logits/chosen": 1.6550222635269165, + "logits/rejected": 1.5474750995635986, + "logps/chosen": -119.24078369140625, + "logps/rejected": -144.82949829101562, + "loss": 0.6784813404083252, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0944647416472435, + "rewards/margins": 0.03385248780250549, + "rewards/rejected": 0.06061224266886711, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 5.244394302368164, + "learning_rate": 4.093333333333333e-07, + "logits/chosen": 1.4200581312179565, + "logits/rejected": 1.524652123451233, + "logps/chosen": -149.26693725585938, + "logps/rejected": -171.10025024414062, + "loss": 0.6608034133911133, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11365441232919693, + "rewards/margins": 0.07590361684560776, + "rewards/rejected": 0.037750788033008575, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 4.911222457885742, + "learning_rate": 4.048888888888889e-07, + "logits/chosen": 1.685605764389038, + "logits/rejected": 1.758301019668579, + "logps/chosen": -155.50318908691406, + "logps/rejected": -157.9464874267578, + "loss": 0.6672821521759034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0965135246515274, + "rewards/margins": 0.05731017515063286, + "rewards/rejected": 0.03920333832502365, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 5.968125820159912, + "learning_rate": 4.004444444444444e-07, + "logits/chosen": 1.4200689792633057, + "logits/rejected": 1.4647656679153442, + "logps/chosen": -133.22052001953125, + "logps/rejected": -133.8321533203125, + "loss": 0.6618384838104248, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09781745076179504, + "rewards/margins": 0.07378478348255157, + "rewards/rejected": 0.02403266355395317, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.501773476600647, + "eval_logits/rejected": 1.5282105207443237, + "eval_logps/chosen": -152.49317932128906, + "eval_logps/rejected": -147.2423553466797, + "eval_loss": 0.6881668567657471, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.08359722793102264, + "eval_rewards/margins": 0.0175489354878664, + "eval_rewards/rejected": 0.0660482868552208, + "eval_runtime": 90.2344, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 2.771, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 4.5057783126831055, + "learning_rate": 3.96e-07, + "logits/chosen": 1.5745240449905396, + "logits/rejected": 1.6804988384246826, + "logps/chosen": -140.4354248046875, + "logps/rejected": -145.98095703125, + "loss": 0.6589378833770752, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.10474289953708649, + "rewards/margins": 0.07502223551273346, + "rewards/rejected": 0.029720673337578773, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 4.290356159210205, + "learning_rate": 3.9155555555555553e-07, + "logits/chosen": 1.472800850868225, + "logits/rejected": 1.5744824409484863, + "logps/chosen": -155.6610107421875, + "logps/rejected": -159.48797607421875, + "loss": 0.6584812641143799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.12055446207523346, + "rewards/margins": 0.07605487108230591, + "rewards/rejected": 0.04449959844350815, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 6.041454792022705, + "learning_rate": 3.871111111111111e-07, + "logits/chosen": 1.6146243810653687, + "logits/rejected": 1.5308036804199219, + "logps/chosen": -138.68113708496094, + "logps/rejected": -167.34625244140625, + "loss": 0.675632095336914, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.09587542712688446, + "rewards/margins": 0.0477941557765007, + "rewards/rejected": 0.04808126017451286, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 4.511700630187988, + "learning_rate": 3.8266666666666665e-07, + "logits/chosen": 1.5492980480194092, + "logits/rejected": 1.4730138778686523, + "logps/chosen": -155.36436462402344, + "logps/rejected": -137.7958221435547, + "loss": 0.670874547958374, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07881785929203033, + "rewards/margins": 0.05401306599378586, + "rewards/rejected": 0.024804789572954178, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 4.88837194442749, + "learning_rate": 3.7822222222222224e-07, + "logits/chosen": 1.4858735799789429, + "logits/rejected": 1.5098966360092163, + "logps/chosen": -135.9697723388672, + "logps/rejected": -133.1214141845703, + "loss": 0.6685511589050293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09598390758037567, + "rewards/margins": 0.053858526051044464, + "rewards/rejected": 0.04212538152933121, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 6.195035934448242, + "learning_rate": 3.7377777777777777e-07, + "logits/chosen": 1.6547601222991943, + "logits/rejected": 1.4603536128997803, + "logps/chosen": -145.89651489257812, + "logps/rejected": -133.7212677001953, + "loss": 0.6622809410095215, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12225265800952911, + "rewards/margins": 0.06909220665693283, + "rewards/rejected": 0.05316043645143509, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 3.8046131134033203, + "learning_rate": 3.693333333333333e-07, + "logits/chosen": 1.5132381916046143, + "logits/rejected": 1.4982823133468628, + "logps/chosen": -150.82089233398438, + "logps/rejected": -148.30099487304688, + "loss": 0.6719642162322998, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11414922773838043, + "rewards/margins": 0.05024232715368271, + "rewards/rejected": 0.06390689313411713, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 4.39717960357666, + "learning_rate": 3.6488888888888884e-07, + "logits/chosen": 1.5617916584014893, + "logits/rejected": 1.682511329650879, + "logps/chosen": -158.3753204345703, + "logps/rejected": -146.37451171875, + "loss": 0.6737432956695557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11844941228628159, + "rewards/margins": 0.04444308206439018, + "rewards/rejected": 0.0740063264966011, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 5.2524261474609375, + "learning_rate": 3.604444444444444e-07, + "logits/chosen": 1.5695984363555908, + "logits/rejected": 1.610656499862671, + "logps/chosen": -151.31857299804688, + "logps/rejected": -151.91409301757812, + "loss": 0.6611621856689454, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14522945880889893, + "rewards/margins": 0.07014746963977814, + "rewards/rejected": 0.07508201897144318, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.1307501792907715, + "learning_rate": 3.5599999999999996e-07, + "logits/chosen": 1.483229398727417, + "logits/rejected": 1.510617733001709, + "logps/chosen": -148.93728637695312, + "logps/rejected": -150.54031372070312, + "loss": 0.6745404243469239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10079844295978546, + "rewards/margins": 0.044623635709285736, + "rewards/rejected": 0.05617480352520943, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.4972316026687622, + "eval_logits/rejected": 1.523227572441101, + "eval_logps/chosen": -152.4461669921875, + "eval_logps/rejected": -147.20535278320312, + "eval_loss": 0.6881544589996338, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.08829746395349503, + "eval_rewards/margins": 0.01854766719043255, + "eval_rewards/rejected": 0.06974979490041733, + "eval_runtime": 2076.8971, + "eval_samples_per_second": 0.241, + "eval_steps_per_second": 0.12, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 5.462865352630615, + "learning_rate": 3.5155555555555554e-07, + "logits/chosen": 1.5469191074371338, + "logits/rejected": 1.5952446460723877, + "logps/chosen": -141.61489868164062, + "logps/rejected": -177.96774291992188, + "loss": 0.6638422012329102, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11491324007511139, + "rewards/margins": 0.06743086874485016, + "rewards/rejected": 0.04748237505555153, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 5.499386310577393, + "learning_rate": 3.471111111111111e-07, + "logits/chosen": 1.5340317487716675, + "logits/rejected": 1.5532381534576416, + "logps/chosen": -147.91448974609375, + "logps/rejected": -149.66990661621094, + "loss": 0.6894158363342285, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07543136924505234, + "rewards/margins": 0.013522538356482983, + "rewards/rejected": 0.061908822506666183, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 4.518467426300049, + "learning_rate": 3.4266666666666666e-07, + "logits/chosen": 1.6533176898956299, + "logits/rejected": 1.7667814493179321, + "logps/chosen": -159.10305786132812, + "logps/rejected": -158.16673278808594, + "loss": 0.6499703407287598, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14074134826660156, + "rewards/margins": 0.09790968149900436, + "rewards/rejected": 0.042831674218177795, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 5.075318813323975, + "learning_rate": 3.382222222222222e-07, + "logits/chosen": 1.610741376876831, + "logits/rejected": 1.5383796691894531, + "logps/chosen": -161.1984405517578, + "logps/rejected": -176.49668884277344, + "loss": 0.6678359508514404, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.11577492952346802, + "rewards/margins": 0.05837785452604294, + "rewards/rejected": 0.05739706754684448, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 5.038666725158691, + "learning_rate": 3.337777777777778e-07, + "logits/chosen": 1.5075687170028687, + "logits/rejected": 1.5709255933761597, + "logps/chosen": -137.76171875, + "logps/rejected": -143.53628540039062, + "loss": 0.6760771751403809, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10021932423114777, + "rewards/margins": 0.04277960956096649, + "rewards/rejected": 0.05743972212076187, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 5.631906032562256, + "learning_rate": 3.293333333333333e-07, + "logits/chosen": 1.7244329452514648, + "logits/rejected": 1.8332182168960571, + "logps/chosen": -168.69371032714844, + "logps/rejected": -175.52243041992188, + "loss": 0.6608580589294434, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12150160223245621, + "rewards/margins": 0.07380715757608414, + "rewards/rejected": 0.04769443720579147, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 5.159383296966553, + "learning_rate": 3.248888888888889e-07, + "logits/chosen": 1.6496635675430298, + "logits/rejected": 1.5933442115783691, + "logps/chosen": -159.64492797851562, + "logps/rejected": -138.42958068847656, + "loss": 0.6662045001983643, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11035974323749542, + "rewards/margins": 0.06124384328722954, + "rewards/rejected": 0.04911590367555618, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 4.214277744293213, + "learning_rate": 3.204444444444444e-07, + "logits/chosen": 1.5003747940063477, + "logits/rejected": 1.2975776195526123, + "logps/chosen": -149.29920959472656, + "logps/rejected": -149.67440795898438, + "loss": 0.6661914348602295, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.10157595574855804, + "rewards/margins": 0.06117083504796028, + "rewards/rejected": 0.04040512815117836, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 4.933855056762695, + "learning_rate": 3.1599999999999997e-07, + "logits/chosen": 1.6081959009170532, + "logits/rejected": 1.6354488134384155, + "logps/chosen": -155.25970458984375, + "logps/rejected": -149.46218872070312, + "loss": 0.6604723453521728, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14066801965236664, + "rewards/margins": 0.07809358835220337, + "rewards/rejected": 0.06257440894842148, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 5.134099960327148, + "learning_rate": 3.115555555555555e-07, + "logits/chosen": 1.6613690853118896, + "logits/rejected": 1.7091057300567627, + "logps/chosen": -164.70701599121094, + "logps/rejected": -157.0238494873047, + "loss": 0.6582140922546387, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12183503806591034, + "rewards/margins": 0.07757680863142014, + "rewards/rejected": 0.044258248060941696, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.4925261735916138, + "eval_logits/rejected": 1.5181974172592163, + "eval_logps/chosen": -152.43206787109375, + "eval_logps/rejected": -147.19375610351562, + "eval_loss": 0.6884378790855408, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.08970824629068375, + "eval_rewards/margins": 0.018797704949975014, + "eval_rewards/rejected": 0.07091052830219269, + "eval_runtime": 90.3798, + "eval_samples_per_second": 5.532, + "eval_steps_per_second": 2.766, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 4.911464214324951, + "learning_rate": 3.071111111111111e-07, + "logits/chosen": 1.5579628944396973, + "logits/rejected": 1.4952296018600464, + "logps/chosen": -150.1460418701172, + "logps/rejected": -133.44647216796875, + "loss": 0.6783469200134278, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1408417820930481, + "rewards/margins": 0.03822038695216179, + "rewards/rejected": 0.1026213988661766, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 4.400730609893799, + "learning_rate": 3.026666666666666e-07, + "logits/chosen": 1.7586734294891357, + "logits/rejected": 1.6157715320587158, + "logps/chosen": -148.05960083007812, + "logps/rejected": -145.48837280273438, + "loss": 0.6609668731689453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1438443809747696, + "rewards/margins": 0.07730484008789062, + "rewards/rejected": 0.06653954088687897, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 4.201021671295166, + "learning_rate": 2.982222222222222e-07, + "logits/chosen": 1.5160504579544067, + "logits/rejected": 1.521612286567688, + "logps/chosen": -162.4706573486328, + "logps/rejected": -141.2475128173828, + "loss": 0.6658499717712403, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14357298612594604, + "rewards/margins": 0.06365186721086502, + "rewards/rejected": 0.07992113381624222, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 4.233860015869141, + "learning_rate": 2.937777777777778e-07, + "logits/chosen": 1.5524044036865234, + "logits/rejected": 1.4785308837890625, + "logps/chosen": -147.02334594726562, + "logps/rejected": -139.936279296875, + "loss": 0.6650140762329102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10469619184732437, + "rewards/margins": 0.06748644262552261, + "rewards/rejected": 0.037209756672382355, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 3.9541995525360107, + "learning_rate": 2.8933333333333333e-07, + "logits/chosen": 1.6252977848052979, + "logits/rejected": 1.674830675125122, + "logps/chosen": -141.0922393798828, + "logps/rejected": -147.82305908203125, + "loss": 0.6732684135437011, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11090433597564697, + "rewards/margins": 0.04710138589143753, + "rewards/rejected": 0.06380295008420944, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 5.4466471672058105, + "learning_rate": 2.848888888888889e-07, + "logits/chosen": 1.6136564016342163, + "logits/rejected": 1.6527297496795654, + "logps/chosen": -152.11404418945312, + "logps/rejected": -117.46043395996094, + "loss": 0.6697136402130127, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1206357479095459, + "rewards/margins": 0.05328177288174629, + "rewards/rejected": 0.0673539787530899, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 4.790223121643066, + "learning_rate": 2.8044444444444445e-07, + "logits/chosen": 1.6939268112182617, + "logits/rejected": 1.6044652462005615, + "logps/chosen": -160.64529418945312, + "logps/rejected": -154.56561279296875, + "loss": 0.6666451454162597, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1110156923532486, + "rewards/margins": 0.06152622774243355, + "rewards/rejected": 0.04948946088552475, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 4.825891494750977, + "learning_rate": 2.7600000000000004e-07, + "logits/chosen": 1.6585792303085327, + "logits/rejected": 1.7031543254852295, + "logps/chosen": -147.00076293945312, + "logps/rejected": -160.07797241210938, + "loss": 0.6680663585662842, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.12477920949459076, + "rewards/margins": 0.06180337816476822, + "rewards/rejected": 0.06297583878040314, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 4.7607879638671875, + "learning_rate": 2.715555555555555e-07, + "logits/chosen": 1.546024203300476, + "logits/rejected": 1.534863829612732, + "logps/chosen": -159.22036743164062, + "logps/rejected": -157.2753143310547, + "loss": 0.6581204414367676, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1385352909564972, + "rewards/margins": 0.0793139860033989, + "rewards/rejected": 0.0592212975025177, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 5.009489059448242, + "learning_rate": 2.671111111111111e-07, + "logits/chosen": 1.6235746145248413, + "logits/rejected": 1.749132513999939, + "logps/chosen": -153.6427764892578, + "logps/rejected": -129.64744567871094, + "loss": 0.6795202255249023, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12408678233623505, + "rewards/margins": 0.0351543165743351, + "rewards/rejected": 0.08893246948719025, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.494035243988037, + "eval_logits/rejected": 1.5192769765853882, + "eval_logps/chosen": -152.30662536621094, + "eval_logps/rejected": -147.08494567871094, + "eval_loss": 0.688173234462738, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.10225009173154831, + "eval_rewards/margins": 0.020460575819015503, + "eval_rewards/rejected": 0.08178950846195221, + "eval_runtime": 90.2907, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 4.748835563659668, + "learning_rate": 2.6266666666666664e-07, + "logits/chosen": 1.6174519062042236, + "logits/rejected": 1.5858701467514038, + "logps/chosen": -128.00579833984375, + "logps/rejected": -147.90884399414062, + "loss": 0.6726502418518067, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11665438115596771, + "rewards/margins": 0.04572301730513573, + "rewards/rejected": 0.07093136012554169, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 4.940587520599365, + "learning_rate": 2.582222222222222e-07, + "logits/chosen": 1.6492948532104492, + "logits/rejected": 1.6153085231781006, + "logps/chosen": -157.8480224609375, + "logps/rejected": -154.41123962402344, + "loss": 0.6731919765472412, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09920360893011093, + "rewards/margins": 0.05006008595228195, + "rewards/rejected": 0.04914351552724838, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 5.234673500061035, + "learning_rate": 2.5377777777777776e-07, + "logits/chosen": 1.7610938549041748, + "logits/rejected": 1.7243057489395142, + "logps/chosen": -170.20077514648438, + "logps/rejected": -171.45242309570312, + "loss": 0.6730650424957275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11712609231472015, + "rewards/margins": 0.04995986074209213, + "rewards/rejected": 0.06716623157262802, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 4.774282455444336, + "learning_rate": 2.493333333333333e-07, + "logits/chosen": 1.5651425123214722, + "logits/rejected": 1.6542021036148071, + "logps/chosen": -155.97225952148438, + "logps/rejected": -151.70547485351562, + "loss": 0.6627981185913085, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1584210842847824, + "rewards/margins": 0.07277830690145493, + "rewards/rejected": 0.08564277738332748, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 4.362725257873535, + "learning_rate": 2.448888888888889e-07, + "logits/chosen": 1.646812081336975, + "logits/rejected": 1.5191072225570679, + "logps/chosen": -159.91146850585938, + "logps/rejected": -153.39175415039062, + "loss": 0.676693344116211, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1165633574128151, + "rewards/margins": 0.043364234268665314, + "rewards/rejected": 0.07319913059473038, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 6.111082077026367, + "learning_rate": 2.404444444444444e-07, + "logits/chosen": 1.3722110986709595, + "logits/rejected": 1.4659714698791504, + "logps/chosen": -136.6461639404297, + "logps/rejected": -157.80828857421875, + "loss": 0.6549744606018066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14050449430942535, + "rewards/margins": 0.09741021692752838, + "rewards/rejected": 0.04309428110718727, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 4.771009922027588, + "learning_rate": 2.3599999999999997e-07, + "logits/chosen": 1.4619197845458984, + "logits/rejected": 1.6102012395858765, + "logps/chosen": -147.86558532714844, + "logps/rejected": -138.744140625, + "loss": 0.6633531093597412, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11970734596252441, + "rewards/margins": 0.06974931806325912, + "rewards/rejected": 0.04995802417397499, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 4.225287437438965, + "learning_rate": 2.3155555555555553e-07, + "logits/chosen": 1.570460557937622, + "logits/rejected": 1.6326253414154053, + "logps/chosen": -143.7037353515625, + "logps/rejected": -146.02279663085938, + "loss": 0.6649062156677246, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13935089111328125, + "rewards/margins": 0.0673290267586708, + "rewards/rejected": 0.07202187180519104, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 3.609804153442383, + "learning_rate": 2.2711111111111112e-07, + "logits/chosen": 1.3968725204467773, + "logits/rejected": 1.5177648067474365, + "logps/chosen": -115.10206604003906, + "logps/rejected": -121.81781005859375, + "loss": 0.6878479957580567, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.11384274810552597, + "rewards/margins": 0.01629973202943802, + "rewards/rejected": 0.09754300117492676, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 4.79855489730835, + "learning_rate": 2.2266666666666668e-07, + "logits/chosen": 1.7946197986602783, + "logits/rejected": 1.779532790184021, + "logps/chosen": -142.3692626953125, + "logps/rejected": -148.74429321289062, + "loss": 0.6723564624786377, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11757278442382812, + "rewards/margins": 0.04625899717211723, + "rewards/rejected": 0.0713137835264206, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.4949489831924438, + "eval_logits/rejected": 1.5201067924499512, + "eval_logps/chosen": -152.3157958984375, + "eval_logps/rejected": -147.0997314453125, + "eval_loss": 0.6879469156265259, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": 0.10133373737335205, + "eval_rewards/margins": 0.021023308858275414, + "eval_rewards/rejected": 0.08031044155359268, + "eval_runtime": 90.2805, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 6.524471759796143, + "learning_rate": 2.1822222222222224e-07, + "logits/chosen": 1.5327484607696533, + "logits/rejected": 1.6004886627197266, + "logps/chosen": -163.94483947753906, + "logps/rejected": -163.67176818847656, + "loss": 0.6695352554321289, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.16861538589000702, + "rewards/margins": 0.05443992465734482, + "rewards/rejected": 0.1141754612326622, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 5.132118225097656, + "learning_rate": 2.1377777777777777e-07, + "logits/chosen": 1.4321272373199463, + "logits/rejected": 1.5530569553375244, + "logps/chosen": -142.76473999023438, + "logps/rejected": -157.69161987304688, + "loss": 0.6764227390289307, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.15017978847026825, + "rewards/margins": 0.04101533442735672, + "rewards/rejected": 0.10916446149349213, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 5.326941967010498, + "learning_rate": 2.0933333333333333e-07, + "logits/chosen": 1.586363673210144, + "logits/rejected": 1.6176955699920654, + "logps/chosen": -149.9454803466797, + "logps/rejected": -126.28958892822266, + "loss": 0.6754706859588623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13434644043445587, + "rewards/margins": 0.04313293471932411, + "rewards/rejected": 0.09121349453926086, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.685925483703613, + "learning_rate": 2.048888888888889e-07, + "logits/chosen": 1.581106424331665, + "logits/rejected": 1.4789012670516968, + "logps/chosen": -156.9187469482422, + "logps/rejected": -159.40908813476562, + "loss": 0.6753536224365234, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10917127132415771, + "rewards/margins": 0.04896850138902664, + "rewards/rejected": 0.06020277738571167, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 4.94968318939209, + "learning_rate": 2.0044444444444445e-07, + "logits/chosen": 1.642589807510376, + "logits/rejected": 1.6963341236114502, + "logps/chosen": -155.6983642578125, + "logps/rejected": -156.7867431640625, + "loss": 0.6885885238647461, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.12209658324718475, + "rewards/margins": 0.016716431826353073, + "rewards/rejected": 0.10538016259670258, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 4.533969879150391, + "learning_rate": 1.96e-07, + "logits/chosen": 1.5925997495651245, + "logits/rejected": 1.4471170902252197, + "logps/chosen": -118.26934814453125, + "logps/rejected": -143.85903930664062, + "loss": 0.6437589168548584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14988769590854645, + "rewards/margins": 0.11566541343927383, + "rewards/rejected": 0.03422228619456291, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 4.945888519287109, + "learning_rate": 1.9155555555555554e-07, + "logits/chosen": 1.6543266773223877, + "logits/rejected": 1.5584715604782104, + "logps/chosen": -156.47389221191406, + "logps/rejected": -139.50567626953125, + "loss": 0.6912104606628418, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11081911623477936, + "rewards/margins": 0.010200846008956432, + "rewards/rejected": 0.10061826556921005, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 4.676812648773193, + "learning_rate": 1.871111111111111e-07, + "logits/chosen": 1.6702083349227905, + "logits/rejected": 1.585174322128296, + "logps/chosen": -158.23495483398438, + "logps/rejected": -148.28843688964844, + "loss": 0.6550227165222168, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.16143694519996643, + "rewards/margins": 0.08484812080860138, + "rewards/rejected": 0.07658880203962326, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 4.585787773132324, + "learning_rate": 1.8266666666666666e-07, + "logits/chosen": 1.5290788412094116, + "logits/rejected": 1.421924114227295, + "logps/chosen": -152.25558471679688, + "logps/rejected": -153.72178649902344, + "loss": 0.6806746482849121, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.11421312391757965, + "rewards/margins": 0.03305097296833992, + "rewards/rejected": 0.08116213977336884, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 4.75665283203125, + "learning_rate": 1.7822222222222222e-07, + "logits/chosen": 1.7162408828735352, + "logits/rejected": 1.6277885437011719, + "logps/chosen": -163.5832977294922, + "logps/rejected": -148.44378662109375, + "loss": 0.6706692218780518, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.17963996529579163, + "rewards/margins": 0.05455024167895317, + "rewards/rejected": 0.12508971989154816, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.5044771432876587, + "eval_logits/rejected": 1.5299365520477295, + "eval_logps/chosen": -152.16799926757812, + "eval_logps/rejected": -146.96514892578125, + "eval_loss": 0.6872401237487793, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": 0.11611522734165192, + "eval_rewards/margins": 0.022344600409269333, + "eval_rewards/rejected": 0.09377063810825348, + "eval_runtime": 90.1889, + "eval_samples_per_second": 5.544, + "eval_steps_per_second": 2.772, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 5.027055263519287, + "learning_rate": 1.7377777777777778e-07, + "logits/chosen": 1.550410509109497, + "logits/rejected": 1.491328239440918, + "logps/chosen": -165.79141235351562, + "logps/rejected": -155.51358032226562, + "loss": 0.6698966026306152, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1318608820438385, + "rewards/margins": 0.05974091216921806, + "rewards/rejected": 0.07211998105049133, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 5.497100353240967, + "learning_rate": 1.6933333333333334e-07, + "logits/chosen": 1.6706756353378296, + "logits/rejected": 1.6537840366363525, + "logps/chosen": -131.91165161132812, + "logps/rejected": -137.20498657226562, + "loss": 0.6716497898101806, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13379232585430145, + "rewards/margins": 0.05359635502099991, + "rewards/rejected": 0.08019598573446274, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 8.849369049072266, + "learning_rate": 1.6488888888888887e-07, + "logits/chosen": 1.6430184841156006, + "logits/rejected": 1.6333353519439697, + "logps/chosen": -164.01890563964844, + "logps/rejected": -162.87362670898438, + "loss": 0.6649856090545654, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1533871591091156, + "rewards/margins": 0.069211944937706, + "rewards/rejected": 0.0841752141714096, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 4.965246677398682, + "learning_rate": 1.6044444444444443e-07, + "logits/chosen": 1.3473514318466187, + "logits/rejected": 1.4646275043487549, + "logps/chosen": -138.31454467773438, + "logps/rejected": -145.73532104492188, + "loss": 0.6746968746185302, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13360588252544403, + "rewards/margins": 0.043482352048158646, + "rewards/rejected": 0.09012351930141449, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 5.954956531524658, + "learning_rate": 1.56e-07, + "logits/chosen": 1.67049241065979, + "logits/rejected": 1.7067524194717407, + "logps/chosen": -156.033935546875, + "logps/rejected": -153.3822021484375, + "loss": 0.6812377452850342, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15508751571178436, + "rewards/margins": 0.03148316591978073, + "rewards/rejected": 0.12360434234142303, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 3.328672170639038, + "learning_rate": 1.5155555555555555e-07, + "logits/chosen": 1.4824349880218506, + "logits/rejected": 1.475740909576416, + "logps/chosen": -136.86688232421875, + "logps/rejected": -138.93450927734375, + "loss": 0.6671774864196778, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12285050004720688, + "rewards/margins": 0.05716937035322189, + "rewards/rejected": 0.06568112224340439, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 4.780362606048584, + "learning_rate": 1.4711111111111111e-07, + "logits/chosen": 1.5829228162765503, + "logits/rejected": 1.553118348121643, + "logps/chosen": -151.36331176757812, + "logps/rejected": -148.54202270507812, + "loss": 0.661128568649292, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.17800253629684448, + "rewards/margins": 0.07710476219654083, + "rewards/rejected": 0.10089776664972305, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 4.95118522644043, + "learning_rate": 1.4266666666666665e-07, + "logits/chosen": 1.370544672012329, + "logits/rejected": 1.4272708892822266, + "logps/chosen": -142.42710876464844, + "logps/rejected": -168.6506805419922, + "loss": 0.6768136024475098, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12570957839488983, + "rewards/margins": 0.04121888428926468, + "rewards/rejected": 0.08449070900678635, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 4.103761196136475, + "learning_rate": 1.382222222222222e-07, + "logits/chosen": 1.6573619842529297, + "logits/rejected": 1.6316673755645752, + "logps/chosen": -169.3119354248047, + "logps/rejected": -164.17633056640625, + "loss": 0.6686764717102051, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11812909692525864, + "rewards/margins": 0.06238695979118347, + "rewards/rejected": 0.05574214458465576, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 4.561870574951172, + "learning_rate": 1.3377777777777777e-07, + "logits/chosen": 1.5772711038589478, + "logits/rejected": 1.3478977680206299, + "logps/chosen": -132.41693115234375, + "logps/rejected": -134.36935424804688, + "loss": 0.6745004653930664, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.15194594860076904, + "rewards/margins": 0.04931178689002991, + "rewards/rejected": 0.10263414680957794, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.50482976436615, + "eval_logits/rejected": 1.5303761959075928, + "eval_logps/chosen": -152.19476318359375, + "eval_logps/rejected": -146.9914093017578, + "eval_loss": 0.6872152090072632, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.11343776434659958, + "eval_rewards/margins": 0.02229386195540428, + "eval_rewards/rejected": 0.0911439061164856, + "eval_runtime": 90.3738, + "eval_samples_per_second": 5.533, + "eval_steps_per_second": 2.766, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 5.253188610076904, + "learning_rate": 1.2933333333333333e-07, + "logits/chosen": 1.602463960647583, + "logits/rejected": 1.6365470886230469, + "logps/chosen": -151.4491729736328, + "logps/rejected": -139.6726531982422, + "loss": 0.6595804214477539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1349448263645172, + "rewards/margins": 0.07438500225543976, + "rewards/rejected": 0.06055985763669014, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 5.627497673034668, + "learning_rate": 1.2488888888888889e-07, + "logits/chosen": 1.5839112997055054, + "logits/rejected": 1.4787819385528564, + "logps/chosen": -134.94741821289062, + "logps/rejected": -163.97610473632812, + "loss": 0.6709693908691406, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15531916916370392, + "rewards/margins": 0.05516272038221359, + "rewards/rejected": 0.10015644878149033, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 4.092975616455078, + "learning_rate": 1.2044444444444445e-07, + "logits/chosen": 1.367531657218933, + "logits/rejected": 1.5928542613983154, + "logps/chosen": -128.33090209960938, + "logps/rejected": -130.70223999023438, + "loss": 0.6782794952392578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.12403953075408936, + "rewards/margins": 0.03512664884328842, + "rewards/rejected": 0.08891288191080093, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 3.851191997528076, + "learning_rate": 1.16e-07, + "logits/chosen": 1.5335030555725098, + "logits/rejected": 1.6405471563339233, + "logps/chosen": -142.96331787109375, + "logps/rejected": -159.10667419433594, + "loss": 0.675669002532959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12597718834877014, + "rewards/margins": 0.04118332266807556, + "rewards/rejected": 0.08479384332895279, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 5.128774166107178, + "learning_rate": 1.1155555555555555e-07, + "logits/chosen": 1.6666786670684814, + "logits/rejected": 1.6352249383926392, + "logps/chosen": -160.48880004882812, + "logps/rejected": -149.51486206054688, + "loss": 0.676695442199707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13730794191360474, + "rewards/margins": 0.03863085061311722, + "rewards/rejected": 0.09867707639932632, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 5.3966169357299805, + "learning_rate": 1.0711111111111111e-07, + "logits/chosen": 1.480333924293518, + "logits/rejected": 1.597484827041626, + "logps/chosen": -124.16390228271484, + "logps/rejected": -147.24319458007812, + "loss": 0.6672614097595215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13824795186519623, + "rewards/margins": 0.05965462327003479, + "rewards/rejected": 0.07859332859516144, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 5.315938949584961, + "learning_rate": 1.0266666666666666e-07, + "logits/chosen": 1.4455833435058594, + "logits/rejected": 1.517375111579895, + "logps/chosen": -130.0419158935547, + "logps/rejected": -122.60333251953125, + "loss": 0.6733921527862549, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1244196742773056, + "rewards/margins": 0.05231797695159912, + "rewards/rejected": 0.07210170477628708, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.050210952758789, + "learning_rate": 9.822222222222222e-08, + "logits/chosen": 1.711755394935608, + "logits/rejected": 1.5550199747085571, + "logps/chosen": -165.57754516601562, + "logps/rejected": -143.4929656982422, + "loss": 0.6610856533050538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12436368316411972, + "rewards/margins": 0.07134710252285004, + "rewards/rejected": 0.05301658436655998, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 4.6738457679748535, + "learning_rate": 9.377777777777778e-08, + "logits/chosen": 1.4832783937454224, + "logits/rejected": 1.3279205560684204, + "logps/chosen": -145.33587646484375, + "logps/rejected": -130.48712158203125, + "loss": 0.6677139282226563, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.08904106914997101, + "rewards/margins": 0.06454581767320633, + "rewards/rejected": 0.024495262652635574, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 4.347452163696289, + "learning_rate": 8.933333333333333e-08, + "logits/chosen": 1.5897009372711182, + "logits/rejected": 1.5665786266326904, + "logps/chosen": -156.43614196777344, + "logps/rejected": -140.20364379882812, + "loss": 0.684497880935669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11093449592590332, + "rewards/margins": 0.023888718336820602, + "rewards/rejected": 0.08704578131437302, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.5041522979736328, + "eval_logits/rejected": 1.5298234224319458, + "eval_logps/chosen": -152.20681762695312, + "eval_logps/rejected": -147.0045166015625, + "eval_loss": 0.687169075012207, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.11223345249891281, + "eval_rewards/margins": 0.022400878369808197, + "eval_rewards/rejected": 0.08983256667852402, + "eval_runtime": 90.2573, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 2.77, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 4.292654991149902, + "learning_rate": 8.488888888888889e-08, + "logits/chosen": 1.7366397380828857, + "logits/rejected": 1.7629550695419312, + "logps/chosen": -165.0113067626953, + "logps/rejected": -153.10745239257812, + "loss": 0.6697476387023926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.18020913004875183, + "rewards/margins": 0.053330183029174805, + "rewards/rejected": 0.12687894701957703, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 4.576894760131836, + "learning_rate": 8.044444444444445e-08, + "logits/chosen": 1.6899020671844482, + "logits/rejected": 1.5166491270065308, + "logps/chosen": -138.1096649169922, + "logps/rejected": -125.38822174072266, + "loss": 0.6785052299499512, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.13225102424621582, + "rewards/margins": 0.03827885538339615, + "rewards/rejected": 0.09397216141223907, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 5.70767068862915, + "learning_rate": 7.599999999999999e-08, + "logits/chosen": 1.5233170986175537, + "logits/rejected": 1.5336878299713135, + "logps/chosen": -144.0012969970703, + "logps/rejected": -156.4627685546875, + "loss": 0.670370626449585, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.12371464818716049, + "rewards/margins": 0.05864514783024788, + "rewards/rejected": 0.0650695189833641, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 4.895171642303467, + "learning_rate": 7.155555555555555e-08, + "logits/chosen": 1.4882056713104248, + "logits/rejected": 1.4597115516662598, + "logps/chosen": -166.4456329345703, + "logps/rejected": -156.53567504882812, + "loss": 0.6758975505828857, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1247052326798439, + "rewards/margins": 0.041960421949625015, + "rewards/rejected": 0.08274482190608978, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 5.97705602645874, + "learning_rate": 6.71111111111111e-08, + "logits/chosen": 1.6275503635406494, + "logits/rejected": 1.791870355606079, + "logps/chosen": -146.13681030273438, + "logps/rejected": -155.450439453125, + "loss": 0.6827160358428955, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10786732286214828, + "rewards/margins": 0.030300844460725784, + "rewards/rejected": 0.0775664821267128, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 5.01773738861084, + "learning_rate": 6.266666666666666e-08, + "logits/chosen": 1.3919562101364136, + "logits/rejected": 1.3836013078689575, + "logps/chosen": -160.868408203125, + "logps/rejected": -152.0878448486328, + "loss": 0.6812721729278565, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11709713935852051, + "rewards/margins": 0.03227861970663071, + "rewards/rejected": 0.0848185196518898, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 4.818210601806641, + "learning_rate": 5.822222222222222e-08, + "logits/chosen": 1.6691443920135498, + "logits/rejected": 1.642247200012207, + "logps/chosen": -179.80433654785156, + "logps/rejected": -163.50428771972656, + "loss": 0.6700118541717529, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.14921119809150696, + "rewards/margins": 0.060837097465991974, + "rewards/rejected": 0.08837412297725677, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 4.891122817993164, + "learning_rate": 5.377777777777778e-08, + "logits/chosen": 1.6384000778198242, + "logits/rejected": 1.6921117305755615, + "logps/chosen": -150.27743530273438, + "logps/rejected": -148.6507568359375, + "loss": 0.6937178611755371, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.11294616758823395, + "rewards/margins": 0.008084317669272423, + "rewards/rejected": 0.10486185550689697, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 5.576618194580078, + "learning_rate": 4.933333333333333e-08, + "logits/chosen": 1.5864824056625366, + "logits/rejected": 1.502429723739624, + "logps/chosen": -158.6403045654297, + "logps/rejected": -142.84228515625, + "loss": 0.6831792831420899, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12857118248939514, + "rewards/margins": 0.030134152621030807, + "rewards/rejected": 0.09843702614307404, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 4.503939151763916, + "learning_rate": 4.4888888888888885e-08, + "logits/chosen": 1.499786138534546, + "logits/rejected": 1.6668217182159424, + "logps/chosen": -148.1971893310547, + "logps/rejected": -140.85711669921875, + "loss": 0.6556127071380615, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1358988732099533, + "rewards/margins": 0.08561079949140549, + "rewards/rejected": 0.05028806999325752, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5030263662338257, + "eval_logits/rejected": 1.5287361145019531, + "eval_logps/chosen": -152.25106811523438, + "eval_logps/rejected": -147.04556274414062, + "eval_loss": 0.6872794032096863, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.10780756920576096, + "eval_rewards/margins": 0.022079555317759514, + "eval_rewards/rejected": 0.08572802692651749, + "eval_runtime": 90.3547, + "eval_samples_per_second": 5.534, + "eval_steps_per_second": 2.767, + "step": 2400 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2400/training_args.bin b/v5/DPO/DPO_10k/lora/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..104933ebf9c17ba9c2c1c1d39a0d26ccafdfe373 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677b288b67816c6ab7a9dcdd40d26bcb142fa3ad3ad050eaeeb4b73a1ba4b498 +size 6161 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/README.md b/v5/DPO/DPO_10k/lora/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_config.json b/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8436359b1aa944f94290f60b93e89d8644f8843e --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "down_proj", + "k_proj", + "gate_proj", + "up_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_model.safetensors b/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..550ef86a69b75a5e043efc654f693361f22633d1 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0bbd3c3e4c00bee837cf13980be86026bc37eed7182fa43a7b6a34481dc706f +size 180385008 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/chat_template.jinja b/v5/DPO/DPO_10k/lora/checkpoint-2500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/optimizer.pt b/v5/DPO/DPO_10k/lora/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..545b82fb0821fb40f7cc071e1a171b247fe7fab5 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85a16bd9a9e3b84f02ff23b0aae6c239155a06a501d1d776ea15eba57fc952a +size 360902475 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/rng_state.pth b/v5/DPO/DPO_10k/lora/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68c0411dd375a388cbc8c58bea912cb904778ab8 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1786ad2057a678cc204dadc7fc5d1a4f939be477df219f770c7d40e9270281 +size 14645 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/scaler.pt b/v5/DPO/DPO_10k/lora/checkpoint-2500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9eed5b41b07271f49ef79494a16894ef3dc21e5 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a4f7df58dbec84710c280fe8ef7660bf055a3d051df5a58eccd60f22b169583 +size 1383 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/scheduler.pt b/v5/DPO/DPO_10k/lora/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9e078b4de5e3a5a2e3d63160cd9ba4e6afc6fd1 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7154cc492a37bc9bb0e6d09325e786c916142f1f119ecd183a60eecde4f9f283 +size 1465 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer.json b/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer_config.json b/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/trainer_state.json b/v5/DPO/DPO_10k/lora/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6338ae74ce7a9df2fb94228534225befb96998fe --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/trainer_state.json @@ -0,0 +1,4184 @@ +{ + "best_global_step": 1300, + "best_metric": 0.5460000038146973, + "best_model_checkpoint": "output/lora/checkpoint-1300", + "epoch": 2.0, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 4.452983379364014, + "learning_rate": 3.6e-08, + "logits/chosen": 1.5510008335113525, + "logits/rejected": 1.5244438648223877, + "logps/chosen": -131.24708557128906, + "logps/rejected": -146.8297576904297, + "loss": 0.6932957172393799, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.00019072293071076274, + "rewards/margins": -0.00029331922996789217, + "rewards/rejected": 0.0001025962847052142, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 5.203515529632568, + "learning_rate": 7.599999999999999e-08, + "logits/chosen": 1.6611576080322266, + "logits/rejected": 1.6220839023590088, + "logps/chosen": -156.2080078125, + "logps/rejected": -142.72964477539062, + "loss": 0.6937986850738526, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.0006145072402432561, + "rewards/margins": -0.0012890815269201994, + "rewards/rejected": 0.000674574519507587, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 4.087289810180664, + "learning_rate": 1.16e-07, + "logits/chosen": 1.9773778915405273, + "logits/rejected": 1.8304665088653564, + "logps/chosen": -163.54708862304688, + "logps/rejected": -157.88926696777344, + "loss": 0.6931437492370606, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0003584886435419321, + "rewards/margins": 2.1700874640373513e-05, + "rewards/rejected": 0.00033678775071166456, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 4.74172830581665, + "learning_rate": 1.56e-07, + "logits/chosen": 1.5896285772323608, + "logits/rejected": 1.7109922170639038, + "logps/chosen": -144.44276428222656, + "logps/rejected": -133.09629821777344, + "loss": 0.6932558059692383, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.00048673158744350076, + "rewards/margins": -0.00020531899644993246, + "rewards/rejected": 0.0006920504383742809, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 4.32133150100708, + "learning_rate": 1.96e-07, + "logits/chosen": 1.5152148008346558, + "logits/rejected": 1.585367202758789, + "logps/chosen": -131.73226928710938, + "logps/rejected": -136.8301239013672, + "loss": 0.6930522918701172, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0008355070021934807, + "rewards/margins": 0.00019948731642216444, + "rewards/rejected": 0.0006360196857713163, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 3.915316343307495, + "learning_rate": 2.3599999999999997e-07, + "logits/chosen": 1.5138778686523438, + "logits/rejected": 1.4824903011322021, + "logps/chosen": -136.43399047851562, + "logps/rejected": -126.70623779296875, + "loss": 0.6929163455963134, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000302538916002959, + "rewards/margins": 0.00047383070341311395, + "rewards/rejected": -0.00017129186016973108, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 4.329769134521484, + "learning_rate": 2.7600000000000004e-07, + "logits/chosen": 1.6920913457870483, + "logits/rejected": 1.8169019222259521, + "logps/chosen": -152.056640625, + "logps/rejected": -155.9404296875, + "loss": 0.6935575008392334, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0025545789394527674, + "rewards/margins": -0.0008073424687609076, + "rewards/rejected": 0.003361921291798353, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 3.964193344116211, + "learning_rate": 3.1599999999999997e-07, + "logits/chosen": 1.5645431280136108, + "logits/rejected": 1.5879082679748535, + "logps/chosen": -147.78839111328125, + "logps/rejected": -135.19906616210938, + "loss": 0.6925086498260498, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.006093275733292103, + "rewards/margins": 0.0012893510283902287, + "rewards/rejected": 0.004803924821317196, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 3.935694694519043, + "learning_rate": 3.5599999999999996e-07, + "logits/chosen": 1.5960246324539185, + "logits/rejected": 1.6901094913482666, + "logps/chosen": -157.85256958007812, + "logps/rejected": -150.51974487304688, + "loss": 0.6931850433349609, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.010219026356935501, + "rewards/margins": -5.4271204135147855e-05, + "rewards/rejected": 0.010273297317326069, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 5.13019323348999, + "learning_rate": 3.96e-07, + "logits/chosen": 1.5446935892105103, + "logits/rejected": 1.6452451944351196, + "logps/chosen": -149.88038635253906, + "logps/rejected": -169.9078826904297, + "loss": 0.6935123443603516, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011777262203395367, + "rewards/margins": -0.0007071519503369927, + "rewards/rejected": 0.0124844154343009, + "step": 100 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5421667098999023, + "eval_logits/rejected": 1.5734084844589233, + "eval_logps/chosen": -153.21649169921875, + "eval_logps/rejected": -147.7952117919922, + "eval_loss": 0.6929068565368652, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.011265883222222328, + "eval_rewards/margins": 0.0005034058121964335, + "eval_rewards/rejected": 0.010762478224933147, + "eval_runtime": 90.2131, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 5.695896625518799, + "learning_rate": 4.36e-07, + "logits/chosen": 1.781393051147461, + "logits/rejected": 1.7461833953857422, + "logps/chosen": -172.24188232421875, + "logps/rejected": -154.40878295898438, + "loss": 0.6922736167907715, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.014022141695022583, + "rewards/margins": 0.001777560799382627, + "rewards/rejected": 0.012244580313563347, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 4.398581027984619, + "learning_rate": 4.76e-07, + "logits/chosen": 1.611268401145935, + "logits/rejected": 1.6106624603271484, + "logps/chosen": -135.13426208496094, + "logps/rejected": -139.7284393310547, + "loss": 0.6927696228027344, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.014869053848087788, + "rewards/margins": 0.0007819391903467476, + "rewards/rejected": 0.014087115414440632, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 5.19202995300293, + "learning_rate": 5.16e-07, + "logits/chosen": 1.5615273714065552, + "logits/rejected": 1.7724416255950928, + "logps/chosen": -157.66746520996094, + "logps/rejected": -161.90391540527344, + "loss": 0.6928309917449951, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013302234932780266, + "rewards/margins": 0.0006531739491038024, + "rewards/rejected": 0.012649061158299446, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 3.4575726985931396, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.492018699645996, + "logits/rejected": 1.5187314748764038, + "logps/chosen": -131.4152374267578, + "logps/rejected": -125.62705993652344, + "loss": 0.6929276943206787, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.014355423860251904, + "rewards/margins": 0.000462935131508857, + "rewards/rejected": 0.013892488554120064, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 4.500187397003174, + "learning_rate": 5.96e-07, + "logits/chosen": 1.5862048864364624, + "logits/rejected": 1.6784181594848633, + "logps/chosen": -163.6667938232422, + "logps/rejected": -157.76402282714844, + "loss": 0.6910766124725342, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.018648523837327957, + "rewards/margins": 0.004183619283139706, + "rewards/rejected": 0.014464902691543102, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 4.113079071044922, + "learning_rate": 6.36e-07, + "logits/chosen": 1.7717370986938477, + "logits/rejected": 1.8070589303970337, + "logps/chosen": -158.02734375, + "logps/rejected": -145.92495727539062, + "loss": 0.6927172183990479, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.02493301033973694, + "rewards/margins": 0.0009325124556198716, + "rewards/rejected": 0.024000495672225952, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 5.4983696937561035, + "learning_rate": 6.76e-07, + "logits/chosen": 1.5733931064605713, + "logits/rejected": 1.6008774042129517, + "logps/chosen": -147.1856689453125, + "logps/rejected": -159.93077087402344, + "loss": 0.6926907062530517, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02877199277281761, + "rewards/margins": 0.0010106085101142526, + "rewards/rejected": 0.027761384844779968, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 4.50191068649292, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.5886398553848267, + "logits/rejected": 1.7399513721466064, + "logps/chosen": -157.6659393310547, + "logps/rejected": -160.65431213378906, + "loss": 0.6925735473632812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.024526067078113556, + "rewards/margins": 0.0012190367560833693, + "rewards/rejected": 0.02330702915787697, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 4.708652019500732, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.6504443883895874, + "logits/rejected": 1.7761001586914062, + "logps/chosen": -141.34536743164062, + "logps/rejected": -143.41159057617188, + "loss": 0.6928653240203857, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.026504456996917725, + "rewards/margins": 0.0006459927535615861, + "rewards/rejected": 0.02585846558213234, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 4.143187046051025, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7108211517333984, + "logits/rejected": 1.6271438598632812, + "logps/chosen": -158.04931640625, + "logps/rejected": -132.23463439941406, + "loss": 0.6927096843719482, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.02748889848589897, + "rewards/margins": 0.0009879134595394135, + "rewards/rejected": 0.026500985026359558, + "step": 200 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.54364013671875, + "eval_logits/rejected": 1.5745173692703247, + "eval_logps/chosen": -153.04653930664062, + "eval_logps/rejected": -147.63844299316406, + "eval_loss": 0.6923084855079651, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.028259562328457832, + "eval_rewards/margins": 0.0018185621593147516, + "eval_rewards/rejected": 0.026441000401973724, + "eval_runtime": 90.4481, + "eval_samples_per_second": 5.528, + "eval_steps_per_second": 2.764, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 4.559652328491211, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.7295278310775757, + "logits/rejected": 1.6801897287368774, + "logps/chosen": -158.0893096923828, + "logps/rejected": -168.72427368164062, + "loss": 0.6922987461090088, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.02837887406349182, + "rewards/margins": 0.0018027331680059433, + "rewards/rejected": 0.026576142758131027, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 3.907545328140259, + "learning_rate": 8.76e-07, + "logits/chosen": 1.6849712133407593, + "logits/rejected": 1.7441444396972656, + "logps/chosen": -158.67384338378906, + "logps/rejected": -143.02920532226562, + "loss": 0.6933117389678956, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.03286944702267647, + "rewards/margins": -0.0002176974667236209, + "rewards/rejected": 0.03308714181184769, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 3.5083253383636475, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5701725482940674, + "logits/rejected": 1.7182495594024658, + "logps/chosen": -160.56790161132812, + "logps/rejected": -138.05374145507812, + "loss": 0.6915814399719238, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.03840702772140503, + "rewards/margins": 0.0033035180531442165, + "rewards/rejected": 0.035103507339954376, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 4.424270153045654, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.671190857887268, + "logits/rejected": 1.6964585781097412, + "logps/chosen": -170.28260803222656, + "logps/rejected": -144.33534240722656, + "loss": 0.6900368690490722, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0511443130671978, + "rewards/margins": 0.006511001847684383, + "rewards/rejected": 0.04463331401348114, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 4.5393967628479, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.6391950845718384, + "logits/rejected": 1.5815935134887695, + "logps/chosen": -160.45225524902344, + "logps/rejected": -147.56185913085938, + "loss": 0.6940414905548096, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.04959743469953537, + "rewards/margins": -0.001381749869324267, + "rewards/rejected": 0.05097918584942818, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 4.256033897399902, + "learning_rate": 9.959999999999999e-07, + "logits/chosen": 1.5204452276229858, + "logits/rejected": 1.6171140670776367, + "logps/chosen": -131.5397186279297, + "logps/rejected": -145.2186279296875, + "loss": 0.6930822372436524, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.045291412621736526, + "rewards/margins": 0.000405142258387059, + "rewards/rejected": 0.0448862686753273, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 4.027031421661377, + "learning_rate": 9.915555555555556e-07, + "logits/chosen": 1.6407123804092407, + "logits/rejected": 1.7262824773788452, + "logps/chosen": -145.78701782226562, + "logps/rejected": -146.34481811523438, + "loss": 0.6946187496185303, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04104622080922127, + "rewards/margins": -0.0027089794166386127, + "rewards/rejected": 0.043755196034908295, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 5.568243026733398, + "learning_rate": 9.87111111111111e-07, + "logits/chosen": 1.6697533130645752, + "logits/rejected": 1.5154677629470825, + "logps/chosen": -171.0277099609375, + "logps/rejected": -154.05654907226562, + "loss": 0.6873753070831299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04779375344514847, + "rewards/margins": 0.011815806850790977, + "rewards/rejected": 0.03597795218229294, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 4.041477680206299, + "learning_rate": 9.826666666666667e-07, + "logits/chosen": 1.6633354425430298, + "logits/rejected": 1.6905081272125244, + "logps/chosen": -142.88864135742188, + "logps/rejected": -152.04757690429688, + "loss": 0.6929869174957275, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.04824609309434891, + "rewards/margins": 0.0005596639821305871, + "rewards/rejected": 0.04768642783164978, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 4.9481635093688965, + "learning_rate": 9.782222222222222e-07, + "logits/chosen": 1.617485761642456, + "logits/rejected": 1.6837307214736938, + "logps/chosen": -152.088134765625, + "logps/rejected": -164.15158081054688, + "loss": 0.6897455215454101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.057097338140010834, + "rewards/margins": 0.007080497685819864, + "rewards/rejected": 0.050016842782497406, + "step": 300 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5520602464675903, + "eval_logits/rejected": 1.5826917886734009, + "eval_logps/chosen": -152.8227996826172, + "eval_logps/rejected": -147.43496704101562, + "eval_loss": 0.6914217472076416, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": 0.05063560605049133, + "eval_rewards/margins": 0.0038486982230097055, + "eval_rewards/rejected": 0.04678690433502197, + "eval_runtime": 90.237, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 2.77, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 4.486109256744385, + "learning_rate": 9.737777777777777e-07, + "logits/chosen": 1.7188745737075806, + "logits/rejected": 1.7590553760528564, + "logps/chosen": -140.6877899169922, + "logps/rejected": -155.31893920898438, + "loss": 0.6952545166015625, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": 0.046565137803554535, + "rewards/margins": -0.003889651270583272, + "rewards/rejected": 0.050454795360565186, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 4.374355316162109, + "learning_rate": 9.693333333333334e-07, + "logits/chosen": 1.7103513479232788, + "logits/rejected": 1.7379261255264282, + "logps/chosen": -137.8660888671875, + "logps/rejected": -140.40956115722656, + "loss": 0.692354393005371, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04029277712106705, + "rewards/margins": 0.0018124934285879135, + "rewards/rejected": 0.038480278104543686, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 4.618821144104004, + "learning_rate": 9.648888888888889e-07, + "logits/chosen": 1.5603777170181274, + "logits/rejected": 1.5868213176727295, + "logps/chosen": -157.3379669189453, + "logps/rejected": -182.77377319335938, + "loss": 0.6921377182006836, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.03832743316888809, + "rewards/margins": 0.0022720033302903175, + "rewards/rejected": 0.03605542704463005, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 4.1974406242370605, + "learning_rate": 9.604444444444443e-07, + "logits/chosen": 1.8520517349243164, + "logits/rejected": 1.716774582862854, + "logps/chosen": -158.16363525390625, + "logps/rejected": -149.66162109375, + "loss": 0.692081069946289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0396885983645916, + "rewards/margins": 0.0023556998930871487, + "rewards/rejected": 0.03733289986848831, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 4.700806140899658, + "learning_rate": 9.559999999999998e-07, + "logits/chosen": 1.6886537075042725, + "logits/rejected": 1.8079423904418945, + "logps/chosen": -165.5438232421875, + "logps/rejected": -194.98428344726562, + "loss": 0.6890993118286133, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05031610652804375, + "rewards/margins": 0.008422226645052433, + "rewards/rejected": 0.041893888264894485, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 3.7786731719970703, + "learning_rate": 9.515555555555555e-07, + "logits/chosen": 1.5483795404434204, + "logits/rejected": 1.4731425046920776, + "logps/chosen": -161.77774047851562, + "logps/rejected": -168.05458068847656, + "loss": 0.6911417007446289, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.050054989755153656, + "rewards/margins": 0.0043535237200558186, + "rewards/rejected": 0.045701466500759125, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 4.362200736999512, + "learning_rate": 9.471111111111111e-07, + "logits/chosen": 1.735099196434021, + "logits/rejected": 1.7567729949951172, + "logps/chosen": -161.30374145507812, + "logps/rejected": -153.4731903076172, + "loss": 0.6882652282714844, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06078929826617241, + "rewards/margins": 0.010189466178417206, + "rewards/rejected": 0.0505998320877552, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 4.35581111907959, + "learning_rate": 9.426666666666666e-07, + "logits/chosen": 1.604020118713379, + "logits/rejected": 1.524717926979065, + "logps/chosen": -141.42324829101562, + "logps/rejected": -151.82521057128906, + "loss": 0.6888412952423095, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.06947806477546692, + "rewards/margins": 0.009350637905299664, + "rewards/rejected": 0.06012742593884468, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 4.360926628112793, + "learning_rate": 9.382222222222222e-07, + "logits/chosen": 1.7075706720352173, + "logits/rejected": 1.6819493770599365, + "logps/chosen": -150.63375854492188, + "logps/rejected": -137.05673217773438, + "loss": 0.6836989879608154, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.08739721775054932, + "rewards/margins": 0.019574418663978577, + "rewards/rejected": 0.06782279908657074, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 4.909813404083252, + "learning_rate": 9.337777777777778e-07, + "logits/chosen": 1.6920162439346313, + "logits/rejected": 1.675100326538086, + "logps/chosen": -154.39663696289062, + "logps/rejected": -147.51455688476562, + "loss": 0.6892420768737793, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08908190578222275, + "rewards/margins": 0.008944114670157433, + "rewards/rejected": 0.08013778924942017, + "step": 400 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5610027313232422, + "eval_logits/rejected": 1.5909091234207153, + "eval_logps/chosen": -152.39871215820312, + "eval_logps/rejected": -147.04049682617188, + "eval_loss": 0.6903401017189026, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.09304190427064896, + "eval_rewards/margins": 0.006809028796851635, + "eval_rewards/rejected": 0.08623287081718445, + "eval_runtime": 90.296, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.769, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 4.541158199310303, + "learning_rate": 9.293333333333333e-07, + "logits/chosen": 1.6515496969223022, + "logits/rejected": 1.548688530921936, + "logps/chosen": -147.21546936035156, + "logps/rejected": -187.50816345214844, + "loss": 0.6924624919891358, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.10388608276844025, + "rewards/margins": 0.002456969814375043, + "rewards/rejected": 0.10142910480499268, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 5.246954917907715, + "learning_rate": 9.248888888888888e-07, + "logits/chosen": 1.6460405588150024, + "logits/rejected": 1.6713184118270874, + "logps/chosen": -151.11341857910156, + "logps/rejected": -166.2979736328125, + "loss": 0.6986268043518067, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.09254096448421478, + "rewards/margins": -0.009933066554367542, + "rewards/rejected": 0.10247401893138885, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 4.068811416625977, + "learning_rate": 9.204444444444443e-07, + "logits/chosen": 1.6973702907562256, + "logits/rejected": 1.7605253458023071, + "logps/chosen": -162.7523651123047, + "logps/rejected": -150.79718017578125, + "loss": 0.6896752834320068, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.08682320266962051, + "rewards/margins": 0.007874277420341969, + "rewards/rejected": 0.07894892990589142, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 4.387909412384033, + "learning_rate": 9.16e-07, + "logits/chosen": 1.5456931591033936, + "logits/rejected": 1.4381892681121826, + "logps/chosen": -155.777099609375, + "logps/rejected": -144.95742797851562, + "loss": 0.6881390571594238, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08465877920389175, + "rewards/margins": 0.01117948442697525, + "rewards/rejected": 0.0734792947769165, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 4.3955864906311035, + "learning_rate": 9.115555555555555e-07, + "logits/chosen": 1.7298389673233032, + "logits/rejected": 1.681171178817749, + "logps/chosen": -156.2227783203125, + "logps/rejected": -158.81114196777344, + "loss": 0.685992956161499, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08095243573188782, + "rewards/margins": 0.015426402911543846, + "rewards/rejected": 0.06552603840827942, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 4.6138176918029785, + "learning_rate": 9.071111111111111e-07, + "logits/chosen": 1.6315510272979736, + "logits/rejected": 1.6908462047576904, + "logps/chosen": -150.84512329101562, + "logps/rejected": -163.89492797851562, + "loss": 0.6891673088073731, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07254813611507416, + "rewards/margins": 0.008895034901797771, + "rewards/rejected": 0.06365309655666351, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 4.3172101974487305, + "learning_rate": 9.026666666666665e-07, + "logits/chosen": 1.4395225048065186, + "logits/rejected": 1.4489599466323853, + "logps/chosen": -130.1565399169922, + "logps/rejected": -122.24504089355469, + "loss": 0.6887143135070801, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.08355607837438583, + "rewards/margins": 0.009718736633658409, + "rewards/rejected": 0.07383735477924347, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 4.2122087478637695, + "learning_rate": 8.982222222222222e-07, + "logits/chosen": 1.5334614515304565, + "logits/rejected": 1.5769469738006592, + "logps/chosen": -147.21896362304688, + "logps/rejected": -162.89804077148438, + "loss": 0.6849615573883057, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.10068760812282562, + "rewards/margins": 0.017685385420918465, + "rewards/rejected": 0.0830022394657135, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 4.740354061126709, + "learning_rate": 8.937777777777777e-07, + "logits/chosen": 1.6524379253387451, + "logits/rejected": 1.7100518941879272, + "logps/chosen": -142.10653686523438, + "logps/rejected": -158.3316192626953, + "loss": 0.696216630935669, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.08680267632007599, + "rewards/margins": -0.004664557985961437, + "rewards/rejected": 0.0914672389626503, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 3.6374881267547607, + "learning_rate": 8.893333333333333e-07, + "logits/chosen": 1.518328309059143, + "logits/rejected": 1.6029644012451172, + "logps/chosen": -143.19154357910156, + "logps/rejected": -134.3892059326172, + "loss": 0.6908615589141845, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.06735874712467194, + "rewards/margins": 0.005541653838008642, + "rewards/rejected": 0.06181709095835686, + "step": 500 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5418404340744019, + "eval_logits/rejected": 1.571341633796692, + "eval_logps/chosen": -152.5870819091797, + "eval_logps/rejected": -147.23146057128906, + "eval_loss": 0.6903930902481079, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": 0.07420650124549866, + "eval_rewards/margins": 0.007066408637911081, + "eval_rewards/rejected": 0.06714009493589401, + "eval_runtime": 90.217, + "eval_samples_per_second": 5.542, + "eval_steps_per_second": 2.771, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 4.53076171875, + "learning_rate": 8.848888888888888e-07, + "logits/chosen": 1.6947540044784546, + "logits/rejected": 1.6306483745574951, + "logps/chosen": -130.33372497558594, + "logps/rejected": -139.05648803710938, + "loss": 0.6863756656646729, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.08373989164829254, + "rewards/margins": 0.014761297032237053, + "rewards/rejected": 0.06897859275341034, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 5.064472675323486, + "learning_rate": 8.804444444444445e-07, + "logits/chosen": 1.795907974243164, + "logits/rejected": 1.6805435419082642, + "logps/chosen": -165.10183715820312, + "logps/rejected": -170.87112426757812, + "loss": 0.6915029525756836, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07381857931613922, + "rewards/margins": 0.00408085435628891, + "rewards/rejected": 0.06973771750926971, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 4.472287178039551, + "learning_rate": 8.76e-07, + "logits/chosen": 1.7226626873016357, + "logits/rejected": 1.6465301513671875, + "logps/chosen": -165.50076293945312, + "logps/rejected": -167.12991333007812, + "loss": 0.6784487724304199, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09387621283531189, + "rewards/margins": 0.031213903799653053, + "rewards/rejected": 0.06266231089830399, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 4.193634033203125, + "learning_rate": 8.715555555555554e-07, + "logits/chosen": 1.7823143005371094, + "logits/rejected": 1.7374283075332642, + "logps/chosen": -180.05233764648438, + "logps/rejected": -157.24835205078125, + "loss": 0.6891638278961182, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10219593346118927, + "rewards/margins": 0.009201721288263798, + "rewards/rejected": 0.09299422055482864, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 5.877465724945068, + "learning_rate": 8.671111111111111e-07, + "logits/chosen": 1.6558294296264648, + "logits/rejected": 1.7549035549163818, + "logps/chosen": -149.97171020507812, + "logps/rejected": -166.52127075195312, + "loss": 0.6909477233886718, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09389887005090714, + "rewards/margins": 0.00655100354924798, + "rewards/rejected": 0.0873478576540947, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 3.9154303073883057, + "learning_rate": 8.626666666666666e-07, + "logits/chosen": 1.7343839406967163, + "logits/rejected": 1.6256252527236938, + "logps/chosen": -153.2657470703125, + "logps/rejected": -137.84548950195312, + "loss": 0.6832056045532227, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.08965932577848434, + "rewards/margins": 0.02167549543082714, + "rewards/rejected": 0.06798382848501205, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 3.569357395172119, + "learning_rate": 8.582222222222222e-07, + "logits/chosen": 1.6020238399505615, + "logits/rejected": 1.5468555688858032, + "logps/chosen": -156.9928741455078, + "logps/rejected": -150.9776153564453, + "loss": 0.6857921123504639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07846538722515106, + "rewards/margins": 0.016974106431007385, + "rewards/rejected": 0.06149129942059517, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 5.710695266723633, + "learning_rate": 8.537777777777777e-07, + "logits/chosen": 1.4293699264526367, + "logits/rejected": 1.583032250404358, + "logps/chosen": -134.98165893554688, + "logps/rejected": -153.61439514160156, + "loss": 0.6899324417114258, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05123991519212723, + "rewards/margins": 0.008615568280220032, + "rewards/rejected": 0.042624346911907196, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 3.757844924926758, + "learning_rate": 8.493333333333334e-07, + "logits/chosen": 1.5719819068908691, + "logits/rejected": 1.5706799030303955, + "logps/chosen": -143.9678955078125, + "logps/rejected": -130.64585876464844, + "loss": 0.6851204395294189, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.07710663974285126, + "rewards/margins": 0.01796458289027214, + "rewards/rejected": 0.059142060577869415, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 5.662181854248047, + "learning_rate": 8.448888888888888e-07, + "logits/chosen": 1.6224644184112549, + "logits/rejected": 1.6623615026474, + "logps/chosen": -130.7429962158203, + "logps/rejected": -157.59295654296875, + "loss": 0.6958520889282227, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0648646205663681, + "rewards/margins": -0.003241670085117221, + "rewards/rejected": 0.0681062787771225, + "step": 600 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5268914699554443, + "eval_logits/rejected": 1.5556302070617676, + "eval_logps/chosen": -152.5731201171875, + "eval_logps/rejected": -147.22303771972656, + "eval_loss": 0.6907246708869934, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.07560181617736816, + "eval_rewards/margins": 0.007620053365826607, + "eval_rewards/rejected": 0.0679817646741867, + "eval_runtime": 90.3327, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 2.768, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 3.7953426837921143, + "learning_rate": 8.404444444444444e-07, + "logits/chosen": 1.6380192041397095, + "logits/rejected": 1.6921494007110596, + "logps/chosen": -130.59445190429688, + "logps/rejected": -148.48709106445312, + "loss": 0.6903901100158691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08007006347179413, + "rewards/margins": 0.00720745325088501, + "rewards/rejected": 0.07286261022090912, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 7.121775150299072, + "learning_rate": 8.359999999999999e-07, + "logits/chosen": 1.6000845432281494, + "logits/rejected": 1.731951355934143, + "logps/chosen": -154.8905792236328, + "logps/rejected": -166.4490966796875, + "loss": 0.6969138145446777, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.05843223258852959, + "rewards/margins": -0.0054575116373598576, + "rewards/rejected": 0.06388974189758301, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 5.155455589294434, + "learning_rate": 8.315555555555556e-07, + "logits/chosen": 1.6201622486114502, + "logits/rejected": 1.6479911804199219, + "logps/chosen": -165.98980712890625, + "logps/rejected": -145.71644592285156, + "loss": 0.6804090023040772, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0773121565580368, + "rewards/margins": 0.027506589889526367, + "rewards/rejected": 0.049805570393800735, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 4.009693145751953, + "learning_rate": 8.271111111111111e-07, + "logits/chosen": 1.5530269145965576, + "logits/rejected": 1.5585509538650513, + "logps/chosen": -166.77560424804688, + "logps/rejected": -151.09249877929688, + "loss": 0.6879617691040039, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07863648235797882, + "rewards/margins": 0.014577758498489857, + "rewards/rejected": 0.06405872106552124, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 4.472072601318359, + "learning_rate": 8.226666666666666e-07, + "logits/chosen": 1.662239670753479, + "logits/rejected": 1.6585584878921509, + "logps/chosen": -153.26776123046875, + "logps/rejected": -125.24166107177734, + "loss": 0.6882720470428467, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09423185139894485, + "rewards/margins": 0.011949598789215088, + "rewards/rejected": 0.08228223770856857, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 5.073488712310791, + "learning_rate": 8.182222222222222e-07, + "logits/chosen": 1.6752973794937134, + "logits/rejected": 1.6020495891571045, + "logps/chosen": -150.0669708251953, + "logps/rejected": -131.1305694580078, + "loss": 0.6880992889404297, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.07629784196615219, + "rewards/margins": 0.01174530852586031, + "rewards/rejected": 0.06455253064632416, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 5.938063621520996, + "learning_rate": 8.137777777777777e-07, + "logits/chosen": 1.7563416957855225, + "logits/rejected": 1.5739262104034424, + "logps/chosen": -165.046875, + "logps/rejected": -150.13104248046875, + "loss": 0.6939912796020508, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.09020708501338959, + "rewards/margins": 0.0006308574229478836, + "rewards/rejected": 0.08957622945308685, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 4.362247467041016, + "learning_rate": 8.093333333333333e-07, + "logits/chosen": 1.6460363864898682, + "logits/rejected": 1.6379966735839844, + "logps/chosen": -143.24754333496094, + "logps/rejected": -131.8529815673828, + "loss": 0.6835652351379394, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.10346712917089462, + "rewards/margins": 0.02079077437520027, + "rewards/rejected": 0.08267635107040405, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 4.323369026184082, + "learning_rate": 8.048888888888888e-07, + "logits/chosen": 1.5466216802597046, + "logits/rejected": 1.541775107383728, + "logps/chosen": -171.82138061523438, + "logps/rejected": -158.87603759765625, + "loss": 0.6892048358917237, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0915951356291771, + "rewards/margins": 0.009338131174445152, + "rewards/rejected": 0.08225701004266739, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 5.28114128112793, + "learning_rate": 8.004444444444444e-07, + "logits/chosen": 1.6419496536254883, + "logits/rejected": 1.6641361713409424, + "logps/chosen": -158.20787048339844, + "logps/rejected": -136.3108367919922, + "loss": 0.6985964775085449, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0700095146894455, + "rewards/margins": -0.008601363748311996, + "rewards/rejected": 0.07861088216304779, + "step": 700 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5386524200439453, + "eval_logits/rejected": 1.5675796270370483, + "eval_logps/chosen": -152.39877319335938, + "eval_logps/rejected": -147.0768585205078, + "eval_loss": 0.6893304586410522, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.09303704649209976, + "eval_rewards/margins": 0.010439171455800533, + "eval_rewards/rejected": 0.08259786665439606, + "eval_runtime": 90.3103, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 4.867155075073242, + "learning_rate": 7.96e-07, + "logits/chosen": 1.7137393951416016, + "logits/rejected": 1.6643224954605103, + "logps/chosen": -147.054931640625, + "logps/rejected": -162.10067749023438, + "loss": 0.6891860008239746, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11649386584758759, + "rewards/margins": 0.00985223613679409, + "rewards/rejected": 0.10664163529872894, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 4.687198638916016, + "learning_rate": 7.915555555555556e-07, + "logits/chosen": 1.686532974243164, + "logits/rejected": 1.7992823123931885, + "logps/chosen": -138.60238647460938, + "logps/rejected": -134.22702026367188, + "loss": 0.7006660461425781, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.08661777526140213, + "rewards/margins": -0.013613695278763771, + "rewards/rejected": 0.10023146867752075, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 4.63344669342041, + "learning_rate": 7.87111111111111e-07, + "logits/chosen": 1.781561255455017, + "logits/rejected": 1.7561432123184204, + "logps/chosen": -151.60018920898438, + "logps/rejected": -147.93264770507812, + "loss": 0.6958267688751221, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10642895847558975, + "rewards/margins": -0.002830044599249959, + "rewards/rejected": 0.1092589944601059, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 4.5400800704956055, + "learning_rate": 7.826666666666666e-07, + "logits/chosen": 1.670771837234497, + "logits/rejected": 1.5866410732269287, + "logps/chosen": -155.36764526367188, + "logps/rejected": -132.60902404785156, + "loss": 0.6921723842620849, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0910472720861435, + "rewards/margins": 0.003630922408774495, + "rewards/rejected": 0.08741635084152222, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 4.779706954956055, + "learning_rate": 7.782222222222222e-07, + "logits/chosen": 1.531534194946289, + "logits/rejected": 1.5548356771469116, + "logps/chosen": -135.88177490234375, + "logps/rejected": -157.09231567382812, + "loss": 0.6919455528259277, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.09000497311353683, + "rewards/margins": 0.004116452299058437, + "rewards/rejected": 0.08588851988315582, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 5.283969879150391, + "learning_rate": 7.737777777777777e-07, + "logits/chosen": 1.6809686422348022, + "logits/rejected": 1.501511812210083, + "logps/chosen": -137.67315673828125, + "logps/rejected": -128.26022338867188, + "loss": 0.6907838344573974, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.08058954030275345, + "rewards/margins": 0.0064643076620996, + "rewards/rejected": 0.07412523031234741, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 4.341912269592285, + "learning_rate": 7.693333333333333e-07, + "logits/chosen": 1.684203863143921, + "logits/rejected": 1.6489808559417725, + "logps/chosen": -139.82455444335938, + "logps/rejected": -135.16998291015625, + "loss": 0.6793179988861084, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10346569865942001, + "rewards/margins": 0.02992106042802334, + "rewards/rejected": 0.07354463636875153, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 5.209469318389893, + "learning_rate": 7.648888888888888e-07, + "logits/chosen": 1.5599935054779053, + "logits/rejected": 1.6487398147583008, + "logps/chosen": -152.46170043945312, + "logps/rejected": -157.7329559326172, + "loss": 0.6873491287231446, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10296590626239777, + "rewards/margins": 0.01368915755301714, + "rewards/rejected": 0.0892767459154129, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 5.364309310913086, + "learning_rate": 7.604444444444445e-07, + "logits/chosen": 1.5357733964920044, + "logits/rejected": 1.5833505392074585, + "logps/chosen": -146.4203338623047, + "logps/rejected": -149.77499389648438, + "loss": 0.68800368309021, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0752335637807846, + "rewards/margins": 0.01265893317759037, + "rewards/rejected": 0.06257463991641998, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 4.701781272888184, + "learning_rate": 7.559999999999999e-07, + "logits/chosen": 1.7693378925323486, + "logits/rejected": 1.784106969833374, + "logps/chosen": -178.75717163085938, + "logps/rejected": -192.69229125976562, + "loss": 0.7001357078552246, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.07940518856048584, + "rewards/margins": -0.01138945110142231, + "rewards/rejected": 0.0907946228981018, + "step": 800 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.532821536064148, + "eval_logits/rejected": 1.5615730285644531, + "eval_logps/chosen": -152.57040405273438, + "eval_logps/rejected": -147.24534606933594, + "eval_loss": 0.6894002556800842, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.07587439566850662, + "eval_rewards/margins": 0.010124183259904385, + "eval_rewards/rejected": 0.06575021147727966, + "eval_runtime": 90.2864, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 4.731827259063721, + "learning_rate": 7.515555555555555e-07, + "logits/chosen": 1.5014355182647705, + "logits/rejected": 1.706011176109314, + "logps/chosen": -113.23974609375, + "logps/rejected": -150.55316162109375, + "loss": 0.6896101951599121, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.066777303814888, + "rewards/margins": 0.008613836951553822, + "rewards/rejected": 0.05816347524523735, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 4.050163745880127, + "learning_rate": 7.47111111111111e-07, + "logits/chosen": 1.701898217201233, + "logits/rejected": 1.7274971008300781, + "logps/chosen": -147.45330810546875, + "logps/rejected": -140.33255004882812, + "loss": 0.6757836818695069, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.08680602163076401, + "rewards/margins": 0.03675536438822746, + "rewards/rejected": 0.050050657242536545, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 4.168673992156982, + "learning_rate": 7.426666666666667e-07, + "logits/chosen": 1.6135514974594116, + "logits/rejected": 1.6518815755844116, + "logps/chosen": -137.38467407226562, + "logps/rejected": -132.65890502929688, + "loss": 0.6800778865814209, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07235284894704819, + "rewards/margins": 0.028275374323129654, + "rewards/rejected": 0.04407747834920883, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 4.72458028793335, + "learning_rate": 7.382222222222222e-07, + "logits/chosen": 1.5987484455108643, + "logits/rejected": 1.6328668594360352, + "logps/chosen": -146.712158203125, + "logps/rejected": -156.0950469970703, + "loss": 0.6804145336151123, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07264034450054169, + "rewards/margins": 0.0286283977329731, + "rewards/rejected": 0.04401194304227829, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 4.735199928283691, + "learning_rate": 7.337777777777778e-07, + "logits/chosen": 1.6810247898101807, + "logits/rejected": 1.6662237644195557, + "logps/chosen": -159.40650939941406, + "logps/rejected": -140.65591430664062, + "loss": 0.6805107116699218, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.061445970088243484, + "rewards/margins": 0.02743927761912346, + "rewards/rejected": 0.034006692469120026, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 3.7038252353668213, + "learning_rate": 7.293333333333332e-07, + "logits/chosen": 1.6597106456756592, + "logits/rejected": 1.6951271295547485, + "logps/chosen": -138.1852569580078, + "logps/rejected": -128.6427764892578, + "loss": 0.6821750164031982, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.07160943746566772, + "rewards/margins": 0.023774990811944008, + "rewards/rejected": 0.047834448516368866, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 4.820807456970215, + "learning_rate": 7.248888888888888e-07, + "logits/chosen": 1.5708585977554321, + "logits/rejected": 1.5483477115631104, + "logps/chosen": -152.8867950439453, + "logps/rejected": -152.02584838867188, + "loss": 0.6911486625671387, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04338831081986427, + "rewards/margins": 0.006898392923176289, + "rewards/rejected": 0.03648992255330086, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 4.6849493980407715, + "learning_rate": 7.204444444444444e-07, + "logits/chosen": 1.5262442827224731, + "logits/rejected": 1.7751166820526123, + "logps/chosen": -143.77993774414062, + "logps/rejected": -155.7498016357422, + "loss": 0.6910871028900146, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.061417657881975174, + "rewards/margins": 0.006087464280426502, + "rewards/rejected": 0.055330194532871246, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 4.951540946960449, + "learning_rate": 7.159999999999999e-07, + "logits/chosen": 1.405790090560913, + "logits/rejected": 1.5980100631713867, + "logps/chosen": -147.6872100830078, + "logps/rejected": -160.23947143554688, + "loss": 0.6822467803955078, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.049768321216106415, + "rewards/margins": 0.024227874353528023, + "rewards/rejected": 0.02554045058786869, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 4.255526542663574, + "learning_rate": 7.115555555555556e-07, + "logits/chosen": 1.6527436971664429, + "logits/rejected": 1.787755012512207, + "logps/chosen": -164.73355102539062, + "logps/rejected": -178.98507690429688, + "loss": 0.6861439704895019, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07546674460172653, + "rewards/margins": 0.017860155552625656, + "rewards/rejected": 0.05760659649968147, + "step": 900 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5091361999511719, + "eval_logits/rejected": 1.5371856689453125, + "eval_logps/chosen": -152.8962860107422, + "eval_logps/rejected": -147.56655883789062, + "eval_loss": 0.690089225769043, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.04328843951225281, + "eval_rewards/margins": 0.009660834446549416, + "eval_rewards/rejected": 0.03362761065363884, + "eval_runtime": 90.3227, + "eval_samples_per_second": 5.536, + "eval_steps_per_second": 2.768, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 4.453512668609619, + "learning_rate": 7.071111111111111e-07, + "logits/chosen": 1.700484037399292, + "logits/rejected": 1.4941186904907227, + "logps/chosen": -138.50682067871094, + "logps/rejected": -137.02490234375, + "loss": 0.6877517700195312, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05909284949302673, + "rewards/margins": 0.013610092923045158, + "rewards/rejected": 0.04548276215791702, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 5.548420429229736, + "learning_rate": 7.026666666666667e-07, + "logits/chosen": 1.409182071685791, + "logits/rejected": 1.375797152519226, + "logps/chosen": -158.7325439453125, + "logps/rejected": -161.9824981689453, + "loss": 0.6867617607116699, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07141564786434174, + "rewards/margins": 0.015245514921844006, + "rewards/rejected": 0.05617012828588486, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 5.186211109161377, + "learning_rate": 6.982222222222221e-07, + "logits/chosen": 1.6255321502685547, + "logits/rejected": 1.7182047367095947, + "logps/chosen": -156.53213500976562, + "logps/rejected": -160.67556762695312, + "loss": 0.6812005519866944, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.062074560672044754, + "rewards/margins": 0.026883777230978012, + "rewards/rejected": 0.03519078344106674, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 4.044335842132568, + "learning_rate": 6.937777777777778e-07, + "logits/chosen": 1.6656415462493896, + "logits/rejected": 1.7865594625473022, + "logps/chosen": -147.00344848632812, + "logps/rejected": -173.11428833007812, + "loss": 0.690484619140625, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0269255880266428, + "rewards/margins": 0.007569611072540283, + "rewards/rejected": 0.019355975091457367, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 4.8925299644470215, + "learning_rate": 6.893333333333333e-07, + "logits/chosen": 1.4856427907943726, + "logits/rejected": 1.5664136409759521, + "logps/chosen": -139.77938842773438, + "logps/rejected": -152.74557495117188, + "loss": 0.6817938804626464, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05857279896736145, + "rewards/margins": 0.02614629827439785, + "rewards/rejected": 0.03242649883031845, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 4.514585018157959, + "learning_rate": 6.848888888888889e-07, + "logits/chosen": 1.6077378988265991, + "logits/rejected": 1.4770267009735107, + "logps/chosen": -134.602294921875, + "logps/rejected": -117.98567199707031, + "loss": 0.694350004196167, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.028881916776299477, + "rewards/margins": 0.00047348294174298644, + "rewards/rejected": 0.028408434242010117, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 3.9295125007629395, + "learning_rate": 6.804444444444444e-07, + "logits/chosen": 1.7124595642089844, + "logits/rejected": 1.8135782480239868, + "logps/chosen": -152.15426635742188, + "logps/rejected": -147.62945556640625, + "loss": 0.6929523944854736, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.057992029935121536, + "rewards/margins": 0.002844708738848567, + "rewards/rejected": 0.05514732003211975, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 5.510717391967773, + "learning_rate": 6.76e-07, + "logits/chosen": 1.483984351158142, + "logits/rejected": 1.4224226474761963, + "logps/chosen": -167.27133178710938, + "logps/rejected": -140.16891479492188, + "loss": 0.6842909336090088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0873967856168747, + "rewards/margins": 0.020737329497933388, + "rewards/rejected": 0.06665945053100586, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 4.097748279571533, + "learning_rate": 6.715555555555556e-07, + "logits/chosen": 1.6006208658218384, + "logits/rejected": 1.6803340911865234, + "logps/chosen": -163.8089141845703, + "logps/rejected": -167.1127166748047, + "loss": 0.6833163261413574, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.07443337142467499, + "rewards/margins": 0.023210588842630386, + "rewards/rejected": 0.0512227788567543, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 3.959730625152588, + "learning_rate": 6.67111111111111e-07, + "logits/chosen": 1.5806870460510254, + "logits/rejected": 1.5673385858535767, + "logps/chosen": -148.22584533691406, + "logps/rejected": -123.79376220703125, + "loss": 0.6892601490020752, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.07928630709648132, + "rewards/margins": 0.009630966000258923, + "rewards/rejected": 0.06965534389019012, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.522994875907898, + "eval_logits/rejected": 1.5511444807052612, + "eval_logps/chosen": -152.6211395263672, + "eval_logps/rejected": -147.31727600097656, + "eval_loss": 0.6889244914054871, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.07080094516277313, + "eval_rewards/margins": 0.012245929799973965, + "eval_rewards/rejected": 0.058555010706186295, + "eval_runtime": 90.2821, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 4.409601211547852, + "learning_rate": 6.626666666666666e-07, + "logits/chosen": 1.6469438076019287, + "logits/rejected": 1.6551824808120728, + "logps/chosen": -153.6597442626953, + "logps/rejected": -140.854248046875, + "loss": 0.6809319496154785, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09710784256458282, + "rewards/margins": 0.026540305465459824, + "rewards/rejected": 0.0705675408244133, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 6.559939384460449, + "learning_rate": 6.582222222222222e-07, + "logits/chosen": 1.5092687606811523, + "logits/rejected": 1.613526701927185, + "logps/chosen": -141.97103881835938, + "logps/rejected": -145.5518035888672, + "loss": 0.6865349292755127, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0591324046254158, + "rewards/margins": 0.017797131091356277, + "rewards/rejected": 0.041335273534059525, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 5.1169047355651855, + "learning_rate": 6.537777777777778e-07, + "logits/chosen": 1.6573750972747803, + "logits/rejected": 1.4447792768478394, + "logps/chosen": -161.51220703125, + "logps/rejected": -135.18309020996094, + "loss": 0.6804659366607666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10412336885929108, + "rewards/margins": 0.02856394089758396, + "rewards/rejected": 0.07555942982435226, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 4.2392072677612305, + "learning_rate": 6.493333333333333e-07, + "logits/chosen": 1.6003319025039673, + "logits/rejected": 1.6346886157989502, + "logps/chosen": -139.08448791503906, + "logps/rejected": -139.89825439453125, + "loss": 0.6737568378448486, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11095432937145233, + "rewards/margins": 0.043066851794719696, + "rewards/rejected": 0.06788748502731323, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 5.133569240570068, + "learning_rate": 6.448888888888889e-07, + "logits/chosen": 1.4209873676300049, + "logits/rejected": 1.5513734817504883, + "logps/chosen": -141.1917266845703, + "logps/rejected": -130.70431518554688, + "loss": 0.6861515998840332, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10248366743326187, + "rewards/margins": 0.016522446647286415, + "rewards/rejected": 0.08596121519804001, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 4.0574445724487305, + "learning_rate": 6.404444444444444e-07, + "logits/chosen": 1.439145803451538, + "logits/rejected": 1.5661519765853882, + "logps/chosen": -111.7681884765625, + "logps/rejected": -126.37353515625, + "loss": 0.6691905975341796, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15403084456920624, + "rewards/margins": 0.05248977616429329, + "rewards/rejected": 0.10154107958078384, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 4.832082748413086, + "learning_rate": 6.36e-07, + "logits/chosen": 1.4902942180633545, + "logits/rejected": 1.3948299884796143, + "logps/chosen": -153.6760711669922, + "logps/rejected": -147.40023803710938, + "loss": 0.6729560375213623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1259043663740158, + "rewards/margins": 0.0469796285033226, + "rewards/rejected": 0.0789247453212738, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 3.609558343887329, + "learning_rate": 6.315555555555555e-07, + "logits/chosen": 1.5476783514022827, + "logits/rejected": 1.6365705728530884, + "logps/chosen": -148.82101440429688, + "logps/rejected": -122.3703842163086, + "loss": 0.6743530750274658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1420261561870575, + "rewards/margins": 0.041455820202827454, + "rewards/rejected": 0.10057034343481064, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 4.052758693695068, + "learning_rate": 6.27111111111111e-07, + "logits/chosen": 1.7934048175811768, + "logits/rejected": 1.7036349773406982, + "logps/chosen": -174.28765869140625, + "logps/rejected": -183.97897338867188, + "loss": 0.6748029708862304, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1439160704612732, + "rewards/margins": 0.0408162847161293, + "rewards/rejected": 0.1030997782945633, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 5.463155269622803, + "learning_rate": 6.226666666666667e-07, + "logits/chosen": 1.5579731464385986, + "logits/rejected": 1.5956088304519653, + "logps/chosen": -138.96115112304688, + "logps/rejected": -152.51766967773438, + "loss": 0.6884016513824462, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11226280778646469, + "rewards/margins": 0.013228577561676502, + "rewards/rejected": 0.09903421252965927, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.5295906066894531, + "eval_logits/rejected": 1.5565518140792847, + "eval_logps/chosen": -152.15892028808594, + "eval_logps/rejected": -146.88125610351562, + "eval_loss": 0.6887561678886414, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11702151596546173, + "eval_rewards/margins": 0.014860817231237888, + "eval_rewards/rejected": 0.10216069966554642, + "eval_runtime": 90.4847, + "eval_samples_per_second": 5.526, + "eval_steps_per_second": 2.763, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 5.542585372924805, + "learning_rate": 6.182222222222222e-07, + "logits/chosen": 1.6245231628417969, + "logits/rejected": 1.600940465927124, + "logps/chosen": -155.20175170898438, + "logps/rejected": -144.58438110351562, + "loss": 0.6818556308746337, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1249178797006607, + "rewards/margins": 0.027298670262098312, + "rewards/rejected": 0.0976191908121109, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 3.8583486080169678, + "learning_rate": 6.137777777777778e-07, + "logits/chosen": 1.6029832363128662, + "logits/rejected": 1.652834177017212, + "logps/chosen": -155.6839141845703, + "logps/rejected": -149.71646118164062, + "loss": 0.6816732883453369, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13532081246376038, + "rewards/margins": 0.025635983794927597, + "rewards/rejected": 0.10968482494354248, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 4.535235404968262, + "learning_rate": 6.093333333333332e-07, + "logits/chosen": 1.7116715908050537, + "logits/rejected": 1.5788238048553467, + "logps/chosen": -146.831787109375, + "logps/rejected": -134.20765686035156, + "loss": 0.6897575855255127, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.12892299890518188, + "rewards/margins": 0.011926446110010147, + "rewards/rejected": 0.11699654906988144, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 4.356500148773193, + "learning_rate": 6.048888888888889e-07, + "logits/chosen": 1.6915569305419922, + "logits/rejected": 1.6864850521087646, + "logps/chosen": -154.83705139160156, + "logps/rejected": -141.92440795898438, + "loss": 0.677583646774292, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.13437099754810333, + "rewards/margins": 0.03629336506128311, + "rewards/rejected": 0.09807763993740082, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 4.3424553871154785, + "learning_rate": 6.004444444444444e-07, + "logits/chosen": 1.3488795757293701, + "logits/rejected": 1.3907456398010254, + "logps/chosen": -138.03089904785156, + "logps/rejected": -129.59719848632812, + "loss": 0.6962613582611084, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.0907142236828804, + "rewards/margins": 0.0030510523356497288, + "rewards/rejected": 0.08766315877437592, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 4.354366779327393, + "learning_rate": 5.96e-07, + "logits/chosen": 1.6015634536743164, + "logits/rejected": 1.5302627086639404, + "logps/chosen": -125.86180114746094, + "logps/rejected": -113.89128112792969, + "loss": 0.6872058868408203, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11558832228183746, + "rewards/margins": 0.016637511551380157, + "rewards/rejected": 0.09895080327987671, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 6.197093486785889, + "learning_rate": 5.915555555555555e-07, + "logits/chosen": 1.8193966150283813, + "logits/rejected": 1.7454732656478882, + "logps/chosen": -155.8551788330078, + "logps/rejected": -165.95828247070312, + "loss": 0.6903214454650879, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12587358057498932, + "rewards/margins": 0.01157000008970499, + "rewards/rejected": 0.1143035739660263, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 6.9796624183654785, + "learning_rate": 5.871111111111112e-07, + "logits/chosen": 1.7167119979858398, + "logits/rejected": 1.693549394607544, + "logps/chosen": -185.20008850097656, + "logps/rejected": -150.20620727539062, + "loss": 0.6912973880767822, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.1253143846988678, + "rewards/margins": 0.009792610071599483, + "rewards/rejected": 0.11552176624536514, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 4.833356857299805, + "learning_rate": 5.826666666666666e-07, + "logits/chosen": 1.7434288263320923, + "logits/rejected": 1.7330595254898071, + "logps/chosen": -165.2196044921875, + "logps/rejected": -195.1993408203125, + "loss": 0.6812029361724854, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11465537548065186, + "rewards/margins": 0.028904888778924942, + "rewards/rejected": 0.08575049787759781, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 4.727373123168945, + "learning_rate": 5.782222222222221e-07, + "logits/chosen": 1.6402454376220703, + "logits/rejected": 1.6071112155914307, + "logps/chosen": -138.0218963623047, + "logps/rejected": -144.56817626953125, + "loss": 0.694274616241455, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.11662209033966064, + "rewards/margins": 0.003029861254617572, + "rewards/rejected": 0.1135922223329544, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5258080959320068, + "eval_logits/rejected": 1.5528327226638794, + "eval_logps/chosen": -152.21652221679688, + "eval_logps/rejected": -146.95628356933594, + "eval_loss": 0.6881142854690552, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": 0.11126487702131271, + "eval_rewards/margins": 0.01660888083279133, + "eval_rewards/rejected": 0.09465599805116653, + "eval_runtime": 90.2009, + "eval_samples_per_second": 5.543, + "eval_steps_per_second": 2.772, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 4.669800758361816, + "learning_rate": 5.737777777777778e-07, + "logits/chosen": 1.469012975692749, + "logits/rejected": 1.4835999011993408, + "logps/chosen": -153.94541931152344, + "logps/rejected": -140.69659423828125, + "loss": 0.6921857357025146, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.12547791004180908, + "rewards/margins": 0.005948380567133427, + "rewards/rejected": 0.11952953040599823, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 5.349202632904053, + "learning_rate": 5.693333333333333e-07, + "logits/chosen": 1.5056023597717285, + "logits/rejected": 1.4966309070587158, + "logps/chosen": -144.36300659179688, + "logps/rejected": -122.79240417480469, + "loss": 0.695373821258545, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.09502485394477844, + "rewards/margins": -0.0005314469453878701, + "rewards/rejected": 0.09555630385875702, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 5.629171848297119, + "learning_rate": 5.648888888888889e-07, + "logits/chosen": 1.5664876699447632, + "logits/rejected": 1.7090803384780884, + "logps/chosen": -136.5504150390625, + "logps/rejected": -148.81802368164062, + "loss": 0.6888413906097413, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13565854728221893, + "rewards/margins": 0.013341712765395641, + "rewards/rejected": 0.12231683731079102, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 5.568618297576904, + "learning_rate": 5.604444444444444e-07, + "logits/chosen": 1.512286901473999, + "logits/rejected": 1.7082159519195557, + "logps/chosen": -143.9463653564453, + "logps/rejected": -146.82052612304688, + "loss": 0.6722752571105957, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.12829402089118958, + "rewards/margins": 0.046781741082668304, + "rewards/rejected": 0.08151227235794067, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 4.786505699157715, + "learning_rate": 5.560000000000001e-07, + "logits/chosen": 1.6615822315216064, + "logits/rejected": 1.7368179559707642, + "logps/chosen": -144.40292358398438, + "logps/rejected": -157.58763122558594, + "loss": 0.6829993724822998, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11200229823589325, + "rewards/margins": 0.026959875598549843, + "rewards/rejected": 0.08504240959882736, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 4.871355056762695, + "learning_rate": 5.515555555555555e-07, + "logits/chosen": 1.581756830215454, + "logits/rejected": 1.643133521080017, + "logps/chosen": -123.44065856933594, + "logps/rejected": -131.16763305664062, + "loss": 0.6869094848632813, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.11280514299869537, + "rewards/margins": 0.016064399853348732, + "rewards/rejected": 0.09674074500799179, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 5.43352746963501, + "learning_rate": 5.471111111111111e-07, + "logits/chosen": 1.8056955337524414, + "logits/rejected": 1.7294971942901611, + "logps/chosen": -159.89236450195312, + "logps/rejected": -152.61293029785156, + "loss": 0.6701028347015381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1585489809513092, + "rewards/margins": 0.0533718541264534, + "rewards/rejected": 0.1051771491765976, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 5.0044474601745605, + "learning_rate": 5.426666666666666e-07, + "logits/chosen": 1.68048894405365, + "logits/rejected": 1.6845731735229492, + "logps/chosen": -153.21994018554688, + "logps/rejected": -140.06024169921875, + "loss": 0.675870132446289, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11108388751745224, + "rewards/margins": 0.039517562836408615, + "rewards/rejected": 0.07156632840633392, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 4.051770210266113, + "learning_rate": 5.382222222222223e-07, + "logits/chosen": 1.6341949701309204, + "logits/rejected": 1.7052574157714844, + "logps/chosen": -143.89512634277344, + "logps/rejected": -132.5374298095703, + "loss": 0.6808773040771484, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.11111323535442352, + "rewards/margins": 0.02908928692340851, + "rewards/rejected": 0.08202396333217621, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 4.0852885246276855, + "learning_rate": 5.337777777777778e-07, + "logits/chosen": 1.701703429222107, + "logits/rejected": 1.5303716659545898, + "logps/chosen": -149.3526611328125, + "logps/rejected": -129.2085418701172, + "loss": 0.6718564987182617, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13976502418518066, + "rewards/margins": 0.04783231392502785, + "rewards/rejected": 0.09193271398544312, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5326013565063477, + "eval_logits/rejected": 1.5598564147949219, + "eval_logps/chosen": -152.15328979492188, + "eval_logps/rejected": -146.90542602539062, + "eval_loss": 0.6874103546142578, + "eval_rewards/accuracies": 0.5460000038146973, + "eval_rewards/chosen": 0.1175844818353653, + "eval_rewards/margins": 0.01784202829003334, + "eval_rewards/rejected": 0.09974244982004166, + "eval_runtime": 90.5137, + "eval_samples_per_second": 5.524, + "eval_steps_per_second": 2.762, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 5.752922534942627, + "learning_rate": 5.293333333333333e-07, + "logits/chosen": 1.7796787023544312, + "logits/rejected": 1.7884852886199951, + "logps/chosen": -164.8948211669922, + "logps/rejected": -159.16610717773438, + "loss": 0.6778300285339356, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.13073134422302246, + "rewards/margins": 0.03521668165922165, + "rewards/rejected": 0.09551465511322021, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 5.461390018463135, + "learning_rate": 5.248888888888888e-07, + "logits/chosen": 1.4079742431640625, + "logits/rejected": 1.6630131006240845, + "logps/chosen": -128.2792510986328, + "logps/rejected": -150.0707244873047, + "loss": 0.6839999198913574, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11985810846090317, + "rewards/margins": 0.021619705483317375, + "rewards/rejected": 0.09823839366436005, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 4.624034881591797, + "learning_rate": 5.204444444444444e-07, + "logits/chosen": 1.727718710899353, + "logits/rejected": 1.7013204097747803, + "logps/chosen": -155.4447479248047, + "logps/rejected": -145.0660858154297, + "loss": 0.6688554286956787, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14348448812961578, + "rewards/margins": 0.05262491852045059, + "rewards/rejected": 0.09085958451032639, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 4.8239617347717285, + "learning_rate": 5.16e-07, + "logits/chosen": 1.5751596689224243, + "logits/rejected": 1.5026706457138062, + "logps/chosen": -151.81521606445312, + "logps/rejected": -144.22158813476562, + "loss": 0.6607092380523681, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15143291652202606, + "rewards/margins": 0.06974462419748306, + "rewards/rejected": 0.0816882774233818, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 5.160701274871826, + "learning_rate": 5.115555555555555e-07, + "logits/chosen": 1.5843019485473633, + "logits/rejected": 1.4874061346054077, + "logps/chosen": -164.33465576171875, + "logps/rejected": -138.5998077392578, + "loss": 0.658644723892212, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.18924950063228607, + "rewards/margins": 0.07819559425115585, + "rewards/rejected": 0.11105390638113022, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 3.7558722496032715, + "learning_rate": 5.071111111111111e-07, + "logits/chosen": 1.7361290454864502, + "logits/rejected": 1.7833389043807983, + "logps/chosen": -165.33505249023438, + "logps/rejected": -163.27987670898438, + "loss": 0.675537109375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13921695947647095, + "rewards/margins": 0.03955959528684616, + "rewards/rejected": 0.09965735673904419, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 3.7639994621276855, + "learning_rate": 5.026666666666667e-07, + "logits/chosen": 1.5270617008209229, + "logits/rejected": 1.6595666408538818, + "logps/chosen": -130.36978149414062, + "logps/rejected": -127.0807113647461, + "loss": 0.6692055702209473, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14394144713878632, + "rewards/margins": 0.053458504378795624, + "rewards/rejected": 0.09048295766115189, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 4.3264946937561035, + "learning_rate": 4.982222222222223e-07, + "logits/chosen": 1.6070820093154907, + "logits/rejected": 1.7583873271942139, + "logps/chosen": -160.203369140625, + "logps/rejected": -165.0500946044922, + "loss": 0.6763527393341064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1415795534849167, + "rewards/margins": 0.03869130462408066, + "rewards/rejected": 0.10288827121257782, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 5.288540840148926, + "learning_rate": 4.937777777777777e-07, + "logits/chosen": 1.5944023132324219, + "logits/rejected": 1.5990722179412842, + "logps/chosen": -165.3871307373047, + "logps/rejected": -161.1041717529297, + "loss": 0.6612223148345947, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16650637984275818, + "rewards/margins": 0.07088200747966766, + "rewards/rejected": 0.09562437236309052, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 4.826308727264404, + "learning_rate": 4.893333333333333e-07, + "logits/chosen": 1.5509014129638672, + "logits/rejected": 1.682037115097046, + "logps/chosen": -151.07647705078125, + "logps/rejected": -160.36459350585938, + "loss": 0.6734431266784668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1914874017238617, + "rewards/margins": 0.043385379016399384, + "rewards/rejected": 0.1481020450592041, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.5414153337478638, + "eval_logits/rejected": 1.5681167840957642, + "eval_logps/chosen": -151.83712768554688, + "eval_logps/rejected": -146.61451721191406, + "eval_loss": 0.6868449449539185, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.14920170605182648, + "eval_rewards/margins": 0.020368749275803566, + "eval_rewards/rejected": 0.12883296608924866, + "eval_runtime": 90.3054, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.768, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 5.3463215827941895, + "learning_rate": 4.848888888888888e-07, + "logits/chosen": 1.7068660259246826, + "logits/rejected": 1.7066301107406616, + "logps/chosen": -144.30587768554688, + "logps/rejected": -148.75234985351562, + "loss": 0.6804617881774903, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14624662697315216, + "rewards/margins": 0.029874861240386963, + "rewards/rejected": 0.1163717657327652, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 6.410695552825928, + "learning_rate": 4.804444444444444e-07, + "logits/chosen": 1.69613778591156, + "logits/rejected": 1.7461612224578857, + "logps/chosen": -134.14889526367188, + "logps/rejected": -149.71217346191406, + "loss": 0.6900940895080566, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.14041271805763245, + "rewards/margins": 0.010535283014178276, + "rewards/rejected": 0.12987744808197021, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 6.171964168548584, + "learning_rate": 4.76e-07, + "logits/chosen": 1.5641189813613892, + "logits/rejected": 1.3624608516693115, + "logps/chosen": -152.79434204101562, + "logps/rejected": -146.59823608398438, + "loss": 0.6705893516540528, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16485603153705597, + "rewards/margins": 0.05301935225725174, + "rewards/rejected": 0.11183668673038483, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 6.058873176574707, + "learning_rate": 4.7155555555555556e-07, + "logits/chosen": 1.554245114326477, + "logits/rejected": 1.5402500629425049, + "logps/chosen": -169.1145782470703, + "logps/rejected": -150.73916625976562, + "loss": 0.6832107543945313, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13169637322425842, + "rewards/margins": 0.023917924612760544, + "rewards/rejected": 0.10777842998504639, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 6.125433921813965, + "learning_rate": 4.6711111111111104e-07, + "logits/chosen": 1.5380629301071167, + "logits/rejected": 1.5384807586669922, + "logps/chosen": -152.90679931640625, + "logps/rejected": -147.15878295898438, + "loss": 0.678408432006836, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.14591780304908752, + "rewards/margins": 0.034742556512355804, + "rewards/rejected": 0.11117523908615112, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 4.146246433258057, + "learning_rate": 4.6266666666666663e-07, + "logits/chosen": 1.6788570880889893, + "logits/rejected": 1.672521948814392, + "logps/chosen": -148.545654296875, + "logps/rejected": -173.09043884277344, + "loss": 0.6805721282958984, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14106473326683044, + "rewards/margins": 0.03079717420041561, + "rewards/rejected": 0.11026755720376968, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 4.701725006103516, + "learning_rate": 4.5822222222222216e-07, + "logits/chosen": 1.549036979675293, + "logits/rejected": 1.7567065954208374, + "logps/chosen": -136.66348266601562, + "logps/rejected": -153.68270874023438, + "loss": 0.6799561977386475, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.10922833532094955, + "rewards/margins": 0.03145608678460121, + "rewards/rejected": 0.07777224481105804, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 4.027674198150635, + "learning_rate": 4.5377777777777775e-07, + "logits/chosen": 1.5800559520721436, + "logits/rejected": 1.6139322519302368, + "logps/chosen": -142.79774475097656, + "logps/rejected": -138.2237548828125, + "loss": 0.6735220909118652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12952227890491486, + "rewards/margins": 0.044702861458063126, + "rewards/rejected": 0.08481942117214203, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 4.0694756507873535, + "learning_rate": 4.493333333333333e-07, + "logits/chosen": 1.8074331283569336, + "logits/rejected": 1.7226593494415283, + "logps/chosen": -139.64735412597656, + "logps/rejected": -145.65493774414062, + "loss": 0.6719463348388672, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12084708362817764, + "rewards/margins": 0.04711727052927017, + "rewards/rejected": 0.07372982054948807, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 3.381568670272827, + "learning_rate": 4.4488888888888887e-07, + "logits/chosen": 1.6288955211639404, + "logits/rejected": 1.710903525352478, + "logps/chosen": -146.3409423828125, + "logps/rejected": -140.67259216308594, + "loss": 0.6622853755950928, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14423500001430511, + "rewards/margins": 0.06720604002475739, + "rewards/rejected": 0.07702895998954773, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.5185505151748657, + "eval_logits/rejected": 1.5454010963439941, + "eval_logps/chosen": -152.3339080810547, + "eval_logps/rejected": -147.08615112304688, + "eval_loss": 0.6876598596572876, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.0995246097445488, + "eval_rewards/margins": 0.017856568098068237, + "eval_rewards/rejected": 0.08166804164648056, + "eval_runtime": 90.2948, + "eval_samples_per_second": 5.537, + "eval_steps_per_second": 2.769, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 4.928984642028809, + "learning_rate": 4.4044444444444445e-07, + "logits/chosen": 1.60953688621521, + "logits/rejected": 1.5581461191177368, + "logps/chosen": -138.63772583007812, + "logps/rejected": -153.8605194091797, + "loss": 0.6731616020202636, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.09628470242023468, + "rewards/margins": 0.046329062432050705, + "rewards/rejected": 0.04995563626289368, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 5.149722099304199, + "learning_rate": 4.36e-07, + "logits/chosen": 1.4915835857391357, + "logits/rejected": 1.5048246383666992, + "logps/chosen": -141.77896118164062, + "logps/rejected": -151.37413024902344, + "loss": 0.6731313228607178, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09898792207241058, + "rewards/margins": 0.04597122594714165, + "rewards/rejected": 0.05301668494939804, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 4.9786505699157715, + "learning_rate": 4.3155555555555557e-07, + "logits/chosen": 1.5681893825531006, + "logits/rejected": 1.602756142616272, + "logps/chosen": -180.0757598876953, + "logps/rejected": -172.3198699951172, + "loss": 0.6719411849975586, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.12683382630348206, + "rewards/margins": 0.048313409090042114, + "rewards/rejected": 0.07852041721343994, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 5.631436824798584, + "learning_rate": 4.271111111111111e-07, + "logits/chosen": 1.596337080001831, + "logits/rejected": 1.5848333835601807, + "logps/chosen": -165.8964080810547, + "logps/rejected": -152.3248748779297, + "loss": 0.6702041149139404, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10199526697397232, + "rewards/margins": 0.052691929042339325, + "rewards/rejected": 0.049303337931632996, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 4.669321537017822, + "learning_rate": 4.226666666666667e-07, + "logits/chosen": 1.3906035423278809, + "logits/rejected": 1.64999520778656, + "logps/chosen": -128.77316284179688, + "logps/rejected": -130.9954376220703, + "loss": 0.6716189384460449, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11641445010900497, + "rewards/margins": 0.048517655581235886, + "rewards/rejected": 0.06789680570363998, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 4.713964462280273, + "learning_rate": 4.1822222222222217e-07, + "logits/chosen": 1.3268989324569702, + "logits/rejected": 1.3860762119293213, + "logps/chosen": -131.51779174804688, + "logps/rejected": -149.37356567382812, + "loss": 0.6767086029052735, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.08011795580387115, + "rewards/margins": 0.03772367164492607, + "rewards/rejected": 0.04239428788423538, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 4.6052775382995605, + "learning_rate": 4.1377777777777776e-07, + "logits/chosen": 1.6550222635269165, + "logits/rejected": 1.5474750995635986, + "logps/chosen": -119.24078369140625, + "logps/rejected": -144.82949829101562, + "loss": 0.6784813404083252, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0944647416472435, + "rewards/margins": 0.03385248780250549, + "rewards/rejected": 0.06061224266886711, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 5.244394302368164, + "learning_rate": 4.093333333333333e-07, + "logits/chosen": 1.4200581312179565, + "logits/rejected": 1.524652123451233, + "logps/chosen": -149.26693725585938, + "logps/rejected": -171.10025024414062, + "loss": 0.6608034133911133, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11365441232919693, + "rewards/margins": 0.07590361684560776, + "rewards/rejected": 0.037750788033008575, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 4.911222457885742, + "learning_rate": 4.048888888888889e-07, + "logits/chosen": 1.685605764389038, + "logits/rejected": 1.758301019668579, + "logps/chosen": -155.50318908691406, + "logps/rejected": -157.9464874267578, + "loss": 0.6672821521759034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0965135246515274, + "rewards/margins": 0.05731017515063286, + "rewards/rejected": 0.03920333832502365, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 5.968125820159912, + "learning_rate": 4.004444444444444e-07, + "logits/chosen": 1.4200689792633057, + "logits/rejected": 1.4647656679153442, + "logps/chosen": -133.22052001953125, + "logps/rejected": -133.8321533203125, + "loss": 0.6618384838104248, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09781745076179504, + "rewards/margins": 0.07378478348255157, + "rewards/rejected": 0.02403266355395317, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.501773476600647, + "eval_logits/rejected": 1.5282105207443237, + "eval_logps/chosen": -152.49317932128906, + "eval_logps/rejected": -147.2423553466797, + "eval_loss": 0.6881668567657471, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.08359722793102264, + "eval_rewards/margins": 0.0175489354878664, + "eval_rewards/rejected": 0.0660482868552208, + "eval_runtime": 90.2344, + "eval_samples_per_second": 5.541, + "eval_steps_per_second": 2.771, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 4.5057783126831055, + "learning_rate": 3.96e-07, + "logits/chosen": 1.5745240449905396, + "logits/rejected": 1.6804988384246826, + "logps/chosen": -140.4354248046875, + "logps/rejected": -145.98095703125, + "loss": 0.6589378833770752, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.10474289953708649, + "rewards/margins": 0.07502223551273346, + "rewards/rejected": 0.029720673337578773, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 4.290356159210205, + "learning_rate": 3.9155555555555553e-07, + "logits/chosen": 1.472800850868225, + "logits/rejected": 1.5744824409484863, + "logps/chosen": -155.6610107421875, + "logps/rejected": -159.48797607421875, + "loss": 0.6584812641143799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.12055446207523346, + "rewards/margins": 0.07605487108230591, + "rewards/rejected": 0.04449959844350815, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 6.041454792022705, + "learning_rate": 3.871111111111111e-07, + "logits/chosen": 1.6146243810653687, + "logits/rejected": 1.5308036804199219, + "logps/chosen": -138.68113708496094, + "logps/rejected": -167.34625244140625, + "loss": 0.675632095336914, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.09587542712688446, + "rewards/margins": 0.0477941557765007, + "rewards/rejected": 0.04808126017451286, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 4.511700630187988, + "learning_rate": 3.8266666666666665e-07, + "logits/chosen": 1.5492980480194092, + "logits/rejected": 1.4730138778686523, + "logps/chosen": -155.36436462402344, + "logps/rejected": -137.7958221435547, + "loss": 0.670874547958374, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07881785929203033, + "rewards/margins": 0.05401306599378586, + "rewards/rejected": 0.024804789572954178, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 4.88837194442749, + "learning_rate": 3.7822222222222224e-07, + "logits/chosen": 1.4858735799789429, + "logits/rejected": 1.5098966360092163, + "logps/chosen": -135.9697723388672, + "logps/rejected": -133.1214141845703, + "loss": 0.6685511589050293, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09598390758037567, + "rewards/margins": 0.053858526051044464, + "rewards/rejected": 0.04212538152933121, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 6.195035934448242, + "learning_rate": 3.7377777777777777e-07, + "logits/chosen": 1.6547601222991943, + "logits/rejected": 1.4603536128997803, + "logps/chosen": -145.89651489257812, + "logps/rejected": -133.7212677001953, + "loss": 0.6622809410095215, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12225265800952911, + "rewards/margins": 0.06909220665693283, + "rewards/rejected": 0.05316043645143509, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 3.8046131134033203, + "learning_rate": 3.693333333333333e-07, + "logits/chosen": 1.5132381916046143, + "logits/rejected": 1.4982823133468628, + "logps/chosen": -150.82089233398438, + "logps/rejected": -148.30099487304688, + "loss": 0.6719642162322998, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11414922773838043, + "rewards/margins": 0.05024232715368271, + "rewards/rejected": 0.06390689313411713, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 4.39717960357666, + "learning_rate": 3.6488888888888884e-07, + "logits/chosen": 1.5617916584014893, + "logits/rejected": 1.682511329650879, + "logps/chosen": -158.3753204345703, + "logps/rejected": -146.37451171875, + "loss": 0.6737432956695557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11844941228628159, + "rewards/margins": 0.04444308206439018, + "rewards/rejected": 0.0740063264966011, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 5.2524261474609375, + "learning_rate": 3.604444444444444e-07, + "logits/chosen": 1.5695984363555908, + "logits/rejected": 1.610656499862671, + "logps/chosen": -151.31857299804688, + "logps/rejected": -151.91409301757812, + "loss": 0.6611621856689454, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14522945880889893, + "rewards/margins": 0.07014746963977814, + "rewards/rejected": 0.07508201897144318, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.1307501792907715, + "learning_rate": 3.5599999999999996e-07, + "logits/chosen": 1.483229398727417, + "logits/rejected": 1.510617733001709, + "logps/chosen": -148.93728637695312, + "logps/rejected": -150.54031372070312, + "loss": 0.6745404243469239, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.10079844295978546, + "rewards/margins": 0.044623635709285736, + "rewards/rejected": 0.05617480352520943, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.4972316026687622, + "eval_logits/rejected": 1.523227572441101, + "eval_logps/chosen": -152.4461669921875, + "eval_logps/rejected": -147.20535278320312, + "eval_loss": 0.6881544589996338, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.08829746395349503, + "eval_rewards/margins": 0.01854766719043255, + "eval_rewards/rejected": 0.06974979490041733, + "eval_runtime": 2076.8971, + "eval_samples_per_second": 0.241, + "eval_steps_per_second": 0.12, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 5.462865352630615, + "learning_rate": 3.5155555555555554e-07, + "logits/chosen": 1.5469191074371338, + "logits/rejected": 1.5952446460723877, + "logps/chosen": -141.61489868164062, + "logps/rejected": -177.96774291992188, + "loss": 0.6638422012329102, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.11491324007511139, + "rewards/margins": 0.06743086874485016, + "rewards/rejected": 0.04748237505555153, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 5.499386310577393, + "learning_rate": 3.471111111111111e-07, + "logits/chosen": 1.5340317487716675, + "logits/rejected": 1.5532381534576416, + "logps/chosen": -147.91448974609375, + "logps/rejected": -149.66990661621094, + "loss": 0.6894158363342285, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07543136924505234, + "rewards/margins": 0.013522538356482983, + "rewards/rejected": 0.061908822506666183, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 4.518467426300049, + "learning_rate": 3.4266666666666666e-07, + "logits/chosen": 1.6533176898956299, + "logits/rejected": 1.7667814493179321, + "logps/chosen": -159.10305786132812, + "logps/rejected": -158.16673278808594, + "loss": 0.6499703407287598, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14074134826660156, + "rewards/margins": 0.09790968149900436, + "rewards/rejected": 0.042831674218177795, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 5.075318813323975, + "learning_rate": 3.382222222222222e-07, + "logits/chosen": 1.610741376876831, + "logits/rejected": 1.5383796691894531, + "logps/chosen": -161.1984405517578, + "logps/rejected": -176.49668884277344, + "loss": 0.6678359508514404, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.11577492952346802, + "rewards/margins": 0.05837785452604294, + "rewards/rejected": 0.05739706754684448, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 5.038666725158691, + "learning_rate": 3.337777777777778e-07, + "logits/chosen": 1.5075687170028687, + "logits/rejected": 1.5709255933761597, + "logps/chosen": -137.76171875, + "logps/rejected": -143.53628540039062, + "loss": 0.6760771751403809, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10021932423114777, + "rewards/margins": 0.04277960956096649, + "rewards/rejected": 0.05743972212076187, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 5.631906032562256, + "learning_rate": 3.293333333333333e-07, + "logits/chosen": 1.7244329452514648, + "logits/rejected": 1.8332182168960571, + "logps/chosen": -168.69371032714844, + "logps/rejected": -175.52243041992188, + "loss": 0.6608580589294434, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12150160223245621, + "rewards/margins": 0.07380715757608414, + "rewards/rejected": 0.04769443720579147, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 5.159383296966553, + "learning_rate": 3.248888888888889e-07, + "logits/chosen": 1.6496635675430298, + "logits/rejected": 1.5933442115783691, + "logps/chosen": -159.64492797851562, + "logps/rejected": -138.42958068847656, + "loss": 0.6662045001983643, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11035974323749542, + "rewards/margins": 0.06124384328722954, + "rewards/rejected": 0.04911590367555618, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 4.214277744293213, + "learning_rate": 3.204444444444444e-07, + "logits/chosen": 1.5003747940063477, + "logits/rejected": 1.2975776195526123, + "logps/chosen": -149.29920959472656, + "logps/rejected": -149.67440795898438, + "loss": 0.6661914348602295, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.10157595574855804, + "rewards/margins": 0.06117083504796028, + "rewards/rejected": 0.04040512815117836, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 4.933855056762695, + "learning_rate": 3.1599999999999997e-07, + "logits/chosen": 1.6081959009170532, + "logits/rejected": 1.6354488134384155, + "logps/chosen": -155.25970458984375, + "logps/rejected": -149.46218872070312, + "loss": 0.6604723453521728, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14066801965236664, + "rewards/margins": 0.07809358835220337, + "rewards/rejected": 0.06257440894842148, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 5.134099960327148, + "learning_rate": 3.115555555555555e-07, + "logits/chosen": 1.6613690853118896, + "logits/rejected": 1.7091057300567627, + "logps/chosen": -164.70701599121094, + "logps/rejected": -157.0238494873047, + "loss": 0.6582140922546387, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12183503806591034, + "rewards/margins": 0.07757680863142014, + "rewards/rejected": 0.044258248060941696, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.4925261735916138, + "eval_logits/rejected": 1.5181974172592163, + "eval_logps/chosen": -152.43206787109375, + "eval_logps/rejected": -147.19375610351562, + "eval_loss": 0.6884378790855408, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.08970824629068375, + "eval_rewards/margins": 0.018797704949975014, + "eval_rewards/rejected": 0.07091052830219269, + "eval_runtime": 90.3798, + "eval_samples_per_second": 5.532, + "eval_steps_per_second": 2.766, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 4.911464214324951, + "learning_rate": 3.071111111111111e-07, + "logits/chosen": 1.5579628944396973, + "logits/rejected": 1.4952296018600464, + "logps/chosen": -150.1460418701172, + "logps/rejected": -133.44647216796875, + "loss": 0.6783469200134278, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1408417820930481, + "rewards/margins": 0.03822038695216179, + "rewards/rejected": 0.1026213988661766, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 4.400730609893799, + "learning_rate": 3.026666666666666e-07, + "logits/chosen": 1.7586734294891357, + "logits/rejected": 1.6157715320587158, + "logps/chosen": -148.05960083007812, + "logps/rejected": -145.48837280273438, + "loss": 0.6609668731689453, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1438443809747696, + "rewards/margins": 0.07730484008789062, + "rewards/rejected": 0.06653954088687897, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 4.201021671295166, + "learning_rate": 2.982222222222222e-07, + "logits/chosen": 1.5160504579544067, + "logits/rejected": 1.521612286567688, + "logps/chosen": -162.4706573486328, + "logps/rejected": -141.2475128173828, + "loss": 0.6658499717712403, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14357298612594604, + "rewards/margins": 0.06365186721086502, + "rewards/rejected": 0.07992113381624222, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 4.233860015869141, + "learning_rate": 2.937777777777778e-07, + "logits/chosen": 1.5524044036865234, + "logits/rejected": 1.4785308837890625, + "logps/chosen": -147.02334594726562, + "logps/rejected": -139.936279296875, + "loss": 0.6650140762329102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10469619184732437, + "rewards/margins": 0.06748644262552261, + "rewards/rejected": 0.037209756672382355, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 3.9541995525360107, + "learning_rate": 2.8933333333333333e-07, + "logits/chosen": 1.6252977848052979, + "logits/rejected": 1.674830675125122, + "logps/chosen": -141.0922393798828, + "logps/rejected": -147.82305908203125, + "loss": 0.6732684135437011, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11090433597564697, + "rewards/margins": 0.04710138589143753, + "rewards/rejected": 0.06380295008420944, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 5.4466471672058105, + "learning_rate": 2.848888888888889e-07, + "logits/chosen": 1.6136564016342163, + "logits/rejected": 1.6527297496795654, + "logps/chosen": -152.11404418945312, + "logps/rejected": -117.46043395996094, + "loss": 0.6697136402130127, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1206357479095459, + "rewards/margins": 0.05328177288174629, + "rewards/rejected": 0.0673539787530899, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 4.790223121643066, + "learning_rate": 2.8044444444444445e-07, + "logits/chosen": 1.6939268112182617, + "logits/rejected": 1.6044652462005615, + "logps/chosen": -160.64529418945312, + "logps/rejected": -154.56561279296875, + "loss": 0.6666451454162597, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1110156923532486, + "rewards/margins": 0.06152622774243355, + "rewards/rejected": 0.04948946088552475, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 4.825891494750977, + "learning_rate": 2.7600000000000004e-07, + "logits/chosen": 1.6585792303085327, + "logits/rejected": 1.7031543254852295, + "logps/chosen": -147.00076293945312, + "logps/rejected": -160.07797241210938, + "loss": 0.6680663585662842, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.12477920949459076, + "rewards/margins": 0.06180337816476822, + "rewards/rejected": 0.06297583878040314, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 4.7607879638671875, + "learning_rate": 2.715555555555555e-07, + "logits/chosen": 1.546024203300476, + "logits/rejected": 1.534863829612732, + "logps/chosen": -159.22036743164062, + "logps/rejected": -157.2753143310547, + "loss": 0.6581204414367676, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1385352909564972, + "rewards/margins": 0.0793139860033989, + "rewards/rejected": 0.0592212975025177, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 5.009489059448242, + "learning_rate": 2.671111111111111e-07, + "logits/chosen": 1.6235746145248413, + "logits/rejected": 1.749132513999939, + "logps/chosen": -153.6427764892578, + "logps/rejected": -129.64744567871094, + "loss": 0.6795202255249023, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12408678233623505, + "rewards/margins": 0.0351543165743351, + "rewards/rejected": 0.08893246948719025, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.494035243988037, + "eval_logits/rejected": 1.5192769765853882, + "eval_logps/chosen": -152.30662536621094, + "eval_logps/rejected": -147.08494567871094, + "eval_loss": 0.688173234462738, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.10225009173154831, + "eval_rewards/margins": 0.020460575819015503, + "eval_rewards/rejected": 0.08178950846195221, + "eval_runtime": 90.2907, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 4.748835563659668, + "learning_rate": 2.6266666666666664e-07, + "logits/chosen": 1.6174519062042236, + "logits/rejected": 1.5858701467514038, + "logps/chosen": -128.00579833984375, + "logps/rejected": -147.90884399414062, + "loss": 0.6726502418518067, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11665438115596771, + "rewards/margins": 0.04572301730513573, + "rewards/rejected": 0.07093136012554169, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 4.940587520599365, + "learning_rate": 2.582222222222222e-07, + "logits/chosen": 1.6492948532104492, + "logits/rejected": 1.6153085231781006, + "logps/chosen": -157.8480224609375, + "logps/rejected": -154.41123962402344, + "loss": 0.6731919765472412, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09920360893011093, + "rewards/margins": 0.05006008595228195, + "rewards/rejected": 0.04914351552724838, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 5.234673500061035, + "learning_rate": 2.5377777777777776e-07, + "logits/chosen": 1.7610938549041748, + "logits/rejected": 1.7243057489395142, + "logps/chosen": -170.20077514648438, + "logps/rejected": -171.45242309570312, + "loss": 0.6730650424957275, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11712609231472015, + "rewards/margins": 0.04995986074209213, + "rewards/rejected": 0.06716623157262802, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 4.774282455444336, + "learning_rate": 2.493333333333333e-07, + "logits/chosen": 1.5651425123214722, + "logits/rejected": 1.6542021036148071, + "logps/chosen": -155.97225952148438, + "logps/rejected": -151.70547485351562, + "loss": 0.6627981185913085, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1584210842847824, + "rewards/margins": 0.07277830690145493, + "rewards/rejected": 0.08564277738332748, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 4.362725257873535, + "learning_rate": 2.448888888888889e-07, + "logits/chosen": 1.646812081336975, + "logits/rejected": 1.5191072225570679, + "logps/chosen": -159.91146850585938, + "logps/rejected": -153.39175415039062, + "loss": 0.676693344116211, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1165633574128151, + "rewards/margins": 0.043364234268665314, + "rewards/rejected": 0.07319913059473038, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 6.111082077026367, + "learning_rate": 2.404444444444444e-07, + "logits/chosen": 1.3722110986709595, + "logits/rejected": 1.4659714698791504, + "logps/chosen": -136.6461639404297, + "logps/rejected": -157.80828857421875, + "loss": 0.6549744606018066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14050449430942535, + "rewards/margins": 0.09741021692752838, + "rewards/rejected": 0.04309428110718727, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 4.771009922027588, + "learning_rate": 2.3599999999999997e-07, + "logits/chosen": 1.4619197845458984, + "logits/rejected": 1.6102012395858765, + "logps/chosen": -147.86558532714844, + "logps/rejected": -138.744140625, + "loss": 0.6633531093597412, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11970734596252441, + "rewards/margins": 0.06974931806325912, + "rewards/rejected": 0.04995802417397499, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 4.225287437438965, + "learning_rate": 2.3155555555555553e-07, + "logits/chosen": 1.570460557937622, + "logits/rejected": 1.6326253414154053, + "logps/chosen": -143.7037353515625, + "logps/rejected": -146.02279663085938, + "loss": 0.6649062156677246, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13935089111328125, + "rewards/margins": 0.0673290267586708, + "rewards/rejected": 0.07202187180519104, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 3.609804153442383, + "learning_rate": 2.2711111111111112e-07, + "logits/chosen": 1.3968725204467773, + "logits/rejected": 1.5177648067474365, + "logps/chosen": -115.10206604003906, + "logps/rejected": -121.81781005859375, + "loss": 0.6878479957580567, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.11384274810552597, + "rewards/margins": 0.01629973202943802, + "rewards/rejected": 0.09754300117492676, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 4.79855489730835, + "learning_rate": 2.2266666666666668e-07, + "logits/chosen": 1.7946197986602783, + "logits/rejected": 1.779532790184021, + "logps/chosen": -142.3692626953125, + "logps/rejected": -148.74429321289062, + "loss": 0.6723564624786377, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11757278442382812, + "rewards/margins": 0.04625899717211723, + "rewards/rejected": 0.0713137835264206, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.4949489831924438, + "eval_logits/rejected": 1.5201067924499512, + "eval_logps/chosen": -152.3157958984375, + "eval_logps/rejected": -147.0997314453125, + "eval_loss": 0.6879469156265259, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": 0.10133373737335205, + "eval_rewards/margins": 0.021023308858275414, + "eval_rewards/rejected": 0.08031044155359268, + "eval_runtime": 90.2805, + "eval_samples_per_second": 5.538, + "eval_steps_per_second": 2.769, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 6.524471759796143, + "learning_rate": 2.1822222222222224e-07, + "logits/chosen": 1.5327484607696533, + "logits/rejected": 1.6004886627197266, + "logps/chosen": -163.94483947753906, + "logps/rejected": -163.67176818847656, + "loss": 0.6695352554321289, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.16861538589000702, + "rewards/margins": 0.05443992465734482, + "rewards/rejected": 0.1141754612326622, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 5.132118225097656, + "learning_rate": 2.1377777777777777e-07, + "logits/chosen": 1.4321272373199463, + "logits/rejected": 1.5530569553375244, + "logps/chosen": -142.76473999023438, + "logps/rejected": -157.69161987304688, + "loss": 0.6764227390289307, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.15017978847026825, + "rewards/margins": 0.04101533442735672, + "rewards/rejected": 0.10916446149349213, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 5.326941967010498, + "learning_rate": 2.0933333333333333e-07, + "logits/chosen": 1.586363673210144, + "logits/rejected": 1.6176955699920654, + "logps/chosen": -149.9454803466797, + "logps/rejected": -126.28958892822266, + "loss": 0.6754706859588623, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13434644043445587, + "rewards/margins": 0.04313293471932411, + "rewards/rejected": 0.09121349453926086, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 5.685925483703613, + "learning_rate": 2.048888888888889e-07, + "logits/chosen": 1.581106424331665, + "logits/rejected": 1.4789012670516968, + "logps/chosen": -156.9187469482422, + "logps/rejected": -159.40908813476562, + "loss": 0.6753536224365234, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.10917127132415771, + "rewards/margins": 0.04896850138902664, + "rewards/rejected": 0.06020277738571167, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 4.94968318939209, + "learning_rate": 2.0044444444444445e-07, + "logits/chosen": 1.642589807510376, + "logits/rejected": 1.6963341236114502, + "logps/chosen": -155.6983642578125, + "logps/rejected": -156.7867431640625, + "loss": 0.6885885238647461, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.12209658324718475, + "rewards/margins": 0.016716431826353073, + "rewards/rejected": 0.10538016259670258, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 4.533969879150391, + "learning_rate": 1.96e-07, + "logits/chosen": 1.5925997495651245, + "logits/rejected": 1.4471170902252197, + "logps/chosen": -118.26934814453125, + "logps/rejected": -143.85903930664062, + "loss": 0.6437589168548584, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14988769590854645, + "rewards/margins": 0.11566541343927383, + "rewards/rejected": 0.03422228619456291, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 4.945888519287109, + "learning_rate": 1.9155555555555554e-07, + "logits/chosen": 1.6543266773223877, + "logits/rejected": 1.5584715604782104, + "logps/chosen": -156.47389221191406, + "logps/rejected": -139.50567626953125, + "loss": 0.6912104606628418, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11081911623477936, + "rewards/margins": 0.010200846008956432, + "rewards/rejected": 0.10061826556921005, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 4.676812648773193, + "learning_rate": 1.871111111111111e-07, + "logits/chosen": 1.6702083349227905, + "logits/rejected": 1.585174322128296, + "logps/chosen": -158.23495483398438, + "logps/rejected": -148.28843688964844, + "loss": 0.6550227165222168, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.16143694519996643, + "rewards/margins": 0.08484812080860138, + "rewards/rejected": 0.07658880203962326, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 4.585787773132324, + "learning_rate": 1.8266666666666666e-07, + "logits/chosen": 1.5290788412094116, + "logits/rejected": 1.421924114227295, + "logps/chosen": -152.25558471679688, + "logps/rejected": -153.72178649902344, + "loss": 0.6806746482849121, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.11421312391757965, + "rewards/margins": 0.03305097296833992, + "rewards/rejected": 0.08116213977336884, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 4.75665283203125, + "learning_rate": 1.7822222222222222e-07, + "logits/chosen": 1.7162408828735352, + "logits/rejected": 1.6277885437011719, + "logps/chosen": -163.5832977294922, + "logps/rejected": -148.44378662109375, + "loss": 0.6706692218780518, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.17963996529579163, + "rewards/margins": 0.05455024167895317, + "rewards/rejected": 0.12508971989154816, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.5044771432876587, + "eval_logits/rejected": 1.5299365520477295, + "eval_logps/chosen": -152.16799926757812, + "eval_logps/rejected": -146.96514892578125, + "eval_loss": 0.6872401237487793, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": 0.11611522734165192, + "eval_rewards/margins": 0.022344600409269333, + "eval_rewards/rejected": 0.09377063810825348, + "eval_runtime": 90.1889, + "eval_samples_per_second": 5.544, + "eval_steps_per_second": 2.772, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 5.027055263519287, + "learning_rate": 1.7377777777777778e-07, + "logits/chosen": 1.550410509109497, + "logits/rejected": 1.491328239440918, + "logps/chosen": -165.79141235351562, + "logps/rejected": -155.51358032226562, + "loss": 0.6698966026306152, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1318608820438385, + "rewards/margins": 0.05974091216921806, + "rewards/rejected": 0.07211998105049133, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 5.497100353240967, + "learning_rate": 1.6933333333333334e-07, + "logits/chosen": 1.6706756353378296, + "logits/rejected": 1.6537840366363525, + "logps/chosen": -131.91165161132812, + "logps/rejected": -137.20498657226562, + "loss": 0.6716497898101806, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.13379232585430145, + "rewards/margins": 0.05359635502099991, + "rewards/rejected": 0.08019598573446274, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 8.849369049072266, + "learning_rate": 1.6488888888888887e-07, + "logits/chosen": 1.6430184841156006, + "logits/rejected": 1.6333353519439697, + "logps/chosen": -164.01890563964844, + "logps/rejected": -162.87362670898438, + "loss": 0.6649856090545654, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1533871591091156, + "rewards/margins": 0.069211944937706, + "rewards/rejected": 0.0841752141714096, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 4.965246677398682, + "learning_rate": 1.6044444444444443e-07, + "logits/chosen": 1.3473514318466187, + "logits/rejected": 1.4646275043487549, + "logps/chosen": -138.31454467773438, + "logps/rejected": -145.73532104492188, + "loss": 0.6746968746185302, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13360588252544403, + "rewards/margins": 0.043482352048158646, + "rewards/rejected": 0.09012351930141449, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 5.954956531524658, + "learning_rate": 1.56e-07, + "logits/chosen": 1.67049241065979, + "logits/rejected": 1.7067524194717407, + "logps/chosen": -156.033935546875, + "logps/rejected": -153.3822021484375, + "loss": 0.6812377452850342, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15508751571178436, + "rewards/margins": 0.03148316591978073, + "rewards/rejected": 0.12360434234142303, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 3.328672170639038, + "learning_rate": 1.5155555555555555e-07, + "logits/chosen": 1.4824349880218506, + "logits/rejected": 1.475740909576416, + "logps/chosen": -136.86688232421875, + "logps/rejected": -138.93450927734375, + "loss": 0.6671774864196778, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12285050004720688, + "rewards/margins": 0.05716937035322189, + "rewards/rejected": 0.06568112224340439, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 4.780362606048584, + "learning_rate": 1.4711111111111111e-07, + "logits/chosen": 1.5829228162765503, + "logits/rejected": 1.553118348121643, + "logps/chosen": -151.36331176757812, + "logps/rejected": -148.54202270507812, + "loss": 0.661128568649292, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.17800253629684448, + "rewards/margins": 0.07710476219654083, + "rewards/rejected": 0.10089776664972305, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 4.95118522644043, + "learning_rate": 1.4266666666666665e-07, + "logits/chosen": 1.370544672012329, + "logits/rejected": 1.4272708892822266, + "logps/chosen": -142.42710876464844, + "logps/rejected": -168.6506805419922, + "loss": 0.6768136024475098, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12570957839488983, + "rewards/margins": 0.04121888428926468, + "rewards/rejected": 0.08449070900678635, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 4.103761196136475, + "learning_rate": 1.382222222222222e-07, + "logits/chosen": 1.6573619842529297, + "logits/rejected": 1.6316673755645752, + "logps/chosen": -169.3119354248047, + "logps/rejected": -164.17633056640625, + "loss": 0.6686764717102051, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11812909692525864, + "rewards/margins": 0.06238695979118347, + "rewards/rejected": 0.05574214458465576, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 4.561870574951172, + "learning_rate": 1.3377777777777777e-07, + "logits/chosen": 1.5772711038589478, + "logits/rejected": 1.3478977680206299, + "logps/chosen": -132.41693115234375, + "logps/rejected": -134.36935424804688, + "loss": 0.6745004653930664, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.15194594860076904, + "rewards/margins": 0.04931178689002991, + "rewards/rejected": 0.10263414680957794, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.50482976436615, + "eval_logits/rejected": 1.5303761959075928, + "eval_logps/chosen": -152.19476318359375, + "eval_logps/rejected": -146.9914093017578, + "eval_loss": 0.6872152090072632, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.11343776434659958, + "eval_rewards/margins": 0.02229386195540428, + "eval_rewards/rejected": 0.0911439061164856, + "eval_runtime": 90.3738, + "eval_samples_per_second": 5.533, + "eval_steps_per_second": 2.766, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 5.253188610076904, + "learning_rate": 1.2933333333333333e-07, + "logits/chosen": 1.602463960647583, + "logits/rejected": 1.6365470886230469, + "logps/chosen": -151.4491729736328, + "logps/rejected": -139.6726531982422, + "loss": 0.6595804214477539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1349448263645172, + "rewards/margins": 0.07438500225543976, + "rewards/rejected": 0.06055985763669014, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 5.627497673034668, + "learning_rate": 1.2488888888888889e-07, + "logits/chosen": 1.5839112997055054, + "logits/rejected": 1.4787819385528564, + "logps/chosen": -134.94741821289062, + "logps/rejected": -163.97610473632812, + "loss": 0.6709693908691406, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15531916916370392, + "rewards/margins": 0.05516272038221359, + "rewards/rejected": 0.10015644878149033, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 4.092975616455078, + "learning_rate": 1.2044444444444445e-07, + "logits/chosen": 1.367531657218933, + "logits/rejected": 1.5928542613983154, + "logps/chosen": -128.33090209960938, + "logps/rejected": -130.70223999023438, + "loss": 0.6782794952392578, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.12403953075408936, + "rewards/margins": 0.03512664884328842, + "rewards/rejected": 0.08891288191080093, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 3.851191997528076, + "learning_rate": 1.16e-07, + "logits/chosen": 1.5335030555725098, + "logits/rejected": 1.6405471563339233, + "logps/chosen": -142.96331787109375, + "logps/rejected": -159.10667419433594, + "loss": 0.675669002532959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12597718834877014, + "rewards/margins": 0.04118332266807556, + "rewards/rejected": 0.08479384332895279, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 5.128774166107178, + "learning_rate": 1.1155555555555555e-07, + "logits/chosen": 1.6666786670684814, + "logits/rejected": 1.6352249383926392, + "logps/chosen": -160.48880004882812, + "logps/rejected": -149.51486206054688, + "loss": 0.676695442199707, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13730794191360474, + "rewards/margins": 0.03863085061311722, + "rewards/rejected": 0.09867707639932632, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 5.3966169357299805, + "learning_rate": 1.0711111111111111e-07, + "logits/chosen": 1.480333924293518, + "logits/rejected": 1.597484827041626, + "logps/chosen": -124.16390228271484, + "logps/rejected": -147.24319458007812, + "loss": 0.6672614097595215, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13824795186519623, + "rewards/margins": 0.05965462327003479, + "rewards/rejected": 0.07859332859516144, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 5.315938949584961, + "learning_rate": 1.0266666666666666e-07, + "logits/chosen": 1.4455833435058594, + "logits/rejected": 1.517375111579895, + "logps/chosen": -130.0419158935547, + "logps/rejected": -122.60333251953125, + "loss": 0.6733921527862549, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1244196742773056, + "rewards/margins": 0.05231797695159912, + "rewards/rejected": 0.07210170477628708, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 5.050210952758789, + "learning_rate": 9.822222222222222e-08, + "logits/chosen": 1.711755394935608, + "logits/rejected": 1.5550199747085571, + "logps/chosen": -165.57754516601562, + "logps/rejected": -143.4929656982422, + "loss": 0.6610856533050538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.12436368316411972, + "rewards/margins": 0.07134710252285004, + "rewards/rejected": 0.05301658436655998, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 4.6738457679748535, + "learning_rate": 9.377777777777778e-08, + "logits/chosen": 1.4832783937454224, + "logits/rejected": 1.3279205560684204, + "logps/chosen": -145.33587646484375, + "logps/rejected": -130.48712158203125, + "loss": 0.6677139282226563, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.08904106914997101, + "rewards/margins": 0.06454581767320633, + "rewards/rejected": 0.024495262652635574, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 4.347452163696289, + "learning_rate": 8.933333333333333e-08, + "logits/chosen": 1.5897009372711182, + "logits/rejected": 1.5665786266326904, + "logps/chosen": -156.43614196777344, + "logps/rejected": -140.20364379882812, + "loss": 0.684497880935669, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11093449592590332, + "rewards/margins": 0.023888718336820602, + "rewards/rejected": 0.08704578131437302, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.5041522979736328, + "eval_logits/rejected": 1.5298234224319458, + "eval_logps/chosen": -152.20681762695312, + "eval_logps/rejected": -147.0045166015625, + "eval_loss": 0.687169075012207, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": 0.11223345249891281, + "eval_rewards/margins": 0.022400878369808197, + "eval_rewards/rejected": 0.08983256667852402, + "eval_runtime": 90.2573, + "eval_samples_per_second": 5.54, + "eval_steps_per_second": 2.77, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 4.292654991149902, + "learning_rate": 8.488888888888889e-08, + "logits/chosen": 1.7366397380828857, + "logits/rejected": 1.7629550695419312, + "logps/chosen": -165.0113067626953, + "logps/rejected": -153.10745239257812, + "loss": 0.6697476387023926, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.18020913004875183, + "rewards/margins": 0.053330183029174805, + "rewards/rejected": 0.12687894701957703, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 4.576894760131836, + "learning_rate": 8.044444444444445e-08, + "logits/chosen": 1.6899020671844482, + "logits/rejected": 1.5166491270065308, + "logps/chosen": -138.1096649169922, + "logps/rejected": -125.38822174072266, + "loss": 0.6785052299499512, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.13225102424621582, + "rewards/margins": 0.03827885538339615, + "rewards/rejected": 0.09397216141223907, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 5.70767068862915, + "learning_rate": 7.599999999999999e-08, + "logits/chosen": 1.5233170986175537, + "logits/rejected": 1.5336878299713135, + "logps/chosen": -144.0012969970703, + "logps/rejected": -156.4627685546875, + "loss": 0.670370626449585, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.12371464818716049, + "rewards/margins": 0.05864514783024788, + "rewards/rejected": 0.0650695189833641, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 4.895171642303467, + "learning_rate": 7.155555555555555e-08, + "logits/chosen": 1.4882056713104248, + "logits/rejected": 1.4597115516662598, + "logps/chosen": -166.4456329345703, + "logps/rejected": -156.53567504882812, + "loss": 0.6758975505828857, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1247052326798439, + "rewards/margins": 0.041960421949625015, + "rewards/rejected": 0.08274482190608978, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 5.97705602645874, + "learning_rate": 6.71111111111111e-08, + "logits/chosen": 1.6275503635406494, + "logits/rejected": 1.791870355606079, + "logps/chosen": -146.13681030273438, + "logps/rejected": -155.450439453125, + "loss": 0.6827160358428955, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10786732286214828, + "rewards/margins": 0.030300844460725784, + "rewards/rejected": 0.0775664821267128, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 5.01773738861084, + "learning_rate": 6.266666666666666e-08, + "logits/chosen": 1.3919562101364136, + "logits/rejected": 1.3836013078689575, + "logps/chosen": -160.868408203125, + "logps/rejected": -152.0878448486328, + "loss": 0.6812721729278565, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.11709713935852051, + "rewards/margins": 0.03227861970663071, + "rewards/rejected": 0.0848185196518898, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 4.818210601806641, + "learning_rate": 5.822222222222222e-08, + "logits/chosen": 1.6691443920135498, + "logits/rejected": 1.642247200012207, + "logps/chosen": -179.80433654785156, + "logps/rejected": -163.50428771972656, + "loss": 0.6700118541717529, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.14921119809150696, + "rewards/margins": 0.060837097465991974, + "rewards/rejected": 0.08837412297725677, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 4.891122817993164, + "learning_rate": 5.377777777777778e-08, + "logits/chosen": 1.6384000778198242, + "logits/rejected": 1.6921117305755615, + "logps/chosen": -150.27743530273438, + "logps/rejected": -148.6507568359375, + "loss": 0.6937178611755371, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.11294616758823395, + "rewards/margins": 0.008084317669272423, + "rewards/rejected": 0.10486185550689697, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 5.576618194580078, + "learning_rate": 4.933333333333333e-08, + "logits/chosen": 1.5864824056625366, + "logits/rejected": 1.502429723739624, + "logps/chosen": -158.6403045654297, + "logps/rejected": -142.84228515625, + "loss": 0.6831792831420899, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12857118248939514, + "rewards/margins": 0.030134152621030807, + "rewards/rejected": 0.09843702614307404, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 4.503939151763916, + "learning_rate": 4.4888888888888885e-08, + "logits/chosen": 1.499786138534546, + "logits/rejected": 1.6668217182159424, + "logps/chosen": -148.1971893310547, + "logps/rejected": -140.85711669921875, + "loss": 0.6556127071380615, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1358988732099533, + "rewards/margins": 0.08561079949140549, + "rewards/rejected": 0.05028806999325752, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5030263662338257, + "eval_logits/rejected": 1.5287361145019531, + "eval_logps/chosen": -152.25106811523438, + "eval_logps/rejected": -147.04556274414062, + "eval_loss": 0.6872794032096863, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.10780756920576096, + "eval_rewards/margins": 0.022079555317759514, + "eval_rewards/rejected": 0.08572802692651749, + "eval_runtime": 90.3547, + "eval_samples_per_second": 5.534, + "eval_steps_per_second": 2.767, + "step": 2400 + }, + { + "epoch": 1.928, + "grad_norm": 4.352348804473877, + "learning_rate": 4.044444444444444e-08, + "logits/chosen": 1.471025824546814, + "logits/rejected": 1.6015453338623047, + "logps/chosen": -151.64999389648438, + "logps/rejected": -168.42857360839844, + "loss": 0.6811740875244141, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.11081109941005707, + "rewards/margins": 0.034671980887651443, + "rewards/rejected": 0.07613912224769592, + "step": 2410 + }, + { + "epoch": 1.936, + "grad_norm": 5.441567897796631, + "learning_rate": 3.6e-08, + "logits/chosen": 1.6943010091781616, + "logits/rejected": 1.682918906211853, + "logps/chosen": -142.38153076171875, + "logps/rejected": -129.25502014160156, + "loss": 0.6731202125549316, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.13154248893260956, + "rewards/margins": 0.04856497049331665, + "rewards/rejected": 0.08297751843929291, + "step": 2420 + }, + { + "epoch": 1.944, + "grad_norm": 5.634176254272461, + "learning_rate": 3.155555555555556e-08, + "logits/chosen": 1.6014600992202759, + "logits/rejected": 1.594496488571167, + "logps/chosen": -143.04586791992188, + "logps/rejected": -139.0958709716797, + "loss": 0.6659110069274903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14549754559993744, + "rewards/margins": 0.06534677743911743, + "rewards/rejected": 0.08015076816082001, + "step": 2430 + }, + { + "epoch": 1.952, + "grad_norm": 4.919292449951172, + "learning_rate": 2.7111111111111108e-08, + "logits/chosen": 1.5626763105392456, + "logits/rejected": 1.71872878074646, + "logps/chosen": -149.76551818847656, + "logps/rejected": -145.3653564453125, + "loss": 0.6616805076599122, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1465718150138855, + "rewards/margins": 0.07126794755458832, + "rewards/rejected": 0.07530387490987778, + "step": 2440 + }, + { + "epoch": 1.96, + "grad_norm": 4.744785785675049, + "learning_rate": 2.2666666666666668e-08, + "logits/chosen": 1.6668106317520142, + "logits/rejected": 1.5912516117095947, + "logps/chosen": -148.5892791748047, + "logps/rejected": -161.57980346679688, + "loss": 0.6620726585388184, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15803922712802887, + "rewards/margins": 0.07078298181295395, + "rewards/rejected": 0.08725622296333313, + "step": 2450 + }, + { + "epoch": 1.968, + "grad_norm": 4.632877349853516, + "learning_rate": 1.822222222222222e-08, + "logits/chosen": 1.5637186765670776, + "logits/rejected": 1.5719751119613647, + "logps/chosen": -154.7703857421875, + "logps/rejected": -159.79837036132812, + "loss": 0.6780555248260498, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.09817321598529816, + "rewards/margins": 0.03681803494691849, + "rewards/rejected": 0.06135518103837967, + "step": 2460 + }, + { + "epoch": 1.976, + "grad_norm": 3.7417750358581543, + "learning_rate": 1.3777777777777778e-08, + "logits/chosen": 1.7216196060180664, + "logits/rejected": 1.6309057474136353, + "logps/chosen": -167.3726348876953, + "logps/rejected": -159.16720581054688, + "loss": 0.6757784843444824, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13760563731193542, + "rewards/margins": 0.043277300894260406, + "rewards/rejected": 0.09432832896709442, + "step": 2470 + }, + { + "epoch": 1.984, + "grad_norm": 4.262117385864258, + "learning_rate": 9.333333333333334e-09, + "logits/chosen": 1.7302201986312866, + "logits/rejected": 1.7856807708740234, + "logps/chosen": -145.80654907226562, + "logps/rejected": -143.59242248535156, + "loss": 0.6701615333557129, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.15805938839912415, + "rewards/margins": 0.05617127567529678, + "rewards/rejected": 0.10188809782266617, + "step": 2480 + }, + { + "epoch": 1.992, + "grad_norm": 4.367166519165039, + "learning_rate": 4.888888888888888e-09, + "logits/chosen": 1.6408073902130127, + "logits/rejected": 1.7056671380996704, + "logps/chosen": -143.609130859375, + "logps/rejected": -156.66224670410156, + "loss": 0.6833744049072266, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12927643954753876, + "rewards/margins": 0.025437816977500916, + "rewards/rejected": 0.10383862257003784, + "step": 2490 + }, + { + "epoch": 2.0, + "grad_norm": 5.572624206542969, + "learning_rate": 4.4444444444444443e-10, + "logits/chosen": 1.6557143926620483, + "logits/rejected": 1.6632550954818726, + "logps/chosen": -176.3168182373047, + "logps/rejected": -142.21820068359375, + "loss": 0.6654477119445801, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.14149871468544006, + "rewards/margins": 0.06656529754400253, + "rewards/rejected": 0.07493340224027634, + "step": 2500 + }, + { + "epoch": 2.0, + "eval_logits/chosen": 1.5025110244750977, + "eval_logits/rejected": 1.5280849933624268, + "eval_logps/chosen": -152.26426696777344, + "eval_logps/rejected": -147.06167602539062, + "eval_loss": 0.6871456503868103, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": 0.10648718476295471, + "eval_rewards/margins": 0.022370221093297005, + "eval_rewards/rejected": 0.08411695808172226, + "eval_runtime": 90.3311, + "eval_samples_per_second": 5.535, + "eval_steps_per_second": 2.768, + "step": 2500 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_10k/lora/checkpoint-2500/training_args.bin b/v5/DPO/DPO_10k/lora/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..104933ebf9c17ba9c2c1c1d39a0d26ccafdfe373 --- /dev/null +++ b/v5/DPO/DPO_10k/lora/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677b288b67816c6ab7a9dcdd40d26bcb142fa3ad3ad050eaeeb4b73a1ba4b498 +size 6161 diff --git a/v5/DPO/DPO_1k/DPO_1k/README.md b/v5/DPO/DPO_1k/DPO_1k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_1k/DPO_1k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_1k/DPO_1k/adapter_config.json b/v5/DPO/DPO_1k/DPO_1k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..51de60011de8c53a3622d49c2b13f6ead1f71e90 --- /dev/null +++ b/v5/DPO/DPO_1k/DPO_1k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "o_proj", + "gate_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_1k/DPO_1k/adapter_model.safetensors b/v5/DPO/DPO_1k/DPO_1k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5e8b8f5cc12190e5d65d64650e503f45c30cfc9f --- /dev/null +++ b/v5/DPO/DPO_1k/DPO_1k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd39ce893511c42bb66fcc45902600954d24ae57a8646fa31ae8e964732c757a +size 180385008 diff --git a/v5/DPO/DPO_1k/MDPO_1k/chat_template.jinja b/v5/DPO/DPO_1k/MDPO_1k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_1k/MDPO_1k/config.json b/v5/DPO/DPO_1k/MDPO_1k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..269c2ffa2c365f594cb5e44218192c94b419a0cb --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/DPO/DPO_1k/MDPO_1k/generation_config.json b/v5/DPO/DPO_1k/MDPO_1k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9c2224cd391437f7236b3f36305dd39a63ab0a --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.0.0" +} diff --git a/v5/DPO/DPO_1k/MDPO_1k/model.safetensors b/v5/DPO/DPO_1k/MDPO_1k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2906bcfa4d8ac85ccf8d4446e6355c0e69b6e022 --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd895ac7ff8bcf90ad0106802dbc8b720ec889affabfe078a7ddf685b9a3436 +size 2471645464 diff --git a/v5/DPO/DPO_1k/MDPO_1k/tokenizer.json b/v5/DPO/DPO_1k/MDPO_1k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_1k/MDPO_1k/tokenizer_config.json b/v5/DPO/DPO_1k/MDPO_1k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_1k/MDPO_1k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_1k/lora/README.md b/v5/DPO/DPO_1k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3f8d384a8a076c0b0c84fa58bad75314903bea4f --- /dev/null +++ b/v5/DPO/DPO_1k/lora/README.md @@ -0,0 +1,69 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- trl +- dpo +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/n743noad) + + +This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290). + +### Framework versions + +- TRL: 0.27.2 +- Transformers: 5.0.0 +- Pytorch: 2.8.0+cu128 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite DPO as: + +```bibtex +@inproceedings{rafailov2023direct, + title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}}, + author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn}, + year = 2023, + booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023}, + url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html}, + editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/README.md b/v5/DPO/DPO_1k/lora/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_config.json b/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..51de60011de8c53a3622d49c2b13f6ead1f71e90 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "o_proj", + "gate_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_model.safetensors b/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c98cc6f3246de954829e6afd727c9c3b361af43c --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7445dcff104a7c314016de678fc90196369d0fb3341fa8d2c816e1d037855f9b +size 180385008 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/chat_template.jinja b/v5/DPO/DPO_1k/lora/checkpoint-240/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/optimizer.pt b/v5/DPO/DPO_1k/lora/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d30552560c1e20a55249ab225309216e8ed1cee6 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dcb80568cf52dfb441739a36345f62e3cc9bb0cd3fc08fbf32de4657fdf9e23 +size 360902475 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/rng_state.pth b/v5/DPO/DPO_1k/lora/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..13e11a54e352d8a7149df1f88c1b023ee9973959 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7affab63b271ed0f59a5b53056fc0a581226a41dcdf2fc2b80b669e7c3cf714 +size 14645 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/scaler.pt b/v5/DPO/DPO_1k/lora/checkpoint-240/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..619ebc02cb5597beed962ff71e9932cca592a8c0 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed396b33f40ccbc2f37eb48310cad33237a6875ad3a8dce2065905ee9e935266 +size 1383 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/scheduler.pt b/v5/DPO/DPO_1k/lora/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c1ddb21f44b82ec093eefe15e12f5bb5a6dbd2d --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ed05362824e1cd1d1691a5f141079f4cf8290607e906f44075db1229c5a7ca +size 1465 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer.json b/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer_config.json b/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/trainer_state.json b/v5/DPO/DPO_1k/lora/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e849b7ba48bdb043ffb7b013435d47baea5a7fcb --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/trainer_state.json @@ -0,0 +1,778 @@ +{ + "best_global_step": 60, + "best_metric": 0.5139999985694885, + "best_model_checkpoint": "output/lora/checkpoint-60", + "epoch": 1.92, + "eval_steps": 10, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 4.44759464263916, + "learning_rate": 3.6e-07, + "logits/chosen": 1.7029424905776978, + "logits/rejected": 1.5582908391952515, + "logps/chosen": -138.26773071289062, + "logps/rejected": -131.23216247558594, + "loss": 0.6930624485015869, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.0004225396551191807, + "rewards/margins": 0.0001746034249663353, + "rewards/rejected": 0.0002479362883605063, + "step": 10 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.537629246711731, + "eval_logits/rejected": 1.5689224004745483, + "eval_logps/chosen": -153.32151794433594, + "eval_logps/rejected": -147.88926696777344, + "eval_loss": 0.6934499144554138, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.000762743002269417, + "eval_rewards/margins": -0.0005942528950981796, + "eval_rewards/rejected": 0.001356995664536953, + "eval_runtime": 89.836, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 2.783, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 4.250877857208252, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": 1.5865893363952637, + "logits/rejected": 1.6983562707901, + "logps/chosen": -146.0574188232422, + "logps/rejected": -146.00059509277344, + "loss": 0.692898178100586, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002089472021907568, + "rewards/margins": 0.000508568249642849, + "rewards/rejected": 0.0015809036558493972, + "step": 20 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5378667116165161, + "eval_logits/rejected": 1.5691591501235962, + "eval_logps/chosen": -153.29969787597656, + "eval_logps/rejected": -147.86961364746094, + "eval_loss": 0.6933443546295166, + "eval_rewards/accuracies": 0.4659999907016754, + "eval_rewards/chosen": 0.0029440198559314013, + "eval_rewards/margins": -0.00037939148023724556, + "eval_rewards/rejected": 0.003323411336168647, + "eval_runtime": 91.2074, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 3.7776143550872803, + "learning_rate": 9.82222222222222e-07, + "logits/chosen": 1.574183702468872, + "logits/rejected": 1.7243797779083252, + "logps/chosen": -145.04452514648438, + "logps/rejected": -157.22335815429688, + "loss": 0.6925838470458985, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.006991167552769184, + "rewards/margins": 0.0011441993992775679, + "rewards/rejected": 0.005846967454999685, + "step": 30 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5403894186019897, + "eval_logits/rejected": 1.5716638565063477, + "eval_logps/chosen": -153.24818420410156, + "eval_logps/rejected": -147.82077026367188, + "eval_loss": 0.6932132244110107, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.008096706122159958, + "eval_rewards/margins": -0.0001120842425734736, + "eval_rewards/rejected": 0.00820879079401493, + "eval_runtime": 91.1688, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 4.8561787605285645, + "learning_rate": 9.377777777777777e-07, + "logits/chosen": 1.5448085069656372, + "logits/rejected": 1.4517648220062256, + "logps/chosen": -137.44406127929688, + "logps/rejected": -146.0561065673828, + "loss": 0.6921479225158691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012806238606572151, + "rewards/margins": 0.002016632352024317, + "rewards/rejected": 0.010789604857563972, + "step": 40 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5443732738494873, + "eval_logits/rejected": 1.575553297996521, + "eval_logps/chosen": -153.18209838867188, + "eval_logps/rejected": -147.76028442382812, + "eval_loss": 0.6929393410682678, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.014704804867506027, + "eval_rewards/margins": 0.00044921261724084616, + "eval_rewards/rejected": 0.014255593530833721, + "eval_runtime": 91.2058, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 4.834971904754639, + "learning_rate": 8.933333333333333e-07, + "logits/chosen": 1.5464718341827393, + "logits/rejected": 1.7828128337860107, + "logps/chosen": -161.72267150878906, + "logps/rejected": -148.73696899414062, + "loss": 0.6917208194732666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.021360795944929123, + "rewards/margins": 0.002904644003137946, + "rewards/rejected": 0.018456149846315384, + "step": 50 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5483616590499878, + "eval_logits/rejected": 1.579562783241272, + "eval_logps/chosen": -153.08152770996094, + "eval_logps/rejected": -147.66171264648438, + "eval_loss": 0.6928565502166748, + "eval_rewards/accuracies": 0.5, + "eval_rewards/chosen": 0.024760283529758453, + "eval_rewards/margins": 0.0006479774019680917, + "eval_rewards/rejected": 0.024112308397889137, + "eval_runtime": 91.2661, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 2.739, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 4.839973449707031, + "learning_rate": 8.488888888888888e-07, + "logits/chosen": 1.6227805614471436, + "logits/rejected": 1.7581478357315063, + "logps/chosen": -149.3035430908203, + "logps/rejected": -172.68246459960938, + "loss": 0.6937461853027344, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.02874893881380558, + "rewards/margins": -0.001109059201553464, + "rewards/rejected": 0.029857998713850975, + "step": 60 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5483386516571045, + "eval_logits/rejected": 1.5794395208358765, + "eval_logps/chosen": -153.0557098388672, + "eval_logps/rejected": -147.6452178955078, + "eval_loss": 0.6923965811729431, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.02734413929283619, + "eval_rewards/margins": 0.0015814845683053136, + "eval_rewards/rejected": 0.02576265297830105, + "eval_runtime": 91.2114, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 4.953029155731201, + "learning_rate": 8.044444444444444e-07, + "logits/chosen": 1.7308895587921143, + "logits/rejected": 1.7446597814559937, + "logps/chosen": -156.3403778076172, + "logps/rejected": -158.07266235351562, + "loss": 0.693772268295288, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.028584053739905357, + "rewards/margins": -0.0011613942915573716, + "rewards/rejected": 0.029745448380708694, + "step": 70 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5475271940231323, + "eval_logits/rejected": 1.5785510540008545, + "eval_logps/chosen": -153.0685577392578, + "eval_logps/rejected": -147.6594696044922, + "eval_loss": 0.6923277974128723, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.026057977229356766, + "eval_rewards/margins": 0.0017198917921632528, + "eval_rewards/rejected": 0.02433808706700802, + "eval_runtime": 91.2304, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.74, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 5.434020042419434, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": 1.5336188077926636, + "logits/rejected": 1.6496741771697998, + "logps/chosen": -139.77798461914062, + "logps/rejected": -141.71542358398438, + "loss": 0.6929593086242676, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02469942346215248, + "rewards/margins": 0.000432062050094828, + "rewards/rejected": 0.02426736056804657, + "step": 80 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.5445293188095093, + "eval_logits/rejected": 1.575512170791626, + "eval_logps/chosen": -153.11724853515625, + "eval_logps/rejected": -147.7042694091797, + "eval_loss": 0.6925179958343506, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.02118886075913906, + "eval_rewards/margins": 0.0013303803279995918, + "eval_rewards/rejected": 0.019858481362462044, + "eval_runtime": 91.2262, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.74, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": Infinity, + "learning_rate": 7.155555555555555e-07, + "logits/chosen": 1.6694380044937134, + "logits/rejected": 1.6528278589248657, + "logps/chosen": -150.8935089111328, + "logps/rejected": -156.09933471679688, + "loss": 0.6924211502075195, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.02094501070678234, + "rewards/margins": 0.001518061151728034, + "rewards/rejected": 0.019426951184868813, + "step": 90 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5404151678085327, + "eval_logits/rejected": 1.571384072303772, + "eval_logps/chosen": -153.1779327392578, + "eval_logps/rejected": -147.76097106933594, + "eval_loss": 0.6927171349525452, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.015120184049010277, + "eval_rewards/margins": 0.000932602328248322, + "eval_rewards/rejected": 0.014187579974532127, + "eval_runtime": 91.2138, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 4.617976665496826, + "learning_rate": 6.711111111111111e-07, + "logits/chosen": 1.5269482135772705, + "logits/rejected": 1.656995177268982, + "logps/chosen": -151.65155029296875, + "logps/rejected": -157.01368713378906, + "loss": 0.6927920341491699, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.013498455286026001, + "rewards/margins": 0.0007692074286751449, + "rewards/rejected": 0.012729247100651264, + "step": 100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.5384068489074707, + "eval_logits/rejected": 1.5692557096481323, + "eval_logps/chosen": -153.2022247314453, + "eval_logps/rejected": -147.78414916992188, + "eval_loss": 0.6927769184112549, + "eval_rewards/accuracies": 0.5040000081062317, + "eval_rewards/chosen": 0.01269307266920805, + "eval_rewards/margins": 0.0008234258857555687, + "eval_rewards/rejected": 0.011869647540152073, + "eval_runtime": 91.1817, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 5.000879287719727, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": 1.4631853103637695, + "logits/rejected": 1.3939930200576782, + "logps/chosen": -135.74972534179688, + "logps/rejected": -128.0284423828125, + "loss": 0.6943031311035156, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.007034818641841412, + "rewards/margins": -0.002210928127169609, + "rewards/rejected": 0.009245747700333595, + "step": 110 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.5367003679275513, + "eval_logits/rejected": 1.5675381422042847, + "eval_logps/chosen": -153.22845458984375, + "eval_logps/rejected": -147.80810546875, + "eval_loss": 0.6928940415382385, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": 0.010068614035844803, + "eval_rewards/margins": 0.0005964106530882418, + "eval_rewards/rejected": 0.009472202509641647, + "eval_runtime": 91.1595, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.742, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 4.761650085449219, + "learning_rate": 5.822222222222222e-07, + "logits/chosen": 1.612473726272583, + "logits/rejected": 1.707841157913208, + "logps/chosen": -140.61358642578125, + "logps/rejected": -134.6945037841797, + "loss": 0.6929790019989014, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01286519318819046, + "rewards/margins": 0.0003966711519751698, + "rewards/rejected": 0.012468521483242512, + "step": 120 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5394755601882935, + "eval_logits/rejected": 1.5703034400939941, + "eval_logps/chosen": -153.20404052734375, + "eval_logps/rejected": -147.78768920898438, + "eval_loss": 0.692690908908844, + "eval_rewards/accuracies": 0.4880000054836273, + "eval_rewards/chosen": 0.012509736232459545, + "eval_rewards/margins": 0.0009925522608682513, + "eval_rewards/rejected": 0.011517184786498547, + "eval_runtime": 91.2364, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 2.74, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 3.74716854095459, + "learning_rate": 5.377777777777778e-07, + "logits/chosen": 1.6066267490386963, + "logits/rejected": 1.5948982238769531, + "logps/chosen": -139.20384216308594, + "logps/rejected": -124.77787017822266, + "loss": 0.6889309406280517, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.019050434231758118, + "rewards/margins": 0.008547642268240452, + "rewards/rejected": 0.010502791032195091, + "step": 130 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5407383441925049, + "eval_logits/rejected": 1.5717073678970337, + "eval_logps/chosen": -153.18846130371094, + "eval_logps/rejected": -147.76727294921875, + "eval_loss": 0.6929307579994202, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": 0.014068227261304855, + "eval_rewards/margins": 0.0005092988139949739, + "eval_rewards/rejected": 0.013558929786086082, + "eval_runtime": 91.1473, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 3.819817066192627, + "learning_rate": 4.933333333333333e-07, + "logits/chosen": 1.7416331768035889, + "logits/rejected": 1.7998367547988892, + "logps/chosen": -139.13510131835938, + "logps/rejected": -132.06640625, + "loss": 0.683869743347168, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.026805799454450607, + "rewards/margins": 0.01872934028506279, + "rewards/rejected": 0.008076457306742668, + "step": 140 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.5413432121276855, + "eval_logits/rejected": 1.5723364353179932, + "eval_logps/chosen": -153.16770935058594, + "eval_logps/rejected": -147.75100708007812, + "eval_loss": 0.6927105188369751, + "eval_rewards/accuracies": 0.4959999918937683, + "eval_rewards/chosen": 0.016144035384058952, + "eval_rewards/margins": 0.00095889694057405, + "eval_rewards/rejected": 0.015185139141976833, + "eval_runtime": 91.2001, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 4.6151814460754395, + "learning_rate": 4.4888888888888885e-07, + "logits/chosen": 1.691542625427246, + "logits/rejected": 1.6202027797698975, + "logps/chosen": -141.56210327148438, + "logps/rejected": -150.57546997070312, + "loss": 0.6834945201873779, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.029597634449601173, + "rewards/margins": 0.019486434757709503, + "rewards/rejected": 0.010111198760569096, + "step": 150 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.542131781578064, + "eval_logits/rejected": 1.5730611085891724, + "eval_logps/chosen": -153.1396942138672, + "eval_logps/rejected": -147.72340393066406, + "eval_loss": 0.6926940679550171, + "eval_rewards/accuracies": 0.5040000081062317, + "eval_rewards/chosen": 0.018946224823594093, + "eval_rewards/margins": 0.001002778299152851, + "eval_rewards/rejected": 0.017943447455763817, + "eval_runtime": 91.2002, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 4.492705821990967, + "learning_rate": 4.044444444444444e-07, + "logits/chosen": 1.4716907739639282, + "logits/rejected": 1.6224310398101807, + "logps/chosen": -137.88778686523438, + "logps/rejected": -156.01174926757812, + "loss": 0.6802837371826171, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.035625211894512177, + "rewards/margins": 0.026028087362647057, + "rewards/rejected": 0.009597120806574821, + "step": 160 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.5431921482086182, + "eval_logits/rejected": 1.5740454196929932, + "eval_logps/chosen": -153.10897827148438, + "eval_logps/rejected": -147.69815063476562, + "eval_loss": 0.6924295425415039, + "eval_rewards/accuracies": 0.49799999594688416, + "eval_rewards/chosen": 0.02201567403972149, + "eval_rewards/margins": 0.001546767307445407, + "eval_rewards/rejected": 0.020468907430768013, + "eval_runtime": 91.1488, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.527454376220703, + "learning_rate": 3.6e-07, + "logits/chosen": 1.4853137731552124, + "logits/rejected": 1.6064825057983398, + "logps/chosen": -123.4184341430664, + "logps/rejected": -160.12344360351562, + "loss": 0.6809058666229248, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.032256536185741425, + "rewards/margins": 0.02484096586704254, + "rewards/rejected": 0.007415571250021458, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.5430748462677002, + "eval_logits/rejected": 1.5740865468978882, + "eval_logps/chosen": -153.10487365722656, + "eval_logps/rejected": -147.6937255859375, + "eval_loss": 0.6924483776092529, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.022427691146731377, + "eval_rewards/margins": 0.0015162257477641106, + "eval_rewards/rejected": 0.020911462604999542, + "eval_runtime": 91.1525, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 4.013235092163086, + "learning_rate": 3.1555555555555554e-07, + "logits/chosen": 1.6032909154891968, + "logits/rejected": 1.6782621145248413, + "logps/chosen": -154.54408264160156, + "logps/rejected": -143.55628967285156, + "loss": 0.6827951908111572, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.034204840660095215, + "rewards/margins": 0.020985398441553116, + "rewards/rejected": 0.013219443149864674, + "step": 180 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.5442792177200317, + "eval_logits/rejected": 1.5750905275344849, + "eval_logps/chosen": -153.09341430664062, + "eval_logps/rejected": -147.68096923828125, + "eval_loss": 0.6925176978111267, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.023571064695715904, + "eval_rewards/margins": 0.0013834317214787006, + "eval_rewards/rejected": 0.02218763343989849, + "eval_runtime": 91.1553, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 5.767404556274414, + "learning_rate": 2.7111111111111114e-07, + "logits/chosen": 1.623437523841858, + "logits/rejected": 1.6042674779891968, + "logps/chosen": -153.09120178222656, + "logps/rejected": -147.37147521972656, + "loss": 0.6806243896484375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032211557030677795, + "rewards/margins": 0.025478612631559372, + "rewards/rejected": 0.006732943467795849, + "step": 190 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.5439079999923706, + "eval_logits/rejected": 1.5748034715652466, + "eval_logps/chosen": -153.10072326660156, + "eval_logps/rejected": -147.69134521484375, + "eval_loss": 0.6923675537109375, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.022842643782496452, + "eval_rewards/margins": 0.0016915848245844245, + "eval_rewards/rejected": 0.021151060238480568, + "eval_runtime": 91.1994, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 3.8343236446380615, + "learning_rate": 2.2666666666666663e-07, + "logits/chosen": 1.6447292566299438, + "logits/rejected": 1.744441032409668, + "logps/chosen": -174.34786987304688, + "logps/rejected": -151.64242553710938, + "loss": 0.6813366413116455, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.03669542819261551, + "rewards/margins": 0.02391393855214119, + "rewards/rejected": 0.012781488709151745, + "step": 200 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.544162631034851, + "eval_logits/rejected": 1.5750305652618408, + "eval_logps/chosen": -153.08941650390625, + "eval_logps/rejected": -147.67959594726562, + "eval_loss": 0.6923924088478088, + "eval_rewards/accuracies": 0.5019999742507935, + "eval_rewards/chosen": 0.023971842601895332, + "eval_rewards/margins": 0.0016475645825266838, + "eval_rewards/rejected": 0.022324278950691223, + "eval_runtime": 91.1391, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 5.134785175323486, + "learning_rate": 1.8222222222222223e-07, + "logits/chosen": 1.5754644870758057, + "logits/rejected": 1.6172332763671875, + "logps/chosen": -136.99713134765625, + "logps/rejected": -135.82577514648438, + "loss": 0.6822806358337402, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.0363207571208477, + "rewards/margins": 0.022009989246726036, + "rewards/rejected": 0.014310772530734539, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.5441726446151733, + "eval_logits/rejected": 1.5750277042388916, + "eval_logps/chosen": -153.08071899414062, + "eval_logps/rejected": -147.67050170898438, + "eval_loss": 0.6924174427986145, + "eval_rewards/accuracies": 0.5080000162124634, + "eval_rewards/chosen": 0.024840930476784706, + "eval_rewards/margins": 0.0016068487893790007, + "eval_rewards/rejected": 0.023234082385897636, + "eval_runtime": 91.2024, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 3.7789998054504395, + "learning_rate": 1.3777777777777778e-07, + "logits/chosen": 1.7328720092773438, + "logits/rejected": 1.736034631729126, + "logps/chosen": -153.88792419433594, + "logps/rejected": -158.17237854003906, + "loss": 0.6818790435791016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035219158977270126, + "rewards/margins": 0.022797629237174988, + "rewards/rejected": 0.012421531602740288, + "step": 220 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.5447088479995728, + "eval_logits/rejected": 1.5754626989364624, + "eval_logps/chosen": -153.07162475585938, + "eval_logps/rejected": -147.66404724121094, + "eval_loss": 0.6922869682312012, + "eval_rewards/accuracies": 0.5099999904632568, + "eval_rewards/chosen": 0.025751180946826935, + "eval_rewards/margins": 0.0018704932881519198, + "eval_rewards/rejected": 0.02388068474829197, + "eval_runtime": 91.223, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.741, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 3.6572909355163574, + "learning_rate": 9.333333333333334e-08, + "logits/chosen": 1.492131233215332, + "logits/rejected": 1.4000813961029053, + "logps/chosen": -137.4458770751953, + "logps/rejected": -131.2755584716797, + "loss": 0.6839917659759521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.035144560039043427, + "rewards/margins": 0.018620768561959267, + "rewards/rejected": 0.01652379147708416, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.544948935508728, + "eval_logits/rejected": 1.5758661031723022, + "eval_logps/chosen": -153.06626892089844, + "eval_logps/rejected": -147.65304565429688, + "eval_loss": 0.6925700902938843, + "eval_rewards/accuracies": 0.492000013589859, + "eval_rewards/chosen": 0.026287397369742393, + "eval_rewards/margins": 0.0013078682823106647, + "eval_rewards/rejected": 0.024979526177048683, + "eval_runtime": 91.1532, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 3.7037456035614014, + "learning_rate": 4.888888888888889e-08, + "logits/chosen": 1.7049148082733154, + "logits/rejected": 1.8360675573349, + "logps/chosen": -142.6393280029297, + "logps/rejected": -149.64599609375, + "loss": 0.6830495834350586, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.03778035193681717, + "rewards/margins": 0.020435160025954247, + "rewards/rejected": 0.017345190048217773, + "step": 240 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5447055101394653, + "eval_logits/rejected": 1.5756163597106934, + "eval_logps/chosen": -153.0759735107422, + "eval_logps/rejected": -147.6631622314453, + "eval_loss": 0.6925495862960815, + "eval_rewards/accuracies": 0.49799999594688416, + "eval_rewards/chosen": 0.025318272411823273, + "eval_rewards/margins": 0.0013493854785338044, + "eval_rewards/rejected": 0.023968886584043503, + "eval_runtime": 91.1717, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 240 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-240/training_args.bin b/v5/DPO/DPO_1k/lora/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e8b84f5a426c49e45b1af860330babc4d518a29 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab57d46930a716685750bae4e83770b275a25633ed865bb4a8a708a9b1519f5 +size 6161 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/README.md b/v5/DPO/DPO_1k/lora/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_config.json b/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..51de60011de8c53a3622d49c2b13f6ead1f71e90 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "o_proj", + "gate_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_model.safetensors b/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20ce212d5e404b971f29c3ec83c1f704e4164903 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52edcdc392af8dbe11d53d838ee5110a8a78a9d9edb52dc2e752ab15cc474b65 +size 180385008 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/chat_template.jinja b/v5/DPO/DPO_1k/lora/checkpoint-250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/optimizer.pt b/v5/DPO/DPO_1k/lora/checkpoint-250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bea0ca2fc013c43cd71a66c0ba3a926c89961961 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3086e7ad2490f77798cb691df8a62aaa211285dabde77ec5b11c5a35d1300f0a +size 360902475 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/rng_state.pth b/v5/DPO/DPO_1k/lora/checkpoint-250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68c0411dd375a388cbc8c58bea912cb904778ab8 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1786ad2057a678cc204dadc7fc5d1a4f939be477df219f770c7d40e9270281 +size 14645 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/scaler.pt b/v5/DPO/DPO_1k/lora/checkpoint-250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..68659f39c4f88d1b5253dd302793916e92350e45 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb547c94abc5a4f322bc71734536e7309a72044f721aac24196aeb3c52f27927 +size 1383 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/scheduler.pt b/v5/DPO/DPO_1k/lora/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c387f17359656becf35803ddf7be25bfecd4b131 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c52ccf6b82b9be2bc8b1bba903d7058c74f2f80e62e76a2c4fab50d59f2a4eb9 +size 1465 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer.json b/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer_config.json b/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/trainer_state.json b/v5/DPO/DPO_1k/lora/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..32d428da5be974065657795d204804891cf795b7 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/trainer_state.json @@ -0,0 +1,809 @@ +{ + "best_global_step": 60, + "best_metric": 0.5139999985694885, + "best_model_checkpoint": "output/lora/checkpoint-60", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 4.44759464263916, + "learning_rate": 3.6e-07, + "logits/chosen": 1.7029424905776978, + "logits/rejected": 1.5582908391952515, + "logps/chosen": -138.26773071289062, + "logps/rejected": -131.23216247558594, + "loss": 0.6930624485015869, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.0004225396551191807, + "rewards/margins": 0.0001746034249663353, + "rewards/rejected": 0.0002479362883605063, + "step": 10 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.537629246711731, + "eval_logits/rejected": 1.5689224004745483, + "eval_logps/chosen": -153.32151794433594, + "eval_logps/rejected": -147.88926696777344, + "eval_loss": 0.6934499144554138, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.000762743002269417, + "eval_rewards/margins": -0.0005942528950981796, + "eval_rewards/rejected": 0.001356995664536953, + "eval_runtime": 89.836, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 2.783, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 4.250877857208252, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": 1.5865893363952637, + "logits/rejected": 1.6983562707901, + "logps/chosen": -146.0574188232422, + "logps/rejected": -146.00059509277344, + "loss": 0.692898178100586, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002089472021907568, + "rewards/margins": 0.000508568249642849, + "rewards/rejected": 0.0015809036558493972, + "step": 20 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5378667116165161, + "eval_logits/rejected": 1.5691591501235962, + "eval_logps/chosen": -153.29969787597656, + "eval_logps/rejected": -147.86961364746094, + "eval_loss": 0.6933443546295166, + "eval_rewards/accuracies": 0.4659999907016754, + "eval_rewards/chosen": 0.0029440198559314013, + "eval_rewards/margins": -0.00037939148023724556, + "eval_rewards/rejected": 0.003323411336168647, + "eval_runtime": 91.2074, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 3.7776143550872803, + "learning_rate": 9.82222222222222e-07, + "logits/chosen": 1.574183702468872, + "logits/rejected": 1.7243797779083252, + "logps/chosen": -145.04452514648438, + "logps/rejected": -157.22335815429688, + "loss": 0.6925838470458985, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.006991167552769184, + "rewards/margins": 0.0011441993992775679, + "rewards/rejected": 0.005846967454999685, + "step": 30 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5403894186019897, + "eval_logits/rejected": 1.5716638565063477, + "eval_logps/chosen": -153.24818420410156, + "eval_logps/rejected": -147.82077026367188, + "eval_loss": 0.6932132244110107, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.008096706122159958, + "eval_rewards/margins": -0.0001120842425734736, + "eval_rewards/rejected": 0.00820879079401493, + "eval_runtime": 91.1688, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 4.8561787605285645, + "learning_rate": 9.377777777777777e-07, + "logits/chosen": 1.5448085069656372, + "logits/rejected": 1.4517648220062256, + "logps/chosen": -137.44406127929688, + "logps/rejected": -146.0561065673828, + "loss": 0.6921479225158691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012806238606572151, + "rewards/margins": 0.002016632352024317, + "rewards/rejected": 0.010789604857563972, + "step": 40 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5443732738494873, + "eval_logits/rejected": 1.575553297996521, + "eval_logps/chosen": -153.18209838867188, + "eval_logps/rejected": -147.76028442382812, + "eval_loss": 0.6929393410682678, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.014704804867506027, + "eval_rewards/margins": 0.00044921261724084616, + "eval_rewards/rejected": 0.014255593530833721, + "eval_runtime": 91.2058, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 4.834971904754639, + "learning_rate": 8.933333333333333e-07, + "logits/chosen": 1.5464718341827393, + "logits/rejected": 1.7828128337860107, + "logps/chosen": -161.72267150878906, + "logps/rejected": -148.73696899414062, + "loss": 0.6917208194732666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.021360795944929123, + "rewards/margins": 0.002904644003137946, + "rewards/rejected": 0.018456149846315384, + "step": 50 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5483616590499878, + "eval_logits/rejected": 1.579562783241272, + "eval_logps/chosen": -153.08152770996094, + "eval_logps/rejected": -147.66171264648438, + "eval_loss": 0.6928565502166748, + "eval_rewards/accuracies": 0.5, + "eval_rewards/chosen": 0.024760283529758453, + "eval_rewards/margins": 0.0006479774019680917, + "eval_rewards/rejected": 0.024112308397889137, + "eval_runtime": 91.2661, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 2.739, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 4.839973449707031, + "learning_rate": 8.488888888888888e-07, + "logits/chosen": 1.6227805614471436, + "logits/rejected": 1.7581478357315063, + "logps/chosen": -149.3035430908203, + "logps/rejected": -172.68246459960938, + "loss": 0.6937461853027344, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.02874893881380558, + "rewards/margins": -0.001109059201553464, + "rewards/rejected": 0.029857998713850975, + "step": 60 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5483386516571045, + "eval_logits/rejected": 1.5794395208358765, + "eval_logps/chosen": -153.0557098388672, + "eval_logps/rejected": -147.6452178955078, + "eval_loss": 0.6923965811729431, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.02734413929283619, + "eval_rewards/margins": 0.0015814845683053136, + "eval_rewards/rejected": 0.02576265297830105, + "eval_runtime": 91.2114, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 4.953029155731201, + "learning_rate": 8.044444444444444e-07, + "logits/chosen": 1.7308895587921143, + "logits/rejected": 1.7446597814559937, + "logps/chosen": -156.3403778076172, + "logps/rejected": -158.07266235351562, + "loss": 0.693772268295288, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.028584053739905357, + "rewards/margins": -0.0011613942915573716, + "rewards/rejected": 0.029745448380708694, + "step": 70 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5475271940231323, + "eval_logits/rejected": 1.5785510540008545, + "eval_logps/chosen": -153.0685577392578, + "eval_logps/rejected": -147.6594696044922, + "eval_loss": 0.6923277974128723, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.026057977229356766, + "eval_rewards/margins": 0.0017198917921632528, + "eval_rewards/rejected": 0.02433808706700802, + "eval_runtime": 91.2304, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.74, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 5.434020042419434, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": 1.5336188077926636, + "logits/rejected": 1.6496741771697998, + "logps/chosen": -139.77798461914062, + "logps/rejected": -141.71542358398438, + "loss": 0.6929593086242676, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.02469942346215248, + "rewards/margins": 0.000432062050094828, + "rewards/rejected": 0.02426736056804657, + "step": 80 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.5445293188095093, + "eval_logits/rejected": 1.575512170791626, + "eval_logps/chosen": -153.11724853515625, + "eval_logps/rejected": -147.7042694091797, + "eval_loss": 0.6925179958343506, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.02118886075913906, + "eval_rewards/margins": 0.0013303803279995918, + "eval_rewards/rejected": 0.019858481362462044, + "eval_runtime": 91.2262, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.74, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": Infinity, + "learning_rate": 7.155555555555555e-07, + "logits/chosen": 1.6694380044937134, + "logits/rejected": 1.6528278589248657, + "logps/chosen": -150.8935089111328, + "logps/rejected": -156.09933471679688, + "loss": 0.6924211502075195, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.02094501070678234, + "rewards/margins": 0.001518061151728034, + "rewards/rejected": 0.019426951184868813, + "step": 90 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5404151678085327, + "eval_logits/rejected": 1.571384072303772, + "eval_logps/chosen": -153.1779327392578, + "eval_logps/rejected": -147.76097106933594, + "eval_loss": 0.6927171349525452, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.015120184049010277, + "eval_rewards/margins": 0.000932602328248322, + "eval_rewards/rejected": 0.014187579974532127, + "eval_runtime": 91.2138, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 4.617976665496826, + "learning_rate": 6.711111111111111e-07, + "logits/chosen": 1.5269482135772705, + "logits/rejected": 1.656995177268982, + "logps/chosen": -151.65155029296875, + "logps/rejected": -157.01368713378906, + "loss": 0.6927920341491699, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.013498455286026001, + "rewards/margins": 0.0007692074286751449, + "rewards/rejected": 0.012729247100651264, + "step": 100 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.5384068489074707, + "eval_logits/rejected": 1.5692557096481323, + "eval_logps/chosen": -153.2022247314453, + "eval_logps/rejected": -147.78414916992188, + "eval_loss": 0.6927769184112549, + "eval_rewards/accuracies": 0.5040000081062317, + "eval_rewards/chosen": 0.01269307266920805, + "eval_rewards/margins": 0.0008234258857555687, + "eval_rewards/rejected": 0.011869647540152073, + "eval_runtime": 91.1817, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 5.000879287719727, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": 1.4631853103637695, + "logits/rejected": 1.3939930200576782, + "logps/chosen": -135.74972534179688, + "logps/rejected": -128.0284423828125, + "loss": 0.6943031311035156, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.007034818641841412, + "rewards/margins": -0.002210928127169609, + "rewards/rejected": 0.009245747700333595, + "step": 110 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.5367003679275513, + "eval_logits/rejected": 1.5675381422042847, + "eval_logps/chosen": -153.22845458984375, + "eval_logps/rejected": -147.80810546875, + "eval_loss": 0.6928940415382385, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": 0.010068614035844803, + "eval_rewards/margins": 0.0005964106530882418, + "eval_rewards/rejected": 0.009472202509641647, + "eval_runtime": 91.1595, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.742, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 4.761650085449219, + "learning_rate": 5.822222222222222e-07, + "logits/chosen": 1.612473726272583, + "logits/rejected": 1.707841157913208, + "logps/chosen": -140.61358642578125, + "logps/rejected": -134.6945037841797, + "loss": 0.6929790019989014, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01286519318819046, + "rewards/margins": 0.0003966711519751698, + "rewards/rejected": 0.012468521483242512, + "step": 120 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5394755601882935, + "eval_logits/rejected": 1.5703034400939941, + "eval_logps/chosen": -153.20404052734375, + "eval_logps/rejected": -147.78768920898438, + "eval_loss": 0.692690908908844, + "eval_rewards/accuracies": 0.4880000054836273, + "eval_rewards/chosen": 0.012509736232459545, + "eval_rewards/margins": 0.0009925522608682513, + "eval_rewards/rejected": 0.011517184786498547, + "eval_runtime": 91.2364, + "eval_samples_per_second": 5.48, + "eval_steps_per_second": 2.74, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 3.74716854095459, + "learning_rate": 5.377777777777778e-07, + "logits/chosen": 1.6066267490386963, + "logits/rejected": 1.5948982238769531, + "logps/chosen": -139.20384216308594, + "logps/rejected": -124.77787017822266, + "loss": 0.6889309406280517, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.019050434231758118, + "rewards/margins": 0.008547642268240452, + "rewards/rejected": 0.010502791032195091, + "step": 130 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5407383441925049, + "eval_logits/rejected": 1.5717073678970337, + "eval_logps/chosen": -153.18846130371094, + "eval_logps/rejected": -147.76727294921875, + "eval_loss": 0.6929307579994202, + "eval_rewards/accuracies": 0.4699999988079071, + "eval_rewards/chosen": 0.014068227261304855, + "eval_rewards/margins": 0.0005092988139949739, + "eval_rewards/rejected": 0.013558929786086082, + "eval_runtime": 91.1473, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 3.819817066192627, + "learning_rate": 4.933333333333333e-07, + "logits/chosen": 1.7416331768035889, + "logits/rejected": 1.7998367547988892, + "logps/chosen": -139.13510131835938, + "logps/rejected": -132.06640625, + "loss": 0.683869743347168, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.026805799454450607, + "rewards/margins": 0.01872934028506279, + "rewards/rejected": 0.008076457306742668, + "step": 140 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.5413432121276855, + "eval_logits/rejected": 1.5723364353179932, + "eval_logps/chosen": -153.16770935058594, + "eval_logps/rejected": -147.75100708007812, + "eval_loss": 0.6927105188369751, + "eval_rewards/accuracies": 0.4959999918937683, + "eval_rewards/chosen": 0.016144035384058952, + "eval_rewards/margins": 0.00095889694057405, + "eval_rewards/rejected": 0.015185139141976833, + "eval_runtime": 91.2001, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 4.6151814460754395, + "learning_rate": 4.4888888888888885e-07, + "logits/chosen": 1.691542625427246, + "logits/rejected": 1.6202027797698975, + "logps/chosen": -141.56210327148438, + "logps/rejected": -150.57546997070312, + "loss": 0.6834945201873779, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.029597634449601173, + "rewards/margins": 0.019486434757709503, + "rewards/rejected": 0.010111198760569096, + "step": 150 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.542131781578064, + "eval_logits/rejected": 1.5730611085891724, + "eval_logps/chosen": -153.1396942138672, + "eval_logps/rejected": -147.72340393066406, + "eval_loss": 0.6926940679550171, + "eval_rewards/accuracies": 0.5040000081062317, + "eval_rewards/chosen": 0.018946224823594093, + "eval_rewards/margins": 0.001002778299152851, + "eval_rewards/rejected": 0.017943447455763817, + "eval_runtime": 91.2002, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 4.492705821990967, + "learning_rate": 4.044444444444444e-07, + "logits/chosen": 1.4716907739639282, + "logits/rejected": 1.6224310398101807, + "logps/chosen": -137.88778686523438, + "logps/rejected": -156.01174926757812, + "loss": 0.6802837371826171, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.035625211894512177, + "rewards/margins": 0.026028087362647057, + "rewards/rejected": 0.009597120806574821, + "step": 160 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.5431921482086182, + "eval_logits/rejected": 1.5740454196929932, + "eval_logps/chosen": -153.10897827148438, + "eval_logps/rejected": -147.69815063476562, + "eval_loss": 0.6924295425415039, + "eval_rewards/accuracies": 0.49799999594688416, + "eval_rewards/chosen": 0.02201567403972149, + "eval_rewards/margins": 0.001546767307445407, + "eval_rewards/rejected": 0.020468907430768013, + "eval_runtime": 91.1488, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.527454376220703, + "learning_rate": 3.6e-07, + "logits/chosen": 1.4853137731552124, + "logits/rejected": 1.6064825057983398, + "logps/chosen": -123.4184341430664, + "logps/rejected": -160.12344360351562, + "loss": 0.6809058666229248, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.032256536185741425, + "rewards/margins": 0.02484096586704254, + "rewards/rejected": 0.007415571250021458, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.5430748462677002, + "eval_logits/rejected": 1.5740865468978882, + "eval_logps/chosen": -153.10487365722656, + "eval_logps/rejected": -147.6937255859375, + "eval_loss": 0.6924483776092529, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.022427691146731377, + "eval_rewards/margins": 0.0015162257477641106, + "eval_rewards/rejected": 0.020911462604999542, + "eval_runtime": 91.1525, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 4.013235092163086, + "learning_rate": 3.1555555555555554e-07, + "logits/chosen": 1.6032909154891968, + "logits/rejected": 1.6782621145248413, + "logps/chosen": -154.54408264160156, + "logps/rejected": -143.55628967285156, + "loss": 0.6827951908111572, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.034204840660095215, + "rewards/margins": 0.020985398441553116, + "rewards/rejected": 0.013219443149864674, + "step": 180 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.5442792177200317, + "eval_logits/rejected": 1.5750905275344849, + "eval_logps/chosen": -153.09341430664062, + "eval_logps/rejected": -147.68096923828125, + "eval_loss": 0.6925176978111267, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.023571064695715904, + "eval_rewards/margins": 0.0013834317214787006, + "eval_rewards/rejected": 0.02218763343989849, + "eval_runtime": 91.1553, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 5.767404556274414, + "learning_rate": 2.7111111111111114e-07, + "logits/chosen": 1.623437523841858, + "logits/rejected": 1.6042674779891968, + "logps/chosen": -153.09120178222656, + "logps/rejected": -147.37147521972656, + "loss": 0.6806243896484375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.032211557030677795, + "rewards/margins": 0.025478612631559372, + "rewards/rejected": 0.006732943467795849, + "step": 190 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.5439079999923706, + "eval_logits/rejected": 1.5748034715652466, + "eval_logps/chosen": -153.10072326660156, + "eval_logps/rejected": -147.69134521484375, + "eval_loss": 0.6923675537109375, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.022842643782496452, + "eval_rewards/margins": 0.0016915848245844245, + "eval_rewards/rejected": 0.021151060238480568, + "eval_runtime": 91.1994, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 3.8343236446380615, + "learning_rate": 2.2666666666666663e-07, + "logits/chosen": 1.6447292566299438, + "logits/rejected": 1.744441032409668, + "logps/chosen": -174.34786987304688, + "logps/rejected": -151.64242553710938, + "loss": 0.6813366413116455, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.03669542819261551, + "rewards/margins": 0.02391393855214119, + "rewards/rejected": 0.012781488709151745, + "step": 200 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.544162631034851, + "eval_logits/rejected": 1.5750305652618408, + "eval_logps/chosen": -153.08941650390625, + "eval_logps/rejected": -147.67959594726562, + "eval_loss": 0.6923924088478088, + "eval_rewards/accuracies": 0.5019999742507935, + "eval_rewards/chosen": 0.023971842601895332, + "eval_rewards/margins": 0.0016475645825266838, + "eval_rewards/rejected": 0.022324278950691223, + "eval_runtime": 91.1391, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 5.134785175323486, + "learning_rate": 1.8222222222222223e-07, + "logits/chosen": 1.5754644870758057, + "logits/rejected": 1.6172332763671875, + "logps/chosen": -136.99713134765625, + "logps/rejected": -135.82577514648438, + "loss": 0.6822806358337402, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.0363207571208477, + "rewards/margins": 0.022009989246726036, + "rewards/rejected": 0.014310772530734539, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.5441726446151733, + "eval_logits/rejected": 1.5750277042388916, + "eval_logps/chosen": -153.08071899414062, + "eval_logps/rejected": -147.67050170898438, + "eval_loss": 0.6924174427986145, + "eval_rewards/accuracies": 0.5080000162124634, + "eval_rewards/chosen": 0.024840930476784706, + "eval_rewards/margins": 0.0016068487893790007, + "eval_rewards/rejected": 0.023234082385897636, + "eval_runtime": 91.2024, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 3.7789998054504395, + "learning_rate": 1.3777777777777778e-07, + "logits/chosen": 1.7328720092773438, + "logits/rejected": 1.736034631729126, + "logps/chosen": -153.88792419433594, + "logps/rejected": -158.17237854003906, + "loss": 0.6818790435791016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.035219158977270126, + "rewards/margins": 0.022797629237174988, + "rewards/rejected": 0.012421531602740288, + "step": 220 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.5447088479995728, + "eval_logits/rejected": 1.5754626989364624, + "eval_logps/chosen": -153.07162475585938, + "eval_logps/rejected": -147.66404724121094, + "eval_loss": 0.6922869682312012, + "eval_rewards/accuracies": 0.5099999904632568, + "eval_rewards/chosen": 0.025751180946826935, + "eval_rewards/margins": 0.0018704932881519198, + "eval_rewards/rejected": 0.02388068474829197, + "eval_runtime": 91.223, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.741, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 3.6572909355163574, + "learning_rate": 9.333333333333334e-08, + "logits/chosen": 1.492131233215332, + "logits/rejected": 1.4000813961029053, + "logps/chosen": -137.4458770751953, + "logps/rejected": -131.2755584716797, + "loss": 0.6839917659759521, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.035144560039043427, + "rewards/margins": 0.018620768561959267, + "rewards/rejected": 0.01652379147708416, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.544948935508728, + "eval_logits/rejected": 1.5758661031723022, + "eval_logps/chosen": -153.06626892089844, + "eval_logps/rejected": -147.65304565429688, + "eval_loss": 0.6925700902938843, + "eval_rewards/accuracies": 0.492000013589859, + "eval_rewards/chosen": 0.026287397369742393, + "eval_rewards/margins": 0.0013078682823106647, + "eval_rewards/rejected": 0.024979526177048683, + "eval_runtime": 91.1532, + "eval_samples_per_second": 5.485, + "eval_steps_per_second": 2.743, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 3.7037456035614014, + "learning_rate": 4.888888888888889e-08, + "logits/chosen": 1.7049148082733154, + "logits/rejected": 1.8360675573349, + "logps/chosen": -142.6393280029297, + "logps/rejected": -149.64599609375, + "loss": 0.6830495834350586, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.03778035193681717, + "rewards/margins": 0.020435160025954247, + "rewards/rejected": 0.017345190048217773, + "step": 240 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5447055101394653, + "eval_logits/rejected": 1.5756163597106934, + "eval_logps/chosen": -153.0759735107422, + "eval_logps/rejected": -147.6631622314453, + "eval_loss": 0.6925495862960815, + "eval_rewards/accuracies": 0.49799999594688416, + "eval_rewards/chosen": 0.025318272411823273, + "eval_rewards/margins": 0.0013493854785338044, + "eval_rewards/rejected": 0.023968886584043503, + "eval_runtime": 91.1717, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 240 + }, + { + "epoch": 2.0, + "grad_norm": 3.520871162414551, + "learning_rate": 4.444444444444444e-09, + "logits/chosen": 1.5210078954696655, + "logits/rejected": 1.6691532135009766, + "logps/chosen": -143.4422149658203, + "logps/rejected": -163.37901306152344, + "loss": 0.68167724609375, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.04191447049379349, + "rewards/margins": 0.023286914452910423, + "rewards/rejected": 0.018627556040883064, + "step": 250 + }, + { + "epoch": 2.0, + "eval_logits/chosen": 1.5446418523788452, + "eval_logits/rejected": 1.5754319429397583, + "eval_logps/chosen": -153.07318115234375, + "eval_logps/rejected": -147.66468811035156, + "eval_loss": 0.692334771156311, + "eval_rewards/accuracies": 0.47999998927116394, + "eval_rewards/chosen": 0.025596633553504944, + "eval_rewards/margins": 0.001780977239832282, + "eval_rewards/rejected": 0.02381565421819687, + "eval_runtime": 91.17, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 250 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-250/training_args.bin b/v5/DPO/DPO_1k/lora/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e8b84f5a426c49e45b1af860330babc4d518a29 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab57d46930a716685750bae4e83770b275a25633ed865bb4a8a708a9b1519f5 +size 6161 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/README.md b/v5/DPO/DPO_1k/lora/checkpoint-60/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_config.json b/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..51de60011de8c53a3622d49c2b13f6ead1f71e90 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "up_proj", + "q_proj", + "down_proj", + "o_proj", + "gate_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_model.safetensors b/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5e8b8f5cc12190e5d65d64650e503f45c30cfc9f --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd39ce893511c42bb66fcc45902600954d24ae57a8646fa31ae8e964732c757a +size 180385008 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/chat_template.jinja b/v5/DPO/DPO_1k/lora/checkpoint-60/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/optimizer.pt b/v5/DPO/DPO_1k/lora/checkpoint-60/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b058f826241486d68d7312b3e00681c287bb390 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2998a5c9d26731c3db0bcb594dc483be22ed1eb6e2d06ee22825865969c8e8c6 +size 360902475 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/rng_state.pth b/v5/DPO/DPO_1k/lora/checkpoint-60/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a787d39b15181d020a94083fddcfaff5ca9eaeca --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480389ce7f683504c393112df2c8045b3bbba2e7bfbed923d3dbd1ed09e2f087 +size 14645 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/scaler.pt b/v5/DPO/DPO_1k/lora/checkpoint-60/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b6fc79a8e7d247a9de49e04221d1d69711f7d38 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c146ae1cf47c9929c1f0cc98e903ce1070f0c3ea64421f26971b053d42844b7 +size 1383 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/scheduler.pt b/v5/DPO/DPO_1k/lora/checkpoint-60/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff1e79a35a9446e871cff9a585244471042af5f4 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3c847e08808d5191a68d394c035f606d08d03f39f90970ab74bbb024c1403c2 +size 1465 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer.json b/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer_config.json b/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/trainer_state.json b/v5/DPO/DPO_1k/lora/checkpoint-60/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c092da5c09aa22cd1ec48c422e6035ce8a136ac8 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/trainer_state.json @@ -0,0 +1,220 @@ +{ + "best_global_step": 60, + "best_metric": 0.5139999985694885, + "best_model_checkpoint": "output/lora/checkpoint-60", + "epoch": 0.48, + "eval_steps": 10, + "global_step": 60, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 4.44759464263916, + "learning_rate": 3.6e-07, + "logits/chosen": 1.7029424905776978, + "logits/rejected": 1.5582908391952515, + "logps/chosen": -138.26773071289062, + "logps/rejected": -131.23216247558594, + "loss": 0.6930624485015869, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.0004225396551191807, + "rewards/margins": 0.0001746034249663353, + "rewards/rejected": 0.0002479362883605063, + "step": 10 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.537629246711731, + "eval_logits/rejected": 1.5689224004745483, + "eval_logps/chosen": -153.32151794433594, + "eval_logps/rejected": -147.88926696777344, + "eval_loss": 0.6934499144554138, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.000762743002269417, + "eval_rewards/margins": -0.0005942528950981796, + "eval_rewards/rejected": 0.001356995664536953, + "eval_runtime": 89.836, + "eval_samples_per_second": 5.566, + "eval_steps_per_second": 2.783, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 4.250877857208252, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": 1.5865893363952637, + "logits/rejected": 1.6983562707901, + "logps/chosen": -146.0574188232422, + "logps/rejected": -146.00059509277344, + "loss": 0.692898178100586, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.002089472021907568, + "rewards/margins": 0.000508568249642849, + "rewards/rejected": 0.0015809036558493972, + "step": 20 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5378667116165161, + "eval_logits/rejected": 1.5691591501235962, + "eval_logps/chosen": -153.29969787597656, + "eval_logps/rejected": -147.86961364746094, + "eval_loss": 0.6933443546295166, + "eval_rewards/accuracies": 0.4659999907016754, + "eval_rewards/chosen": 0.0029440198559314013, + "eval_rewards/margins": -0.00037939148023724556, + "eval_rewards/rejected": 0.003323411336168647, + "eval_runtime": 91.2074, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 3.7776143550872803, + "learning_rate": 9.82222222222222e-07, + "logits/chosen": 1.574183702468872, + "logits/rejected": 1.7243797779083252, + "logps/chosen": -145.04452514648438, + "logps/rejected": -157.22335815429688, + "loss": 0.6925838470458985, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.006991167552769184, + "rewards/margins": 0.0011441993992775679, + "rewards/rejected": 0.005846967454999685, + "step": 30 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.5403894186019897, + "eval_logits/rejected": 1.5716638565063477, + "eval_logps/chosen": -153.24818420410156, + "eval_logps/rejected": -147.82077026367188, + "eval_loss": 0.6932132244110107, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.008096706122159958, + "eval_rewards/margins": -0.0001120842425734736, + "eval_rewards/rejected": 0.00820879079401493, + "eval_runtime": 91.1688, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 4.8561787605285645, + "learning_rate": 9.377777777777777e-07, + "logits/chosen": 1.5448085069656372, + "logits/rejected": 1.4517648220062256, + "logps/chosen": -137.44406127929688, + "logps/rejected": -146.0561065673828, + "loss": 0.6921479225158691, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.012806238606572151, + "rewards/margins": 0.002016632352024317, + "rewards/rejected": 0.010789604857563972, + "step": 40 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5443732738494873, + "eval_logits/rejected": 1.575553297996521, + "eval_logps/chosen": -153.18209838867188, + "eval_logps/rejected": -147.76028442382812, + "eval_loss": 0.6929393410682678, + "eval_rewards/accuracies": 0.4860000014305115, + "eval_rewards/chosen": 0.014704804867506027, + "eval_rewards/margins": 0.00044921261724084616, + "eval_rewards/rejected": 0.014255593530833721, + "eval_runtime": 91.2058, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 4.834971904754639, + "learning_rate": 8.933333333333333e-07, + "logits/chosen": 1.5464718341827393, + "logits/rejected": 1.7828128337860107, + "logps/chosen": -161.72267150878906, + "logps/rejected": -148.73696899414062, + "loss": 0.6917208194732666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.021360795944929123, + "rewards/margins": 0.002904644003137946, + "rewards/rejected": 0.018456149846315384, + "step": 50 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.5483616590499878, + "eval_logits/rejected": 1.579562783241272, + "eval_logps/chosen": -153.08152770996094, + "eval_logps/rejected": -147.66171264648438, + "eval_loss": 0.6928565502166748, + "eval_rewards/accuracies": 0.5, + "eval_rewards/chosen": 0.024760283529758453, + "eval_rewards/margins": 0.0006479774019680917, + "eval_rewards/rejected": 0.024112308397889137, + "eval_runtime": 91.2661, + "eval_samples_per_second": 5.478, + "eval_steps_per_second": 2.739, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 4.839973449707031, + "learning_rate": 8.488888888888888e-07, + "logits/chosen": 1.6227805614471436, + "logits/rejected": 1.7581478357315063, + "logps/chosen": -149.3035430908203, + "logps/rejected": -172.68246459960938, + "loss": 0.6937461853027344, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.02874893881380558, + "rewards/margins": -0.001109059201553464, + "rewards/rejected": 0.029857998713850975, + "step": 60 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.5483386516571045, + "eval_logits/rejected": 1.5794395208358765, + "eval_logps/chosen": -153.0557098388672, + "eval_logps/rejected": -147.6452178955078, + "eval_loss": 0.6923965811729431, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.02734413929283619, + "eval_rewards/margins": 0.0015814845683053136, + "eval_rewards/rejected": 0.02576265297830105, + "eval_runtime": 91.2114, + "eval_samples_per_second": 5.482, + "eval_steps_per_second": 2.741, + "step": 60 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_1k/lora/checkpoint-60/training_args.bin b/v5/DPO/DPO_1k/lora/checkpoint-60/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3e8b84f5a426c49e45b1af860330babc4d518a29 --- /dev/null +++ b/v5/DPO/DPO_1k/lora/checkpoint-60/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab57d46930a716685750bae4e83770b275a25633ed865bb4a8a708a9b1519f5 +size 6161 diff --git a/v5/DPO/DPO_5k/DPO_5k/README.md b/v5/DPO/DPO_5k/DPO_5k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_5k/DPO_5k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_5k/DPO_5k/adapter_config.json b/v5/DPO/DPO_5k/DPO_5k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11512c97a55a2d441704a9e11460444b5019509a --- /dev/null +++ b/v5/DPO/DPO_5k/DPO_5k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "o_proj", + "up_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_5k/DPO_5k/adapter_model.safetensors b/v5/DPO/DPO_5k/DPO_5k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23653a35bcc9bc5cd525755e201dccde768c178a --- /dev/null +++ b/v5/DPO/DPO_5k/DPO_5k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1766c6c831e4681b14adf5735152cfcbab91d1a5b7ea384c85a5e149eb6ea7 +size 180385008 diff --git a/v5/DPO/DPO_5k/MDPO_5k/chat_template.jinja b/v5/DPO/DPO_5k/MDPO_5k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_5k/MDPO_5k/config.json b/v5/DPO/DPO_5k/MDPO_5k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..269c2ffa2c365f594cb5e44218192c94b419a0cb --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/DPO/DPO_5k/MDPO_5k/generation_config.json b/v5/DPO/DPO_5k/MDPO_5k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9c2224cd391437f7236b3f36305dd39a63ab0a --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.0.0" +} diff --git a/v5/DPO/DPO_5k/MDPO_5k/model.safetensors b/v5/DPO/DPO_5k/MDPO_5k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ec3aecf4fbb1fc5610f2e0506e2f538dad012e32 --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205030395b2e2ab21f84976d15e0c3713dd86fe884f0fdb046e92a0c5cf6ceaf +size 2471645464 diff --git a/v5/DPO/DPO_5k/MDPO_5k/tokenizer.json b/v5/DPO/DPO_5k/MDPO_5k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_5k/MDPO_5k/tokenizer_config.json b/v5/DPO/DPO_5k/MDPO_5k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_5k/MDPO_5k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_5k/lora/README.md b/v5/DPO/DPO_5k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b7624946b6d2a419468e2b62306de9407746ee2b --- /dev/null +++ b/v5/DPO/DPO_5k/lora/README.md @@ -0,0 +1,69 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- dpo +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/eeyi5nal) + + +This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290). + +### Framework versions + +- TRL: 0.27.2 +- Transformers: 5.0.0 +- Pytorch: 2.8.0+cu128 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite DPO as: + +```bibtex +@inproceedings{rafailov2023direct, + title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}}, + author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn}, + year = 2023, + booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023}, + url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html}, + editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/README.md b/v5/DPO/DPO_5k/lora/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_config.json b/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11512c97a55a2d441704a9e11460444b5019509a --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "o_proj", + "up_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_model.safetensors b/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..08fe26327f11746e1049b4c45b42e1fea0633783 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe590a083489903dab61203710473505fb4c13598b7d99c40d5fe0b17eda305 +size 180385008 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/chat_template.jinja b/v5/DPO/DPO_5k/lora/checkpoint-1200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/optimizer.pt b/v5/DPO/DPO_5k/lora/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef1eb4a4d068be8f49133ff47ab7248d81bff9f3 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7254ef547ba3e7ef0dd4ecdbae6aa90d1c58ac90c4dc626011b7858ce0a4342c +size 360902475 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/rng_state.pth b/v5/DPO/DPO_5k/lora/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..13e11a54e352d8a7149df1f88c1b023ee9973959 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7affab63b271ed0f59a5b53056fc0a581226a41dcdf2fc2b80b669e7c3cf714 +size 14645 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/scaler.pt b/v5/DPO/DPO_5k/lora/checkpoint-1200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ddf3c67a64291c7187398acabef416a6a749c2fa --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f91c7dd1fae1ee6ff1b2dab3bd764f4d736f55614ac14e9fbc0db3b8cce0005c +size 1383 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/scheduler.pt b/v5/DPO/DPO_5k/lora/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba746ef9944773e775356a0d566bc70a78f4d9e5 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ab944ae2bef569be1ce08fdec0a24028fb2810c66879e124797940deaf67de0 +size 1465 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer.json b/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer_config.json b/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/trainer_state.json b/v5/DPO/DPO_5k/lora/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..23112d7161a9285979ceb0b06467761050ed6058 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/trainer_state.json @@ -0,0 +1,2218 @@ +{ + "best_global_step": 300, + "best_metric": 0.5440000295639038, + "best_model_checkpoint": "output/lora/checkpoint-300", + "epoch": 1.92, + "eval_steps": 50, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 4.368600368499756, + "learning_rate": 7.2e-08, + "logits/chosen": 1.684491515159607, + "logits/rejected": 1.6000019311904907, + "logps/chosen": -145.20462036132812, + "logps/rejected": -150.64056396484375, + "loss": 0.6933496475219727, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.00038564440910704434, + "rewards/margins": -0.0003992128185927868, + "rewards/rejected": 1.356836855848087e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 5.3214850425720215, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 1.3753983974456787, + "logits/rejected": 1.4558300971984863, + "logps/chosen": -120.09315490722656, + "logps/rejected": -133.41905212402344, + "loss": 0.6935864925384522, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.0006091356044635177, + "rewards/margins": -0.0008693885756656528, + "rewards/rejected": 0.0002602529712021351, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 4.661340713500977, + "learning_rate": 2.32e-07, + "logits/chosen": 1.5848007202148438, + "logits/rejected": 1.744507074356079, + "logps/chosen": -161.58753967285156, + "logps/rejected": -178.603271484375, + "loss": 0.6937230110168457, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002343270927667618, + "rewards/margins": -0.001140077132731676, + "rewards/rejected": -0.0012031936785206199, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 5.193538188934326, + "learning_rate": 3.12e-07, + "logits/chosen": 1.6050277948379517, + "logits/rejected": 1.534880518913269, + "logps/chosen": -151.981689453125, + "logps/rejected": -150.1208038330078, + "loss": 0.6932186126708985, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0018907630583271384, + "rewards/margins": -0.00012836574751418084, + "rewards/rejected": -0.0017623973544687033, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 4.683797359466553, + "learning_rate": 3.92e-07, + "logits/chosen": 1.734514594078064, + "logits/rejected": 1.7892601490020752, + "logps/chosen": -169.11004638671875, + "logps/rejected": -156.22427368164062, + "loss": 0.692991828918457, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00029434924363158643, + "rewards/margins": 0.0003237081109546125, + "rewards/rejected": -2.9358878236962482e-05, + "step": 50 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5363190174102783, + "eval_logits/rejected": 1.567551612854004, + "eval_logps/chosen": -153.31736755371094, + "eval_logps/rejected": -147.88914489746094, + "eval_loss": 0.6932514905929565, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.0011768279364332557, + "eval_rewards/margins": -0.00019350247748661786, + "eval_rewards/rejected": 0.0013703303411602974, + "eval_runtime": 91.1759, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 4.66879940032959, + "learning_rate": 4.7199999999999994e-07, + "logits/chosen": 1.8444726467132568, + "logits/rejected": 1.8203474283218384, + "logps/chosen": -158.23243713378906, + "logps/rejected": -149.02316284179688, + "loss": 0.6930979251861572, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.002846779767423868, + "rewards/margins": 0.00011160141002619639, + "rewards/rejected": 0.0027351784519851208, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 3.876270294189453, + "learning_rate": 5.520000000000001e-07, + "logits/chosen": 1.8535444736480713, + "logits/rejected": 1.7816137075424194, + "logps/chosen": -157.98268127441406, + "logps/rejected": -164.66925048828125, + "loss": 0.6924784183502197, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.006634838879108429, + "rewards/margins": 0.0013595198979601264, + "rewards/rejected": 0.005275317933410406, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 4.669241428375244, + "learning_rate": 6.319999999999999e-07, + "logits/chosen": 1.5538957118988037, + "logits/rejected": 1.5381535291671753, + "logps/chosen": -145.74713134765625, + "logps/rejected": -137.40780639648438, + "loss": 0.6929487705230712, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009571035392582417, + "rewards/margins": 0.000421993900090456, + "rewards/rejected": 0.009149041026830673, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 4.3166022300720215, + "learning_rate": 7.119999999999999e-07, + "logits/chosen": 1.5454356670379639, + "logits/rejected": 1.5363503694534302, + "logps/chosen": -162.4505157470703, + "logps/rejected": -159.41574096679688, + "loss": 0.6919499397277832, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.011575761251151562, + "rewards/margins": 0.0024396872613579035, + "rewards/rejected": 0.009136073291301727, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 4.640413284301758, + "learning_rate": 7.92e-07, + "logits/chosen": 1.6131670475006104, + "logits/rejected": 1.673753023147583, + "logps/chosen": -142.8424530029297, + "logps/rejected": -165.93234252929688, + "loss": 0.6919528007507324, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.014428429305553436, + "rewards/margins": 0.0024503350723534822, + "rewards/rejected": 0.011978095397353172, + "step": 100 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5355972051620483, + "eval_logits/rejected": 1.5665204524993896, + "eval_logps/chosen": -153.1815948486328, + "eval_logps/rejected": -147.7570037841797, + "eval_loss": 0.6931047439575195, + "eval_rewards/accuracies": 0.46000000834465027, + "eval_rewards/chosen": 0.014755296520888805, + "eval_rewards/margins": 0.00017206119082402438, + "eval_rewards/rejected": 0.01458323560655117, + "eval_runtime": 91.1022, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 4.072097301483154, + "learning_rate": 8.72e-07, + "logits/chosen": 1.5775041580200195, + "logits/rejected": 1.6383779048919678, + "logps/chosen": -143.57952880859375, + "logps/rejected": -137.651611328125, + "loss": 0.6926439762115478, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01751135289669037, + "rewards/margins": 0.0010686519090086222, + "rewards/rejected": 0.01644269935786724, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 4.417011260986328, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5828511714935303, + "logits/rejected": 1.6531331539154053, + "logps/chosen": -142.53514099121094, + "logps/rejected": -142.88226318359375, + "loss": 0.6947136402130127, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.012874701991677284, + "rewards/margins": -0.0030476213432848454, + "rewards/rejected": 0.015922321006655693, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 5.043814182281494, + "learning_rate": 9.964444444444445e-07, + "logits/chosen": 1.7005653381347656, + "logits/rejected": 1.8352782726287842, + "logps/chosen": -155.3563995361328, + "logps/rejected": -151.04742431640625, + "loss": 0.693049955368042, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004944052547216415, + "rewards/margins": 0.000274717720458284, + "rewards/rejected": 0.004669335670769215, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 4.280579566955566, + "learning_rate": 9.875555555555555e-07, + "logits/chosen": 1.4931142330169678, + "logits/rejected": 1.5797803401947021, + "logps/chosen": -134.4127655029297, + "logps/rejected": -132.8173065185547, + "loss": 0.6921967983245849, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -4.65536504634656e-05, + "rewards/margins": 0.001995303900912404, + "rewards/rejected": -0.002041857223957777, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 3.553212881088257, + "learning_rate": 9.786666666666666e-07, + "logits/chosen": 1.7535524368286133, + "logits/rejected": 1.7318353652954102, + "logps/chosen": -141.84011840820312, + "logps/rejected": -140.5338592529297, + "loss": 0.6925621032714844, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.006464059464633465, + "rewards/margins": 0.0012660837965086102, + "rewards/rejected": 0.005197975784540176, + "step": 150 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.527831792831421, + "eval_logits/rejected": 1.55862557888031, + "eval_logps/chosen": -153.29705810546875, + "eval_logps/rejected": -147.86842346191406, + "eval_loss": 0.6933275461196899, + "eval_rewards/accuracies": 0.49399998784065247, + "eval_rewards/chosen": 0.003206671681255102, + "eval_rewards/margins": -0.000235457657254301, + "eval_rewards/rejected": 0.0034421291202306747, + "eval_runtime": 91.0798, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 2.745, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 5.0327606201171875, + "learning_rate": 9.697777777777776e-07, + "logits/chosen": 1.7338924407958984, + "logits/rejected": 1.6693298816680908, + "logps/chosen": -163.9834747314453, + "logps/rejected": -147.54122924804688, + "loss": 0.6920580387115478, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010300886817276478, + "rewards/margins": 0.0022598577197641134, + "rewards/rejected": 0.00804102886468172, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 4.203429222106934, + "learning_rate": 9.608888888888888e-07, + "logits/chosen": 1.548438310623169, + "logits/rejected": 1.608687400817871, + "logps/chosen": -140.65548706054688, + "logps/rejected": -124.45481872558594, + "loss": 0.693133544921875, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011312992312014103, + "rewards/margins": 0.00010961303632939234, + "rewards/rejected": 0.011203380301594734, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 3.8275039196014404, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5688340663909912, + "logits/rejected": 1.5681618452072144, + "logps/chosen": -153.26898193359375, + "logps/rejected": -154.38824462890625, + "loss": 0.6913642883300781, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.018832791596651077, + "rewards/margins": 0.0036639694590121508, + "rewards/rejected": 0.015168821439146996, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 4.507416725158691, + "learning_rate": 9.431111111111111e-07, + "logits/chosen": 1.6990807056427002, + "logits/rejected": 1.646045446395874, + "logps/chosen": -156.6995849609375, + "logps/rejected": -155.2141876220703, + "loss": 0.6934223651885987, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.027501707896590233, + "rewards/margins": -0.00040446725324727595, + "rewards/rejected": 0.027906173840165138, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 4.723247051239014, + "learning_rate": 9.342222222222221e-07, + "logits/chosen": 1.4298136234283447, + "logits/rejected": 1.6043508052825928, + "logps/chosen": -126.81380462646484, + "logps/rejected": -133.35108947753906, + "loss": 0.688706636428833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03327309712767601, + "rewards/margins": 0.009079854004085064, + "rewards/rejected": 0.024193240329623222, + "step": 200 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5382241010665894, + "eval_logits/rejected": 1.5689103603363037, + "eval_logps/chosen": -152.9728240966797, + "eval_logps/rejected": -147.55966186523438, + "eval_loss": 0.6926390528678894, + "eval_rewards/accuracies": 0.4880000054836273, + "eval_rewards/chosen": 0.03563162684440613, + "eval_rewards/margins": 0.0013116379268467426, + "eval_rewards/rejected": 0.03431998938322067, + "eval_runtime": 91.1085, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 5.396594047546387, + "learning_rate": 9.253333333333333e-07, + "logits/chosen": 1.7294985055923462, + "logits/rejected": 1.6115707159042358, + "logps/chosen": -150.76341247558594, + "logps/rejected": -126.10733795166016, + "loss": 0.6902324199676514, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.043607715517282486, + "rewards/margins": 0.0060890489257872105, + "rewards/rejected": 0.037518661469221115, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 4.06983757019043, + "learning_rate": 9.164444444444443e-07, + "logits/chosen": 1.7748816013336182, + "logits/rejected": 1.7431707382202148, + "logps/chosen": -163.86878967285156, + "logps/rejected": -142.68081665039062, + "loss": 0.6923216342926025, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.057915735989809036, + "rewards/margins": 0.0020550203043967485, + "rewards/rejected": 0.055860716849565506, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 3.652050256729126, + "learning_rate": 9.075555555555555e-07, + "logits/chosen": 1.6430679559707642, + "logits/rejected": 1.6094305515289307, + "logps/chosen": -148.5363006591797, + "logps/rejected": -153.50338745117188, + "loss": 0.6937844753265381, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05856321379542351, + "rewards/margins": -0.0007888395339250565, + "rewards/rejected": 0.05935205891728401, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 3.4950904846191406, + "learning_rate": 8.986666666666666e-07, + "logits/chosen": 1.6158307790756226, + "logits/rejected": 1.7254810333251953, + "logps/chosen": -157.8291473388672, + "logps/rejected": -164.51071166992188, + "loss": 0.6928259372711182, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06535087525844574, + "rewards/margins": 0.001135659171268344, + "rewards/rejected": 0.06421522051095963, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.940080165863037, + "learning_rate": 8.897777777777777e-07, + "logits/chosen": 1.5965789556503296, + "logits/rejected": 1.649510145187378, + "logps/chosen": -137.41818237304688, + "logps/rejected": -150.51309204101562, + "loss": 0.6896316051483155, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0644104927778244, + "rewards/margins": 0.007390809245407581, + "rewards/rejected": 0.057019688189029694, + "step": 250 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.55246102809906, + "eval_logits/rejected": 1.5829427242279053, + "eval_logps/chosen": -152.7037811279297, + "eval_logps/rejected": -147.31491088867188, + "eval_loss": 0.6915441751480103, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": 0.06253667175769806, + "eval_rewards/margins": 0.0037424375768750906, + "eval_rewards/rejected": 0.058794230222702026, + "eval_runtime": 90.9689, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 2.748, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 4.24291467666626, + "learning_rate": 8.808888888888889e-07, + "logits/chosen": 1.640729546546936, + "logits/rejected": 1.6604511737823486, + "logps/chosen": -144.95303344726562, + "logps/rejected": -149.94384765625, + "loss": 0.691684627532959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07415647059679031, + "rewards/margins": 0.0033445146400481462, + "rewards/rejected": 0.07081194967031479, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 4.855024814605713, + "learning_rate": 8.72e-07, + "logits/chosen": 1.6059837341308594, + "logits/rejected": 1.7337257862091064, + "logps/chosen": -157.46888732910156, + "logps/rejected": -150.69549560546875, + "loss": 0.6913710117340088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0714256763458252, + "rewards/margins": 0.00412519508972764, + "rewards/rejected": 0.06730048358440399, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 3.2214746475219727, + "learning_rate": 8.631111111111111e-07, + "logits/chosen": 1.7127368450164795, + "logits/rejected": 1.7540982961654663, + "logps/chosen": -131.8098602294922, + "logps/rejected": -150.68472290039062, + "loss": 0.6942379474639893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07171601057052612, + "rewards/margins": -0.0016212888294830918, + "rewards/rejected": 0.07333729416131973, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 4.169992446899414, + "learning_rate": 8.542222222222222e-07, + "logits/chosen": 1.6667410135269165, + "logits/rejected": 1.6612989902496338, + "logps/chosen": -154.6461639404297, + "logps/rejected": -148.51638793945312, + "loss": 0.6932468891143799, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06825742870569229, + "rewards/margins": 0.0004094833566341549, + "rewards/rejected": 0.06784794479608536, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 3.3903329372406006, + "learning_rate": 8.453333333333334e-07, + "logits/chosen": 1.5482908487319946, + "logits/rejected": 1.5708004236221313, + "logps/chosen": -138.28341674804688, + "logps/rejected": -140.89016723632812, + "loss": 0.6920734405517578, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05976419895887375, + "rewards/margins": 0.002482531126588583, + "rewards/rejected": 0.057281672954559326, + "step": 300 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.55509614944458, + "eval_logits/rejected": 1.5857810974121094, + "eval_logps/chosen": -152.74725341796875, + "eval_logps/rejected": -147.36598205566406, + "eval_loss": 0.6911302208900452, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": 0.05818922445178032, + "eval_rewards/margins": 0.004501740448176861, + "eval_rewards/rejected": 0.053687483072280884, + "eval_runtime": 90.9977, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 2.747, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 4.032289981842041, + "learning_rate": 8.364444444444443e-07, + "logits/chosen": 1.7680352926254272, + "logits/rejected": 1.8532991409301758, + "logps/chosen": -159.21726989746094, + "logps/rejected": -138.13540649414062, + "loss": 0.6892982959747315, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.06391827762126923, + "rewards/margins": 0.00805785320699215, + "rewards/rejected": 0.05586041882634163, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 4.574601650238037, + "learning_rate": 8.275555555555555e-07, + "logits/chosen": 1.785130500793457, + "logits/rejected": 1.6803003549575806, + "logps/chosen": -150.91397094726562, + "logps/rejected": -142.10299682617188, + "loss": 0.6916579723358154, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.05742686986923218, + "rewards/margins": 0.0033137183636426926, + "rewards/rejected": 0.054113149642944336, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 4.799781322479248, + "learning_rate": 8.186666666666666e-07, + "logits/chosen": 1.5050979852676392, + "logits/rejected": 1.5341730117797852, + "logps/chosen": -136.2999725341797, + "logps/rejected": -155.4806365966797, + "loss": 0.6880038261413575, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.06376932561397552, + "rewards/margins": 0.010802140459418297, + "rewards/rejected": 0.05296717956662178, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 3.9436914920806885, + "learning_rate": 8.097777777777778e-07, + "logits/chosen": 1.5927613973617554, + "logits/rejected": 1.6387131214141846, + "logps/chosen": -142.1040496826172, + "logps/rejected": -119.8014907836914, + "loss": 0.6890507698059082, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06471486389636993, + "rewards/margins": 0.008592360652983189, + "rewards/rejected": 0.05612250417470932, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 4.886368751525879, + "learning_rate": 8.008888888888888e-07, + "logits/chosen": 1.6515939235687256, + "logits/rejected": 1.43025541305542, + "logps/chosen": -137.16024780273438, + "logps/rejected": -133.55267333984375, + "loss": 0.6931482791900635, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06017603352665901, + "rewards/margins": 0.0006265616975724697, + "rewards/rejected": 0.059549469500780106, + "step": 350 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5533874034881592, + "eval_logits/rejected": 1.5838865041732788, + "eval_logps/chosen": -152.75637817382812, + "eval_logps/rejected": -147.3810577392578, + "eval_loss": 0.6908727884292603, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": 0.05727628618478775, + "eval_rewards/margins": 0.005099303554743528, + "eval_rewards/rejected": 0.05217698588967323, + "eval_runtime": 91.1217, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 2.744, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 5.395493030548096, + "learning_rate": 7.92e-07, + "logits/chosen": 1.6785246133804321, + "logits/rejected": 1.7938287258148193, + "logps/chosen": -162.50350952148438, + "logps/rejected": -169.7019500732422, + "loss": 0.6897931098937988, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06360206753015518, + "rewards/margins": 0.007130332291126251, + "rewards/rejected": 0.05647173523902893, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 4.3724541664123535, + "learning_rate": 7.831111111111111e-07, + "logits/chosen": 1.7033554315567017, + "logits/rejected": 1.7527239322662354, + "logps/chosen": -174.69869995117188, + "logps/rejected": -175.63180541992188, + "loss": 0.6901405334472657, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.059598714113235474, + "rewards/margins": 0.006548056844621897, + "rewards/rejected": 0.053050655871629715, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 4.353290557861328, + "learning_rate": 7.742222222222222e-07, + "logits/chosen": 1.6846504211425781, + "logits/rejected": 1.7877483367919922, + "logps/chosen": -136.30889892578125, + "logps/rejected": -159.4036407470703, + "loss": 0.690678882598877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.050548993051052094, + "rewards/margins": 0.005496628116816282, + "rewards/rejected": 0.04505236819386482, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 4.879935264587402, + "learning_rate": 7.653333333333333e-07, + "logits/chosen": 1.4606497287750244, + "logits/rejected": 1.7344859838485718, + "logps/chosen": -130.42169189453125, + "logps/rejected": -166.63027954101562, + "loss": 0.6936720848083496, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.046336062252521515, + "rewards/margins": -0.0004482082149479538, + "rewards/rejected": 0.04678427055478096, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 4.271523475646973, + "learning_rate": 7.564444444444445e-07, + "logits/chosen": 1.6960214376449585, + "logits/rejected": 1.6826406717300415, + "logps/chosen": -149.68972778320312, + "logps/rejected": -160.5981903076172, + "loss": 0.6915639400482178, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.037508051842451096, + "rewards/margins": 0.0035654257517307997, + "rewards/rejected": 0.033942628651857376, + "step": 400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.5374687910079956, + "eval_logits/rejected": 1.567594051361084, + "eval_logps/chosen": -152.97109985351562, + "eval_logps/rejected": -147.59535217285156, + "eval_loss": 0.6909184455871582, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.035805922001600266, + "eval_rewards/margins": 0.0050564357079565525, + "eval_rewards/rejected": 0.03074948862195015, + "eval_runtime": 91.0668, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 2.745, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 5.176839828491211, + "learning_rate": 7.475555555555555e-07, + "logits/chosen": 1.5886105298995972, + "logits/rejected": 1.6644985675811768, + "logps/chosen": -156.63021850585938, + "logps/rejected": -145.19676208496094, + "loss": 0.6877344131469727, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.03953630477190018, + "rewards/margins": 0.011391694657504559, + "rewards/rejected": 0.028144609183073044, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 4.446777820587158, + "learning_rate": 7.386666666666666e-07, + "logits/chosen": 1.625689148902893, + "logits/rejected": 1.6569058895111084, + "logps/chosen": -131.517578125, + "logps/rejected": -131.78964233398438, + "loss": 0.6866260528564453, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04599715396761894, + "rewards/margins": 0.013597942888736725, + "rewards/rejected": 0.03239920735359192, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 4.535679817199707, + "learning_rate": 7.297777777777777e-07, + "logits/chosen": 1.734668493270874, + "logits/rejected": 1.7557735443115234, + "logps/chosen": -139.6569366455078, + "logps/rejected": -140.00808715820312, + "loss": 0.6939756393432617, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.044145092368125916, + "rewards/margins": -0.000974461785517633, + "rewards/rejected": 0.045119550079107285, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 3.674733877182007, + "learning_rate": 7.208888888888889e-07, + "logits/chosen": 1.7089004516601562, + "logits/rejected": 1.7358248233795166, + "logps/chosen": -155.2206573486328, + "logps/rejected": -144.6761932373047, + "loss": 0.6933175563812256, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.043650850653648376, + "rewards/margins": 0.00010928641859209165, + "rewards/rejected": 0.0435415655374527, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 5.13329553604126, + "learning_rate": 7.119999999999999e-07, + "logits/chosen": 1.5127164125442505, + "logits/rejected": 1.6689109802246094, + "logps/chosen": -150.4583282470703, + "logps/rejected": -147.11000061035156, + "loss": 0.6905129909515381, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.030837291851639748, + "rewards/margins": 0.006133320741355419, + "rewards/rejected": 0.024703968316316605, + "step": 450 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5396348237991333, + "eval_logits/rejected": 1.5696207284927368, + "eval_logps/chosen": -152.88949584960938, + "eval_logps/rejected": -147.522216796875, + "eval_loss": 0.6905900239944458, + "eval_rewards/accuracies": 0.515999972820282, + "eval_rewards/chosen": 0.04396428167819977, + "eval_rewards/margins": 0.00590139627456665, + "eval_rewards/rejected": 0.03806288540363312, + "eval_runtime": 91.0404, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 5.364885330200195, + "learning_rate": 7.031111111111111e-07, + "logits/chosen": 1.5854089260101318, + "logits/rejected": 1.6501134634017944, + "logps/chosen": -137.01268005371094, + "logps/rejected": -160.6153106689453, + "loss": 0.6898775577545166, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.03512059897184372, + "rewards/margins": 0.0070413872599601746, + "rewards/rejected": 0.028079207986593246, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 4.035534381866455, + "learning_rate": 6.942222222222222e-07, + "logits/chosen": 1.4868170022964478, + "logits/rejected": 1.6255724430084229, + "logps/chosen": -142.16026306152344, + "logps/rejected": -128.42076110839844, + "loss": 0.6882358074188233, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0400058776140213, + "rewards/margins": 0.010345013812184334, + "rewards/rejected": 0.029660871252417564, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 4.675441265106201, + "learning_rate": 6.853333333333333e-07, + "logits/chosen": 1.7639102935791016, + "logits/rejected": 1.8540350198745728, + "logps/chosen": -170.15591430664062, + "logps/rejected": -163.63043212890625, + "loss": 0.6906004428863526, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04748428240418434, + "rewards/margins": 0.005763492546975613, + "rewards/rejected": 0.041720788925886154, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 4.720729827880859, + "learning_rate": 6.764444444444444e-07, + "logits/chosen": 1.7105178833007812, + "logits/rejected": 1.640414834022522, + "logps/chosen": -165.30238342285156, + "logps/rejected": -163.19874572753906, + "loss": 0.6960553169250489, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.029804859310388565, + "rewards/margins": -0.004881127271801233, + "rewards/rejected": 0.03468598425388336, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 4.234210968017578, + "learning_rate": 6.675555555555556e-07, + "logits/chosen": 1.6047271490097046, + "logits/rejected": 1.7314249277114868, + "logps/chosen": -136.36837768554688, + "logps/rejected": -139.07589721679688, + "loss": 0.6871551513671875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03045068122446537, + "rewards/margins": 0.012581512331962585, + "rewards/rejected": 0.017869170755147934, + "step": 500 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.5329995155334473, + "eval_logits/rejected": 1.562899112701416, + "eval_logps/chosen": -153.01869201660156, + "eval_logps/rejected": -147.65211486816406, + "eval_loss": 0.6905782222747803, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.031044049188494682, + "eval_rewards/margins": 0.005970073863863945, + "eval_rewards/rejected": 0.025073975324630737, + "eval_runtime": 90.9818, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 2.748, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 3.8755218982696533, + "learning_rate": 6.586666666666666e-07, + "logits/chosen": 1.7628934383392334, + "logits/rejected": 1.669203758239746, + "logps/chosen": -165.7239227294922, + "logps/rejected": -151.4439239501953, + "loss": 0.6865688323974609, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04186190292239189, + "rewards/margins": 0.013743969611823559, + "rewards/rejected": 0.028117936104536057, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 4.301093101501465, + "learning_rate": 6.497777777777778e-07, + "logits/chosen": 1.7004272937774658, + "logits/rejected": 1.6940956115722656, + "logps/chosen": -149.85525512695312, + "logps/rejected": -178.81362915039062, + "loss": 0.6878121376037598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.040519606322050095, + "rewards/margins": 0.011822985485196114, + "rewards/rejected": 0.02869662083685398, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 5.140315055847168, + "learning_rate": 6.408888888888888e-07, + "logits/chosen": 1.5849007368087769, + "logits/rejected": 1.6338441371917725, + "logps/chosen": -143.96917724609375, + "logps/rejected": -135.54281616210938, + "loss": 0.684235954284668, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.036442749202251434, + "rewards/margins": 0.0188708808273077, + "rewards/rejected": 0.017571870237588882, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 4.218395709991455, + "learning_rate": 6.319999999999999e-07, + "logits/chosen": 1.4655095338821411, + "logits/rejected": 1.6130340099334717, + "logps/chosen": -128.4731903076172, + "logps/rejected": -149.0254364013672, + "loss": 0.6945743560791016, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.017684206366539, + "rewards/margins": -0.0021602497436106205, + "rewards/rejected": 0.019844455644488335, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 4.782381057739258, + "learning_rate": 6.23111111111111e-07, + "logits/chosen": 1.7393369674682617, + "logits/rejected": 1.806014060974121, + "logps/chosen": -174.6748046875, + "logps/rejected": -184.66175842285156, + "loss": 0.6872735977172851, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.03625861555337906, + "rewards/margins": 0.012612670660018921, + "rewards/rejected": 0.02364594303071499, + "step": 550 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.523414134979248, + "eval_logits/rejected": 1.553091287612915, + "eval_logps/chosen": -153.1641387939453, + "eval_logps/rejected": -147.79656982421875, + "eval_loss": 0.6907312273979187, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.016499562188982964, + "eval_rewards/margins": 0.005870947614312172, + "eval_rewards/rejected": 0.010628614574670792, + "eval_runtime": 91.1697, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 4.337022304534912, + "learning_rate": 6.142222222222222e-07, + "logits/chosen": 1.5208700895309448, + "logits/rejected": 1.433935523033142, + "logps/chosen": -145.02027893066406, + "logps/rejected": -151.54751586914062, + "loss": 0.6930646419525146, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020919274538755417, + "rewards/margins": 0.000952291302382946, + "rewards/rejected": 0.019966980442404747, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 4.181249618530273, + "learning_rate": 6.053333333333332e-07, + "logits/chosen": 1.669150710105896, + "logits/rejected": 1.6721159219741821, + "logps/chosen": -149.44937133789062, + "logps/rejected": -136.69595336914062, + "loss": 0.6915061950683594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.021581020206212997, + "rewards/margins": 0.003878307295963168, + "rewards/rejected": 0.017702709883451462, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 5.015549659729004, + "learning_rate": 5.964444444444444e-07, + "logits/chosen": 1.7415902614593506, + "logits/rejected": 1.7086597681045532, + "logps/chosen": -167.40968322753906, + "logps/rejected": -136.7971649169922, + "loss": 0.6886765480041503, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.015860076993703842, + "rewards/margins": 0.009648093953728676, + "rewards/rejected": 0.006211983505636454, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 5.115492343902588, + "learning_rate": 5.875555555555556e-07, + "logits/chosen": 1.6168180704116821, + "logits/rejected": 1.549253225326538, + "logps/chosen": -176.7733612060547, + "logps/rejected": -155.52088928222656, + "loss": 0.6812242984771728, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02709801122546196, + "rewards/margins": 0.024875756353139877, + "rewards/rejected": 0.002222254639491439, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 4.916851043701172, + "learning_rate": 5.786666666666667e-07, + "logits/chosen": 1.6100847721099854, + "logits/rejected": 1.6456801891326904, + "logps/chosen": -162.9514923095703, + "logps/rejected": -154.86952209472656, + "loss": 0.6924587726593018, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.022737273946404457, + "rewards/margins": 0.002692488022148609, + "rewards/rejected": 0.020044784992933273, + "step": 600 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5220483541488647, + "eval_logits/rejected": 1.5517752170562744, + "eval_logps/chosen": -153.1302490234375, + "eval_logps/rejected": -147.76690673828125, + "eval_loss": 0.6905914545059204, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.019888723269104958, + "eval_rewards/margins": 0.006294028367847204, + "eval_rewards/rejected": 0.01359469536691904, + "eval_runtime": 90.9556, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 2.749, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 4.3061909675598145, + "learning_rate": 5.697777777777778e-07, + "logits/chosen": 1.5751426219940186, + "logits/rejected": 1.5668468475341797, + "logps/chosen": -122.49625396728516, + "logps/rejected": -121.03358459472656, + "loss": 0.6872867584228516, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.041438184678554535, + "rewards/margins": 0.012499396689236164, + "rewards/rejected": 0.028938788920640945, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 5.183097839355469, + "learning_rate": 5.608888888888889e-07, + "logits/chosen": 1.279733419418335, + "logits/rejected": 1.3740136623382568, + "logps/chosen": -124.96076965332031, + "logps/rejected": -138.65638732910156, + "loss": 0.6843455314636231, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.021548787131905556, + "rewards/margins": 0.019731000065803528, + "rewards/rejected": 0.0018177882302552462, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 3.398516893386841, + "learning_rate": 5.520000000000001e-07, + "logits/chosen": 1.5581461191177368, + "logits/rejected": 1.4368339776992798, + "logps/chosen": -152.03562927246094, + "logps/rejected": -141.5266571044922, + "loss": 0.6839772701263428, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03745534271001816, + "rewards/margins": 0.019791865721344948, + "rewards/rejected": 0.01766347326338291, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 4.76785135269165, + "learning_rate": 5.43111111111111e-07, + "logits/chosen": 1.5052144527435303, + "logits/rejected": 1.5997518301010132, + "logps/chosen": -146.1188507080078, + "logps/rejected": -150.7531280517578, + "loss": 0.679119062423706, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04266165569424629, + "rewards/margins": 0.02940245531499386, + "rewards/rejected": 0.013259200379252434, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 5.084848403930664, + "learning_rate": 5.342222222222222e-07, + "logits/chosen": 1.6985509395599365, + "logits/rejected": 1.782636284828186, + "logps/chosen": -147.97036743164062, + "logps/rejected": -160.01596069335938, + "loss": 0.680629301071167, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.04756547138094902, + "rewards/margins": 0.026726100593805313, + "rewards/rejected": 0.020839370787143707, + "step": 650 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5202165842056274, + "eval_logits/rejected": 1.549627661705017, + "eval_logps/chosen": -153.07310485839844, + "eval_logps/rejected": -147.71853637695312, + "eval_loss": 0.6902864575386047, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.025603344663977623, + "eval_rewards/margins": 0.007172676268965006, + "eval_rewards/rejected": 0.018430663272738457, + "eval_runtime": 90.938, + "eval_samples_per_second": 5.498, + "eval_steps_per_second": 2.749, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 4.315349578857422, + "learning_rate": 5.253333333333333e-07, + "logits/chosen": 1.4403737783432007, + "logits/rejected": 1.5927050113677979, + "logps/chosen": -140.49227905273438, + "logps/rejected": -150.4305877685547, + "loss": 0.6848072052001953, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.028764763846993446, + "rewards/margins": 0.01788407936692238, + "rewards/rejected": 0.010880683548748493, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 5.036780834197998, + "learning_rate": 5.164444444444444e-07, + "logits/chosen": 1.5576122999191284, + "logits/rejected": 1.5609056949615479, + "logps/chosen": -152.83335876464844, + "logps/rejected": -153.91134643554688, + "loss": 0.677812910079956, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.042785800993442535, + "rewards/margins": 0.03232557699084282, + "rewards/rejected": 0.010460222139954567, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 4.167594909667969, + "learning_rate": 5.075555555555555e-07, + "logits/chosen": 1.6192195415496826, + "logits/rejected": 1.5309553146362305, + "logps/chosen": -128.08506774902344, + "logps/rejected": -130.7288360595703, + "loss": 0.6876038551330567, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.029848307371139526, + "rewards/margins": 0.012140638194978237, + "rewards/rejected": 0.017707668244838715, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 4.746713638305664, + "learning_rate": 4.986666666666666e-07, + "logits/chosen": 1.7024202346801758, + "logits/rejected": 1.7695732116699219, + "logps/chosen": -175.9063720703125, + "logps/rejected": -174.35122680664062, + "loss": 0.6774589538574218, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05260822921991348, + "rewards/margins": 0.032807059586048126, + "rewards/rejected": 0.019801167771220207, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 5.154026508331299, + "learning_rate": 4.897777777777778e-07, + "logits/chosen": 1.6061038970947266, + "logits/rejected": 1.6249440908432007, + "logps/chosen": -145.95343017578125, + "logps/rejected": -139.83033752441406, + "loss": 0.6705130577087403, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.04269074648618698, + "rewards/margins": 0.04763682931661606, + "rewards/rejected": -0.0049460758455097675, + "step": 700 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.52028489112854, + "eval_logits/rejected": 1.549724817276001, + "eval_logps/chosen": -153.03794860839844, + "eval_logps/rejected": -147.68222045898438, + "eval_loss": 0.690391480922699, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.029118061065673828, + "eval_rewards/margins": 0.007054829970002174, + "eval_rewards/rejected": 0.022063229233026505, + "eval_runtime": 91.06, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 3.8743929862976074, + "learning_rate": 4.808888888888888e-07, + "logits/chosen": 1.7265201807022095, + "logits/rejected": 1.663900375366211, + "logps/chosen": -175.18624877929688, + "logps/rejected": -144.47412109375, + "loss": 0.6695387840270997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06449539959430695, + "rewards/margins": 0.04969433322548866, + "rewards/rejected": 0.014801068231463432, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 4.586221218109131, + "learning_rate": 4.7199999999999994e-07, + "logits/chosen": 1.5080691576004028, + "logits/rejected": 1.5206321477890015, + "logps/chosen": -139.26808166503906, + "logps/rejected": -158.47634887695312, + "loss": 0.6808416843414307, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05986329913139343, + "rewards/margins": 0.025978583842515945, + "rewards/rejected": 0.03388471156358719, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 5.567377090454102, + "learning_rate": 4.6311111111111106e-07, + "logits/chosen": 1.6352875232696533, + "logits/rejected": 1.5908689498901367, + "logps/chosen": -160.4707489013672, + "logps/rejected": -163.84034729003906, + "loss": 0.6816823959350586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0627724900841713, + "rewards/margins": 0.024768764153122902, + "rewards/rejected": 0.038003724068403244, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 4.378120422363281, + "learning_rate": 4.5422222222222223e-07, + "logits/chosen": 1.7893062829971313, + "logits/rejected": 1.8099231719970703, + "logps/chosen": -147.02523803710938, + "logps/rejected": -155.3527374267578, + "loss": 0.6705075740814209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07738201320171356, + "rewards/margins": 0.047473303973674774, + "rewards/rejected": 0.029908711090683937, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 3.968580961227417, + "learning_rate": 4.4533333333333335e-07, + "logits/chosen": 1.6924632787704468, + "logits/rejected": 1.6529285907745361, + "logps/chosen": -165.9091339111328, + "logps/rejected": -143.5700225830078, + "loss": 0.6665099143981934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05950168892741203, + "rewards/margins": 0.05627988651394844, + "rewards/rejected": 0.0032218091655522585, + "step": 750 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.5162402391433716, + "eval_logits/rejected": 1.545271635055542, + "eval_logps/chosen": -152.985107421875, + "eval_logps/rejected": -147.642578125, + "eval_loss": 0.6899175643920898, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": 0.034404147416353226, + "eval_rewards/margins": 0.008377066813409328, + "eval_rewards/rejected": 0.026027081534266472, + "eval_runtime": 91.0476, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 4.258547782897949, + "learning_rate": 4.3644444444444447e-07, + "logits/chosen": 1.5382895469665527, + "logits/rejected": 1.5183976888656616, + "logps/chosen": -155.6405029296875, + "logps/rejected": -137.9111785888672, + "loss": 0.675658893585205, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04784867540001869, + "rewards/margins": 0.036956787109375, + "rewards/rejected": 0.010891887359321117, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 4.439138889312744, + "learning_rate": 4.2755555555555554e-07, + "logits/chosen": 1.60821533203125, + "logits/rejected": 1.6539623737335205, + "logps/chosen": -147.12428283691406, + "logps/rejected": -126.96165466308594, + "loss": 0.6754384994506836, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.061006199568510056, + "rewards/margins": 0.037146344780921936, + "rewards/rejected": 0.023859847337007523, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 4.03473424911499, + "learning_rate": 4.1866666666666666e-07, + "logits/chosen": 1.728029489517212, + "logits/rejected": 1.8161824941635132, + "logps/chosen": -170.85220336914062, + "logps/rejected": -184.57485961914062, + "loss": 0.6782269954681397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.058054376393556595, + "rewards/margins": 0.032068002969026566, + "rewards/rejected": 0.025986377149820328, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 3.655622959136963, + "learning_rate": 4.097777777777778e-07, + "logits/chosen": 1.7169599533081055, + "logits/rejected": 1.5931851863861084, + "logps/chosen": -147.65869140625, + "logps/rejected": -140.3924560546875, + "loss": 0.6885969638824463, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04908771440386772, + "rewards/margins": 0.011032032780349255, + "rewards/rejected": 0.03805568441748619, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 4.527826309204102, + "learning_rate": 4.008888888888889e-07, + "logits/chosen": 1.4913551807403564, + "logits/rejected": 1.5887629985809326, + "logps/chosen": -142.08096313476562, + "logps/rejected": -137.82150268554688, + "loss": 0.6758649826049805, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.060299746692180634, + "rewards/margins": 0.03790009766817093, + "rewards/rejected": 0.022399652749300003, + "step": 800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.518919825553894, + "eval_logits/rejected": 1.5479708909988403, + "eval_logps/chosen": -152.8915557861328, + "eval_logps/rejected": -147.5607452392578, + "eval_loss": 0.6894462704658508, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.04375747963786125, + "eval_rewards/margins": 0.009548054076731205, + "eval_rewards/rejected": 0.03420942649245262, + "eval_runtime": 91.0636, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 4.380261421203613, + "learning_rate": 3.92e-07, + "logits/chosen": 1.4656002521514893, + "logits/rejected": 1.5140694379806519, + "logps/chosen": -164.12106323242188, + "logps/rejected": -142.29354858398438, + "loss": 0.681245994567871, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.062036655843257904, + "rewards/margins": 0.025710636749863625, + "rewards/rejected": 0.03632602095603943, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 5.250955581665039, + "learning_rate": 3.831111111111111e-07, + "logits/chosen": 1.367980718612671, + "logits/rejected": 1.5478198528289795, + "logps/chosen": -140.98110961914062, + "logps/rejected": -138.2202911376953, + "loss": 0.6749699592590332, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04839102178812027, + "rewards/margins": 0.03896191343665123, + "rewards/rejected": 0.00942910648882389, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 4.680771827697754, + "learning_rate": 3.742222222222222e-07, + "logits/chosen": 1.5844545364379883, + "logits/rejected": 1.6176366806030273, + "logps/chosen": -145.24916076660156, + "logps/rejected": -133.55044555664062, + "loss": 0.6737788677215576, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07001302391290665, + "rewards/margins": 0.04148613661527634, + "rewards/rejected": 0.02852689102292061, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 5.02022123336792, + "learning_rate": 3.653333333333333e-07, + "logits/chosen": 1.640952467918396, + "logits/rejected": 1.5505130290985107, + "logps/chosen": -137.98764038085938, + "logps/rejected": -125.2115249633789, + "loss": 0.6780137062072754, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.08490036427974701, + "rewards/margins": 0.03257036954164505, + "rewards/rejected": 0.05232999473810196, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.4920525550842285, + "learning_rate": 3.5644444444444444e-07, + "logits/chosen": 1.5515003204345703, + "logits/rejected": 1.5913991928100586, + "logps/chosen": -132.81130981445312, + "logps/rejected": -137.0764923095703, + "loss": 0.6755705356597901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07374271750450134, + "rewards/margins": 0.03724803403019905, + "rewards/rejected": 0.03649468347430229, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.5257093906402588, + "eval_logits/rejected": 1.554529070854187, + "eval_logps/chosen": -152.75559997558594, + "eval_logps/rejected": -147.43365478515625, + "eval_loss": 0.6891025304794312, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.05735430866479874, + "eval_rewards/margins": 0.010435618460178375, + "eval_rewards/rejected": 0.04691869020462036, + "eval_runtime": 91.0584, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 4.146276473999023, + "learning_rate": 3.4755555555555556e-07, + "logits/chosen": 1.48005211353302, + "logits/rejected": 1.6272552013397217, + "logps/chosen": -147.8726043701172, + "logps/rejected": -128.67236328125, + "loss": 0.6757299423217773, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07889878004789352, + "rewards/margins": 0.03726017475128174, + "rewards/rejected": 0.041638605296611786, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 4.959987163543701, + "learning_rate": 3.386666666666667e-07, + "logits/chosen": 1.6901594400405884, + "logits/rejected": 1.6778481006622314, + "logps/chosen": -155.42953491210938, + "logps/rejected": -155.63232421875, + "loss": 0.6713708400726318, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07203079760074615, + "rewards/margins": 0.04540194571018219, + "rewards/rejected": 0.026628846302628517, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 4.350543975830078, + "learning_rate": 3.2977777777777775e-07, + "logits/chosen": 1.6911184787750244, + "logits/rejected": 1.6407556533813477, + "logps/chosen": -138.36717224121094, + "logps/rejected": -148.633056640625, + "loss": 0.6685135364532471, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.08504179120063782, + "rewards/margins": 0.051595449447631836, + "rewards/rejected": 0.03344634547829628, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 4.636990070343018, + "learning_rate": 3.2088888888888887e-07, + "logits/chosen": 1.6906163692474365, + "logits/rejected": 1.5781571865081787, + "logps/chosen": -155.74984741210938, + "logps/rejected": -144.20733642578125, + "loss": 0.667927598953247, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09189029037952423, + "rewards/margins": 0.05316054821014404, + "rewards/rejected": 0.03872973471879959, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 3.7753641605377197, + "learning_rate": 3.12e-07, + "logits/chosen": 1.4137648344039917, + "logits/rejected": 1.5492877960205078, + "logps/chosen": -136.68003845214844, + "logps/rejected": -167.0130157470703, + "loss": 0.6825860977172852, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.07187347114086151, + "rewards/margins": 0.02330738678574562, + "rewards/rejected": 0.04856608435511589, + "step": 900 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.5240751504898071, + "eval_logits/rejected": 1.5527867078781128, + "eval_logps/chosen": -152.74700927734375, + "eval_logps/rejected": -147.4285888671875, + "eval_loss": 0.6890708208084106, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.05821174010634422, + "eval_rewards/margins": 0.010788210667669773, + "eval_rewards/rejected": 0.047423530369997025, + "eval_runtime": 90.9575, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 2.749, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 4.123291492462158, + "learning_rate": 3.031111111111111e-07, + "logits/chosen": 1.6012052297592163, + "logits/rejected": 1.553945779800415, + "logps/chosen": -132.0549774169922, + "logps/rejected": -128.9656982421875, + "loss": 0.6811869621276856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06877180188894272, + "rewards/margins": 0.02542707696557045, + "rewards/rejected": 0.04334472864866257, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 4.015886306762695, + "learning_rate": 2.9422222222222223e-07, + "logits/chosen": 1.7002413272857666, + "logits/rejected": 1.7617849111557007, + "logps/chosen": -138.71337890625, + "logps/rejected": -153.83413696289062, + "loss": 0.6822823524475098, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0791015774011612, + "rewards/margins": 0.023635607212781906, + "rewards/rejected": 0.05546595901250839, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 4.11157751083374, + "learning_rate": 2.853333333333333e-07, + "logits/chosen": 1.5699821710586548, + "logits/rejected": 1.6132333278656006, + "logps/chosen": -131.09051513671875, + "logps/rejected": -121.78560638427734, + "loss": 0.6841318130493164, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06592516601085663, + "rewards/margins": 0.02018926665186882, + "rewards/rejected": 0.04573589563369751, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 4.930734634399414, + "learning_rate": 2.764444444444444e-07, + "logits/chosen": 1.6620748043060303, + "logits/rejected": 1.6714975833892822, + "logps/chosen": -148.5550537109375, + "logps/rejected": -152.5007781982422, + "loss": 0.6815722942352295, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07848857343196869, + "rewards/margins": 0.025429734960198402, + "rewards/rejected": 0.053058840334415436, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 5.25533390045166, + "learning_rate": 2.6755555555555553e-07, + "logits/chosen": 1.7330894470214844, + "logits/rejected": 1.7220786809921265, + "logps/chosen": -164.49046325683594, + "logps/rejected": -163.75827026367188, + "loss": 0.6790725708007812, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.07422961294651031, + "rewards/margins": 0.02951725758612156, + "rewards/rejected": 0.044712357223033905, + "step": 950 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.5236308574676514, + "eval_logits/rejected": 1.552416205406189, + "eval_logps/chosen": -152.75738525390625, + "eval_logps/rejected": -147.43734741210938, + "eval_loss": 0.6891617178916931, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.05717539042234421, + "eval_rewards/margins": 0.010625330731272697, + "eval_rewards/rejected": 0.04655005782842636, + "eval_runtime": 90.991, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 2.748, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 4.0215373039245605, + "learning_rate": 2.5866666666666665e-07, + "logits/chosen": 1.7435601949691772, + "logits/rejected": 1.7673505544662476, + "logps/chosen": -176.63160705566406, + "logps/rejected": -160.36460876464844, + "loss": 0.6746974945068359, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0737513080239296, + "rewards/margins": 0.0406021922826767, + "rewards/rejected": 0.0331491120159626, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 4.616787433624268, + "learning_rate": 2.4977777777777777e-07, + "logits/chosen": 1.555336356163025, + "logits/rejected": 1.5377695560455322, + "logps/chosen": -144.1389923095703, + "logps/rejected": -159.7912139892578, + "loss": 0.680583906173706, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06359116733074188, + "rewards/margins": 0.027758348733186722, + "rewards/rejected": 0.03583281859755516, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 4.0511579513549805, + "learning_rate": 2.408888888888889e-07, + "logits/chosen": 1.5684994459152222, + "logits/rejected": 1.6245285272598267, + "logps/chosen": -156.8944549560547, + "logps/rejected": -148.4877166748047, + "loss": 0.6721893787384033, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.07115811854600906, + "rewards/margins": 0.04428264498710632, + "rewards/rejected": 0.026875469833612442, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 5.074681758880615, + "learning_rate": 2.32e-07, + "logits/chosen": 1.62103271484375, + "logits/rejected": 1.6313188076019287, + "logps/chosen": -143.34356689453125, + "logps/rejected": -140.15875244140625, + "loss": 0.6764451503753662, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06751976162195206, + "rewards/margins": 0.03728804737329483, + "rewards/rejected": 0.03023170866072178, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 5.334754943847656, + "learning_rate": 2.231111111111111e-07, + "logits/chosen": 1.6107559204101562, + "logits/rejected": 1.4455175399780273, + "logps/chosen": -148.34190368652344, + "logps/rejected": -142.6099090576172, + "loss": 0.6792704105377197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04247204214334488, + "rewards/margins": 0.029789533466100693, + "rewards/rejected": 0.012682514265179634, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.5214487314224243, + "eval_logits/rejected": 1.5501652956008911, + "eval_logps/chosen": -152.7904510498047, + "eval_logps/rejected": -147.47216796875, + "eval_loss": 0.6890739798545837, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.053869131952524185, + "eval_rewards/margins": 0.010802755132317543, + "eval_rewards/rejected": 0.04306638240814209, + "eval_runtime": 91.0136, + "eval_samples_per_second": 5.494, + "eval_steps_per_second": 2.747, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 4.7201948165893555, + "learning_rate": 2.1422222222222223e-07, + "logits/chosen": 1.5747811794281006, + "logits/rejected": 1.7228872776031494, + "logps/chosen": -149.9728546142578, + "logps/rejected": -170.13587951660156, + "loss": 0.6746612071990967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07195371389389038, + "rewards/margins": 0.03933341056108475, + "rewards/rejected": 0.03262030705809593, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 3.240947723388672, + "learning_rate": 2.0533333333333332e-07, + "logits/chosen": 1.547649621963501, + "logits/rejected": 1.5779509544372559, + "logps/chosen": -118.39051818847656, + "logps/rejected": -139.14511108398438, + "loss": 0.6755857944488526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04843420907855034, + "rewards/margins": 0.03727344423532486, + "rewards/rejected": 0.011160760186612606, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.555476188659668, + "learning_rate": 1.9644444444444444e-07, + "logits/chosen": 1.6104564666748047, + "logits/rejected": 1.6452620029449463, + "logps/chosen": -147.3595733642578, + "logps/rejected": -138.03305053710938, + "loss": 0.6876121520996094, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.057530391961336136, + "rewards/margins": 0.013370493426918983, + "rewards/rejected": 0.0441599003970623, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 4.597150802612305, + "learning_rate": 1.8755555555555556e-07, + "logits/chosen": 1.5648972988128662, + "logits/rejected": 1.4944285154342651, + "logps/chosen": -139.306396484375, + "logps/rejected": -154.79624938964844, + "loss": 0.678040885925293, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05991698056459427, + "rewards/margins": 0.034894365817308426, + "rewards/rejected": 0.025022611021995544, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.591237545013428, + "learning_rate": 1.7866666666666665e-07, + "logits/chosen": 1.6720062494277954, + "logits/rejected": 1.6317745447158813, + "logps/chosen": -140.55374145507812, + "logps/rejected": -166.04254150390625, + "loss": 0.6782684803009034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.056691985577344894, + "rewards/margins": 0.03214425593614578, + "rewards/rejected": 0.024547729641199112, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.516204833984375, + "eval_logits/rejected": 1.545017957687378, + "eval_logps/chosen": -152.87672424316406, + "eval_logps/rejected": -147.5528106689453, + "eval_loss": 0.689375638961792, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.04524260014295578, + "eval_rewards/margins": 0.010239595547318459, + "eval_rewards/rejected": 0.03500300645828247, + "eval_runtime": 91.0411, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 4.6675896644592285, + "learning_rate": 1.6977777777777777e-07, + "logits/chosen": 1.3496609926223755, + "logits/rejected": 1.4482405185699463, + "logps/chosen": -145.14395141601562, + "logps/rejected": -147.69923400878906, + "loss": 0.6717345714569092, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05466550588607788, + "rewards/margins": 0.04511015862226486, + "rewards/rejected": 0.009555344469845295, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 5.049563884735107, + "learning_rate": 1.608888888888889e-07, + "logits/chosen": 1.8515784740447998, + "logits/rejected": 1.6993423700332642, + "logps/chosen": -159.16888427734375, + "logps/rejected": -181.34690856933594, + "loss": 0.6704267978668212, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07823075354099274, + "rewards/margins": 0.04844868183135986, + "rewards/rejected": 0.029782067984342575, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 3.619241714477539, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 1.6090354919433594, + "logits/rejected": 1.626612901687622, + "logps/chosen": -154.02679443359375, + "logps/rejected": -142.3501434326172, + "loss": 0.6738579273223877, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.07384214550256729, + "rewards/margins": 0.04149458929896355, + "rewards/rejected": 0.032347556203603745, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 4.203415393829346, + "learning_rate": 1.431111111111111e-07, + "logits/chosen": 1.6636466979980469, + "logits/rejected": 1.7839996814727783, + "logps/chosen": -151.83984375, + "logps/rejected": -156.90054321289062, + "loss": 0.6809844017028809, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07428060472011566, + "rewards/margins": 0.02610836923122406, + "rewards/rejected": 0.0481722429394722, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 5.443735122680664, + "learning_rate": 1.342222222222222e-07, + "logits/chosen": 1.6737174987792969, + "logits/rejected": 1.6681289672851562, + "logps/chosen": -170.30580139160156, + "logps/rejected": -123.48567199707031, + "loss": 0.6710718631744385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.052741169929504395, + "rewards/margins": 0.04700620472431183, + "rewards/rejected": 0.0057349675334990025, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.5129518508911133, + "eval_logits/rejected": 1.5416380167007446, + "eval_logps/chosen": -152.91659545898438, + "eval_logps/rejected": -147.5920867919922, + "eval_loss": 0.6894625425338745, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.0412554033100605, + "eval_rewards/margins": 0.010181105695664883, + "eval_rewards/rejected": 0.031074294820427895, + "eval_runtime": 91.2168, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.741, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 4.157502174377441, + "learning_rate": 1.2533333333333332e-07, + "logits/chosen": 1.6970571279525757, + "logits/rejected": 1.6661456823349, + "logps/chosen": -155.0122528076172, + "logps/rejected": -157.14593505859375, + "loss": 0.6756897926330566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06435829401016235, + "rewards/margins": 0.03779596835374832, + "rewards/rejected": 0.02656233310699463, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 3.509523868560791, + "learning_rate": 1.1644444444444444e-07, + "logits/chosen": 1.5473051071166992, + "logits/rejected": 1.6796495914459229, + "logps/chosen": -142.53172302246094, + "logps/rejected": -155.2112579345703, + "loss": 0.6751244068145752, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06569470465183258, + "rewards/margins": 0.03866187483072281, + "rewards/rejected": 0.02703283168375492, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 4.258018970489502, + "learning_rate": 1.0755555555555556e-07, + "logits/chosen": 1.6674772500991821, + "logits/rejected": 1.740517258644104, + "logps/chosen": -143.2890625, + "logps/rejected": -144.5006103515625, + "loss": 0.6744827270507813, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.049980729818344116, + "rewards/margins": 0.039941221475601196, + "rewards/rejected": 0.01003950648009777, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 4.8917388916015625, + "learning_rate": 9.866666666666666e-08, + "logits/chosen": 1.4226986169815063, + "logits/rejected": 1.497089147567749, + "logps/chosen": -132.8553009033203, + "logps/rejected": -132.77996826171875, + "loss": 0.6772085189819336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.046212293207645416, + "rewards/margins": 0.03522529453039169, + "rewards/rejected": 0.010986998677253723, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 5.875709533691406, + "learning_rate": 8.977777777777777e-08, + "logits/chosen": 1.5243072509765625, + "logits/rejected": 1.6050777435302734, + "logps/chosen": -145.8072967529297, + "logps/rejected": -157.0274200439453, + "loss": 0.6806241035461426, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05913332849740982, + "rewards/margins": 0.02726324275135994, + "rewards/rejected": 0.03187008947134018, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.5127122402191162, + "eval_logits/rejected": 1.5414215326309204, + "eval_logps/chosen": -152.91351318359375, + "eval_logps/rejected": -147.59005737304688, + "eval_loss": 0.6894447207450867, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.04156311973929405, + "eval_rewards/margins": 0.010284863412380219, + "eval_rewards/rejected": 0.031278256326913834, + "eval_runtime": 91.0625, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 4.677998065948486, + "learning_rate": 8.088888888888888e-08, + "logits/chosen": 1.4877736568450928, + "logits/rejected": 1.5148862600326538, + "logps/chosen": -125.41712951660156, + "logps/rejected": -151.0792999267578, + "loss": 0.6776061058044434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.052109260112047195, + "rewards/margins": 0.03345213457942009, + "rewards/rejected": 0.018657123669981956, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 6.006388187408447, + "learning_rate": 7.2e-08, + "logits/chosen": 1.7018489837646484, + "logits/rejected": 1.6689262390136719, + "logps/chosen": -148.28671264648438, + "logps/rejected": -147.13995361328125, + "loss": 0.6749439239501953, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.056884974241256714, + "rewards/margins": 0.038883306086063385, + "rewards/rejected": 0.01800166629254818, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 4.416358947753906, + "learning_rate": 6.311111111111112e-08, + "logits/chosen": 1.5913951396942139, + "logits/rejected": 1.6728503704071045, + "logps/chosen": -161.4684295654297, + "logps/rejected": -155.4317626953125, + "loss": 0.67286696434021, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08693180978298187, + "rewards/margins": 0.04347502067685127, + "rewards/rejected": 0.0434567965567112, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 4.496129035949707, + "learning_rate": 5.4222222222222216e-08, + "logits/chosen": 1.66684091091156, + "logits/rejected": 1.563511848449707, + "logps/chosen": -146.62295532226562, + "logps/rejected": -129.34072875976562, + "loss": 0.6853626251220704, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.05792286992073059, + "rewards/margins": 0.018264759331941605, + "rewards/rejected": 0.03965810686349869, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 4.628019332885742, + "learning_rate": 4.5333333333333336e-08, + "logits/chosen": 1.7843306064605713, + "logits/rejected": 1.620391607284546, + "logps/chosen": -143.50228881835938, + "logps/rejected": -154.37442016601562, + "loss": 0.6730541706085205, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06739021092653275, + "rewards/margins": 0.043615005910396576, + "rewards/rejected": 0.023775208741426468, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5122698545455933, + "eval_logits/rejected": 1.5408285856246948, + "eval_logps/chosen": -152.92884826660156, + "eval_logps/rejected": -147.60997009277344, + "eval_loss": 0.6892240643501282, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": 0.04003090038895607, + "eval_rewards/margins": 0.010743732564151287, + "eval_rewards/rejected": 0.02928716316819191, + "eval_runtime": 91.1446, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 1200 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1200/training_args.bin b/v5/DPO/DPO_5k/lora/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cb6e403b06e05c65a94488c31a57e3448aa1a628 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2949ac3bd9315d3a45a1d086fec2301ea7bd1dab4938d70cfd24209203d51940 +size 6161 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/README.md b/v5/DPO/DPO_5k/lora/checkpoint-1250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_config.json b/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11512c97a55a2d441704a9e11460444b5019509a --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "o_proj", + "up_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_model.safetensors b/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c3cd1c0ce468f61c6ba15dfbb443f28c927d78ef --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e5c699db44dc76dd8166101fd9c2e76e9874861fab843e913db54c28ece8d9 +size 180385008 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/chat_template.jinja b/v5/DPO/DPO_5k/lora/checkpoint-1250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/optimizer.pt b/v5/DPO/DPO_5k/lora/checkpoint-1250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d859c582021369c627f6caf443518a4ad8e85928 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0596481ef539c3bc75d557aea1f45d989e1fcc6d0763f167f08b5a5dda762eff +size 360902475 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/rng_state.pth b/v5/DPO/DPO_5k/lora/checkpoint-1250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68c0411dd375a388cbc8c58bea912cb904778ab8 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1786ad2057a678cc204dadc7fc5d1a4f939be477df219f770c7d40e9270281 +size 14645 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/scaler.pt b/v5/DPO/DPO_5k/lora/checkpoint-1250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8acc02ea691ea976779c9243524c7365b0897dc --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f99a1a4a55270b7355105d1bb34b386a930d3e5cdd5ac90325bf3e1a7d3c6b2 +size 1383 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/scheduler.pt b/v5/DPO/DPO_5k/lora/checkpoint-1250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2b7a902ed4f8b0f93c5d60d693446ebebf82e20 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e145664f01136a1555e0d5f604d325102b2c261ef5f13db13e0f9d51cdea4ef7 +size 1465 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer.json b/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer_config.json b/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/trainer_state.json b/v5/DPO/DPO_5k/lora/checkpoint-1250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f6fbccbcde9b83ba0ee044a5b64412ff7fae4bb2 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/trainer_state.json @@ -0,0 +1,2309 @@ +{ + "best_global_step": 300, + "best_metric": 0.5440000295639038, + "best_model_checkpoint": "output/lora/checkpoint-300", + "epoch": 2.0, + "eval_steps": 50, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 4.368600368499756, + "learning_rate": 7.2e-08, + "logits/chosen": 1.684491515159607, + "logits/rejected": 1.6000019311904907, + "logps/chosen": -145.20462036132812, + "logps/rejected": -150.64056396484375, + "loss": 0.6933496475219727, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.00038564440910704434, + "rewards/margins": -0.0003992128185927868, + "rewards/rejected": 1.356836855848087e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 5.3214850425720215, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 1.3753983974456787, + "logits/rejected": 1.4558300971984863, + "logps/chosen": -120.09315490722656, + "logps/rejected": -133.41905212402344, + "loss": 0.6935864925384522, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.0006091356044635177, + "rewards/margins": -0.0008693885756656528, + "rewards/rejected": 0.0002602529712021351, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 4.661340713500977, + "learning_rate": 2.32e-07, + "logits/chosen": 1.5848007202148438, + "logits/rejected": 1.744507074356079, + "logps/chosen": -161.58753967285156, + "logps/rejected": -178.603271484375, + "loss": 0.6937230110168457, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002343270927667618, + "rewards/margins": -0.001140077132731676, + "rewards/rejected": -0.0012031936785206199, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 5.193538188934326, + "learning_rate": 3.12e-07, + "logits/chosen": 1.6050277948379517, + "logits/rejected": 1.534880518913269, + "logps/chosen": -151.981689453125, + "logps/rejected": -150.1208038330078, + "loss": 0.6932186126708985, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0018907630583271384, + "rewards/margins": -0.00012836574751418084, + "rewards/rejected": -0.0017623973544687033, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 4.683797359466553, + "learning_rate": 3.92e-07, + "logits/chosen": 1.734514594078064, + "logits/rejected": 1.7892601490020752, + "logps/chosen": -169.11004638671875, + "logps/rejected": -156.22427368164062, + "loss": 0.692991828918457, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00029434924363158643, + "rewards/margins": 0.0003237081109546125, + "rewards/rejected": -2.9358878236962482e-05, + "step": 50 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5363190174102783, + "eval_logits/rejected": 1.567551612854004, + "eval_logps/chosen": -153.31736755371094, + "eval_logps/rejected": -147.88914489746094, + "eval_loss": 0.6932514905929565, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.0011768279364332557, + "eval_rewards/margins": -0.00019350247748661786, + "eval_rewards/rejected": 0.0013703303411602974, + "eval_runtime": 91.1759, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 4.66879940032959, + "learning_rate": 4.7199999999999994e-07, + "logits/chosen": 1.8444726467132568, + "logits/rejected": 1.8203474283218384, + "logps/chosen": -158.23243713378906, + "logps/rejected": -149.02316284179688, + "loss": 0.6930979251861572, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.002846779767423868, + "rewards/margins": 0.00011160141002619639, + "rewards/rejected": 0.0027351784519851208, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 3.876270294189453, + "learning_rate": 5.520000000000001e-07, + "logits/chosen": 1.8535444736480713, + "logits/rejected": 1.7816137075424194, + "logps/chosen": -157.98268127441406, + "logps/rejected": -164.66925048828125, + "loss": 0.6924784183502197, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.006634838879108429, + "rewards/margins": 0.0013595198979601264, + "rewards/rejected": 0.005275317933410406, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 4.669241428375244, + "learning_rate": 6.319999999999999e-07, + "logits/chosen": 1.5538957118988037, + "logits/rejected": 1.5381535291671753, + "logps/chosen": -145.74713134765625, + "logps/rejected": -137.40780639648438, + "loss": 0.6929487705230712, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009571035392582417, + "rewards/margins": 0.000421993900090456, + "rewards/rejected": 0.009149041026830673, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 4.3166022300720215, + "learning_rate": 7.119999999999999e-07, + "logits/chosen": 1.5454356670379639, + "logits/rejected": 1.5363503694534302, + "logps/chosen": -162.4505157470703, + "logps/rejected": -159.41574096679688, + "loss": 0.6919499397277832, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.011575761251151562, + "rewards/margins": 0.0024396872613579035, + "rewards/rejected": 0.009136073291301727, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 4.640413284301758, + "learning_rate": 7.92e-07, + "logits/chosen": 1.6131670475006104, + "logits/rejected": 1.673753023147583, + "logps/chosen": -142.8424530029297, + "logps/rejected": -165.93234252929688, + "loss": 0.6919528007507324, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.014428429305553436, + "rewards/margins": 0.0024503350723534822, + "rewards/rejected": 0.011978095397353172, + "step": 100 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5355972051620483, + "eval_logits/rejected": 1.5665204524993896, + "eval_logps/chosen": -153.1815948486328, + "eval_logps/rejected": -147.7570037841797, + "eval_loss": 0.6931047439575195, + "eval_rewards/accuracies": 0.46000000834465027, + "eval_rewards/chosen": 0.014755296520888805, + "eval_rewards/margins": 0.00017206119082402438, + "eval_rewards/rejected": 0.01458323560655117, + "eval_runtime": 91.1022, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 4.072097301483154, + "learning_rate": 8.72e-07, + "logits/chosen": 1.5775041580200195, + "logits/rejected": 1.6383779048919678, + "logps/chosen": -143.57952880859375, + "logps/rejected": -137.651611328125, + "loss": 0.6926439762115478, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01751135289669037, + "rewards/margins": 0.0010686519090086222, + "rewards/rejected": 0.01644269935786724, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 4.417011260986328, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5828511714935303, + "logits/rejected": 1.6531331539154053, + "logps/chosen": -142.53514099121094, + "logps/rejected": -142.88226318359375, + "loss": 0.6947136402130127, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.012874701991677284, + "rewards/margins": -0.0030476213432848454, + "rewards/rejected": 0.015922321006655693, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 5.043814182281494, + "learning_rate": 9.964444444444445e-07, + "logits/chosen": 1.7005653381347656, + "logits/rejected": 1.8352782726287842, + "logps/chosen": -155.3563995361328, + "logps/rejected": -151.04742431640625, + "loss": 0.693049955368042, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004944052547216415, + "rewards/margins": 0.000274717720458284, + "rewards/rejected": 0.004669335670769215, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 4.280579566955566, + "learning_rate": 9.875555555555555e-07, + "logits/chosen": 1.4931142330169678, + "logits/rejected": 1.5797803401947021, + "logps/chosen": -134.4127655029297, + "logps/rejected": -132.8173065185547, + "loss": 0.6921967983245849, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -4.65536504634656e-05, + "rewards/margins": 0.001995303900912404, + "rewards/rejected": -0.002041857223957777, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 3.553212881088257, + "learning_rate": 9.786666666666666e-07, + "logits/chosen": 1.7535524368286133, + "logits/rejected": 1.7318353652954102, + "logps/chosen": -141.84011840820312, + "logps/rejected": -140.5338592529297, + "loss": 0.6925621032714844, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.006464059464633465, + "rewards/margins": 0.0012660837965086102, + "rewards/rejected": 0.005197975784540176, + "step": 150 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.527831792831421, + "eval_logits/rejected": 1.55862557888031, + "eval_logps/chosen": -153.29705810546875, + "eval_logps/rejected": -147.86842346191406, + "eval_loss": 0.6933275461196899, + "eval_rewards/accuracies": 0.49399998784065247, + "eval_rewards/chosen": 0.003206671681255102, + "eval_rewards/margins": -0.000235457657254301, + "eval_rewards/rejected": 0.0034421291202306747, + "eval_runtime": 91.0798, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 2.745, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 5.0327606201171875, + "learning_rate": 9.697777777777776e-07, + "logits/chosen": 1.7338924407958984, + "logits/rejected": 1.6693298816680908, + "logps/chosen": -163.9834747314453, + "logps/rejected": -147.54122924804688, + "loss": 0.6920580387115478, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010300886817276478, + "rewards/margins": 0.0022598577197641134, + "rewards/rejected": 0.00804102886468172, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 4.203429222106934, + "learning_rate": 9.608888888888888e-07, + "logits/chosen": 1.548438310623169, + "logits/rejected": 1.608687400817871, + "logps/chosen": -140.65548706054688, + "logps/rejected": -124.45481872558594, + "loss": 0.693133544921875, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011312992312014103, + "rewards/margins": 0.00010961303632939234, + "rewards/rejected": 0.011203380301594734, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 3.8275039196014404, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5688340663909912, + "logits/rejected": 1.5681618452072144, + "logps/chosen": -153.26898193359375, + "logps/rejected": -154.38824462890625, + "loss": 0.6913642883300781, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.018832791596651077, + "rewards/margins": 0.0036639694590121508, + "rewards/rejected": 0.015168821439146996, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 4.507416725158691, + "learning_rate": 9.431111111111111e-07, + "logits/chosen": 1.6990807056427002, + "logits/rejected": 1.646045446395874, + "logps/chosen": -156.6995849609375, + "logps/rejected": -155.2141876220703, + "loss": 0.6934223651885987, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.027501707896590233, + "rewards/margins": -0.00040446725324727595, + "rewards/rejected": 0.027906173840165138, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 4.723247051239014, + "learning_rate": 9.342222222222221e-07, + "logits/chosen": 1.4298136234283447, + "logits/rejected": 1.6043508052825928, + "logps/chosen": -126.81380462646484, + "logps/rejected": -133.35108947753906, + "loss": 0.688706636428833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03327309712767601, + "rewards/margins": 0.009079854004085064, + "rewards/rejected": 0.024193240329623222, + "step": 200 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5382241010665894, + "eval_logits/rejected": 1.5689103603363037, + "eval_logps/chosen": -152.9728240966797, + "eval_logps/rejected": -147.55966186523438, + "eval_loss": 0.6926390528678894, + "eval_rewards/accuracies": 0.4880000054836273, + "eval_rewards/chosen": 0.03563162684440613, + "eval_rewards/margins": 0.0013116379268467426, + "eval_rewards/rejected": 0.03431998938322067, + "eval_runtime": 91.1085, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 5.396594047546387, + "learning_rate": 9.253333333333333e-07, + "logits/chosen": 1.7294985055923462, + "logits/rejected": 1.6115707159042358, + "logps/chosen": -150.76341247558594, + "logps/rejected": -126.10733795166016, + "loss": 0.6902324199676514, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.043607715517282486, + "rewards/margins": 0.0060890489257872105, + "rewards/rejected": 0.037518661469221115, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 4.06983757019043, + "learning_rate": 9.164444444444443e-07, + "logits/chosen": 1.7748816013336182, + "logits/rejected": 1.7431707382202148, + "logps/chosen": -163.86878967285156, + "logps/rejected": -142.68081665039062, + "loss": 0.6923216342926025, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.057915735989809036, + "rewards/margins": 0.0020550203043967485, + "rewards/rejected": 0.055860716849565506, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 3.652050256729126, + "learning_rate": 9.075555555555555e-07, + "logits/chosen": 1.6430679559707642, + "logits/rejected": 1.6094305515289307, + "logps/chosen": -148.5363006591797, + "logps/rejected": -153.50338745117188, + "loss": 0.6937844753265381, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05856321379542351, + "rewards/margins": -0.0007888395339250565, + "rewards/rejected": 0.05935205891728401, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 3.4950904846191406, + "learning_rate": 8.986666666666666e-07, + "logits/chosen": 1.6158307790756226, + "logits/rejected": 1.7254810333251953, + "logps/chosen": -157.8291473388672, + "logps/rejected": -164.51071166992188, + "loss": 0.6928259372711182, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06535087525844574, + "rewards/margins": 0.001135659171268344, + "rewards/rejected": 0.06421522051095963, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.940080165863037, + "learning_rate": 8.897777777777777e-07, + "logits/chosen": 1.5965789556503296, + "logits/rejected": 1.649510145187378, + "logps/chosen": -137.41818237304688, + "logps/rejected": -150.51309204101562, + "loss": 0.6896316051483155, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0644104927778244, + "rewards/margins": 0.007390809245407581, + "rewards/rejected": 0.057019688189029694, + "step": 250 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.55246102809906, + "eval_logits/rejected": 1.5829427242279053, + "eval_logps/chosen": -152.7037811279297, + "eval_logps/rejected": -147.31491088867188, + "eval_loss": 0.6915441751480103, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": 0.06253667175769806, + "eval_rewards/margins": 0.0037424375768750906, + "eval_rewards/rejected": 0.058794230222702026, + "eval_runtime": 90.9689, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 2.748, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 4.24291467666626, + "learning_rate": 8.808888888888889e-07, + "logits/chosen": 1.640729546546936, + "logits/rejected": 1.6604511737823486, + "logps/chosen": -144.95303344726562, + "logps/rejected": -149.94384765625, + "loss": 0.691684627532959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07415647059679031, + "rewards/margins": 0.0033445146400481462, + "rewards/rejected": 0.07081194967031479, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 4.855024814605713, + "learning_rate": 8.72e-07, + "logits/chosen": 1.6059837341308594, + "logits/rejected": 1.7337257862091064, + "logps/chosen": -157.46888732910156, + "logps/rejected": -150.69549560546875, + "loss": 0.6913710117340088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0714256763458252, + "rewards/margins": 0.00412519508972764, + "rewards/rejected": 0.06730048358440399, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 3.2214746475219727, + "learning_rate": 8.631111111111111e-07, + "logits/chosen": 1.7127368450164795, + "logits/rejected": 1.7540982961654663, + "logps/chosen": -131.8098602294922, + "logps/rejected": -150.68472290039062, + "loss": 0.6942379474639893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07171601057052612, + "rewards/margins": -0.0016212888294830918, + "rewards/rejected": 0.07333729416131973, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 4.169992446899414, + "learning_rate": 8.542222222222222e-07, + "logits/chosen": 1.6667410135269165, + "logits/rejected": 1.6612989902496338, + "logps/chosen": -154.6461639404297, + "logps/rejected": -148.51638793945312, + "loss": 0.6932468891143799, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06825742870569229, + "rewards/margins": 0.0004094833566341549, + "rewards/rejected": 0.06784794479608536, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 3.3903329372406006, + "learning_rate": 8.453333333333334e-07, + "logits/chosen": 1.5482908487319946, + "logits/rejected": 1.5708004236221313, + "logps/chosen": -138.28341674804688, + "logps/rejected": -140.89016723632812, + "loss": 0.6920734405517578, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05976419895887375, + "rewards/margins": 0.002482531126588583, + "rewards/rejected": 0.057281672954559326, + "step": 300 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.55509614944458, + "eval_logits/rejected": 1.5857810974121094, + "eval_logps/chosen": -152.74725341796875, + "eval_logps/rejected": -147.36598205566406, + "eval_loss": 0.6911302208900452, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": 0.05818922445178032, + "eval_rewards/margins": 0.004501740448176861, + "eval_rewards/rejected": 0.053687483072280884, + "eval_runtime": 90.9977, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 2.747, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 4.032289981842041, + "learning_rate": 8.364444444444443e-07, + "logits/chosen": 1.7680352926254272, + "logits/rejected": 1.8532991409301758, + "logps/chosen": -159.21726989746094, + "logps/rejected": -138.13540649414062, + "loss": 0.6892982959747315, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.06391827762126923, + "rewards/margins": 0.00805785320699215, + "rewards/rejected": 0.05586041882634163, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 4.574601650238037, + "learning_rate": 8.275555555555555e-07, + "logits/chosen": 1.785130500793457, + "logits/rejected": 1.6803003549575806, + "logps/chosen": -150.91397094726562, + "logps/rejected": -142.10299682617188, + "loss": 0.6916579723358154, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.05742686986923218, + "rewards/margins": 0.0033137183636426926, + "rewards/rejected": 0.054113149642944336, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 4.799781322479248, + "learning_rate": 8.186666666666666e-07, + "logits/chosen": 1.5050979852676392, + "logits/rejected": 1.5341730117797852, + "logps/chosen": -136.2999725341797, + "logps/rejected": -155.4806365966797, + "loss": 0.6880038261413575, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.06376932561397552, + "rewards/margins": 0.010802140459418297, + "rewards/rejected": 0.05296717956662178, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 3.9436914920806885, + "learning_rate": 8.097777777777778e-07, + "logits/chosen": 1.5927613973617554, + "logits/rejected": 1.6387131214141846, + "logps/chosen": -142.1040496826172, + "logps/rejected": -119.8014907836914, + "loss": 0.6890507698059082, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.06471486389636993, + "rewards/margins": 0.008592360652983189, + "rewards/rejected": 0.05612250417470932, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 4.886368751525879, + "learning_rate": 8.008888888888888e-07, + "logits/chosen": 1.6515939235687256, + "logits/rejected": 1.43025541305542, + "logps/chosen": -137.16024780273438, + "logps/rejected": -133.55267333984375, + "loss": 0.6931482791900635, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06017603352665901, + "rewards/margins": 0.0006265616975724697, + "rewards/rejected": 0.059549469500780106, + "step": 350 + }, + { + "epoch": 0.56, + "eval_logits/chosen": 1.5533874034881592, + "eval_logits/rejected": 1.5838865041732788, + "eval_logps/chosen": -152.75637817382812, + "eval_logps/rejected": -147.3810577392578, + "eval_loss": 0.6908727884292603, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": 0.05727628618478775, + "eval_rewards/margins": 0.005099303554743528, + "eval_rewards/rejected": 0.05217698588967323, + "eval_runtime": 91.1217, + "eval_samples_per_second": 5.487, + "eval_steps_per_second": 2.744, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 5.395493030548096, + "learning_rate": 7.92e-07, + "logits/chosen": 1.6785246133804321, + "logits/rejected": 1.7938287258148193, + "logps/chosen": -162.50350952148438, + "logps/rejected": -169.7019500732422, + "loss": 0.6897931098937988, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06360206753015518, + "rewards/margins": 0.007130332291126251, + "rewards/rejected": 0.05647173523902893, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 4.3724541664123535, + "learning_rate": 7.831111111111111e-07, + "logits/chosen": 1.7033554315567017, + "logits/rejected": 1.7527239322662354, + "logps/chosen": -174.69869995117188, + "logps/rejected": -175.63180541992188, + "loss": 0.6901405334472657, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.059598714113235474, + "rewards/margins": 0.006548056844621897, + "rewards/rejected": 0.053050655871629715, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 4.353290557861328, + "learning_rate": 7.742222222222222e-07, + "logits/chosen": 1.6846504211425781, + "logits/rejected": 1.7877483367919922, + "logps/chosen": -136.30889892578125, + "logps/rejected": -159.4036407470703, + "loss": 0.690678882598877, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.050548993051052094, + "rewards/margins": 0.005496628116816282, + "rewards/rejected": 0.04505236819386482, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 4.879935264587402, + "learning_rate": 7.653333333333333e-07, + "logits/chosen": 1.4606497287750244, + "logits/rejected": 1.7344859838485718, + "logps/chosen": -130.42169189453125, + "logps/rejected": -166.63027954101562, + "loss": 0.6936720848083496, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.046336062252521515, + "rewards/margins": -0.0004482082149479538, + "rewards/rejected": 0.04678427055478096, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 4.271523475646973, + "learning_rate": 7.564444444444445e-07, + "logits/chosen": 1.6960214376449585, + "logits/rejected": 1.6826406717300415, + "logps/chosen": -149.68972778320312, + "logps/rejected": -160.5981903076172, + "loss": 0.6915639400482178, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.037508051842451096, + "rewards/margins": 0.0035654257517307997, + "rewards/rejected": 0.033942628651857376, + "step": 400 + }, + { + "epoch": 0.64, + "eval_logits/chosen": 1.5374687910079956, + "eval_logits/rejected": 1.567594051361084, + "eval_logps/chosen": -152.97109985351562, + "eval_logps/rejected": -147.59535217285156, + "eval_loss": 0.6909184455871582, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.035805922001600266, + "eval_rewards/margins": 0.0050564357079565525, + "eval_rewards/rejected": 0.03074948862195015, + "eval_runtime": 91.0668, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 2.745, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 5.176839828491211, + "learning_rate": 7.475555555555555e-07, + "logits/chosen": 1.5886105298995972, + "logits/rejected": 1.6644985675811768, + "logps/chosen": -156.63021850585938, + "logps/rejected": -145.19676208496094, + "loss": 0.6877344131469727, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.03953630477190018, + "rewards/margins": 0.011391694657504559, + "rewards/rejected": 0.028144609183073044, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 4.446777820587158, + "learning_rate": 7.386666666666666e-07, + "logits/chosen": 1.625689148902893, + "logits/rejected": 1.6569058895111084, + "logps/chosen": -131.517578125, + "logps/rejected": -131.78964233398438, + "loss": 0.6866260528564453, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04599715396761894, + "rewards/margins": 0.013597942888736725, + "rewards/rejected": 0.03239920735359192, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 4.535679817199707, + "learning_rate": 7.297777777777777e-07, + "logits/chosen": 1.734668493270874, + "logits/rejected": 1.7557735443115234, + "logps/chosen": -139.6569366455078, + "logps/rejected": -140.00808715820312, + "loss": 0.6939756393432617, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.044145092368125916, + "rewards/margins": -0.000974461785517633, + "rewards/rejected": 0.045119550079107285, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 3.674733877182007, + "learning_rate": 7.208888888888889e-07, + "logits/chosen": 1.7089004516601562, + "logits/rejected": 1.7358248233795166, + "logps/chosen": -155.2206573486328, + "logps/rejected": -144.6761932373047, + "loss": 0.6933175563812256, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.043650850653648376, + "rewards/margins": 0.00010928641859209165, + "rewards/rejected": 0.0435415655374527, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 5.13329553604126, + "learning_rate": 7.119999999999999e-07, + "logits/chosen": 1.5127164125442505, + "logits/rejected": 1.6689109802246094, + "logps/chosen": -150.4583282470703, + "logps/rejected": -147.11000061035156, + "loss": 0.6905129909515381, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.030837291851639748, + "rewards/margins": 0.006133320741355419, + "rewards/rejected": 0.024703968316316605, + "step": 450 + }, + { + "epoch": 0.72, + "eval_logits/chosen": 1.5396348237991333, + "eval_logits/rejected": 1.5696207284927368, + "eval_logps/chosen": -152.88949584960938, + "eval_logps/rejected": -147.522216796875, + "eval_loss": 0.6905900239944458, + "eval_rewards/accuracies": 0.515999972820282, + "eval_rewards/chosen": 0.04396428167819977, + "eval_rewards/margins": 0.00590139627456665, + "eval_rewards/rejected": 0.03806288540363312, + "eval_runtime": 91.0404, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 5.364885330200195, + "learning_rate": 7.031111111111111e-07, + "logits/chosen": 1.5854089260101318, + "logits/rejected": 1.6501134634017944, + "logps/chosen": -137.01268005371094, + "logps/rejected": -160.6153106689453, + "loss": 0.6898775577545166, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.03512059897184372, + "rewards/margins": 0.0070413872599601746, + "rewards/rejected": 0.028079207986593246, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 4.035534381866455, + "learning_rate": 6.942222222222222e-07, + "logits/chosen": 1.4868170022964478, + "logits/rejected": 1.6255724430084229, + "logps/chosen": -142.16026306152344, + "logps/rejected": -128.42076110839844, + "loss": 0.6882358074188233, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0400058776140213, + "rewards/margins": 0.010345013812184334, + "rewards/rejected": 0.029660871252417564, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 4.675441265106201, + "learning_rate": 6.853333333333333e-07, + "logits/chosen": 1.7639102935791016, + "logits/rejected": 1.8540350198745728, + "logps/chosen": -170.15591430664062, + "logps/rejected": -163.63043212890625, + "loss": 0.6906004428863526, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04748428240418434, + "rewards/margins": 0.005763492546975613, + "rewards/rejected": 0.041720788925886154, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 4.720729827880859, + "learning_rate": 6.764444444444444e-07, + "logits/chosen": 1.7105178833007812, + "logits/rejected": 1.640414834022522, + "logps/chosen": -165.30238342285156, + "logps/rejected": -163.19874572753906, + "loss": 0.6960553169250489, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.029804859310388565, + "rewards/margins": -0.004881127271801233, + "rewards/rejected": 0.03468598425388336, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 4.234210968017578, + "learning_rate": 6.675555555555556e-07, + "logits/chosen": 1.6047271490097046, + "logits/rejected": 1.7314249277114868, + "logps/chosen": -136.36837768554688, + "logps/rejected": -139.07589721679688, + "loss": 0.6871551513671875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03045068122446537, + "rewards/margins": 0.012581512331962585, + "rewards/rejected": 0.017869170755147934, + "step": 500 + }, + { + "epoch": 0.8, + "eval_logits/chosen": 1.5329995155334473, + "eval_logits/rejected": 1.562899112701416, + "eval_logps/chosen": -153.01869201660156, + "eval_logps/rejected": -147.65211486816406, + "eval_loss": 0.6905782222747803, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.031044049188494682, + "eval_rewards/margins": 0.005970073863863945, + "eval_rewards/rejected": 0.025073975324630737, + "eval_runtime": 90.9818, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 2.748, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 3.8755218982696533, + "learning_rate": 6.586666666666666e-07, + "logits/chosen": 1.7628934383392334, + "logits/rejected": 1.669203758239746, + "logps/chosen": -165.7239227294922, + "logps/rejected": -151.4439239501953, + "loss": 0.6865688323974609, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04186190292239189, + "rewards/margins": 0.013743969611823559, + "rewards/rejected": 0.028117936104536057, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 4.301093101501465, + "learning_rate": 6.497777777777778e-07, + "logits/chosen": 1.7004272937774658, + "logits/rejected": 1.6940956115722656, + "logps/chosen": -149.85525512695312, + "logps/rejected": -178.81362915039062, + "loss": 0.6878121376037598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.040519606322050095, + "rewards/margins": 0.011822985485196114, + "rewards/rejected": 0.02869662083685398, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 5.140315055847168, + "learning_rate": 6.408888888888888e-07, + "logits/chosen": 1.5849007368087769, + "logits/rejected": 1.6338441371917725, + "logps/chosen": -143.96917724609375, + "logps/rejected": -135.54281616210938, + "loss": 0.684235954284668, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.036442749202251434, + "rewards/margins": 0.0188708808273077, + "rewards/rejected": 0.017571870237588882, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 4.218395709991455, + "learning_rate": 6.319999999999999e-07, + "logits/chosen": 1.4655095338821411, + "logits/rejected": 1.6130340099334717, + "logps/chosen": -128.4731903076172, + "logps/rejected": -149.0254364013672, + "loss": 0.6945743560791016, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.017684206366539, + "rewards/margins": -0.0021602497436106205, + "rewards/rejected": 0.019844455644488335, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 4.782381057739258, + "learning_rate": 6.23111111111111e-07, + "logits/chosen": 1.7393369674682617, + "logits/rejected": 1.806014060974121, + "logps/chosen": -174.6748046875, + "logps/rejected": -184.66175842285156, + "loss": 0.6872735977172851, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.03625861555337906, + "rewards/margins": 0.012612670660018921, + "rewards/rejected": 0.02364594303071499, + "step": 550 + }, + { + "epoch": 0.88, + "eval_logits/chosen": 1.523414134979248, + "eval_logits/rejected": 1.553091287612915, + "eval_logps/chosen": -153.1641387939453, + "eval_logps/rejected": -147.79656982421875, + "eval_loss": 0.6907312273979187, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.016499562188982964, + "eval_rewards/margins": 0.005870947614312172, + "eval_rewards/rejected": 0.010628614574670792, + "eval_runtime": 91.1697, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 4.337022304534912, + "learning_rate": 6.142222222222222e-07, + "logits/chosen": 1.5208700895309448, + "logits/rejected": 1.433935523033142, + "logps/chosen": -145.02027893066406, + "logps/rejected": -151.54751586914062, + "loss": 0.6930646419525146, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020919274538755417, + "rewards/margins": 0.000952291302382946, + "rewards/rejected": 0.019966980442404747, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 4.181249618530273, + "learning_rate": 6.053333333333332e-07, + "logits/chosen": 1.669150710105896, + "logits/rejected": 1.6721159219741821, + "logps/chosen": -149.44937133789062, + "logps/rejected": -136.69595336914062, + "loss": 0.6915061950683594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.021581020206212997, + "rewards/margins": 0.003878307295963168, + "rewards/rejected": 0.017702709883451462, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 5.015549659729004, + "learning_rate": 5.964444444444444e-07, + "logits/chosen": 1.7415902614593506, + "logits/rejected": 1.7086597681045532, + "logps/chosen": -167.40968322753906, + "logps/rejected": -136.7971649169922, + "loss": 0.6886765480041503, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.015860076993703842, + "rewards/margins": 0.009648093953728676, + "rewards/rejected": 0.006211983505636454, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 5.115492343902588, + "learning_rate": 5.875555555555556e-07, + "logits/chosen": 1.6168180704116821, + "logits/rejected": 1.549253225326538, + "logps/chosen": -176.7733612060547, + "logps/rejected": -155.52088928222656, + "loss": 0.6812242984771728, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02709801122546196, + "rewards/margins": 0.024875756353139877, + "rewards/rejected": 0.002222254639491439, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 4.916851043701172, + "learning_rate": 5.786666666666667e-07, + "logits/chosen": 1.6100847721099854, + "logits/rejected": 1.6456801891326904, + "logps/chosen": -162.9514923095703, + "logps/rejected": -154.86952209472656, + "loss": 0.6924587726593018, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.022737273946404457, + "rewards/margins": 0.002692488022148609, + "rewards/rejected": 0.020044784992933273, + "step": 600 + }, + { + "epoch": 0.96, + "eval_logits/chosen": 1.5220483541488647, + "eval_logits/rejected": 1.5517752170562744, + "eval_logps/chosen": -153.1302490234375, + "eval_logps/rejected": -147.76690673828125, + "eval_loss": 0.6905914545059204, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.019888723269104958, + "eval_rewards/margins": 0.006294028367847204, + "eval_rewards/rejected": 0.01359469536691904, + "eval_runtime": 90.9556, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 2.749, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 4.3061909675598145, + "learning_rate": 5.697777777777778e-07, + "logits/chosen": 1.5751426219940186, + "logits/rejected": 1.5668468475341797, + "logps/chosen": -122.49625396728516, + "logps/rejected": -121.03358459472656, + "loss": 0.6872867584228516, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.041438184678554535, + "rewards/margins": 0.012499396689236164, + "rewards/rejected": 0.028938788920640945, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 5.183097839355469, + "learning_rate": 5.608888888888889e-07, + "logits/chosen": 1.279733419418335, + "logits/rejected": 1.3740136623382568, + "logps/chosen": -124.96076965332031, + "logps/rejected": -138.65638732910156, + "loss": 0.6843455314636231, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.021548787131905556, + "rewards/margins": 0.019731000065803528, + "rewards/rejected": 0.0018177882302552462, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 3.398516893386841, + "learning_rate": 5.520000000000001e-07, + "logits/chosen": 1.5581461191177368, + "logits/rejected": 1.4368339776992798, + "logps/chosen": -152.03562927246094, + "logps/rejected": -141.5266571044922, + "loss": 0.6839772701263428, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.03745534271001816, + "rewards/margins": 0.019791865721344948, + "rewards/rejected": 0.01766347326338291, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 4.76785135269165, + "learning_rate": 5.43111111111111e-07, + "logits/chosen": 1.5052144527435303, + "logits/rejected": 1.5997518301010132, + "logps/chosen": -146.1188507080078, + "logps/rejected": -150.7531280517578, + "loss": 0.679119062423706, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04266165569424629, + "rewards/margins": 0.02940245531499386, + "rewards/rejected": 0.013259200379252434, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 5.084848403930664, + "learning_rate": 5.342222222222222e-07, + "logits/chosen": 1.6985509395599365, + "logits/rejected": 1.782636284828186, + "logps/chosen": -147.97036743164062, + "logps/rejected": -160.01596069335938, + "loss": 0.680629301071167, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.04756547138094902, + "rewards/margins": 0.026726100593805313, + "rewards/rejected": 0.020839370787143707, + "step": 650 + }, + { + "epoch": 1.04, + "eval_logits/chosen": 1.5202165842056274, + "eval_logits/rejected": 1.549627661705017, + "eval_logps/chosen": -153.07310485839844, + "eval_logps/rejected": -147.71853637695312, + "eval_loss": 0.6902864575386047, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.025603344663977623, + "eval_rewards/margins": 0.007172676268965006, + "eval_rewards/rejected": 0.018430663272738457, + "eval_runtime": 90.938, + "eval_samples_per_second": 5.498, + "eval_steps_per_second": 2.749, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 4.315349578857422, + "learning_rate": 5.253333333333333e-07, + "logits/chosen": 1.4403737783432007, + "logits/rejected": 1.5927050113677979, + "logps/chosen": -140.49227905273438, + "logps/rejected": -150.4305877685547, + "loss": 0.6848072052001953, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.028764763846993446, + "rewards/margins": 0.01788407936692238, + "rewards/rejected": 0.010880683548748493, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 5.036780834197998, + "learning_rate": 5.164444444444444e-07, + "logits/chosen": 1.5576122999191284, + "logits/rejected": 1.5609056949615479, + "logps/chosen": -152.83335876464844, + "logps/rejected": -153.91134643554688, + "loss": 0.677812910079956, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.042785800993442535, + "rewards/margins": 0.03232557699084282, + "rewards/rejected": 0.010460222139954567, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 4.167594909667969, + "learning_rate": 5.075555555555555e-07, + "logits/chosen": 1.6192195415496826, + "logits/rejected": 1.5309553146362305, + "logps/chosen": -128.08506774902344, + "logps/rejected": -130.7288360595703, + "loss": 0.6876038551330567, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.029848307371139526, + "rewards/margins": 0.012140638194978237, + "rewards/rejected": 0.017707668244838715, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 4.746713638305664, + "learning_rate": 4.986666666666666e-07, + "logits/chosen": 1.7024202346801758, + "logits/rejected": 1.7695732116699219, + "logps/chosen": -175.9063720703125, + "logps/rejected": -174.35122680664062, + "loss": 0.6774589538574218, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05260822921991348, + "rewards/margins": 0.032807059586048126, + "rewards/rejected": 0.019801167771220207, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 5.154026508331299, + "learning_rate": 4.897777777777778e-07, + "logits/chosen": 1.6061038970947266, + "logits/rejected": 1.6249440908432007, + "logps/chosen": -145.95343017578125, + "logps/rejected": -139.83033752441406, + "loss": 0.6705130577087403, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.04269074648618698, + "rewards/margins": 0.04763682931661606, + "rewards/rejected": -0.0049460758455097675, + "step": 700 + }, + { + "epoch": 1.12, + "eval_logits/chosen": 1.52028489112854, + "eval_logits/rejected": 1.549724817276001, + "eval_logps/chosen": -153.03794860839844, + "eval_logps/rejected": -147.68222045898438, + "eval_loss": 0.690391480922699, + "eval_rewards/accuracies": 0.5139999985694885, + "eval_rewards/chosen": 0.029118061065673828, + "eval_rewards/margins": 0.007054829970002174, + "eval_rewards/rejected": 0.022063229233026505, + "eval_runtime": 91.06, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 3.8743929862976074, + "learning_rate": 4.808888888888888e-07, + "logits/chosen": 1.7265201807022095, + "logits/rejected": 1.663900375366211, + "logps/chosen": -175.18624877929688, + "logps/rejected": -144.47412109375, + "loss": 0.6695387840270997, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06449539959430695, + "rewards/margins": 0.04969433322548866, + "rewards/rejected": 0.014801068231463432, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 4.586221218109131, + "learning_rate": 4.7199999999999994e-07, + "logits/chosen": 1.5080691576004028, + "logits/rejected": 1.5206321477890015, + "logps/chosen": -139.26808166503906, + "logps/rejected": -158.47634887695312, + "loss": 0.6808416843414307, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05986329913139343, + "rewards/margins": 0.025978583842515945, + "rewards/rejected": 0.03388471156358719, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 5.567377090454102, + "learning_rate": 4.6311111111111106e-07, + "logits/chosen": 1.6352875232696533, + "logits/rejected": 1.5908689498901367, + "logps/chosen": -160.4707489013672, + "logps/rejected": -163.84034729003906, + "loss": 0.6816823959350586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0627724900841713, + "rewards/margins": 0.024768764153122902, + "rewards/rejected": 0.038003724068403244, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 4.378120422363281, + "learning_rate": 4.5422222222222223e-07, + "logits/chosen": 1.7893062829971313, + "logits/rejected": 1.8099231719970703, + "logps/chosen": -147.02523803710938, + "logps/rejected": -155.3527374267578, + "loss": 0.6705075740814209, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07738201320171356, + "rewards/margins": 0.047473303973674774, + "rewards/rejected": 0.029908711090683937, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 3.968580961227417, + "learning_rate": 4.4533333333333335e-07, + "logits/chosen": 1.6924632787704468, + "logits/rejected": 1.6529285907745361, + "logps/chosen": -165.9091339111328, + "logps/rejected": -143.5700225830078, + "loss": 0.6665099143981934, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05950168892741203, + "rewards/margins": 0.05627988651394844, + "rewards/rejected": 0.0032218091655522585, + "step": 750 + }, + { + "epoch": 1.2, + "eval_logits/chosen": 1.5162402391433716, + "eval_logits/rejected": 1.545271635055542, + "eval_logps/chosen": -152.985107421875, + "eval_logps/rejected": -147.642578125, + "eval_loss": 0.6899175643920898, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": 0.034404147416353226, + "eval_rewards/margins": 0.008377066813409328, + "eval_rewards/rejected": 0.026027081534266472, + "eval_runtime": 91.0476, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 4.258547782897949, + "learning_rate": 4.3644444444444447e-07, + "logits/chosen": 1.5382895469665527, + "logits/rejected": 1.5183976888656616, + "logps/chosen": -155.6405029296875, + "logps/rejected": -137.9111785888672, + "loss": 0.675658893585205, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04784867540001869, + "rewards/margins": 0.036956787109375, + "rewards/rejected": 0.010891887359321117, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 4.439138889312744, + "learning_rate": 4.2755555555555554e-07, + "logits/chosen": 1.60821533203125, + "logits/rejected": 1.6539623737335205, + "logps/chosen": -147.12428283691406, + "logps/rejected": -126.96165466308594, + "loss": 0.6754384994506836, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.061006199568510056, + "rewards/margins": 0.037146344780921936, + "rewards/rejected": 0.023859847337007523, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 4.03473424911499, + "learning_rate": 4.1866666666666666e-07, + "logits/chosen": 1.728029489517212, + "logits/rejected": 1.8161824941635132, + "logps/chosen": -170.85220336914062, + "logps/rejected": -184.57485961914062, + "loss": 0.6782269954681397, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.058054376393556595, + "rewards/margins": 0.032068002969026566, + "rewards/rejected": 0.025986377149820328, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 3.655622959136963, + "learning_rate": 4.097777777777778e-07, + "logits/chosen": 1.7169599533081055, + "logits/rejected": 1.5931851863861084, + "logps/chosen": -147.65869140625, + "logps/rejected": -140.3924560546875, + "loss": 0.6885969638824463, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04908771440386772, + "rewards/margins": 0.011032032780349255, + "rewards/rejected": 0.03805568441748619, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 4.527826309204102, + "learning_rate": 4.008888888888889e-07, + "logits/chosen": 1.4913551807403564, + "logits/rejected": 1.5887629985809326, + "logps/chosen": -142.08096313476562, + "logps/rejected": -137.82150268554688, + "loss": 0.6758649826049805, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.060299746692180634, + "rewards/margins": 0.03790009766817093, + "rewards/rejected": 0.022399652749300003, + "step": 800 + }, + { + "epoch": 1.28, + "eval_logits/chosen": 1.518919825553894, + "eval_logits/rejected": 1.5479708909988403, + "eval_logps/chosen": -152.8915557861328, + "eval_logps/rejected": -147.5607452392578, + "eval_loss": 0.6894462704658508, + "eval_rewards/accuracies": 0.5120000243186951, + "eval_rewards/chosen": 0.04375747963786125, + "eval_rewards/margins": 0.009548054076731205, + "eval_rewards/rejected": 0.03420942649245262, + "eval_runtime": 91.0636, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 4.380261421203613, + "learning_rate": 3.92e-07, + "logits/chosen": 1.4656002521514893, + "logits/rejected": 1.5140694379806519, + "logps/chosen": -164.12106323242188, + "logps/rejected": -142.29354858398438, + "loss": 0.681245994567871, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.062036655843257904, + "rewards/margins": 0.025710636749863625, + "rewards/rejected": 0.03632602095603943, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 5.250955581665039, + "learning_rate": 3.831111111111111e-07, + "logits/chosen": 1.367980718612671, + "logits/rejected": 1.5478198528289795, + "logps/chosen": -140.98110961914062, + "logps/rejected": -138.2202911376953, + "loss": 0.6749699592590332, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.04839102178812027, + "rewards/margins": 0.03896191343665123, + "rewards/rejected": 0.00942910648882389, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 4.680771827697754, + "learning_rate": 3.742222222222222e-07, + "logits/chosen": 1.5844545364379883, + "logits/rejected": 1.6176366806030273, + "logps/chosen": -145.24916076660156, + "logps/rejected": -133.55044555664062, + "loss": 0.6737788677215576, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07001302391290665, + "rewards/margins": 0.04148613661527634, + "rewards/rejected": 0.02852689102292061, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 5.02022123336792, + "learning_rate": 3.653333333333333e-07, + "logits/chosen": 1.640952467918396, + "logits/rejected": 1.5505130290985107, + "logps/chosen": -137.98764038085938, + "logps/rejected": -125.2115249633789, + "loss": 0.6780137062072754, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.08490036427974701, + "rewards/margins": 0.03257036954164505, + "rewards/rejected": 0.05232999473810196, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 4.4920525550842285, + "learning_rate": 3.5644444444444444e-07, + "logits/chosen": 1.5515003204345703, + "logits/rejected": 1.5913991928100586, + "logps/chosen": -132.81130981445312, + "logps/rejected": -137.0764923095703, + "loss": 0.6755705356597901, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07374271750450134, + "rewards/margins": 0.03724803403019905, + "rewards/rejected": 0.03649468347430229, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_logits/chosen": 1.5257093906402588, + "eval_logits/rejected": 1.554529070854187, + "eval_logps/chosen": -152.75559997558594, + "eval_logps/rejected": -147.43365478515625, + "eval_loss": 0.6891025304794312, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.05735430866479874, + "eval_rewards/margins": 0.010435618460178375, + "eval_rewards/rejected": 0.04691869020462036, + "eval_runtime": 91.0584, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 4.146276473999023, + "learning_rate": 3.4755555555555556e-07, + "logits/chosen": 1.48005211353302, + "logits/rejected": 1.6272552013397217, + "logps/chosen": -147.8726043701172, + "logps/rejected": -128.67236328125, + "loss": 0.6757299423217773, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07889878004789352, + "rewards/margins": 0.03726017475128174, + "rewards/rejected": 0.041638605296611786, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 4.959987163543701, + "learning_rate": 3.386666666666667e-07, + "logits/chosen": 1.6901594400405884, + "logits/rejected": 1.6778481006622314, + "logps/chosen": -155.42953491210938, + "logps/rejected": -155.63232421875, + "loss": 0.6713708400726318, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07203079760074615, + "rewards/margins": 0.04540194571018219, + "rewards/rejected": 0.026628846302628517, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 4.350543975830078, + "learning_rate": 3.2977777777777775e-07, + "logits/chosen": 1.6911184787750244, + "logits/rejected": 1.6407556533813477, + "logps/chosen": -138.36717224121094, + "logps/rejected": -148.633056640625, + "loss": 0.6685135364532471, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.08504179120063782, + "rewards/margins": 0.051595449447631836, + "rewards/rejected": 0.03344634547829628, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 4.636990070343018, + "learning_rate": 3.2088888888888887e-07, + "logits/chosen": 1.6906163692474365, + "logits/rejected": 1.5781571865081787, + "logps/chosen": -155.74984741210938, + "logps/rejected": -144.20733642578125, + "loss": 0.667927598953247, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09189029037952423, + "rewards/margins": 0.05316054821014404, + "rewards/rejected": 0.03872973471879959, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 3.7753641605377197, + "learning_rate": 3.12e-07, + "logits/chosen": 1.4137648344039917, + "logits/rejected": 1.5492877960205078, + "logps/chosen": -136.68003845214844, + "logps/rejected": -167.0130157470703, + "loss": 0.6825860977172852, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.07187347114086151, + "rewards/margins": 0.02330738678574562, + "rewards/rejected": 0.04856608435511589, + "step": 900 + }, + { + "epoch": 1.44, + "eval_logits/chosen": 1.5240751504898071, + "eval_logits/rejected": 1.5527867078781128, + "eval_logps/chosen": -152.74700927734375, + "eval_logps/rejected": -147.4285888671875, + "eval_loss": 0.6890708208084106, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": 0.05821174010634422, + "eval_rewards/margins": 0.010788210667669773, + "eval_rewards/rejected": 0.047423530369997025, + "eval_runtime": 90.9575, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 2.749, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 4.123291492462158, + "learning_rate": 3.031111111111111e-07, + "logits/chosen": 1.6012052297592163, + "logits/rejected": 1.553945779800415, + "logps/chosen": -132.0549774169922, + "logps/rejected": -128.9656982421875, + "loss": 0.6811869621276856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06877180188894272, + "rewards/margins": 0.02542707696557045, + "rewards/rejected": 0.04334472864866257, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 4.015886306762695, + "learning_rate": 2.9422222222222223e-07, + "logits/chosen": 1.7002413272857666, + "logits/rejected": 1.7617849111557007, + "logps/chosen": -138.71337890625, + "logps/rejected": -153.83413696289062, + "loss": 0.6822823524475098, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0791015774011612, + "rewards/margins": 0.023635607212781906, + "rewards/rejected": 0.05546595901250839, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 4.11157751083374, + "learning_rate": 2.853333333333333e-07, + "logits/chosen": 1.5699821710586548, + "logits/rejected": 1.6132333278656006, + "logps/chosen": -131.09051513671875, + "logps/rejected": -121.78560638427734, + "loss": 0.6841318130493164, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.06592516601085663, + "rewards/margins": 0.02018926665186882, + "rewards/rejected": 0.04573589563369751, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 4.930734634399414, + "learning_rate": 2.764444444444444e-07, + "logits/chosen": 1.6620748043060303, + "logits/rejected": 1.6714975833892822, + "logps/chosen": -148.5550537109375, + "logps/rejected": -152.5007781982422, + "loss": 0.6815722942352295, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07848857343196869, + "rewards/margins": 0.025429734960198402, + "rewards/rejected": 0.053058840334415436, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 5.25533390045166, + "learning_rate": 2.6755555555555553e-07, + "logits/chosen": 1.7330894470214844, + "logits/rejected": 1.7220786809921265, + "logps/chosen": -164.49046325683594, + "logps/rejected": -163.75827026367188, + "loss": 0.6790725708007812, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.07422961294651031, + "rewards/margins": 0.02951725758612156, + "rewards/rejected": 0.044712357223033905, + "step": 950 + }, + { + "epoch": 1.52, + "eval_logits/chosen": 1.5236308574676514, + "eval_logits/rejected": 1.552416205406189, + "eval_logps/chosen": -152.75738525390625, + "eval_logps/rejected": -147.43734741210938, + "eval_loss": 0.6891617178916931, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": 0.05717539042234421, + "eval_rewards/margins": 0.010625330731272697, + "eval_rewards/rejected": 0.04655005782842636, + "eval_runtime": 90.991, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 2.748, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 4.0215373039245605, + "learning_rate": 2.5866666666666665e-07, + "logits/chosen": 1.7435601949691772, + "logits/rejected": 1.7673505544662476, + "logps/chosen": -176.63160705566406, + "logps/rejected": -160.36460876464844, + "loss": 0.6746974945068359, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0737513080239296, + "rewards/margins": 0.0406021922826767, + "rewards/rejected": 0.0331491120159626, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 4.616787433624268, + "learning_rate": 2.4977777777777777e-07, + "logits/chosen": 1.555336356163025, + "logits/rejected": 1.5377695560455322, + "logps/chosen": -144.1389923095703, + "logps/rejected": -159.7912139892578, + "loss": 0.680583906173706, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06359116733074188, + "rewards/margins": 0.027758348733186722, + "rewards/rejected": 0.03583281859755516, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 4.0511579513549805, + "learning_rate": 2.408888888888889e-07, + "logits/chosen": 1.5684994459152222, + "logits/rejected": 1.6245285272598267, + "logps/chosen": -156.8944549560547, + "logps/rejected": -148.4877166748047, + "loss": 0.6721893787384033, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.07115811854600906, + "rewards/margins": 0.04428264498710632, + "rewards/rejected": 0.026875469833612442, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 5.074681758880615, + "learning_rate": 2.32e-07, + "logits/chosen": 1.62103271484375, + "logits/rejected": 1.6313188076019287, + "logps/chosen": -143.34356689453125, + "logps/rejected": -140.15875244140625, + "loss": 0.6764451503753662, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06751976162195206, + "rewards/margins": 0.03728804737329483, + "rewards/rejected": 0.03023170866072178, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 5.334754943847656, + "learning_rate": 2.231111111111111e-07, + "logits/chosen": 1.6107559204101562, + "logits/rejected": 1.4455175399780273, + "logps/chosen": -148.34190368652344, + "logps/rejected": -142.6099090576172, + "loss": 0.6792704105377197, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04247204214334488, + "rewards/margins": 0.029789533466100693, + "rewards/rejected": 0.012682514265179634, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_logits/chosen": 1.5214487314224243, + "eval_logits/rejected": 1.5501652956008911, + "eval_logps/chosen": -152.7904510498047, + "eval_logps/rejected": -147.47216796875, + "eval_loss": 0.6890739798545837, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": 0.053869131952524185, + "eval_rewards/margins": 0.010802755132317543, + "eval_rewards/rejected": 0.04306638240814209, + "eval_runtime": 91.0136, + "eval_samples_per_second": 5.494, + "eval_steps_per_second": 2.747, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 4.7201948165893555, + "learning_rate": 2.1422222222222223e-07, + "logits/chosen": 1.5747811794281006, + "logits/rejected": 1.7228872776031494, + "logps/chosen": -149.9728546142578, + "logps/rejected": -170.13587951660156, + "loss": 0.6746612071990967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.07195371389389038, + "rewards/margins": 0.03933341056108475, + "rewards/rejected": 0.03262030705809593, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 3.240947723388672, + "learning_rate": 2.0533333333333332e-07, + "logits/chosen": 1.547649621963501, + "logits/rejected": 1.5779509544372559, + "logps/chosen": -118.39051818847656, + "logps/rejected": -139.14511108398438, + "loss": 0.6755857944488526, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04843420907855034, + "rewards/margins": 0.03727344423532486, + "rewards/rejected": 0.011160760186612606, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 5.555476188659668, + "learning_rate": 1.9644444444444444e-07, + "logits/chosen": 1.6104564666748047, + "logits/rejected": 1.6452620029449463, + "logps/chosen": -147.3595733642578, + "logps/rejected": -138.03305053710938, + "loss": 0.6876121520996094, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.057530391961336136, + "rewards/margins": 0.013370493426918983, + "rewards/rejected": 0.0441599003970623, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 4.597150802612305, + "learning_rate": 1.8755555555555556e-07, + "logits/chosen": 1.5648972988128662, + "logits/rejected": 1.4944285154342651, + "logps/chosen": -139.306396484375, + "logps/rejected": -154.79624938964844, + "loss": 0.678040885925293, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05991698056459427, + "rewards/margins": 0.034894365817308426, + "rewards/rejected": 0.025022611021995544, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 6.591237545013428, + "learning_rate": 1.7866666666666665e-07, + "logits/chosen": 1.6720062494277954, + "logits/rejected": 1.6317745447158813, + "logps/chosen": -140.55374145507812, + "logps/rejected": -166.04254150390625, + "loss": 0.6782684803009034, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.056691985577344894, + "rewards/margins": 0.03214425593614578, + "rewards/rejected": 0.024547729641199112, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_logits/chosen": 1.516204833984375, + "eval_logits/rejected": 1.545017957687378, + "eval_logps/chosen": -152.87672424316406, + "eval_logps/rejected": -147.5528106689453, + "eval_loss": 0.689375638961792, + "eval_rewards/accuracies": 0.5180000066757202, + "eval_rewards/chosen": 0.04524260014295578, + "eval_rewards/margins": 0.010239595547318459, + "eval_rewards/rejected": 0.03500300645828247, + "eval_runtime": 91.0411, + "eval_samples_per_second": 5.492, + "eval_steps_per_second": 2.746, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 4.6675896644592285, + "learning_rate": 1.6977777777777777e-07, + "logits/chosen": 1.3496609926223755, + "logits/rejected": 1.4482405185699463, + "logps/chosen": -145.14395141601562, + "logps/rejected": -147.69923400878906, + "loss": 0.6717345714569092, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05466550588607788, + "rewards/margins": 0.04511015862226486, + "rewards/rejected": 0.009555344469845295, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 5.049563884735107, + "learning_rate": 1.608888888888889e-07, + "logits/chosen": 1.8515784740447998, + "logits/rejected": 1.6993423700332642, + "logps/chosen": -159.16888427734375, + "logps/rejected": -181.34690856933594, + "loss": 0.6704267978668212, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07823075354099274, + "rewards/margins": 0.04844868183135986, + "rewards/rejected": 0.029782067984342575, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 3.619241714477539, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 1.6090354919433594, + "logits/rejected": 1.626612901687622, + "logps/chosen": -154.02679443359375, + "logps/rejected": -142.3501434326172, + "loss": 0.6738579273223877, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.07384214550256729, + "rewards/margins": 0.04149458929896355, + "rewards/rejected": 0.032347556203603745, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 4.203415393829346, + "learning_rate": 1.431111111111111e-07, + "logits/chosen": 1.6636466979980469, + "logits/rejected": 1.7839996814727783, + "logps/chosen": -151.83984375, + "logps/rejected": -156.90054321289062, + "loss": 0.6809844017028809, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07428060472011566, + "rewards/margins": 0.02610836923122406, + "rewards/rejected": 0.0481722429394722, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 5.443735122680664, + "learning_rate": 1.342222222222222e-07, + "logits/chosen": 1.6737174987792969, + "logits/rejected": 1.6681289672851562, + "logps/chosen": -170.30580139160156, + "logps/rejected": -123.48567199707031, + "loss": 0.6710718631744385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.052741169929504395, + "rewards/margins": 0.04700620472431183, + "rewards/rejected": 0.0057349675334990025, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_logits/chosen": 1.5129518508911133, + "eval_logits/rejected": 1.5416380167007446, + "eval_logps/chosen": -152.91659545898438, + "eval_logps/rejected": -147.5920867919922, + "eval_loss": 0.6894625425338745, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.0412554033100605, + "eval_rewards/margins": 0.010181105695664883, + "eval_rewards/rejected": 0.031074294820427895, + "eval_runtime": 91.2168, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 2.741, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 4.157502174377441, + "learning_rate": 1.2533333333333332e-07, + "logits/chosen": 1.6970571279525757, + "logits/rejected": 1.6661456823349, + "logps/chosen": -155.0122528076172, + "logps/rejected": -157.14593505859375, + "loss": 0.6756897926330566, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.06435829401016235, + "rewards/margins": 0.03779596835374832, + "rewards/rejected": 0.02656233310699463, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 3.509523868560791, + "learning_rate": 1.1644444444444444e-07, + "logits/chosen": 1.5473051071166992, + "logits/rejected": 1.6796495914459229, + "logps/chosen": -142.53172302246094, + "logps/rejected": -155.2112579345703, + "loss": 0.6751244068145752, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06569470465183258, + "rewards/margins": 0.03866187483072281, + "rewards/rejected": 0.02703283168375492, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 4.258018970489502, + "learning_rate": 1.0755555555555556e-07, + "logits/chosen": 1.6674772500991821, + "logits/rejected": 1.740517258644104, + "logps/chosen": -143.2890625, + "logps/rejected": -144.5006103515625, + "loss": 0.6744827270507813, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.049980729818344116, + "rewards/margins": 0.039941221475601196, + "rewards/rejected": 0.01003950648009777, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 4.8917388916015625, + "learning_rate": 9.866666666666666e-08, + "logits/chosen": 1.4226986169815063, + "logits/rejected": 1.497089147567749, + "logps/chosen": -132.8553009033203, + "logps/rejected": -132.77996826171875, + "loss": 0.6772085189819336, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.046212293207645416, + "rewards/margins": 0.03522529453039169, + "rewards/rejected": 0.010986998677253723, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 5.875709533691406, + "learning_rate": 8.977777777777777e-08, + "logits/chosen": 1.5243072509765625, + "logits/rejected": 1.6050777435302734, + "logps/chosen": -145.8072967529297, + "logps/rejected": -157.0274200439453, + "loss": 0.6806241035461426, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05913332849740982, + "rewards/margins": 0.02726324275135994, + "rewards/rejected": 0.03187008947134018, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_logits/chosen": 1.5127122402191162, + "eval_logits/rejected": 1.5414215326309204, + "eval_logps/chosen": -152.91351318359375, + "eval_logps/rejected": -147.59005737304688, + "eval_loss": 0.6894447207450867, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": 0.04156311973929405, + "eval_rewards/margins": 0.010284863412380219, + "eval_rewards/rejected": 0.031278256326913834, + "eval_runtime": 91.0625, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.745, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 4.677998065948486, + "learning_rate": 8.088888888888888e-08, + "logits/chosen": 1.4877736568450928, + "logits/rejected": 1.5148862600326538, + "logps/chosen": -125.41712951660156, + "logps/rejected": -151.0792999267578, + "loss": 0.6776061058044434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.052109260112047195, + "rewards/margins": 0.03345213457942009, + "rewards/rejected": 0.018657123669981956, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 6.006388187408447, + "learning_rate": 7.2e-08, + "logits/chosen": 1.7018489837646484, + "logits/rejected": 1.6689262390136719, + "logps/chosen": -148.28671264648438, + "logps/rejected": -147.13995361328125, + "loss": 0.6749439239501953, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.056884974241256714, + "rewards/margins": 0.038883306086063385, + "rewards/rejected": 0.01800166629254818, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 4.416358947753906, + "learning_rate": 6.311111111111112e-08, + "logits/chosen": 1.5913951396942139, + "logits/rejected": 1.6728503704071045, + "logps/chosen": -161.4684295654297, + "logps/rejected": -155.4317626953125, + "loss": 0.67286696434021, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08693180978298187, + "rewards/margins": 0.04347502067685127, + "rewards/rejected": 0.0434567965567112, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 4.496129035949707, + "learning_rate": 5.4222222222222216e-08, + "logits/chosen": 1.66684091091156, + "logits/rejected": 1.563511848449707, + "logps/chosen": -146.62295532226562, + "logps/rejected": -129.34072875976562, + "loss": 0.6853626251220704, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.05792286992073059, + "rewards/margins": 0.018264759331941605, + "rewards/rejected": 0.03965810686349869, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 4.628019332885742, + "learning_rate": 4.5333333333333336e-08, + "logits/chosen": 1.7843306064605713, + "logits/rejected": 1.620391607284546, + "logps/chosen": -143.50228881835938, + "logps/rejected": -154.37442016601562, + "loss": 0.6730541706085205, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.06739021092653275, + "rewards/margins": 0.043615005910396576, + "rewards/rejected": 0.023775208741426468, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_logits/chosen": 1.5122698545455933, + "eval_logits/rejected": 1.5408285856246948, + "eval_logps/chosen": -152.92884826660156, + "eval_logps/rejected": -147.60997009277344, + "eval_loss": 0.6892240643501282, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": 0.04003090038895607, + "eval_rewards/margins": 0.010743732564151287, + "eval_rewards/rejected": 0.02928716316819191, + "eval_runtime": 91.1446, + "eval_samples_per_second": 5.486, + "eval_steps_per_second": 2.743, + "step": 1200 + }, + { + "epoch": 1.936, + "grad_norm": 4.912255764007568, + "learning_rate": 3.644444444444444e-08, + "logits/chosen": 1.5628576278686523, + "logits/rejected": 1.6063191890716553, + "logps/chosen": -142.47982788085938, + "logps/rejected": -144.87289428710938, + "loss": 0.6799941062927246, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.05753699690103531, + "rewards/margins": 0.028712283819913864, + "rewards/rejected": 0.028824711218476295, + "step": 1210 + }, + { + "epoch": 1.952, + "grad_norm": 4.883981704711914, + "learning_rate": 2.7555555555555555e-08, + "logits/chosen": 1.459054946899414, + "logits/rejected": 1.6200172901153564, + "logps/chosen": -136.9459228515625, + "logps/rejected": -164.0298309326172, + "loss": 0.6825344562530518, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.04566190391778946, + "rewards/margins": 0.023402264341711998, + "rewards/rejected": 0.02225964143872261, + "step": 1220 + }, + { + "epoch": 1.968, + "grad_norm": 4.902575969696045, + "learning_rate": 1.866666666666667e-08, + "logits/chosen": 1.7254364490509033, + "logits/rejected": 1.588733434677124, + "logps/chosen": -162.27706909179688, + "logps/rejected": -145.51052856445312, + "loss": 0.6757502079010009, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05863445997238159, + "rewards/margins": 0.03770970553159714, + "rewards/rejected": 0.020924758166074753, + "step": 1230 + }, + { + "epoch": 1.984, + "grad_norm": 3.9780874252319336, + "learning_rate": 9.777777777777777e-09, + "logits/chosen": 1.567268967628479, + "logits/rejected": 1.4752018451690674, + "logps/chosen": -146.47994995117188, + "logps/rejected": -141.8123016357422, + "loss": 0.6737892627716064, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.06928073614835739, + "rewards/margins": 0.04141712933778763, + "rewards/rejected": 0.02786361239850521, + "step": 1240 + }, + { + "epoch": 2.0, + "grad_norm": 4.694293022155762, + "learning_rate": 8.888888888888889e-10, + "logits/chosen": 1.6857208013534546, + "logits/rejected": 1.648901343345642, + "logps/chosen": -178.7536163330078, + "logps/rejected": -179.25527954101562, + "loss": 0.6749695301055908, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.05160802602767944, + "rewards/margins": 0.03985036164522171, + "rewards/rejected": 0.011757662519812584, + "step": 1250 + }, + { + "epoch": 2.0, + "eval_logits/chosen": 1.5123344659805298, + "eval_logits/rejected": 1.5409704446792603, + "eval_logps/chosen": -152.93345642089844, + "eval_logps/rejected": -147.61294555664062, + "eval_loss": 0.6893199682235718, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": 0.03956935182213783, + "eval_rewards/margins": 0.010580410249531269, + "eval_rewards/rejected": 0.02898894064128399, + "eval_runtime": 91.0523, + "eval_samples_per_second": 5.491, + "eval_steps_per_second": 2.746, + "step": 1250 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-1250/training_args.bin b/v5/DPO/DPO_5k/lora/checkpoint-1250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cb6e403b06e05c65a94488c31a57e3448aa1a628 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-1250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2949ac3bd9315d3a45a1d086fec2301ea7bd1dab4938d70cfd24209203d51940 +size 6161 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/README.md b/v5/DPO/DPO_5k/lora/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..250500f798b4682b2cd2c35cd1fc366677c215dd --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- dpo +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_config.json b/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..11512c97a55a2d441704a9e11460444b5019509a --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "o_proj", + "up_proj", + "v_proj", + "k_proj", + "q_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_model.safetensors b/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23653a35bcc9bc5cd525755e201dccde768c178a --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d1766c6c831e4681b14adf5735152cfcbab91d1a5b7ea384c85a5e149eb6ea7 +size 180385008 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/chat_template.jinja b/v5/DPO/DPO_5k/lora/checkpoint-300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/optimizer.pt b/v5/DPO/DPO_5k/lora/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff2abf18ebcbd8a4cbb509a859745ac0d3aca5fd --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb8bf3e35172eb41f0788a222ffcd4647084512407555304d095396a1fec689e +size 360902475 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/rng_state.pth b/v5/DPO/DPO_5k/lora/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a787d39b15181d020a94083fddcfaff5ca9eaeca --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:480389ce7f683504c393112df2c8045b3bbba2e7bfbed923d3dbd1ed09e2f087 +size 14645 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/scaler.pt b/v5/DPO/DPO_5k/lora/checkpoint-300/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..71791c5267763e63e87c2a333d975d350199ec89 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddee6ac37719064ff2852a986d303329a21232e06866785b3779dd0020c6b090 +size 1383 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/scheduler.pt b/v5/DPO/DPO_5k/lora/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..34826f96dc0a5b799dc58fe53d683ccbd7349068 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95a41dd9891fa5b90b75c52502ecb493479a9c58e923ba3009b415ebb53cda2 +size 1465 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer.json b/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer_config.json b/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/trainer_state.json b/v5/DPO/DPO_5k/lora/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1c92a80332ccd37ef8b8fe30024f4dbb273e8a --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/trainer_state.json @@ -0,0 +1,580 @@ +{ + "best_global_step": 300, + "best_metric": 0.5440000295639038, + "best_model_checkpoint": "output/lora/checkpoint-300", + "epoch": 0.48, + "eval_steps": 50, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 4.368600368499756, + "learning_rate": 7.2e-08, + "logits/chosen": 1.684491515159607, + "logits/rejected": 1.6000019311904907, + "logps/chosen": -145.20462036132812, + "logps/rejected": -150.64056396484375, + "loss": 0.6933496475219727, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.00038564440910704434, + "rewards/margins": -0.0003992128185927868, + "rewards/rejected": 1.356836855848087e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 5.3214850425720215, + "learning_rate": 1.5199999999999998e-07, + "logits/chosen": 1.3753983974456787, + "logits/rejected": 1.4558300971984863, + "logps/chosen": -120.09315490722656, + "logps/rejected": -133.41905212402344, + "loss": 0.6935864925384522, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.0006091356044635177, + "rewards/margins": -0.0008693885756656528, + "rewards/rejected": 0.0002602529712021351, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 4.661340713500977, + "learning_rate": 2.32e-07, + "logits/chosen": 1.5848007202148438, + "logits/rejected": 1.744507074356079, + "logps/chosen": -161.58753967285156, + "logps/rejected": -178.603271484375, + "loss": 0.6937230110168457, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.002343270927667618, + "rewards/margins": -0.001140077132731676, + "rewards/rejected": -0.0012031936785206199, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 5.193538188934326, + "learning_rate": 3.12e-07, + "logits/chosen": 1.6050277948379517, + "logits/rejected": 1.534880518913269, + "logps/chosen": -151.981689453125, + "logps/rejected": -150.1208038330078, + "loss": 0.6932186126708985, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0018907630583271384, + "rewards/margins": -0.00012836574751418084, + "rewards/rejected": -0.0017623973544687033, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 4.683797359466553, + "learning_rate": 3.92e-07, + "logits/chosen": 1.734514594078064, + "logits/rejected": 1.7892601490020752, + "logps/chosen": -169.11004638671875, + "logps/rejected": -156.22427368164062, + "loss": 0.692991828918457, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.00029434924363158643, + "rewards/margins": 0.0003237081109546125, + "rewards/rejected": -2.9358878236962482e-05, + "step": 50 + }, + { + "epoch": 0.08, + "eval_logits/chosen": 1.5363190174102783, + "eval_logits/rejected": 1.567551612854004, + "eval_logps/chosen": -153.31736755371094, + "eval_logps/rejected": -147.88914489746094, + "eval_loss": 0.6932514905929565, + "eval_rewards/accuracies": 0.47600001096725464, + "eval_rewards/chosen": 0.0011768279364332557, + "eval_rewards/margins": -0.00019350247748661786, + "eval_rewards/rejected": 0.0013703303411602974, + "eval_runtime": 91.1759, + "eval_samples_per_second": 5.484, + "eval_steps_per_second": 2.742, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 4.66879940032959, + "learning_rate": 4.7199999999999994e-07, + "logits/chosen": 1.8444726467132568, + "logits/rejected": 1.8203474283218384, + "logps/chosen": -158.23243713378906, + "logps/rejected": -149.02316284179688, + "loss": 0.6930979251861572, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.002846779767423868, + "rewards/margins": 0.00011160141002619639, + "rewards/rejected": 0.0027351784519851208, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 3.876270294189453, + "learning_rate": 5.520000000000001e-07, + "logits/chosen": 1.8535444736480713, + "logits/rejected": 1.7816137075424194, + "logps/chosen": -157.98268127441406, + "logps/rejected": -164.66925048828125, + "loss": 0.6924784183502197, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.006634838879108429, + "rewards/margins": 0.0013595198979601264, + "rewards/rejected": 0.005275317933410406, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 4.669241428375244, + "learning_rate": 6.319999999999999e-07, + "logits/chosen": 1.5538957118988037, + "logits/rejected": 1.5381535291671753, + "logps/chosen": -145.74713134765625, + "logps/rejected": -137.40780639648438, + "loss": 0.6929487705230712, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.009571035392582417, + "rewards/margins": 0.000421993900090456, + "rewards/rejected": 0.009149041026830673, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 4.3166022300720215, + "learning_rate": 7.119999999999999e-07, + "logits/chosen": 1.5454356670379639, + "logits/rejected": 1.5363503694534302, + "logps/chosen": -162.4505157470703, + "logps/rejected": -159.41574096679688, + "loss": 0.6919499397277832, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.011575761251151562, + "rewards/margins": 0.0024396872613579035, + "rewards/rejected": 0.009136073291301727, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 4.640413284301758, + "learning_rate": 7.92e-07, + "logits/chosen": 1.6131670475006104, + "logits/rejected": 1.673753023147583, + "logps/chosen": -142.8424530029297, + "logps/rejected": -165.93234252929688, + "loss": 0.6919528007507324, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.014428429305553436, + "rewards/margins": 0.0024503350723534822, + "rewards/rejected": 0.011978095397353172, + "step": 100 + }, + { + "epoch": 0.16, + "eval_logits/chosen": 1.5355972051620483, + "eval_logits/rejected": 1.5665204524993896, + "eval_logps/chosen": -153.1815948486328, + "eval_logps/rejected": -147.7570037841797, + "eval_loss": 0.6931047439575195, + "eval_rewards/accuracies": 0.46000000834465027, + "eval_rewards/chosen": 0.014755296520888805, + "eval_rewards/margins": 0.00017206119082402438, + "eval_rewards/rejected": 0.01458323560655117, + "eval_runtime": 91.1022, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 4.072097301483154, + "learning_rate": 8.72e-07, + "logits/chosen": 1.5775041580200195, + "logits/rejected": 1.6383779048919678, + "logps/chosen": -143.57952880859375, + "logps/rejected": -137.651611328125, + "loss": 0.6926439762115478, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.01751135289669037, + "rewards/margins": 0.0010686519090086222, + "rewards/rejected": 0.01644269935786724, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 4.417011260986328, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5828511714935303, + "logits/rejected": 1.6531331539154053, + "logps/chosen": -142.53514099121094, + "logps/rejected": -142.88226318359375, + "loss": 0.6947136402130127, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.012874701991677284, + "rewards/margins": -0.0030476213432848454, + "rewards/rejected": 0.015922321006655693, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 5.043814182281494, + "learning_rate": 9.964444444444445e-07, + "logits/chosen": 1.7005653381347656, + "logits/rejected": 1.8352782726287842, + "logps/chosen": -155.3563995361328, + "logps/rejected": -151.04742431640625, + "loss": 0.693049955368042, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004944052547216415, + "rewards/margins": 0.000274717720458284, + "rewards/rejected": 0.004669335670769215, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 4.280579566955566, + "learning_rate": 9.875555555555555e-07, + "logits/chosen": 1.4931142330169678, + "logits/rejected": 1.5797803401947021, + "logps/chosen": -134.4127655029297, + "logps/rejected": -132.8173065185547, + "loss": 0.6921967983245849, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -4.65536504634656e-05, + "rewards/margins": 0.001995303900912404, + "rewards/rejected": -0.002041857223957777, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 3.553212881088257, + "learning_rate": 9.786666666666666e-07, + "logits/chosen": 1.7535524368286133, + "logits/rejected": 1.7318353652954102, + "logps/chosen": -141.84011840820312, + "logps/rejected": -140.5338592529297, + "loss": 0.6925621032714844, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.006464059464633465, + "rewards/margins": 0.0012660837965086102, + "rewards/rejected": 0.005197975784540176, + "step": 150 + }, + { + "epoch": 0.24, + "eval_logits/chosen": 1.527831792831421, + "eval_logits/rejected": 1.55862557888031, + "eval_logps/chosen": -153.29705810546875, + "eval_logps/rejected": -147.86842346191406, + "eval_loss": 0.6933275461196899, + "eval_rewards/accuracies": 0.49399998784065247, + "eval_rewards/chosen": 0.003206671681255102, + "eval_rewards/margins": -0.000235457657254301, + "eval_rewards/rejected": 0.0034421291202306747, + "eval_runtime": 91.0798, + "eval_samples_per_second": 5.49, + "eval_steps_per_second": 2.745, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 5.0327606201171875, + "learning_rate": 9.697777777777776e-07, + "logits/chosen": 1.7338924407958984, + "logits/rejected": 1.6693298816680908, + "logps/chosen": -163.9834747314453, + "logps/rejected": -147.54122924804688, + "loss": 0.6920580387115478, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.010300886817276478, + "rewards/margins": 0.0022598577197641134, + "rewards/rejected": 0.00804102886468172, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 4.203429222106934, + "learning_rate": 9.608888888888888e-07, + "logits/chosen": 1.548438310623169, + "logits/rejected": 1.608687400817871, + "logps/chosen": -140.65548706054688, + "logps/rejected": -124.45481872558594, + "loss": 0.693133544921875, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011312992312014103, + "rewards/margins": 0.00010961303632939234, + "rewards/rejected": 0.011203380301594734, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 3.8275039196014404, + "learning_rate": 9.52e-07, + "logits/chosen": 1.5688340663909912, + "logits/rejected": 1.5681618452072144, + "logps/chosen": -153.26898193359375, + "logps/rejected": -154.38824462890625, + "loss": 0.6913642883300781, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.018832791596651077, + "rewards/margins": 0.0036639694590121508, + "rewards/rejected": 0.015168821439146996, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 4.507416725158691, + "learning_rate": 9.431111111111111e-07, + "logits/chosen": 1.6990807056427002, + "logits/rejected": 1.646045446395874, + "logps/chosen": -156.6995849609375, + "logps/rejected": -155.2141876220703, + "loss": 0.6934223651885987, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.027501707896590233, + "rewards/margins": -0.00040446725324727595, + "rewards/rejected": 0.027906173840165138, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 4.723247051239014, + "learning_rate": 9.342222222222221e-07, + "logits/chosen": 1.4298136234283447, + "logits/rejected": 1.6043508052825928, + "logps/chosen": -126.81380462646484, + "logps/rejected": -133.35108947753906, + "loss": 0.688706636428833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.03327309712767601, + "rewards/margins": 0.009079854004085064, + "rewards/rejected": 0.024193240329623222, + "step": 200 + }, + { + "epoch": 0.32, + "eval_logits/chosen": 1.5382241010665894, + "eval_logits/rejected": 1.5689103603363037, + "eval_logps/chosen": -152.9728240966797, + "eval_logps/rejected": -147.55966186523438, + "eval_loss": 0.6926390528678894, + "eval_rewards/accuracies": 0.4880000054836273, + "eval_rewards/chosen": 0.03563162684440613, + "eval_rewards/margins": 0.0013116379268467426, + "eval_rewards/rejected": 0.03431998938322067, + "eval_runtime": 91.1085, + "eval_samples_per_second": 5.488, + "eval_steps_per_second": 2.744, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 5.396594047546387, + "learning_rate": 9.253333333333333e-07, + "logits/chosen": 1.7294985055923462, + "logits/rejected": 1.6115707159042358, + "logps/chosen": -150.76341247558594, + "logps/rejected": -126.10733795166016, + "loss": 0.6902324199676514, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.043607715517282486, + "rewards/margins": 0.0060890489257872105, + "rewards/rejected": 0.037518661469221115, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 4.06983757019043, + "learning_rate": 9.164444444444443e-07, + "logits/chosen": 1.7748816013336182, + "logits/rejected": 1.7431707382202148, + "logps/chosen": -163.86878967285156, + "logps/rejected": -142.68081665039062, + "loss": 0.6923216342926025, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.057915735989809036, + "rewards/margins": 0.0020550203043967485, + "rewards/rejected": 0.055860716849565506, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 3.652050256729126, + "learning_rate": 9.075555555555555e-07, + "logits/chosen": 1.6430679559707642, + "logits/rejected": 1.6094305515289307, + "logps/chosen": -148.5363006591797, + "logps/rejected": -153.50338745117188, + "loss": 0.6937844753265381, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05856321379542351, + "rewards/margins": -0.0007888395339250565, + "rewards/rejected": 0.05935205891728401, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 3.4950904846191406, + "learning_rate": 8.986666666666666e-07, + "logits/chosen": 1.6158307790756226, + "logits/rejected": 1.7254810333251953, + "logps/chosen": -157.8291473388672, + "logps/rejected": -164.51071166992188, + "loss": 0.6928259372711182, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06535087525844574, + "rewards/margins": 0.001135659171268344, + "rewards/rejected": 0.06421522051095963, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.940080165863037, + "learning_rate": 8.897777777777777e-07, + "logits/chosen": 1.5965789556503296, + "logits/rejected": 1.649510145187378, + "logps/chosen": -137.41818237304688, + "logps/rejected": -150.51309204101562, + "loss": 0.6896316051483155, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0644104927778244, + "rewards/margins": 0.007390809245407581, + "rewards/rejected": 0.057019688189029694, + "step": 250 + }, + { + "epoch": 0.4, + "eval_logits/chosen": 1.55246102809906, + "eval_logits/rejected": 1.5829427242279053, + "eval_logps/chosen": -152.7037811279297, + "eval_logps/rejected": -147.31491088867188, + "eval_loss": 0.6915441751480103, + "eval_rewards/accuracies": 0.5199999809265137, + "eval_rewards/chosen": 0.06253667175769806, + "eval_rewards/margins": 0.0037424375768750906, + "eval_rewards/rejected": 0.058794230222702026, + "eval_runtime": 90.9689, + "eval_samples_per_second": 5.496, + "eval_steps_per_second": 2.748, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 4.24291467666626, + "learning_rate": 8.808888888888889e-07, + "logits/chosen": 1.640729546546936, + "logits/rejected": 1.6604511737823486, + "logps/chosen": -144.95303344726562, + "logps/rejected": -149.94384765625, + "loss": 0.691684627532959, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07415647059679031, + "rewards/margins": 0.0033445146400481462, + "rewards/rejected": 0.07081194967031479, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 4.855024814605713, + "learning_rate": 8.72e-07, + "logits/chosen": 1.6059837341308594, + "logits/rejected": 1.7337257862091064, + "logps/chosen": -157.46888732910156, + "logps/rejected": -150.69549560546875, + "loss": 0.6913710117340088, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0714256763458252, + "rewards/margins": 0.00412519508972764, + "rewards/rejected": 0.06730048358440399, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 3.2214746475219727, + "learning_rate": 8.631111111111111e-07, + "logits/chosen": 1.7127368450164795, + "logits/rejected": 1.7540982961654663, + "logps/chosen": -131.8098602294922, + "logps/rejected": -150.68472290039062, + "loss": 0.6942379474639893, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.07171601057052612, + "rewards/margins": -0.0016212888294830918, + "rewards/rejected": 0.07333729416131973, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 4.169992446899414, + "learning_rate": 8.542222222222222e-07, + "logits/chosen": 1.6667410135269165, + "logits/rejected": 1.6612989902496338, + "logps/chosen": -154.6461639404297, + "logps/rejected": -148.51638793945312, + "loss": 0.6932468891143799, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.06825742870569229, + "rewards/margins": 0.0004094833566341549, + "rewards/rejected": 0.06784794479608536, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 3.3903329372406006, + "learning_rate": 8.453333333333334e-07, + "logits/chosen": 1.5482908487319946, + "logits/rejected": 1.5708004236221313, + "logps/chosen": -138.28341674804688, + "logps/rejected": -140.89016723632812, + "loss": 0.6920734405517578, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.05976419895887375, + "rewards/margins": 0.002482531126588583, + "rewards/rejected": 0.057281672954559326, + "step": 300 + }, + { + "epoch": 0.48, + "eval_logits/chosen": 1.55509614944458, + "eval_logits/rejected": 1.5857810974121094, + "eval_logps/chosen": -152.74725341796875, + "eval_logps/rejected": -147.36598205566406, + "eval_loss": 0.6911302208900452, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": 0.05818922445178032, + "eval_rewards/margins": 0.004501740448176861, + "eval_rewards/rejected": 0.053687483072280884, + "eval_runtime": 90.9977, + "eval_samples_per_second": 5.495, + "eval_steps_per_second": 2.747, + "step": 300 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/DPO/DPO_5k/lora/checkpoint-300/training_args.bin b/v5/DPO/DPO_5k/lora/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..cb6e403b06e05c65a94488c31a57e3448aa1a628 --- /dev/null +++ b/v5/DPO/DPO_5k/lora/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2949ac3bd9315d3a45a1d086fec2301ea7bd1dab4938d70cfd24209203d51940 +size 6161 diff --git a/v5/DPO/gen-output/DPO_10k/data-00000-of-00001.arrow b/v5/DPO/gen-output/DPO_10k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..41d41eb28f0597cdbe50874b97d8e56fb48848a1 --- /dev/null +++ b/v5/DPO/gen-output/DPO_10k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66d88d5c1469483d67f72ee8beeefbf79feef5ceb9833187b84833eebd8bd372 +size 981088 diff --git a/v5/DPO/gen-output/DPO_10k/dataset_info.json b/v5/DPO/gen-output/DPO_10k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/DPO/gen-output/DPO_10k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/DPO/gen-output/DPO_10k/state.json b/v5/DPO/gen-output/DPO_10k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..07739923c454fad8a27ef221d598aad1bbf3ce7f --- /dev/null +++ b/v5/DPO/gen-output/DPO_10k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "5b754269039c4624", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/DPO/gen-output/DPO_1k/data-00000-of-00001.arrow b/v5/DPO/gen-output/DPO_1k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..e538a294e3a858289f2118dbc01e4496316119eb --- /dev/null +++ b/v5/DPO/gen-output/DPO_1k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791240d22713993e7278395f8eaec7e439099724dec69cd3ec566c7c5b5d4dcf +size 992144 diff --git a/v5/DPO/gen-output/DPO_1k/dataset_info.json b/v5/DPO/gen-output/DPO_1k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/DPO/gen-output/DPO_1k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/DPO/gen-output/DPO_1k/state.json b/v5/DPO/gen-output/DPO_1k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d2111de4b05dee03f3e4d844fdba0df9f7242ed1 --- /dev/null +++ b/v5/DPO/gen-output/DPO_1k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "3158922e70fa5fb8", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/DPO/gen-output/DPO_5k/data-00000-of-00001.arrow b/v5/DPO/gen-output/DPO_5k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..7068d0e6a384aaed9174fa3d126134ec8c3430ff --- /dev/null +++ b/v5/DPO/gen-output/DPO_5k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63679d9b80c4281e464030593c748373838c148c7264df148abb77657aab9700 +size 988928 diff --git a/v5/DPO/gen-output/DPO_5k/dataset_info.json b/v5/DPO/gen-output/DPO_5k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/DPO/gen-output/DPO_5k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/DPO/gen-output/DPO_5k/state.json b/v5/DPO/gen-output/DPO_5k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..12a1a42ce1a04e88ed5b7c3aaa42f6677a8a789b --- /dev/null +++ b/v5/DPO/gen-output/DPO_5k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "7a3c26e57069c487", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/KTO/KTO_10k/KTO_10k/README.md b/v5/KTO/KTO_10k/KTO_10k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_10k/KTO_10k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_10k/KTO_10k/adapter_config.json b/v5/KTO/KTO_10k/KTO_10k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e59b48ef11325fd83a0fa60f4e367a1bcacba7d --- /dev/null +++ b/v5/KTO/KTO_10k/KTO_10k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "k_proj", + "o_proj", + "v_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_10k/KTO_10k/adapter_model.safetensors b/v5/KTO/KTO_10k/KTO_10k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee8e6ee6513cb5b6f8380ee3a0ef08bb3def554e --- /dev/null +++ b/v5/KTO/KTO_10k/KTO_10k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ce8e22e8f9cfaea0c749b838ca5acd7ffeaa4277b6674abe9d93a82c75a3c8 +size 180385008 diff --git a/v5/KTO/KTO_10k/MKTO_10k/chat_template.jinja b/v5/KTO/KTO_10k/MKTO_10k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_10k/MKTO_10k/config.json b/v5/KTO/KTO_10k/MKTO_10k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/KTO/KTO_10k/MKTO_10k/generation_config.json b/v5/KTO/KTO_10k/MKTO_10k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/KTO/KTO_10k/MKTO_10k/model.safetensors b/v5/KTO/KTO_10k/MKTO_10k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6314da446166665245a32af7c38ca25dc134daeb --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f46dde50c6d5d4237edc55ec9de8941a2d657cb457afd3f30e0b190399d01490 +size 2471645464 diff --git a/v5/KTO/KTO_10k/MKTO_10k/tokenizer.json b/v5/KTO/KTO_10k/MKTO_10k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_10k/MKTO_10k/tokenizer_config.json b/v5/KTO/KTO_10k/MKTO_10k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_10k/MKTO_10k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_10k/lora/README.md b/v5/KTO/KTO_10k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..536c4141dbbc75a3867b5f8d58b1f3df9cd2b2fd --- /dev/null +++ b/v5/KTO/KTO_10k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- kto +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/gbx4j95o) + + +This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite KTO as: + +```bibtex +@article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/README.md b/v5/KTO/KTO_10k/lora/checkpoint-2300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e59b48ef11325fd83a0fa60f4e367a1bcacba7d --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "k_proj", + "o_proj", + "v_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_model.safetensors b/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3b203b1b47e0e1d9d202aba7e65f41d896029265 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fed67bad451cf294be622ffd2f333ee9a188ec9b3a6a4c91423685b4f1c6b16 +size 180385008 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/chat_template.jinja b/v5/KTO/KTO_10k/lora/checkpoint-2300/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/optimizer.pt b/v5/KTO/KTO_10k/lora/checkpoint-2300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6eeb37c6b2ca7eb726881cae0eeee4cd21b9b08 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e31b9bc88dc4b4eb442eaf087b7134a2936b4b9e1ed1bd8d3098411637b96850 +size 360902475 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/rng_state.pth b/v5/KTO/KTO_10k/lora/checkpoint-2300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..80f98e729a85d79fa77e6570e00c15b63087b058 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a804dd9b4962bc1e7c8e5b51c83ce95f04ab0a366340b47fc4849e7d4ecffd6d +size 14645 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/scaler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2300/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..dfa38287bdec0a64548a1c269e4aa937bef2edec --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9b4a9496356e2b9baa77ee7dd16dce4e644d0c2b24f1f2ed619fc446f84f7f +size 1383 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/scheduler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ad993ea8daba03e2c85a33cf30c2302cf9ed1a2 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:741c2b0245ef6d59508f07f70d0ec7cf197583bcb90c7f4bf14dc438f1019647 +size 1465 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer.json b/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/trainer_state.json b/v5/KTO/KTO_10k/lora/checkpoint-2300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f5d399f7cea1ac576deb9a029e79b6626d3824d --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/trainer_state.json @@ -0,0 +1,3852 @@ +{ + "best_global_step": 2300, + "best_metric": 0.12928588867187502, + "best_model_checkpoint": "output/lora/checkpoint-2300", + "epoch": 1.8399999999999999, + "eval_steps": 100, + "global_step": 2300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.4994136691093445, + "kl": 0.010484933853149414, + "learning_rate": 1.8e-07, + "logits/chosen": 29687939.2, + "logits/rejected": 31342233.6, + "logps/chosen": -148.9648681640625, + "logps/rejected": -128.8302734375, + "loss": 0.500147819519043, + "rewards/chosen": -0.0003900241805240512, + "rewards/margins": -0.001183443213813007, + "rewards/rejected": 0.0007934190332889556, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.42419925332069397, + "kl": 0.018610835075378418, + "learning_rate": 3.8e-07, + "logits/chosen": 53382841.6, + "logits/rejected": 52884211.2, + "logps/chosen": -140.02025146484374, + "logps/rejected": -151.92236328125, + "loss": 0.49989566802978513, + "rewards/chosen": 0.0010854244232177735, + "rewards/margins": 0.0008347129682078958, + "rewards/rejected": 0.0002507114550098777, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.41562652587890625, + "kl": 0.00999913178384304, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 34145190.4, + "logits/rejected": 34195894.4, + "logps/chosen": -131.7357177734375, + "logps/rejected": -140.3759033203125, + "loss": 0.49987101554870605, + "rewards/chosen": 0.00029232501983642576, + "rewards/margins": 0.0010309695731848477, + "rewards/rejected": -0.0007386445533484221, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.35795858502388, + "kl": 0.01658189296722412, + "learning_rate": 7.8e-07, + "logits/chosen": 43262694.4, + "logits/rejected": 43904278.4, + "logps/chosen": -144.2994140625, + "logps/rejected": -146.0284423828125, + "loss": 0.5001150608062744, + "rewards/chosen": -0.00019423491321504116, + "rewards/margins": -0.0009199525695294142, + "rewards/rejected": 0.000725717656314373, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.322542279958725, + "kl": 0.016057539731264114, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 43062272.0, + "logits/rejected": 44864710.4, + "logps/chosen": -141.009814453125, + "logps/rejected": -154.3311279296875, + "loss": 0.4999659538269043, + "rewards/chosen": 4.65535675175488e-05, + "rewards/margins": 0.00027224536752328276, + "rewards/rejected": -0.00022569180000573397, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.27746379375457764, + "kl": 0.0211088415235281, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 36592531.2, + "logits/rejected": 34114694.4, + "logps/chosen": -105.72940673828126, + "logps/rejected": -114.016015625, + "loss": 0.4998314380645752, + "rewards/chosen": 0.0008930303156375885, + "rewards/margins": 0.0013488865923136472, + "rewards/rejected": -0.0004558562766760588, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.380987286567688, + "kl": 0.014461040496826172, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 47752102.4, + "logits/rejected": 46858576.0, + "logps/chosen": -165.7050048828125, + "logps/rejected": -175.17645263671875, + "loss": 0.49965806007385255, + "rewards/chosen": -0.007297745347023011, + "rewards/margins": 0.002736319601535796, + "rewards/rejected": -0.010034064948558807, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.4557498097419739, + "kl": 0.016758393496274948, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 40700441.6, + "logits/rejected": 40753952.0, + "logps/chosen": -154.99173583984376, + "logps/rejected": -163.616552734375, + "loss": 0.49955191612243655, + "rewards/chosen": -0.007268477231264114, + "rewards/margins": 0.0035857379436492927, + "rewards/rejected": -0.010854215174913407, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.3776198923587799, + "kl": 0.04920945316553116, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 43998704.0, + "logits/rejected": 44111488.0, + "logps/chosen": -169.890185546875, + "logps/rejected": -159.26253662109374, + "loss": 0.4991014003753662, + "rewards/chosen": -0.0037218812853097917, + "rewards/margins": 0.007189888134598732, + "rewards/rejected": -0.010911769419908523, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.45459961891174316, + "kl": 0.10221505165100098, + "learning_rate": 1.98e-06, + "logits/chosen": 27590704.0, + "logits/rejected": 27196054.4, + "logps/chosen": -134.2844970703125, + "logps/rejected": -164.56478271484374, + "loss": 0.4994335651397705, + "rewards/chosen": 0.001446514017879963, + "rewards/margins": 0.004533729329705239, + "rewards/rejected": -0.0030872153118252756, + "step": 100 + }, + { + "epoch": 0.08, + "eval_kl": 0.0926995798945427, + "eval_logits/chosen": 38615707.648, + "eval_logits/rejected": 38522241.024, + "eval_logps/chosen": -154.3604375, + "eval_logps/rejected": -148.682875, + "eval_loss": 0.4999313950538635, + "eval_rewards/chosen": -0.0034661414623260497, + "eval_rewards/margins": 0.0005488345623016356, + "eval_rewards/rejected": -0.004014976024627685, + "eval_runtime": 216.3934, + "eval_samples_per_second": 4.621, + "eval_steps_per_second": 2.311, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 0.3749667704105377, + "kl": 0.09545516967773438, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37966393.6, + "logits/rejected": 37751027.2, + "logps/chosen": -130.5518798828125, + "logps/rejected": -135.6833740234375, + "loss": 0.4993227481842041, + "rewards/chosen": 0.001455230824649334, + "rewards/margins": 0.005419917218387127, + "rewards/rejected": -0.003964686393737793, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 0.36912816762924194, + "kl": 0.14255723357200623, + "learning_rate": 2.38e-06, + "logits/chosen": 47479664.0, + "logits/rejected": 47101081.6, + "logps/chosen": -162.19322509765624, + "logps/rejected": -133.80028076171874, + "loss": 0.5003850936889649, + "rewards/chosen": -0.0034813500940799715, + "rewards/margins": -0.0030801778659224513, + "rewards/rejected": -0.0004011722281575203, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 0.3060654103755951, + "kl": 0.3212381601333618, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 39886729.6, + "logits/rejected": 38994944.0, + "logps/chosen": -161.928857421875, + "logps/rejected": -140.0421630859375, + "loss": 0.5001925468444824, + "rewards/chosen": 0.024501633644104005, + "rewards/margins": -0.001541826128959655, + "rewards/rejected": 0.02604345977306366, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 0.3445453345775604, + "kl": 0.48165637254714966, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 40894547.2, + "logits/rejected": 42894540.8, + "logps/chosen": -142.19818115234375, + "logps/rejected": -157.58607177734376, + "loss": 0.5001253128051758, + "rewards/chosen": 0.04530414342880249, + "rewards/margins": -0.001001721620559691, + "rewards/rejected": 0.04630586504936218, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 0.3646848797798157, + "kl": 0.5575106143951416, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 41080057.6, + "logits/rejected": 42315260.8, + "logps/chosen": -129.9904052734375, + "logps/rejected": -117.11707763671875, + "loss": 0.49814538955688475, + "rewards/chosen": 0.059499716758728026, + "rewards/margins": 0.014832848310470582, + "rewards/rejected": 0.044666868448257444, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 0.37343886494636536, + "kl": 0.7937558889389038, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 34726796.8, + "logits/rejected": 35066944.0, + "logps/chosen": -143.1036376953125, + "logps/rejected": -146.66500244140624, + "loss": 0.4996847152709961, + "rewards/chosen": 0.07830544710159301, + "rewards/margins": 0.0025246202945709145, + "rewards/rejected": 0.0757808268070221, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 0.3172762095928192, + "kl": 0.9795322418212891, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 37869379.2, + "logits/rejected": 40011753.6, + "logps/chosen": -137.67252197265626, + "logps/rejected": -149.25455322265626, + "loss": 0.4995111465454102, + "rewards/chosen": 0.0999127209186554, + "rewards/margins": 0.003919076919555661, + "rewards/rejected": 0.09599364399909974, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 0.459634393453598, + "kl": 1.297642707824707, + "learning_rate": 3.58e-06, + "logits/chosen": 44220444.8, + "logits/rejected": 45226771.2, + "logps/chosen": -144.420849609375, + "logps/rejected": -170.05146484375, + "loss": 0.5002459049224853, + "rewards/chosen": 0.12877843379974366, + "rewards/margins": -0.0019718408584594727, + "rewards/rejected": 0.13075027465820313, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 0.347683310508728, + "kl": 1.2592600584030151, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 41769235.2, + "logits/rejected": 43331692.8, + "logps/chosen": -123.3504150390625, + "logps/rejected": -136.4183837890625, + "loss": 0.4997075080871582, + "rewards/chosen": 0.12709956169128417, + "rewards/margins": 0.0023471236228942705, + "rewards/rejected": 0.1247524380683899, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 0.46408534049987793, + "kl": 1.3921682834625244, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 47041785.6, + "logits/rejected": 48364675.2, + "logps/chosen": -150.128076171875, + "logps/rejected": -171.89765625, + "loss": 0.49930601119995116, + "rewards/chosen": 0.14199190139770507, + "rewards/margins": 0.005550038814544661, + "rewards/rejected": 0.1364418625831604, + "step": 200 + }, + { + "epoch": 0.16, + "eval_kl": 1.0393632650375366, + "eval_logits/chosen": 39075643.392, + "eval_logits/rejected": 38930210.816, + "eval_logps/chosen": -153.263515625, + "eval_logps/rejected": -147.659890625, + "eval_loss": 0.49900853633880615, + "eval_rewards/chosen": 0.10622586059570313, + "eval_rewards/margins": 0.007942695617675785, + "eval_rewards/rejected": 0.09828316497802735, + "eval_runtime": 215.9673, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 2.315, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 0.39666494727134705, + "kl": 0.7951234579086304, + "learning_rate": 4.18e-06, + "logits/chosen": 33959907.2, + "logits/rejected": 33986992.0, + "logps/chosen": -139.88677978515625, + "logps/rejected": -131.93973388671876, + "loss": 0.5003408432006836, + "rewards/chosen": 0.06579458713531494, + "rewards/margins": -0.002647107839584356, + "rewards/rejected": 0.0684416949748993, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 0.3799448311328888, + "kl": 0.6490715146064758, + "learning_rate": 4.38e-06, + "logits/chosen": 35468355.2, + "logits/rejected": 36302822.4, + "logps/chosen": -101.356298828125, + "logps/rejected": -125.962353515625, + "loss": 0.5001154899597168, + "rewards/chosen": 0.0492926150560379, + "rewards/margins": -0.0009777992963790894, + "rewards/rejected": 0.05027041435241699, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 0.41211405396461487, + "kl": 0.3460121750831604, + "learning_rate": 4.58e-06, + "logits/chosen": 47615702.4, + "logits/rejected": 46232614.4, + "logps/chosen": -185.3808837890625, + "logps/rejected": -163.7504638671875, + "loss": 0.5009187698364258, + "rewards/chosen": -0.0020151469856500626, + "rewards/margins": -0.007613314315676689, + "rewards/rejected": 0.005598167330026627, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 0.40270859003067017, + "kl": 0.5220479369163513, + "learning_rate": 4.78e-06, + "logits/chosen": 48030569.6, + "logits/rejected": 48140400.0, + "logps/chosen": -176.74349365234374, + "logps/rejected": -166.65750732421876, + "loss": 0.5001285076141357, + "rewards/chosen": 0.028535887598991394, + "rewards/margins": -0.001354834437370299, + "rewards/rejected": 0.029890722036361693, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 0.4905576705932617, + "kl": 0.5900261402130127, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 37097190.4, + "logits/rejected": 35081888.0, + "logps/chosen": -176.0585205078125, + "logps/rejected": -145.11353759765626, + "loss": 0.4949470520019531, + "rewards/chosen": 0.06773759722709656, + "rewards/margins": 0.040507239103317265, + "rewards/rejected": 0.027230358123779295, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 0.30912280082702637, + "kl": 0.5255872011184692, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 30562265.6, + "logits/rejected": 29522019.2, + "logps/chosen": -128.9729248046875, + "logps/rejected": -131.62899169921874, + "loss": 0.4973008155822754, + "rewards/chosen": 0.040848633646965025, + "rewards/margins": 0.021623241901397704, + "rewards/rejected": 0.01922539174556732, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 0.5176734328269958, + "kl": 0.9013652801513672, + "learning_rate": 4.957777777777778e-06, + "logits/chosen": 39767792.0, + "logits/rejected": 39945158.4, + "logps/chosen": -156.84248046875, + "logps/rejected": -151.7102294921875, + "loss": 0.4969136714935303, + "rewards/chosen": 0.07821747660636902, + "rewards/margins": 0.024683624505996704, + "rewards/rejected": 0.053533852100372314, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 0.4220561385154724, + "kl": 0.8817802667617798, + "learning_rate": 4.935555555555556e-06, + "logits/chosen": 33369977.6, + "logits/rejected": 27383606.4, + "logps/chosen": -167.6235595703125, + "logps/rejected": -139.73486328125, + "loss": 0.5022628784179688, + "rewards/chosen": 0.054727953672409055, + "rewards/margins": -0.018271952867507935, + "rewards/rejected": 0.07299990653991699, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 0.33811691403388977, + "kl": 1.448921799659729, + "learning_rate": 4.9133333333333334e-06, + "logits/chosen": 31531936.0, + "logits/rejected": 30661184.0, + "logps/chosen": -145.08800048828124, + "logps/rejected": -147.349755859375, + "loss": 0.49300565719604494, + "rewards/chosen": 0.1612391948699951, + "rewards/margins": 0.05618309974670409, + "rewards/rejected": 0.10505609512329102, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 0.5129542350769043, + "kl": 1.6933104991912842, + "learning_rate": 4.891111111111111e-06, + "logits/chosen": 42485971.2, + "logits/rejected": 42720950.4, + "logps/chosen": -167.75079345703125, + "logps/rejected": -179.53148193359374, + "loss": 0.4963071823120117, + "rewards/chosen": 0.118367600440979, + "rewards/margins": 0.03180532455444336, + "rewards/rejected": 0.08656227588653564, + "step": 300 + }, + { + "epoch": 0.24, + "eval_kl": 1.5601574182510376, + "eval_logits/chosen": 38297956.352, + "eval_logits/rejected": 38117695.488, + "eval_logps/chosen": -153.006140625, + "eval_logps/rejected": -147.429, + "eval_loss": 0.49868252873420715, + "eval_rewards/chosen": 0.13196340942382812, + "eval_rewards/margins": 0.010592102050781246, + "eval_rewards/rejected": 0.12137130737304687, + "eval_runtime": 217.0741, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.303, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 0.3847499489784241, + "kl": 1.3948395252227783, + "learning_rate": 4.8688888888888895e-06, + "logits/chosen": 33896211.2, + "logits/rejected": 34871568.0, + "logps/chosen": -145.9845458984375, + "logps/rejected": -154.91959228515626, + "loss": 0.5030938625335694, + "rewards/chosen": 0.08795046210289001, + "rewards/margins": -0.024919158220291143, + "rewards/rejected": 0.11286962032318115, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 0.618556797504425, + "kl": 0.49630022048950195, + "learning_rate": 4.846666666666667e-06, + "logits/chosen": 37342124.8, + "logits/rejected": 35182000.0, + "logps/chosen": -160.91566162109376, + "logps/rejected": -134.513427734375, + "loss": 0.5022326946258545, + "rewards/chosen": -0.059583669900894164, + "rewards/margins": -0.018618279695510866, + "rewards/rejected": -0.0409653902053833, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 0.382318377494812, + "kl": 0.8811993598937988, + "learning_rate": 4.824444444444445e-06, + "logits/chosen": 46995257.6, + "logits/rejected": 44221206.4, + "logps/chosen": -153.2612060546875, + "logps/rejected": -144.4525634765625, + "loss": 0.4899014949798584, + "rewards/chosen": 0.058102655410766604, + "rewards/margins": 0.08179453760385513, + "rewards/rejected": -0.023691882193088532, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 0.4012068510055542, + "kl": 0.9655236005783081, + "learning_rate": 4.802222222222222e-06, + "logits/chosen": 39877590.4, + "logits/rejected": 40850240.0, + "logps/chosen": -134.43511962890625, + "logps/rejected": -143.74300537109374, + "loss": 0.5008483409881592, + "rewards/chosen": 0.046630316972732545, + "rewards/margins": -0.009023183584213258, + "rewards/rejected": 0.055653500556945804, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 0.4055842161178589, + "kl": 1.7407032251358032, + "learning_rate": 4.78e-06, + "logits/chosen": 37863616.0, + "logits/rejected": 36761936.0, + "logps/chosen": -133.8212646484375, + "logps/rejected": -169.326318359375, + "loss": 0.5016643524169921, + "rewards/chosen": 0.14738692045211793, + "rewards/margins": -0.01331337690353393, + "rewards/rejected": 0.16070029735565186, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 0.4029492139816284, + "kl": 1.1468133926391602, + "learning_rate": 4.7577777777777784e-06, + "logits/chosen": 41317878.4, + "logits/rejected": 38904140.8, + "logps/chosen": -147.33363037109376, + "logps/rejected": -112.39573974609375, + "loss": 0.49462456703186036, + "rewards/chosen": 0.09142228960990906, + "rewards/margins": 0.04296924769878387, + "rewards/rejected": 0.048453041911125184, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 0.39963042736053467, + "kl": 1.3335682153701782, + "learning_rate": 4.735555555555556e-06, + "logits/chosen": 38361622.4, + "logits/rejected": 38506108.8, + "logps/chosen": -146.35006103515624, + "logps/rejected": -150.335205078125, + "loss": 0.5048986434936523, + "rewards/chosen": 0.06511063575744629, + "rewards/margins": -0.04016592502593995, + "rewards/rejected": 0.10527656078338624, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 0.5386641025543213, + "kl": 1.9048980474472046, + "learning_rate": 4.713333333333334e-06, + "logits/chosen": 34626476.8, + "logits/rejected": 35537760.0, + "logps/chosen": -154.9567626953125, + "logps/rejected": -166.59052734375, + "loss": 0.5030035495758056, + "rewards/chosen": 0.13892955780029298, + "rewards/margins": -0.02978687286376952, + "rewards/rejected": 0.1687164306640625, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 0.3963494896888733, + "kl": 1.7998809814453125, + "learning_rate": 4.691111111111111e-06, + "logits/chosen": 31470185.6, + "logits/rejected": 30747776.0, + "logps/chosen": -174.68343505859374, + "logps/rejected": -149.20201416015624, + "loss": 0.4925515174865723, + "rewards/chosen": 0.17315468788146973, + "rewards/margins": 0.05992317199707031, + "rewards/rejected": 0.11323151588439942, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 0.40272921323776245, + "kl": 1.6665403842926025, + "learning_rate": 4.66888888888889e-06, + "logits/chosen": 43372483.2, + "logits/rejected": 41547449.6, + "logps/chosen": -148.32398681640626, + "logps/rejected": -134.78739013671876, + "loss": 0.49486651420593264, + "rewards/chosen": 0.149322509765625, + "rewards/margins": 0.040551638603210455, + "rewards/rejected": 0.10877087116241455, + "step": 400 + }, + { + "epoch": 0.32, + "eval_kl": 1.792982578277588, + "eval_logits/chosen": 38918168.576, + "eval_logits/rejected": 38725652.48, + "eval_logps/chosen": -152.730328125, + "eval_logps/rejected": -147.293078125, + "eval_loss": 0.4969332814216614, + "eval_rewards/chosen": 0.15954458618164064, + "eval_rewards/margins": 0.024580596923828135, + "eval_rewards/rejected": 0.1349639892578125, + "eval_runtime": 216.6464, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 0.3303642272949219, + "kl": 2.137221336364746, + "learning_rate": 4.646666666666667e-06, + "logits/chosen": 43939001.6, + "logits/rejected": 41818220.8, + "logps/chosen": -146.33731689453126, + "logps/rejected": -147.7433349609375, + "loss": 0.4917384147644043, + "rewards/chosen": 0.22513296604156494, + "rewards/margins": 0.06730514764785767, + "rewards/rejected": 0.15782781839370727, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 0.5785346031188965, + "kl": 1.536816120147705, + "learning_rate": 4.624444444444445e-06, + "logits/chosen": 34265174.4, + "logits/rejected": 32297750.4, + "logps/chosen": -161.92572021484375, + "logps/rejected": -130.8744384765625, + "loss": 0.4967160701751709, + "rewards/chosen": 0.12509127855300903, + "rewards/margins": 0.025565683841705322, + "rewards/rejected": 0.0995255947113037, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 0.39299893379211426, + "kl": 2.4938416481018066, + "learning_rate": 4.602222222222223e-06, + "logits/chosen": 37429766.4, + "logits/rejected": 33713158.4, + "logps/chosen": -168.366845703125, + "logps/rejected": -117.99913330078125, + "loss": 0.4935178279876709, + "rewards/chosen": 0.2566863536834717, + "rewards/margins": 0.051660680770874046, + "rewards/rejected": 0.20502567291259766, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 0.6378316879272461, + "kl": 3.6217243671417236, + "learning_rate": 4.58e-06, + "logits/chosen": 43531513.6, + "logits/rejected": 45458550.4, + "logps/chosen": -145.77152099609376, + "logps/rejected": -166.725390625, + "loss": 0.5008945465087891, + "rewards/chosen": 0.3571479320526123, + "rewards/margins": -0.007279539108276389, + "rewards/rejected": 0.3644274711608887, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 0.38800859451293945, + "kl": 3.8835651874542236, + "learning_rate": 4.557777777777778e-06, + "logits/chosen": 35328048.0, + "logits/rejected": 38813721.6, + "logps/chosen": -95.403271484375, + "logps/rejected": -151.7120849609375, + "loss": 0.50106782913208, + "rewards/chosen": 0.38196592330932616, + "rewards/margins": -0.00870509147644044, + "rewards/rejected": 0.3906710147857666, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 0.454421728849411, + "kl": 4.79476261138916, + "learning_rate": 4.535555555555555e-06, + "logits/chosen": 47006140.8, + "logits/rejected": 45068256.0, + "logps/chosen": -160.01910400390625, + "logps/rejected": -175.923046875, + "loss": 0.5052647590637207, + "rewards/chosen": 0.45828795433044434, + "rewards/margins": -0.042376470565795854, + "rewards/rejected": 0.5006644248962402, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 0.846814751625061, + "kl": 3.439274311065674, + "learning_rate": 4.513333333333333e-06, + "logits/chosen": 55978662.4, + "logits/rejected": 53112982.4, + "logps/chosen": -170.11988525390626, + "logps/rejected": -174.0812744140625, + "loss": 0.4981950283050537, + "rewards/chosen": 0.3498707294464111, + "rewards/margins": 0.014589142799377453, + "rewards/rejected": 0.33528158664703367, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 0.46414715051651, + "kl": 2.857430934906006, + "learning_rate": 4.4911111111111115e-06, + "logits/chosen": 44121936.0, + "logits/rejected": 43484160.0, + "logps/chosen": -149.39083251953124, + "logps/rejected": -159.20223388671874, + "loss": 0.4906013011932373, + "rewards/chosen": 0.3135632276535034, + "rewards/margins": 0.0760336399078369, + "rewards/rejected": 0.23752958774566652, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 0.31783437728881836, + "kl": 2.6989314556121826, + "learning_rate": 4.468888888888889e-06, + "logits/chosen": 29722166.4, + "logits/rejected": 27615270.4, + "logps/chosen": -146.17584228515625, + "logps/rejected": -145.2918212890625, + "loss": 0.4850144863128662, + "rewards/chosen": 0.30660200119018555, + "rewards/margins": 0.12175897359848023, + "rewards/rejected": 0.18484302759170532, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 0.4939253032207489, + "kl": 4.809385299682617, + "learning_rate": 4.446666666666667e-06, + "logits/chosen": 45245225.6, + "logits/rejected": 41364572.8, + "logps/chosen": -177.41658935546874, + "logps/rejected": -128.30506591796876, + "loss": 0.48539199829101565, + "rewards/chosen": 0.5390491962432862, + "rewards/margins": 0.1182609081268311, + "rewards/rejected": 0.4207882881164551, + "step": 500 + }, + { + "epoch": 0.4, + "eval_kl": 4.020763874053955, + "eval_logits/chosen": 40241844.224, + "eval_logits/rejected": 39968325.632, + "eval_logps/chosen": -150.122734375, + "eval_logps/rejected": -144.86325, + "eval_loss": 0.4947924017906189, + "eval_rewards/chosen": 0.4203052978515625, + "eval_rewards/margins": 0.042358032226562536, + "eval_rewards/rejected": 0.377947265625, + "eval_runtime": 216.7408, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 0.414318323135376, + "kl": 3.33302640914917, + "learning_rate": 4.424444444444444e-06, + "logits/chosen": 47571436.8, + "logits/rejected": 49124124.8, + "logps/chosen": -143.7648681640625, + "logps/rejected": -157.2596435546875, + "loss": 0.4965871810913086, + "rewards/chosen": 0.34320816993713377, + "rewards/margins": 0.027533125877380327, + "rewards/rejected": 0.31567504405975344, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 0.38320228457450867, + "kl": 4.671795845031738, + "learning_rate": 4.402222222222223e-06, + "logits/chosen": 46508307.2, + "logits/rejected": 45490304.0, + "logps/chosen": -154.68175048828124, + "logps/rejected": -160.55111083984374, + "loss": 0.4931188106536865, + "rewards/chosen": 0.47988028526306153, + "rewards/margins": 0.05947685241699219, + "rewards/rejected": 0.42040343284606935, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 0.4373217821121216, + "kl": 3.6891350746154785, + "learning_rate": 4.38e-06, + "logits/chosen": 42301033.6, + "logits/rejected": 42527356.8, + "logps/chosen": -138.6637939453125, + "logps/rejected": -173.32967529296874, + "loss": 0.5058313369750976, + "rewards/chosen": 0.33178033828735354, + "rewards/margins": -0.047040796279907204, + "rewards/rejected": 0.37882113456726074, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 0.6072640419006348, + "kl": 4.442656517028809, + "learning_rate": 4.357777777777778e-06, + "logits/chosen": 34522003.2, + "logits/rejected": 34255187.2, + "logps/chosen": -147.196533203125, + "logps/rejected": -154.7218505859375, + "loss": 0.4857354640960693, + "rewards/chosen": 0.49022879600524905, + "rewards/margins": 0.11430189609527591, + "rewards/rejected": 0.37592689990997313, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 0.4359336793422699, + "kl": 3.4759514331817627, + "learning_rate": 4.3355555555555565e-06, + "logits/chosen": 41427052.8, + "logits/rejected": 42907648.0, + "logps/chosen": -152.25201416015625, + "logps/rejected": -165.486767578125, + "loss": 0.49396610260009766, + "rewards/chosen": 0.345978832244873, + "rewards/margins": 0.05630025863647459, + "rewards/rejected": 0.28967857360839844, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 0.43716976046562195, + "kl": 3.152191638946533, + "learning_rate": 4.313333333333334e-06, + "logits/chosen": 35663577.6, + "logits/rejected": 34092796.8, + "logps/chosen": -161.36358642578125, + "logps/rejected": -148.691259765625, + "loss": 0.49653072357177735, + "rewards/chosen": 0.27045164108276365, + "rewards/margins": 0.027580332756042464, + "rewards/rejected": 0.24287130832672119, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 0.387523889541626, + "kl": 2.6372287273406982, + "learning_rate": 4.291111111111112e-06, + "logits/chosen": 41382582.4, + "logits/rejected": 40126329.6, + "logps/chosen": -147.36217041015624, + "logps/rejected": -132.27440185546874, + "loss": 0.48532447814941404, + "rewards/chosen": 0.2873492479324341, + "rewards/margins": 0.12220915555953982, + "rewards/rejected": 0.1651400923728943, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 0.4191218316555023, + "kl": 3.158555507659912, + "learning_rate": 4.268888888888889e-06, + "logits/chosen": 47333145.6, + "logits/rejected": 46272729.6, + "logps/chosen": -147.29794921875, + "logps/rejected": -157.137255859375, + "loss": 0.4924956798553467, + "rewards/chosen": 0.26015233993530273, + "rewards/margins": 0.0591968059539795, + "rewards/rejected": 0.20095553398132324, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 0.4541790783405304, + "kl": 3.111989974975586, + "learning_rate": 4.246666666666667e-06, + "logits/chosen": 29866240.0, + "logits/rejected": 30473120.0, + "logps/chosen": -128.02447509765625, + "logps/rejected": -133.55704345703126, + "loss": 0.4842988967895508, + "rewards/chosen": 0.3343390941619873, + "rewards/margins": 0.12930448055267335, + "rewards/rejected": 0.20503461360931396, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 0.565047025680542, + "kl": 2.7821693420410156, + "learning_rate": 4.2244444444444446e-06, + "logits/chosen": 28686153.6, + "logits/rejected": 31275964.8, + "logps/chosen": -147.65833740234376, + "logps/rejected": -149.1046630859375, + "loss": 0.5113170146942139, + "rewards/chosen": 0.13800346851348877, + "rewards/margins": -0.09322352409362794, + "rewards/rejected": 0.2312269926071167, + "step": 600 + }, + { + "epoch": 0.48, + "eval_kl": 2.7389280796051025, + "eval_logits/chosen": 38005252.096, + "eval_logits/rejected": 37846036.48, + "eval_logps/chosen": -152.185953125, + "eval_logps/rejected": -146.89521875, + "eval_loss": 0.4950157403945923, + "eval_rewards/chosen": 0.21398320007324217, + "eval_rewards/margins": 0.03923100280761718, + "eval_rewards/rejected": 0.174752197265625, + "eval_runtime": 217.558, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 0.4400153160095215, + "kl": 3.6884047985076904, + "learning_rate": 4.202222222222222e-06, + "logits/chosen": 45485420.8, + "logits/rejected": 41585961.6, + "logps/chosen": -193.09393310546875, + "logps/rejected": -168.6018310546875, + "loss": 0.47965612411499026, + "rewards/chosen": 0.346639347076416, + "rewards/margins": 0.19525065422058108, + "rewards/rejected": 0.15138869285583495, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 0.47579634189605713, + "kl": 2.8855841159820557, + "learning_rate": 4.18e-06, + "logits/chosen": 26868339.2, + "logits/rejected": 25530107.2, + "logps/chosen": -139.2341552734375, + "logps/rejected": -135.46981201171874, + "loss": 0.48549280166625974, + "rewards/chosen": 0.2853414058685303, + "rewards/margins": 0.12018097639083863, + "rewards/rejected": 0.16516042947769166, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 0.4894777536392212, + "kl": 3.817617893218994, + "learning_rate": 4.157777777777778e-06, + "logits/chosen": 38511724.8, + "logits/rejected": 41062003.2, + "logps/chosen": -139.717919921875, + "logps/rejected": -172.0173828125, + "loss": 0.5030189037322998, + "rewards/chosen": 0.3242809772491455, + "rewards/margins": -0.024120402336120617, + "rewards/rejected": 0.3484013795852661, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 0.5884784460067749, + "kl": 3.3272690773010254, + "learning_rate": 4.135555555555556e-06, + "logits/chosen": 40902281.6, + "logits/rejected": 39306883.2, + "logps/chosen": -189.29173583984374, + "logps/rejected": -149.8311279296875, + "loss": 0.4905113220214844, + "rewards/chosen": 0.27348809242248534, + "rewards/margins": 0.07974576950073242, + "rewards/rejected": 0.19374232292175292, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 0.489397794008255, + "kl": 4.352996349334717, + "learning_rate": 4.1133333333333335e-06, + "logits/chosen": 43261625.6, + "logits/rejected": 41635296.0, + "logps/chosen": -125.68609619140625, + "logps/rejected": -132.724267578125, + "loss": 0.49439477920532227, + "rewards/chosen": 0.3973216533660889, + "rewards/margins": 0.04381968975067141, + "rewards/rejected": 0.35350196361541747, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 0.36593517661094666, + "kl": 3.356546401977539, + "learning_rate": 4.091111111111111e-06, + "logits/chosen": 56701203.2, + "logits/rejected": 55284249.6, + "logps/chosen": -168.565625, + "logps/rejected": -132.68575439453124, + "loss": 0.48746094703674314, + "rewards/chosen": 0.3356959581375122, + "rewards/margins": 0.09914519786834713, + "rewards/rejected": 0.23655076026916505, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 0.47609221935272217, + "kl": 3.9726402759552, + "learning_rate": 4.0688888888888896e-06, + "logits/chosen": 42420092.8, + "logits/rejected": 42645120.0, + "logps/chosen": -181.13988037109374, + "logps/rejected": -183.315185546875, + "loss": 0.4789764881134033, + "rewards/chosen": 0.3791257381439209, + "rewards/margins": 0.18321629762649538, + "rewards/rejected": 0.19590944051742554, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 0.479322224855423, + "kl": 2.822577953338623, + "learning_rate": 4.046666666666667e-06, + "logits/chosen": 41487219.2, + "logits/rejected": 40422083.2, + "logps/chosen": -144.38018798828125, + "logps/rejected": -126.14996337890625, + "loss": 0.49282026290893555, + "rewards/chosen": 0.22584574222564696, + "rewards/margins": 0.055848944187164296, + "rewards/rejected": 0.16999679803848267, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 0.3670179843902588, + "kl": 4.148089408874512, + "learning_rate": 4.024444444444445e-06, + "logits/chosen": 42715072.0, + "logits/rejected": 40623603.2, + "logps/chosen": -156.69072265625, + "logps/rejected": -175.76126708984376, + "loss": 0.4938004970550537, + "rewards/chosen": 0.39667787551879885, + "rewards/margins": 0.05049760341644288, + "rewards/rejected": 0.34618027210235597, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 0.41768333315849304, + "kl": 3.2628045082092285, + "learning_rate": 4.002222222222222e-06, + "logits/chosen": 37950355.2, + "logits/rejected": 34915990.4, + "logps/chosen": -155.3704833984375, + "logps/rejected": -143.3675537109375, + "loss": 0.49524383544921874, + "rewards/chosen": 0.2646515369415283, + "rewards/margins": 0.04396252632141112, + "rewards/rejected": 0.2206890106201172, + "step": 700 + }, + { + "epoch": 0.56, + "eval_kl": 3.236727237701416, + "eval_logits/chosen": 38033387.52, + "eval_logits/rejected": 37810647.04, + "eval_logps/chosen": -151.62878125, + "eval_logps/rejected": -146.479140625, + "eval_loss": 0.49332940578460693, + "eval_rewards/chosen": 0.269699462890625, + "eval_rewards/margins": 0.05334155273437502, + "eval_rewards/rejected": 0.21635791015625, + "eval_runtime": 218.185, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.292, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 0.40857982635498047, + "kl": 4.429306983947754, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 40884387.2, + "logits/rejected": 39080608.0, + "logps/chosen": -172.00384521484375, + "logps/rejected": -133.983837890625, + "loss": 0.4770909309387207, + "rewards/chosen": 0.518680477142334, + "rewards/margins": 0.18963458538055422, + "rewards/rejected": 0.3290458917617798, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 0.3682423233985901, + "kl": 3.0005943775177, + "learning_rate": 3.9577777777777785e-06, + "logits/chosen": 33681516.8, + "logits/rejected": 34946268.8, + "logps/chosen": -120.9507080078125, + "logps/rejected": -123.20738525390625, + "loss": 0.5031956672668457, + "rewards/chosen": 0.11274595260620117, + "rewards/margins": -0.05625311136245727, + "rewards/rejected": 0.16899906396865844, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 0.37147852778434753, + "kl": 3.7080981731414795, + "learning_rate": 3.935555555555556e-06, + "logits/chosen": 36905440.0, + "logits/rejected": 34017891.2, + "logps/chosen": -152.36019287109374, + "logps/rejected": -154.4943115234375, + "loss": 0.49049901962280273, + "rewards/chosen": 0.3831493616104126, + "rewards/margins": 0.07779901027679442, + "rewards/rejected": 0.30535035133361815, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 0.48657310009002686, + "kl": 4.199930191040039, + "learning_rate": 3.913333333333334e-06, + "logits/chosen": 34702265.6, + "logits/rejected": 33570732.8, + "logps/chosen": -156.0489501953125, + "logps/rejected": -182.53206787109374, + "loss": 0.4937909603118896, + "rewards/chosen": 0.3453744649887085, + "rewards/margins": 0.06571738719940184, + "rewards/rejected": 0.27965707778930665, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 0.2791324257850647, + "kl": 3.1397013664245605, + "learning_rate": 3.891111111111111e-06, + "logits/chosen": 38985750.4, + "logits/rejected": 38637244.8, + "logps/chosen": -159.29842529296874, + "logps/rejected": -183.90196533203124, + "loss": 0.5016417980194092, + "rewards/chosen": 0.11977872848510743, + "rewards/margins": 0.02148157954216004, + "rewards/rejected": 0.09829714894294739, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 0.4719991683959961, + "kl": 1.940172553062439, + "learning_rate": 3.868888888888889e-06, + "logits/chosen": 36901945.6, + "logits/rejected": 37816726.4, + "logps/chosen": -115.81767578125, + "logps/rejected": -124.9832763671875, + "loss": 0.48836345672607423, + "rewards/chosen": 0.1182823657989502, + "rewards/margins": 0.09493236243724823, + "rewards/rejected": 0.023350003361701965, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 0.46570995450019836, + "kl": 1.71030592918396, + "learning_rate": 3.8466666666666665e-06, + "logits/chosen": 45056902.4, + "logits/rejected": 45886822.4, + "logps/chosen": -142.8737060546875, + "logps/rejected": -152.980029296875, + "loss": 0.4849842071533203, + "rewards/chosen": 0.044194817543029785, + "rewards/margins": 0.13042356967926025, + "rewards/rejected": -0.08622875213623046, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 0.48490992188453674, + "kl": 0.8850091695785522, + "learning_rate": 3.824444444444444e-06, + "logits/chosen": 41897849.6, + "logits/rejected": 42659980.8, + "logps/chosen": -163.0940673828125, + "logps/rejected": -145.29967041015624, + "loss": 0.4809276103973389, + "rewards/chosen": -0.029845520853996277, + "rewards/margins": 0.16276139318943023, + "rewards/rejected": -0.1926069140434265, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 0.42963773012161255, + "kl": 1.2656173706054688, + "learning_rate": 3.8022222222222226e-06, + "logits/chosen": 31615977.6, + "logits/rejected": 27643244.8, + "logps/chosen": -146.9314453125, + "logps/rejected": -139.28326416015625, + "loss": 0.4754499912261963, + "rewards/chosen": -0.14652204513549805, + "rewards/margins": 0.23166158199310305, + "rewards/rejected": -0.3781836271286011, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 0.5815227627754211, + "kl": 1.1006227731704712, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 28676291.2, + "logits/rejected": 28398739.2, + "logps/chosen": -149.7789306640625, + "logps/rejected": -134.99962158203124, + "loss": 0.5007228374481201, + "rewards/chosen": -0.2438833236694336, + "rewards/margins": -0.03309731483459474, + "rewards/rejected": -0.21078600883483886, + "step": 800 + }, + { + "epoch": 0.64, + "eval_kl": 1.4775981903076172, + "eval_logits/chosen": 34909265.92, + "eval_logits/rejected": 34874159.104, + "eval_logps/chosen": -156.25446875, + "eval_logps/rejected": -151.0355, + "eval_loss": 0.4928078353404999, + "eval_rewards/chosen": -0.19286886596679687, + "eval_rewards/margins": 0.04640672302246096, + "eval_rewards/rejected": -0.23927558898925783, + "eval_runtime": 217.2245, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 0.518290102481842, + "kl": 2.0555756092071533, + "learning_rate": 3.757777777777778e-06, + "logits/chosen": 39001305.6, + "logits/rejected": 39306675.2, + "logps/chosen": -159.09794921875, + "logps/rejected": -157.2656982421875, + "loss": 0.48754167556762695, + "rewards/chosen": 0.07401522397994995, + "rewards/margins": 0.1124086320400238, + "rewards/rejected": -0.03839340806007385, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 0.4529527425765991, + "kl": 1.2279353141784668, + "learning_rate": 3.7355555555555555e-06, + "logits/chosen": 24645734.4, + "logits/rejected": 24314422.4, + "logps/chosen": -157.53839111328125, + "logps/rejected": -129.3763427734375, + "loss": 0.5034448146820069, + "rewards/chosen": -0.31603260040283204, + "rewards/margins": -0.04990806579589846, + "rewards/rejected": -0.2661245346069336, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 0.5347335934638977, + "kl": 2.0512425899505615, + "learning_rate": 3.713333333333334e-06, + "logits/chosen": 31296140.8, + "logits/rejected": 29981024.0, + "logps/chosen": -160.424072265625, + "logps/rejected": -127.09144287109375, + "loss": 0.49833097457885744, + "rewards/chosen": -0.05236924290657043, + "rewards/margins": 0.004509323835372926, + "rewards/rejected": -0.05687856674194336, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 0.4843555688858032, + "kl": 1.5582542419433594, + "learning_rate": 3.6911111111111115e-06, + "logits/chosen": 39700444.8, + "logits/rejected": 39422995.2, + "logps/chosen": -151.80198974609374, + "logps/rejected": -151.66746826171874, + "loss": 0.4924652099609375, + "rewards/chosen": -0.040176278352737425, + "rewards/margins": 0.038288170099258424, + "rewards/rejected": -0.07846444845199585, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 0.5929602384567261, + "kl": 2.925257921218872, + "learning_rate": 3.668888888888889e-06, + "logits/chosen": 39676166.4, + "logits/rejected": 38709782.4, + "logps/chosen": -169.22021484375, + "logps/rejected": -189.6208251953125, + "loss": 0.507749605178833, + "rewards/chosen": -0.05675660371780396, + "rewards/margins": -0.11356353759765625, + "rewards/rejected": 0.05680693387985229, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 0.47086119651794434, + "kl": 2.162543296813965, + "learning_rate": 3.6466666666666668e-06, + "logits/chosen": 31780547.2, + "logits/rejected": 29934672.0, + "logps/chosen": -165.53729248046875, + "logps/rejected": -144.7294921875, + "loss": 0.48428568840026853, + "rewards/chosen": -0.0063173860311508175, + "rewards/margins": 0.12325810492038727, + "rewards/rejected": -0.12957549095153809, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 0.5226603150367737, + "kl": 1.0791276693344116, + "learning_rate": 3.624444444444445e-06, + "logits/chosen": 36146592.0, + "logits/rejected": 34014483.2, + "logps/chosen": -139.47996826171874, + "logps/rejected": -147.29366455078124, + "loss": 0.4861104965209961, + "rewards/chosen": -0.3762362003326416, + "rewards/margins": 0.0545970916748047, + "rewards/rejected": -0.4308332920074463, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 0.456878662109375, + "kl": 1.0787068605422974, + "learning_rate": 3.6022222222222224e-06, + "logits/chosen": 31733673.6, + "logits/rejected": 30545894.4, + "logps/chosen": -166.9651123046875, + "logps/rejected": -136.55260009765624, + "loss": 0.4926890850067139, + "rewards/chosen": -0.2618767499923706, + "rewards/margins": 0.09881234169006348, + "rewards/rejected": -0.36068909168243407, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 0.39478904008865356, + "kl": 0.7077828645706177, + "learning_rate": 3.58e-06, + "logits/chosen": 42203872.0, + "logits/rejected": 40975161.6, + "logps/chosen": -135.72105712890624, + "logps/rejected": -136.8107421875, + "loss": 0.4823348045349121, + "rewards/chosen": -0.3193112850189209, + "rewards/margins": 0.17958507537841795, + "rewards/rejected": -0.49889636039733887, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 0.4868822395801544, + "kl": 1.0201635360717773, + "learning_rate": 3.5577777777777785e-06, + "logits/chosen": 37633750.4, + "logits/rejected": 38356057.6, + "logps/chosen": -176.8873779296875, + "logps/rejected": -164.86949462890624, + "loss": 0.49372262954711915, + "rewards/chosen": -0.5601509571075439, + "rewards/margins": -0.027371644973754883, + "rewards/rejected": -0.532779312133789, + "step": 900 + }, + { + "epoch": 0.72, + "eval_kl": 0.7019873857498169, + "eval_logits/chosen": 32590643.2, + "eval_logits/rejected": 32688842.752, + "eval_logps/chosen": -159.868109375, + "eval_logps/rejected": -154.68375, + "eval_loss": 0.49165070056915283, + "eval_rewards/chosen": -0.5542342529296875, + "eval_rewards/margins": 0.049869018554687505, + "eval_rewards/rejected": -0.604103271484375, + "eval_runtime": 218.2133, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 0.4481786787509918, + "kl": 0.8257962465286255, + "learning_rate": 3.535555555555556e-06, + "logits/chosen": 32320790.4, + "logits/rejected": 32438003.2, + "logps/chosen": -168.3318603515625, + "logps/rejected": -174.481884765625, + "loss": 0.4958535671234131, + "rewards/chosen": -0.454105281829834, + "rewards/margins": 0.1731292247772217, + "rewards/rejected": -0.6272345066070557, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 0.41489994525909424, + "kl": 0.9689595103263855, + "learning_rate": 3.5133333333333337e-06, + "logits/chosen": 33867084.8, + "logits/rejected": 31248268.8, + "logps/chosen": -136.1980224609375, + "logps/rejected": -151.76387939453124, + "loss": 0.4944427490234375, + "rewards/chosen": -0.40467538833618166, + "rewards/margins": 0.14626178741455076, + "rewards/rejected": -0.5509371757507324, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 0.460254430770874, + "kl": 1.2024842500686646, + "learning_rate": 3.4911111111111113e-06, + "logits/chosen": 32133318.4, + "logits/rejected": 32185379.2, + "logps/chosen": -155.15115966796876, + "logps/rejected": -149.83077392578124, + "loss": 0.48492116928100587, + "rewards/chosen": -0.21543638706207274, + "rewards/margins": 0.15435693264007572, + "rewards/rejected": -0.36979331970214846, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 0.34393665194511414, + "kl": 1.7038171291351318, + "learning_rate": 3.4688888888888894e-06, + "logits/chosen": 27802694.4, + "logits/rejected": 25992144.0, + "logps/chosen": -136.5869140625, + "logps/rejected": -152.3591064453125, + "loss": 0.482952356338501, + "rewards/chosen": -0.24357478618621825, + "rewards/margins": 0.1968345880508423, + "rewards/rejected": -0.44040937423706056, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 0.3970324397087097, + "kl": 2.81396484375, + "learning_rate": 3.446666666666667e-06, + "logits/chosen": 40630716.8, + "logits/rejected": 43665993.6, + "logps/chosen": -184.17490234375, + "logps/rejected": -158.86982421875, + "loss": 0.48198614120483396, + "rewards/chosen": -0.03642080426216125, + "rewards/margins": 0.11629058718681336, + "rewards/rejected": -0.1527113914489746, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 0.5192223787307739, + "kl": 2.0976433753967285, + "learning_rate": 3.4244444444444446e-06, + "logits/chosen": 37736128.0, + "logits/rejected": 39264816.0, + "logps/chosen": -137.5658203125, + "logps/rejected": -150.39862060546875, + "loss": 0.5065433502197265, + "rewards/chosen": -0.08580412864685058, + "rewards/margins": -0.038575989007949826, + "rewards/rejected": -0.047228139638900754, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 0.4041205942630768, + "kl": 1.893617033958435, + "learning_rate": 3.4022222222222222e-06, + "logits/chosen": 32113164.8, + "logits/rejected": 30382905.6, + "logps/chosen": -135.3074462890625, + "logps/rejected": -119.05921630859375, + "loss": 0.4857178688049316, + "rewards/chosen": -0.022751623392105104, + "rewards/margins": 0.15494421124458313, + "rewards/rejected": -0.17769583463668823, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 0.6198734641075134, + "kl": 3.3098182678222656, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 39511980.8, + "logits/rejected": 39711251.2, + "logps/chosen": -138.0878662109375, + "logps/rejected": -157.733740234375, + "loss": 0.4792346000671387, + "rewards/chosen": 0.2472997188568115, + "rewards/margins": 0.20102626085281372, + "rewards/rejected": 0.046273458003997806, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 0.5357170104980469, + "kl": 1.1110466718673706, + "learning_rate": 3.3577777777777783e-06, + "logits/chosen": 31936358.4, + "logits/rejected": 31202771.2, + "logps/chosen": -127.25572509765625, + "logps/rejected": -136.280859375, + "loss": 0.4806090831756592, + "rewards/chosen": -0.14191631078720093, + "rewards/margins": 0.16535891294479368, + "rewards/rejected": -0.3072752237319946, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 0.351481556892395, + "kl": 1.9038498401641846, + "learning_rate": 3.335555555555556e-06, + "logits/chosen": 31806704.0, + "logits/rejected": 32803180.8, + "logps/chosen": -159.603076171875, + "logps/rejected": -122.312548828125, + "loss": 0.492017126083374, + "rewards/chosen": -0.0802042841911316, + "rewards/margins": 0.04814127683639527, + "rewards/rejected": -0.12834556102752687, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_kl": 1.7515510320663452, + "eval_logits/chosen": 34055507.968, + "eval_logits/rejected": 34077693.952, + "eval_logps/chosen": -156.563640625, + "eval_logps/rejected": -151.51715625, + "eval_loss": 0.49078983068466187, + "eval_rewards/chosen": -0.223786865234375, + "eval_rewards/margins": 0.06365646362304686, + "eval_rewards/rejected": -0.28744332885742185, + "eval_runtime": 216.801, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 0.5972615480422974, + "kl": 2.554426431655884, + "learning_rate": 3.3133333333333335e-06, + "logits/chosen": 27570451.2, + "logits/rejected": 30221734.4, + "logps/chosen": -138.31341552734375, + "logps/rejected": -188.19471435546876, + "loss": 0.49091529846191406, + "rewards/chosen": -0.16074006557464598, + "rewards/margins": 0.05920815467834473, + "rewards/rejected": -0.2199482202529907, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 0.5644449591636658, + "kl": 2.1006593704223633, + "learning_rate": 3.2911111111111116e-06, + "logits/chosen": 28881091.2, + "logits/rejected": 27956883.2, + "logps/chosen": -154.400244140625, + "logps/rejected": -139.5636474609375, + "loss": 0.4885709762573242, + "rewards/chosen": -0.201019549369812, + "rewards/margins": 0.14239611625671386, + "rewards/rejected": -0.34341566562652587, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 0.45909127593040466, + "kl": 2.598879337310791, + "learning_rate": 3.268888888888889e-06, + "logits/chosen": 34286569.6, + "logits/rejected": 33405510.4, + "logps/chosen": -155.141162109375, + "logps/rejected": -153.61441650390626, + "loss": 0.47780580520629884, + "rewards/chosen": -0.026760125160217287, + "rewards/margins": 0.17421259880065917, + "rewards/rejected": -0.20097272396087645, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 0.5554538369178772, + "kl": 2.448212146759033, + "learning_rate": 3.2466666666666668e-06, + "logits/chosen": 27163843.2, + "logits/rejected": 26525179.2, + "logps/chosen": -192.26566162109376, + "logps/rejected": -115.9719970703125, + "loss": 0.5046756744384766, + "rewards/chosen": -0.2157804250717163, + "rewards/margins": -0.09053788185119627, + "rewards/rejected": -0.12524254322052003, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 0.535012423992157, + "kl": 1.9927467107772827, + "learning_rate": 3.2244444444444444e-06, + "logits/chosen": 29665126.4, + "logits/rejected": 27342956.8, + "logps/chosen": -129.9255859375, + "logps/rejected": -160.8191162109375, + "loss": 0.4967525005340576, + "rewards/chosen": -0.1964455485343933, + "rewards/margins": 0.04946266412734987, + "rewards/rejected": -0.24590821266174318, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 0.4275314509868622, + "kl": 1.8325145244598389, + "learning_rate": 3.202222222222223e-06, + "logits/chosen": 38900185.6, + "logits/rejected": 36465756.8, + "logps/chosen": -173.99815673828124, + "logps/rejected": -189.41026611328124, + "loss": 0.5163179874420166, + "rewards/chosen": -0.341221284866333, + "rewards/margins": -0.10804271697998047, + "rewards/rejected": -0.23317856788635255, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 0.6077404022216797, + "kl": 1.2542213201522827, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 30413292.8, + "logits/rejected": 31516124.8, + "logps/chosen": -146.35406494140625, + "logps/rejected": -171.661767578125, + "loss": 0.4878209590911865, + "rewards/chosen": -0.4057271957397461, + "rewards/margins": 0.26455159187316896, + "rewards/rejected": -0.670278787612915, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 0.3333641290664673, + "kl": 0.8504716753959656, + "learning_rate": 3.157777777777778e-06, + "logits/chosen": 33478700.8, + "logits/rejected": 35287001.6, + "logps/chosen": -159.8537109375, + "logps/rejected": -140.7849609375, + "loss": 0.5070839405059815, + "rewards/chosen": -0.5834408283233643, + "rewards/margins": -0.13464021682739263, + "rewards/rejected": -0.44880061149597167, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 0.5745656490325928, + "kl": 0.5619686245918274, + "learning_rate": 3.1355555555555557e-06, + "logits/chosen": 25375937.6, + "logits/rejected": 24836934.4, + "logps/chosen": -150.52469482421876, + "logps/rejected": -119.2016357421875, + "loss": 0.5091001033782959, + "rewards/chosen": -0.6672951698303222, + "rewards/margins": -0.07665328979492181, + "rewards/rejected": -0.5906418800354004, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 0.5211692452430725, + "kl": 0.7837439775466919, + "learning_rate": 3.1133333333333337e-06, + "logits/chosen": 32551961.6, + "logits/rejected": 31246240.0, + "logps/chosen": -141.6515625, + "logps/rejected": -178.54500732421874, + "loss": 0.4797823429107666, + "rewards/chosen": -0.5657515525817871, + "rewards/margins": 0.2927797317504883, + "rewards/rejected": -0.8585312843322754, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_kl": 1.0299346446990967, + "eval_logits/chosen": 32200159.232, + "eval_logits/rejected": 32317042.688, + "eval_logps/chosen": -159.071015625, + "eval_logps/rejected": -154.04371875, + "eval_loss": 0.4902701675891876, + "eval_rewards/chosen": -0.4745252380371094, + "eval_rewards/margins": 0.0655753479003906, + "eval_rewards/rejected": -0.5401005859375, + "eval_runtime": 218.2058, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 0.5113421678543091, + "kl": 0.8989810943603516, + "learning_rate": 3.0911111111111114e-06, + "logits/chosen": 35005900.8, + "logits/rejected": 37000198.4, + "logps/chosen": -121.57982177734375, + "logps/rejected": -188.24630126953124, + "loss": 0.5006334781646729, + "rewards/chosen": -0.5451927661895752, + "rewards/margins": -0.006183815002441473, + "rewards/rejected": -0.5390089511871338, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 0.5302984714508057, + "kl": 1.4579006433486938, + "learning_rate": 3.068888888888889e-06, + "logits/chosen": 44859440.0, + "logits/rejected": 44371523.2, + "logps/chosen": -152.48531494140624, + "logps/rejected": -166.46834716796874, + "loss": 0.47098937034606936, + "rewards/chosen": -0.30156469345092773, + "rewards/margins": 0.31780052185058594, + "rewards/rejected": -0.6193652153015137, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 0.513234555721283, + "kl": 1.1729340553283691, + "learning_rate": 3.0466666666666666e-06, + "logits/chosen": 34804921.6, + "logits/rejected": 33861536.0, + "logps/chosen": -184.123681640625, + "logps/rejected": -151.5656005859375, + "loss": 0.48264274597167967, + "rewards/chosen": -0.47753205299377444, + "rewards/margins": 0.19833426475524896, + "rewards/rejected": -0.6758663177490234, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 0.3625507354736328, + "kl": 1.0059670209884644, + "learning_rate": 3.024444444444445e-06, + "logits/chosen": 21122052.8, + "logits/rejected": 21075241.6, + "logps/chosen": -109.50086669921875, + "logps/rejected": -147.45343017578125, + "loss": 0.4898237705230713, + "rewards/chosen": -0.47812538146972655, + "rewards/margins": 0.0868696689605713, + "rewards/rejected": -0.5649950504302979, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 0.45654433965682983, + "kl": 0.5379985570907593, + "learning_rate": 3.0022222222222227e-06, + "logits/chosen": 41951734.4, + "logits/rejected": 39602163.2, + "logps/chosen": -224.10224609375, + "logps/rejected": -172.74881591796876, + "loss": 0.4965188980102539, + "rewards/chosen": -1.0065871238708497, + "rewards/margins": 0.07849445343017569, + "rewards/rejected": -1.0850815773010254, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 0.4175960123538971, + "kl": 0.7397834062576294, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 25937947.2, + "logits/rejected": 23997976.0, + "logps/chosen": -139.0525634765625, + "logps/rejected": -172.82266845703126, + "loss": 0.46753606796264646, + "rewards/chosen": -0.6469098091125488, + "rewards/margins": 0.5304314613342286, + "rewards/rejected": -1.1773412704467774, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 0.6416438221931458, + "kl": 1.0863409042358398, + "learning_rate": 2.957777777777778e-06, + "logits/chosen": 27452134.4, + "logits/rejected": 28027673.6, + "logps/chosen": -141.15223388671876, + "logps/rejected": -165.84906005859375, + "loss": 0.492138671875, + "rewards/chosen": -0.9567946434020996, + "rewards/margins": -0.08803501129150393, + "rewards/rejected": -0.8687596321105957, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 0.4831324517726898, + "kl": 0.9691106081008911, + "learning_rate": 2.935555555555556e-06, + "logits/chosen": 29437625.6, + "logits/rejected": 27526915.2, + "logps/chosen": -141.1181884765625, + "logps/rejected": -170.26187744140626, + "loss": 0.48889832496643065, + "rewards/chosen": -0.5727379322052002, + "rewards/margins": 0.27677369117736816, + "rewards/rejected": -0.8495116233825684, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 0.4612303078174591, + "kl": 0.5348154306411743, + "learning_rate": 2.9133333333333335e-06, + "logits/chosen": 29246425.6, + "logits/rejected": 24839766.4, + "logps/chosen": -184.67042236328126, + "logps/rejected": -187.88712158203126, + "loss": 0.4651207447052002, + "rewards/chosen": -0.6563633918762207, + "rewards/margins": 0.6019566535949706, + "rewards/rejected": -1.2583200454711914, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 0.4897603392601013, + "kl": 0.6600741147994995, + "learning_rate": 2.891111111111111e-06, + "logits/chosen": 29597916.8, + "logits/rejected": 28004249.6, + "logps/chosen": -146.8487060546875, + "logps/rejected": -160.24705810546874, + "loss": 0.4915929794311523, + "rewards/chosen": -0.6603847980499268, + "rewards/margins": 0.15054879188537595, + "rewards/rejected": -0.8109335899353027, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_kl": 0.6639278531074524, + "eval_logits/chosen": 30372524.032, + "eval_logits/rejected": 30547062.784, + "eval_logps/chosen": -161.525796875, + "eval_logps/rejected": -156.5376875, + "eval_loss": 0.4897628426551819, + "eval_rewards/chosen": -0.720002685546875, + "eval_rewards/margins": 0.06949468994140628, + "eval_rewards/rejected": -0.7894973754882812, + "eval_runtime": 218.5755, + "eval_samples_per_second": 4.575, + "eval_steps_per_second": 2.288, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 0.48334673047065735, + "kl": 0.7732948064804077, + "learning_rate": 2.868888888888889e-06, + "logits/chosen": 21258230.4, + "logits/rejected": 23340913.6, + "logps/chosen": -139.87337646484374, + "logps/rejected": -144.701416015625, + "loss": 0.5003955841064454, + "rewards/chosen": -0.8982287406921386, + "rewards/margins": 0.038875579833984375, + "rewards/rejected": -0.937104320526123, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 0.492876261472702, + "kl": 1.0231386423110962, + "learning_rate": 2.8466666666666672e-06, + "logits/chosen": 29408150.4, + "logits/rejected": 26554420.8, + "logps/chosen": -184.746484375, + "logps/rejected": -195.84049072265626, + "loss": 0.49897193908691406, + "rewards/chosen": -0.7237229824066163, + "rewards/margins": 0.09356503486633294, + "rewards/rejected": -0.8172880172729492, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 0.4466714560985565, + "kl": 1.3907277584075928, + "learning_rate": 2.824444444444445e-06, + "logits/chosen": 26329916.8, + "logits/rejected": 26271193.6, + "logps/chosen": -150.485302734375, + "logps/rejected": -159.1055908203125, + "loss": 0.48776721954345703, + "rewards/chosen": -0.38318867683410646, + "rewards/margins": 0.07780303955078122, + "rewards/rejected": -0.4609917163848877, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 0.4158620834350586, + "kl": 1.1157363653182983, + "learning_rate": 2.8022222222222225e-06, + "logits/chosen": 27694662.4, + "logits/rejected": 27987369.6, + "logps/chosen": -125.29927978515624, + "logps/rejected": -122.66854248046874, + "loss": 0.49160265922546387, + "rewards/chosen": -0.3763638734817505, + "rewards/margins": 0.043813061714172374, + "rewards/rejected": -0.42017693519592286, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 0.5007547736167908, + "kl": 1.0412095785140991, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 19932609.6, + "logits/rejected": 20719350.4, + "logps/chosen": -115.5499267578125, + "logps/rejected": -133.51761474609376, + "loss": 0.4874756336212158, + "rewards/chosen": -0.3205535411834717, + "rewards/margins": 0.1453540325164795, + "rewards/rejected": -0.46590757369995117, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 0.5230170488357544, + "kl": 1.1391807794570923, + "learning_rate": 2.757777777777778e-06, + "logits/chosen": 24849233.6, + "logits/rejected": 25524444.8, + "logps/chosen": -152.53955078125, + "logps/rejected": -136.38438720703124, + "loss": 0.4610313892364502, + "rewards/chosen": -0.35785841941833496, + "rewards/margins": 0.3967602729797364, + "rewards/rejected": -0.7546186923980713, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 0.44898638129234314, + "kl": 1.0509991645812988, + "learning_rate": 2.7355555555555557e-06, + "logits/chosen": 45757600.0, + "logits/rejected": 44022054.4, + "logps/chosen": -142.66414794921874, + "logps/rejected": -156.66207275390624, + "loss": 0.48042120933532717, + "rewards/chosen": -0.2633040189743042, + "rewards/margins": 0.21041772365570066, + "rewards/rejected": -0.4737217426300049, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 0.38083699345588684, + "kl": 1.3006069660186768, + "learning_rate": 2.7133333333333333e-06, + "logits/chosen": 27252787.2, + "logits/rejected": 26214958.4, + "logps/chosen": -137.90665283203126, + "logps/rejected": -148.19869384765624, + "loss": 0.4850759029388428, + "rewards/chosen": -0.6168015956878662, + "rewards/margins": 0.1662153244018555, + "rewards/rejected": -0.7830169200897217, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 0.3815905749797821, + "kl": 1.5258519649505615, + "learning_rate": 2.6911111111111114e-06, + "logits/chosen": 36098556.8, + "logits/rejected": 37203574.4, + "logps/chosen": -146.7918212890625, + "logps/rejected": -150.63316650390624, + "loss": 0.48316545486450196, + "rewards/chosen": -0.2494358777999878, + "rewards/margins": 0.2103111505508423, + "rewards/rejected": -0.4597470283508301, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 0.45923149585723877, + "kl": 0.7379667162895203, + "learning_rate": 2.6688888888888894e-06, + "logits/chosen": 35448102.4, + "logits/rejected": 36526304.0, + "logps/chosen": -146.106689453125, + "logps/rejected": -163.4516357421875, + "loss": 0.47162642478942873, + "rewards/chosen": -0.5096414089202881, + "rewards/margins": 0.40263419151306157, + "rewards/rejected": -0.9122756004333497, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_kl": 1.307568073272705, + "eval_logits/chosen": 31229739.008, + "eval_logits/rejected": 31346253.824, + "eval_logps/chosen": -159.37909375, + "eval_logps/rejected": -154.479625, + "eval_loss": 0.4888923466205597, + "eval_rewards/chosen": -0.5053312683105469, + "eval_rewards/margins": 0.07835940551757814, + "eval_rewards/rejected": -0.583690673828125, + "eval_runtime": 218.3314, + "eval_samples_per_second": 4.58, + "eval_steps_per_second": 2.29, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 0.45518290996551514, + "kl": 0.6098345518112183, + "learning_rate": 2.646666666666667e-06, + "logits/chosen": 28600944.0, + "logits/rejected": 24958267.2, + "logps/chosen": -110.2119384765625, + "logps/rejected": -123.31849365234375, + "loss": 0.4656740665435791, + "rewards/chosen": -0.44736084938049314, + "rewards/margins": 0.48334193229675293, + "rewards/rejected": -0.9307027816772461, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 0.4651184678077698, + "kl": 2.3091390132904053, + "learning_rate": 2.6244444444444446e-06, + "logits/chosen": 41654179.2, + "logits/rejected": 39032467.2, + "logps/chosen": -170.0236083984375, + "logps/rejected": -183.32196044921875, + "loss": 0.48522496223449707, + "rewards/chosen": -0.43915767669677735, + "rewards/margins": 0.38542776107788085, + "rewards/rejected": -0.8245854377746582, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 0.5705190896987915, + "kl": 2.221473217010498, + "learning_rate": 2.6022222222222227e-06, + "logits/chosen": 37688201.6, + "logits/rejected": 36830240.0, + "logps/chosen": -155.06568603515626, + "logps/rejected": -168.05982666015626, + "loss": 0.4541748046875, + "rewards/chosen": -0.014664022624492646, + "rewards/margins": 0.44051638394594195, + "rewards/rejected": -0.45518040657043457, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 0.5728591680526733, + "kl": 2.2642099857330322, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 36680921.6, + "logits/rejected": 36696492.8, + "logps/chosen": -169.67305908203124, + "logps/rejected": -163.4990478515625, + "loss": 0.4542436122894287, + "rewards/chosen": 0.017992374300956727, + "rewards/margins": 0.4525547713041306, + "rewards/rejected": -0.43456239700317384, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 0.5952552556991577, + "kl": 2.2592105865478516, + "learning_rate": 2.557777777777778e-06, + "logits/chosen": 23545342.4, + "logits/rejected": 21279006.4, + "logps/chosen": -138.18184814453124, + "logps/rejected": -171.426416015625, + "loss": 0.47548651695251465, + "rewards/chosen": -0.38828775882720945, + "rewards/margins": 0.30096304416656494, + "rewards/rejected": -0.6892508029937744, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 0.45678818225860596, + "kl": 3.0089876651763916, + "learning_rate": 2.5355555555555555e-06, + "logits/chosen": 34513894.4, + "logits/rejected": 33449926.4, + "logps/chosen": -128.89869384765626, + "logps/rejected": -136.55162353515624, + "loss": 0.4745296001434326, + "rewards/chosen": 0.16677324771881102, + "rewards/margins": 0.25756397247314455, + "rewards/rejected": -0.0907907247543335, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 0.5198411345481873, + "kl": 4.8974103927612305, + "learning_rate": 2.5133333333333336e-06, + "logits/chosen": 42754617.6, + "logits/rejected": 43195552.0, + "logps/chosen": -159.500830078125, + "logps/rejected": -132.177587890625, + "loss": 0.48738694190979004, + "rewards/chosen": 0.2657592296600342, + "rewards/margins": 0.10389068126678466, + "rewards/rejected": 0.16186854839324952, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 0.4035385549068451, + "kl": 4.419563293457031, + "learning_rate": 2.491111111111111e-06, + "logits/chosen": 35998966.4, + "logits/rejected": 36727424.0, + "logps/chosen": -159.00057373046874, + "logps/rejected": -138.2158935546875, + "loss": 0.4853508949279785, + "rewards/chosen": 0.31732945442199706, + "rewards/margins": 0.10866012573242187, + "rewards/rejected": 0.2086693286895752, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 0.48067033290863037, + "kl": 2.848634958267212, + "learning_rate": 2.468888888888889e-06, + "logits/chosen": 39241859.2, + "logits/rejected": 40267868.8, + "logps/chosen": -141.302490234375, + "logps/rejected": -158.81297607421874, + "loss": 0.47108969688415525, + "rewards/chosen": 0.13487266302108764, + "rewards/margins": 0.21125618219375608, + "rewards/rejected": -0.07638351917266846, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 0.5766560435295105, + "kl": 3.4840214252471924, + "learning_rate": 2.446666666666667e-06, + "logits/chosen": 37769334.4, + "logits/rejected": 39194985.6, + "logps/chosen": -128.72220458984376, + "logps/rejected": -119.84296875, + "loss": 0.4526535987854004, + "rewards/chosen": 0.18631891012191773, + "rewards/margins": 0.41424218416213987, + "rewards/rejected": -0.22792327404022217, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_kl": 2.8408021926879883, + "eval_logits/chosen": 34422374.4, + "eval_logits/rejected": 34374045.696, + "eval_logps/chosen": -154.737546875, + "eval_logps/rejected": -149.911859375, + "eval_loss": 0.48892152309417725, + "eval_rewards/chosen": -0.04117748260498047, + "eval_rewards/margins": 0.08573676300048827, + "eval_rewards/rejected": -0.12691424560546874, + "eval_runtime": 217.9065, + "eval_samples_per_second": 4.589, + "eval_steps_per_second": 2.295, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.572714626789093, + "kl": 3.2733802795410156, + "learning_rate": 2.4244444444444444e-06, + "logits/chosen": 31887904.0, + "logits/rejected": 31864508.8, + "logps/chosen": -143.56092529296876, + "logps/rejected": -149.83736572265624, + "loss": 0.47499790191650393, + "rewards/chosen": 0.03257654905319214, + "rewards/margins": 0.27403136491775515, + "rewards/rejected": -0.241454815864563, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.41845089197158813, + "kl": 3.525341749191284, + "learning_rate": 2.4022222222222225e-06, + "logits/chosen": 34721577.6, + "logits/rejected": 34714211.2, + "logps/chosen": -135.9596923828125, + "logps/rejected": -152.25648193359376, + "loss": 0.4618217945098877, + "rewards/chosen": 0.27119529247283936, + "rewards/margins": 0.4753966093063354, + "rewards/rejected": -0.2042013168334961, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 0.48100632429122925, + "kl": 4.161208152770996, + "learning_rate": 2.38e-06, + "logits/chosen": 41846537.6, + "logits/rejected": 41130585.6, + "logps/chosen": -141.89105224609375, + "logps/rejected": -170.82178955078126, + "loss": 0.46498618125915525, + "rewards/chosen": 0.3817573547363281, + "rewards/margins": 0.3280399918556213, + "rewards/rejected": 0.05371736288070679, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 0.5169075131416321, + "kl": 3.003018617630005, + "learning_rate": 2.357777777777778e-06, + "logits/chosen": 39127420.8, + "logits/rejected": 39187852.8, + "logps/chosen": -121.814013671875, + "logps/rejected": -137.307861328125, + "loss": 0.46971497535705564, + "rewards/chosen": 0.28073878288269044, + "rewards/margins": 0.24493391513824464, + "rewards/rejected": 0.0358048677444458, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 0.6531253457069397, + "kl": 4.080590724945068, + "learning_rate": 2.3355555555555557e-06, + "logits/chosen": 45846016.0, + "logits/rejected": 43724588.8, + "logps/chosen": -147.2013671875, + "logps/rejected": -174.9349365234375, + "loss": 0.4397883892059326, + "rewards/chosen": 0.4346614837646484, + "rewards/margins": 0.6019469738006591, + "rewards/rejected": -0.16728549003601073, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 0.5618774890899658, + "kl": 2.632253885269165, + "learning_rate": 2.3133333333333333e-06, + "logits/chosen": 29858144.0, + "logits/rejected": 30215900.8, + "logps/chosen": -140.875927734375, + "logps/rejected": -135.17412109375, + "loss": 0.4855960369110107, + "rewards/chosen": -0.033119755983352664, + "rewards/margins": 0.22187880873680116, + "rewards/rejected": -0.2549985647201538, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 0.5618053674697876, + "kl": 3.8460822105407715, + "learning_rate": 2.2911111111111114e-06, + "logits/chosen": 33933001.6, + "logits/rejected": 33417001.6, + "logps/chosen": -98.64682006835938, + "logps/rejected": -126.3620849609375, + "loss": 0.4694389343261719, + "rewards/chosen": 0.3202403783798218, + "rewards/margins": 0.30994352102279665, + "rewards/rejected": 0.010296857357025147, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 0.49939385056495667, + "kl": 3.8765969276428223, + "learning_rate": 2.268888888888889e-06, + "logits/chosen": 45282752.0, + "logits/rejected": 45168672.0, + "logps/chosen": -182.675830078125, + "logps/rejected": -164.2170166015625, + "loss": 0.4553979396820068, + "rewards/chosen": 0.2684901714324951, + "rewards/margins": 0.3095468133687973, + "rewards/rejected": -0.041056641936302186, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 0.4833432734012604, + "kl": 3.6524147987365723, + "learning_rate": 2.2466666666666666e-06, + "logits/chosen": 44457616.0, + "logits/rejected": 45819459.2, + "logps/chosen": -174.73040771484375, + "logps/rejected": -167.15947265625, + "loss": 0.4712826728820801, + "rewards/chosen": 0.22984566688537597, + "rewards/margins": 0.2501527413725853, + "rewards/rejected": -0.02030707448720932, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 0.5918501019477844, + "kl": 3.85776948928833, + "learning_rate": 2.2244444444444447e-06, + "logits/chosen": 35046502.4, + "logits/rejected": 33714608.0, + "logps/chosen": -171.797265625, + "logps/rejected": -147.64517822265626, + "loss": 0.4212610721588135, + "rewards/chosen": 0.4938655376434326, + "rewards/margins": 0.7197992086410523, + "rewards/rejected": -0.22593367099761963, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_kl": 3.5718271732330322, + "eval_logits/chosen": 36649410.56, + "eval_logits/rejected": 36563263.488, + "eval_logps/chosen": -152.817171875, + "eval_logps/rejected": -148.086484375, + "eval_loss": 0.4883860945701599, + "eval_rewards/chosen": 0.15086082458496095, + "eval_rewards/margins": 0.09523786926269531, + "eval_rewards/rejected": 0.055622955322265626, + "eval_runtime": 217.5749, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 0.5604835748672485, + "kl": 2.5207982063293457, + "learning_rate": 2.2022222222222227e-06, + "logits/chosen": 28183353.6, + "logits/rejected": 26171800.0, + "logps/chosen": -127.34248046875, + "logps/rejected": -135.6572265625, + "loss": 0.44087018966674807, + "rewards/chosen": 0.20389485359191895, + "rewards/margins": 0.5874947786331177, + "rewards/rejected": -0.3835999250411987, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 0.6260045170783997, + "kl": 4.300113677978516, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37477926.4, + "logits/rejected": 37180899.2, + "logps/chosen": -153.624072265625, + "logps/rejected": -153.7638916015625, + "loss": 0.4450747013092041, + "rewards/chosen": 0.40005855560302733, + "rewards/margins": 0.5518900513648987, + "rewards/rejected": -0.15183149576187133, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 0.5863284468650818, + "kl": 3.6325111389160156, + "learning_rate": 2.157777777777778e-06, + "logits/chosen": 31546803.2, + "logits/rejected": 27054540.8, + "logps/chosen": -164.37142333984374, + "logps/rejected": -141.66112060546874, + "loss": 0.4467916488647461, + "rewards/chosen": 0.37994205951690674, + "rewards/margins": 0.499565863609314, + "rewards/rejected": -0.11962380409240722, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 0.42624586820602417, + "kl": 4.787026882171631, + "learning_rate": 2.1355555555555555e-06, + "logits/chosen": 29693196.8, + "logits/rejected": 28872902.4, + "logps/chosen": -140.95440673828125, + "logps/rejected": -148.26785888671876, + "loss": 0.43952031135559083, + "rewards/chosen": 0.574599027633667, + "rewards/margins": 0.5613519787788391, + "rewards/rejected": 0.013247048854827881, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 0.5493943095207214, + "kl": 3.9344754219055176, + "learning_rate": 2.1133333333333336e-06, + "logits/chosen": 40305772.8, + "logits/rejected": 40673481.6, + "logps/chosen": -165.5770751953125, + "logps/rejected": -181.73116455078124, + "loss": 0.4579936981201172, + "rewards/chosen": 0.33573935031890867, + "rewards/margins": 0.4691450238227844, + "rewards/rejected": -0.13340567350387572, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 0.46218565106391907, + "kl": 4.996054172515869, + "learning_rate": 2.091111111111111e-06, + "logits/chosen": 34034489.6, + "logits/rejected": 34783824.0, + "logps/chosen": -142.5915283203125, + "logps/rejected": -154.66793212890624, + "loss": 0.4685988426208496, + "rewards/chosen": 0.42725467681884766, + "rewards/margins": 0.2892191410064697, + "rewards/rejected": 0.13803553581237793, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 0.6164036989212036, + "kl": 3.2075297832489014, + "learning_rate": 2.0688888888888892e-06, + "logits/chosen": 37389939.2, + "logits/rejected": 35211542.4, + "logps/chosen": -157.27457275390626, + "logps/rejected": -135.26968994140626, + "loss": 0.4538430690765381, + "rewards/chosen": 0.30452628135681153, + "rewards/margins": 0.4211171746253968, + "rewards/rejected": -0.11659089326858521, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 0.4599875807762146, + "kl": 4.0805768966674805, + "learning_rate": 2.046666666666667e-06, + "logits/chosen": 47580665.6, + "logits/rejected": 44842764.8, + "logps/chosen": -147.82244873046875, + "logps/rejected": -144.87330322265626, + "loss": 0.4251837253570557, + "rewards/chosen": 0.6019775867462158, + "rewards/margins": 0.6677440404891968, + "rewards/rejected": -0.06576645374298096, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 0.5166321992874146, + "kl": 3.547309160232544, + "learning_rate": 2.024444444444445e-06, + "logits/chosen": 39700006.4, + "logits/rejected": 39951171.2, + "logps/chosen": -131.4615478515625, + "logps/rejected": -144.446240234375, + "loss": 0.4551235675811768, + "rewards/chosen": 0.3439887046813965, + "rewards/margins": 0.35865890979766846, + "rewards/rejected": -0.014670205116271973, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 0.515184760093689, + "kl": 5.392228126525879, + "learning_rate": 2.0022222222222225e-06, + "logits/chosen": 37336246.4, + "logits/rejected": 35750588.8, + "logps/chosen": -129.5730224609375, + "logps/rejected": -167.88192138671874, + "loss": 0.46866717338562014, + "rewards/chosen": 0.5722126007080078, + "rewards/margins": 0.2670687913894654, + "rewards/rejected": 0.30514380931854246, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_kl": 4.125787258148193, + "eval_logits/chosen": 36980781.056, + "eval_logits/rejected": 36865613.824, + "eval_logps/chosen": -152.2105625, + "eval_logps/rejected": -147.532890625, + "eval_loss": 0.4878697693347931, + "eval_rewards/chosen": 0.21152139282226562, + "eval_rewards/margins": 0.1005374526977539, + "eval_rewards/rejected": 0.11098394012451172, + "eval_runtime": 217.1057, + "eval_samples_per_second": 4.606, + "eval_steps_per_second": 2.303, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 0.33879461884498596, + "kl": 3.9892711639404297, + "learning_rate": 1.98e-06, + "logits/chosen": 38520425.6, + "logits/rejected": 37891657.6, + "logps/chosen": -144.33753662109376, + "logps/rejected": -113.0916748046875, + "loss": 0.4574281215667725, + "rewards/chosen": 0.3910325288772583, + "rewards/margins": 0.4121716648340225, + "rewards/rejected": -0.02113913595676422, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 0.4744361937046051, + "kl": 4.163745403289795, + "learning_rate": 1.9577777777777777e-06, + "logits/chosen": 37167372.8, + "logits/rejected": 37068976.0, + "logps/chosen": -144.5396484375, + "logps/rejected": -151.2422119140625, + "loss": 0.4770793914794922, + "rewards/chosen": 0.24615283012390138, + "rewards/margins": 0.2315782740712166, + "rewards/rejected": 0.014574556052684784, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 0.6805797815322876, + "kl": 4.433796405792236, + "learning_rate": 1.9355555555555558e-06, + "logits/chosen": 33666192.0, + "logits/rejected": 33794051.2, + "logps/chosen": -152.936279296875, + "logps/rejected": -167.15966796875, + "loss": 0.4641437530517578, + "rewards/chosen": 0.3409790754318237, + "rewards/margins": 0.22917660474777218, + "rewards/rejected": 0.11180247068405151, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 0.4908677637577057, + "kl": 4.329981803894043, + "learning_rate": 1.9133333333333334e-06, + "logits/chosen": 28728278.4, + "logits/rejected": 29006166.4, + "logps/chosen": -171.674267578125, + "logps/rejected": -151.65748291015626, + "loss": 0.4317901611328125, + "rewards/chosen": 0.4740726947784424, + "rewards/margins": 0.6063881039619445, + "rewards/rejected": -0.1323154091835022, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 0.5628176927566528, + "kl": 3.917667865753174, + "learning_rate": 1.8911111111111114e-06, + "logits/chosen": 41002073.6, + "logits/rejected": 38901564.8, + "logps/chosen": -146.1817138671875, + "logps/rejected": -137.590380859375, + "loss": 0.44250779151916503, + "rewards/chosen": 0.36355061531066896, + "rewards/margins": 0.5350786447525024, + "rewards/rejected": -0.17152802944183348, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 0.4226330816745758, + "kl": 4.557765007019043, + "learning_rate": 1.868888888888889e-06, + "logits/chosen": 41738940.8, + "logits/rejected": 39382457.6, + "logps/chosen": -144.0579833984375, + "logps/rejected": -149.93160400390624, + "loss": 0.4471259117126465, + "rewards/chosen": 0.4530649662017822, + "rewards/margins": 0.5140628039836883, + "rewards/rejected": -0.06099783778190613, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 0.726682186126709, + "kl": 2.606447219848633, + "learning_rate": 1.8466666666666668e-06, + "logits/chosen": 28843798.4, + "logits/rejected": 28256953.6, + "logps/chosen": -162.8355224609375, + "logps/rejected": -135.0196044921875, + "loss": 0.4602541923522949, + "rewards/chosen": 0.034113740921020506, + "rewards/margins": 0.3491029262542724, + "rewards/rejected": -0.31498918533325193, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4371758997440338, + "kl": 3.8837997913360596, + "learning_rate": 1.8244444444444445e-06, + "logits/chosen": 30105264.0, + "logits/rejected": 30200156.8, + "logps/chosen": -169.60362548828124, + "logps/rejected": -121.60936279296875, + "loss": 0.4653130054473877, + "rewards/chosen": 0.13300797939300538, + "rewards/margins": 0.2890047550201416, + "rewards/rejected": -0.15599677562713624, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.7084026336669922, + "kl": 3.9908013343811035, + "learning_rate": 1.8022222222222225e-06, + "logits/chosen": 40353881.6, + "logits/rejected": 41124192.0, + "logps/chosen": -145.57255859375, + "logps/rejected": -169.683984375, + "loss": 0.46573567390441895, + "rewards/chosen": 0.37704455852508545, + "rewards/margins": 0.30847471952438354, + "rewards/rejected": 0.0685698390007019, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.560882031917572, + "kl": 3.73456072807312, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 31647622.4, + "logits/rejected": 32944563.2, + "logps/chosen": -96.58458251953125, + "logps/rejected": -156.17933349609376, + "loss": 0.47052454948425293, + "rewards/chosen": 0.26383423805236816, + "rewards/margins": 0.31988897919654846, + "rewards/rejected": -0.0560547411441803, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 4.715727806091309, + "eval_logits/chosen": 37336121.344, + "eval_logits/rejected": 37191081.984, + "eval_logps/chosen": -151.06928125, + "eval_logps/rejected": -146.466265625, + "eval_loss": 0.4871442914009094, + "eval_rewards/chosen": 0.3256500549316406, + "eval_rewards/margins": 0.1080040283203125, + "eval_rewards/rejected": 0.21764602661132812, + "eval_runtime": 217.8394, + "eval_samples_per_second": 4.591, + "eval_steps_per_second": 2.295, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5111773014068604, + "kl": 5.973706245422363, + "learning_rate": 1.757777777777778e-06, + "logits/chosen": 43906630.4, + "logits/rejected": 41141516.8, + "logps/chosen": -158.88671875, + "logps/rejected": -179.5316650390625, + "loss": 0.4583888530731201, + "rewards/chosen": 0.5715279579162598, + "rewards/margins": 0.4317225098609924, + "rewards/rejected": 0.13980544805526735, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 0.4663240611553192, + "kl": 4.6347246170043945, + "learning_rate": 1.7355555555555555e-06, + "logits/chosen": 49689798.4, + "logits/rejected": 46981590.4, + "logps/chosen": -171.759765625, + "logps/rejected": -179.522802734375, + "loss": 0.47397675514221194, + "rewards/chosen": 0.18588199615478515, + "rewards/margins": 0.39475393295288086, + "rewards/rejected": -0.2088719367980957, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 0.5763538479804993, + "kl": 4.0374932289123535, + "learning_rate": 1.7133333333333336e-06, + "logits/chosen": 41851731.2, + "logits/rejected": 41044272.0, + "logps/chosen": -149.54090576171876, + "logps/rejected": -163.9952880859375, + "loss": 0.4446412563323975, + "rewards/chosen": 0.29855611324310305, + "rewards/margins": 0.5403349876403809, + "rewards/rejected": -0.24177887439727783, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 0.40721967816352844, + "kl": 4.595529556274414, + "learning_rate": 1.6911111111111112e-06, + "logits/chosen": 27241497.6, + "logits/rejected": 25061027.2, + "logps/chosen": -144.4770751953125, + "logps/rejected": -148.99678955078124, + "loss": 0.441908073425293, + "rewards/chosen": 0.47647829055786134, + "rewards/margins": 0.662132203578949, + "rewards/rejected": -0.18565391302108764, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 0.5112435221672058, + "kl": 6.786820411682129, + "learning_rate": 1.668888888888889e-06, + "logits/chosen": 43438329.6, + "logits/rejected": 42274822.4, + "logps/chosen": -173.9513916015625, + "logps/rejected": -128.84437255859376, + "loss": 0.44019775390625, + "rewards/chosen": 0.8855677604675293, + "rewards/margins": 0.5187113761901856, + "rewards/rejected": 0.36685638427734374, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 0.6391093134880066, + "kl": 3.4943454265594482, + "learning_rate": 1.6466666666666666e-06, + "logits/chosen": 46858697.6, + "logits/rejected": 47063660.8, + "logps/chosen": -143.279248046875, + "logps/rejected": -161.62069091796874, + "loss": 0.4441429615020752, + "rewards/chosen": 0.3917685985565186, + "rewards/margins": 0.5121995925903321, + "rewards/rejected": -0.12043099403381348, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 0.5220089554786682, + "kl": 5.633955955505371, + "learning_rate": 1.6244444444444447e-06, + "logits/chosen": 45115747.2, + "logits/rejected": 43860156.8, + "logps/chosen": -153.03194580078124, + "logps/rejected": -162.67841796875, + "loss": 0.4590646743774414, + "rewards/chosen": 0.6448601245880127, + "rewards/margins": 0.43718719482421875, + "rewards/rejected": 0.20767292976379395, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 0.5118013024330139, + "kl": 4.387326240539551, + "learning_rate": 1.6022222222222223e-06, + "logits/chosen": 40869132.8, + "logits/rejected": 39574160.0, + "logps/chosen": -138.70467529296874, + "logps/rejected": -177.04256591796874, + "loss": 0.4694656848907471, + "rewards/chosen": 0.32769317626953126, + "rewards/margins": 0.32016055583953856, + "rewards/rejected": 0.0075326204299926754, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 0.7699334025382996, + "kl": 5.964260578155518, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 33789193.6, + "logits/rejected": 32414848.0, + "logps/chosen": -144.8453857421875, + "logps/rejected": -156.7958740234375, + "loss": 0.43700370788574217, + "rewards/chosen": 0.7253459453582763, + "rewards/margins": 0.5568214774131774, + "rewards/rejected": 0.16852446794509887, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 0.569644570350647, + "kl": 4.964392185211182, + "learning_rate": 1.5577777777777777e-06, + "logits/chosen": 41654611.2, + "logits/rejected": 42416057.6, + "logps/chosen": -150.742919921875, + "logps/rejected": -167.503076171875, + "loss": 0.46422877311706545, + "rewards/chosen": 0.4968874931335449, + "rewards/margins": 0.40884148478508, + "rewards/rejected": 0.08804600834846496, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_kl": 4.746038436889648, + "eval_logits/chosen": 38021050.368, + "eval_logits/rejected": 37875998.72, + "eval_logps/chosen": -150.81178125, + "eval_logps/rejected": -146.26340625, + "eval_loss": 0.48678913712501526, + "eval_rewards/chosen": 0.3514009094238281, + "eval_rewards/margins": 0.11346868896484374, + "eval_rewards/rejected": 0.23793222045898438, + "eval_runtime": 217.6136, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 2.298, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 0.5262874960899353, + "kl": 4.751598358154297, + "learning_rate": 1.5355555555555558e-06, + "logits/chosen": 34054140.8, + "logits/rejected": 33053715.2, + "logps/chosen": -159.1825439453125, + "logps/rejected": -150.52947998046875, + "loss": 0.45218782424926757, + "rewards/chosen": 0.4885563850402832, + "rewards/margins": 0.42950677275657656, + "rewards/rejected": 0.059049612283706664, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 0.5098588466644287, + "kl": 4.014686584472656, + "learning_rate": 1.5133333333333334e-06, + "logits/chosen": 40354809.6, + "logits/rejected": 39480486.4, + "logps/chosen": -145.28863525390625, + "logps/rejected": -135.333837890625, + "loss": 0.43676166534423827, + "rewards/chosen": 0.4947031021118164, + "rewards/margins": 0.6355077743530273, + "rewards/rejected": -0.14080467224121093, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 0.448231041431427, + "kl": 5.823625564575195, + "learning_rate": 1.4911111111111112e-06, + "logits/chosen": 47668928.0, + "logits/rejected": 45680892.8, + "logps/chosen": -143.58907470703124, + "logps/rejected": -157.80914306640625, + "loss": 0.44497880935668943, + "rewards/chosen": 0.631040382385254, + "rewards/margins": 0.497272527217865, + "rewards/rejected": 0.13376785516738893, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 0.5563249588012695, + "kl": 5.55691385269165, + "learning_rate": 1.468888888888889e-06, + "logits/chosen": 30804796.8, + "logits/rejected": 30690835.2, + "logps/chosen": -124.4900146484375, + "logps/rejected": -133.585205078125, + "loss": 0.444712495803833, + "rewards/chosen": 0.6877860069274903, + "rewards/margins": 0.4855673313140869, + "rewards/rejected": 0.20221867561340331, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 0.6280549168586731, + "kl": 4.7264180183410645, + "learning_rate": 1.4466666666666669e-06, + "logits/chosen": 29776838.4, + "logits/rejected": 31885555.2, + "logps/chosen": -145.07203369140626, + "logps/rejected": -149.7443115234375, + "loss": 0.4761053562164307, + "rewards/chosen": 0.3966336488723755, + "rewards/margins": 0.2293717384338379, + "rewards/rejected": 0.1672619104385376, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 0.5127621293067932, + "kl": 6.063115119934082, + "learning_rate": 1.4244444444444447e-06, + "logits/chosen": 46517398.4, + "logits/rejected": 43311113.6, + "logps/chosen": -189.47987060546876, + "logps/rejected": -170.6665283203125, + "loss": 0.42708525657653806, + "rewards/chosen": 0.7080463409423828, + "rewards/margins": 0.7631270289421082, + "rewards/rejected": -0.05508068799972534, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 0.5822389125823975, + "kl": 4.272950649261475, + "learning_rate": 1.4022222222222223e-06, + "logits/chosen": 25695438.4, + "logits/rejected": 24746280.0, + "logps/chosen": -137.86600341796876, + "logps/rejected": -137.41337890625, + "loss": 0.4546250343322754, + "rewards/chosen": 0.42215428352355955, + "rewards/margins": 0.4513491034507751, + "rewards/rejected": -0.029194819927215575, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 0.5575308799743652, + "kl": 5.757713794708252, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 39392422.4, + "logits/rejected": 40771721.6, + "logps/chosen": -136.75103759765625, + "logps/rejected": -172.27413330078124, + "loss": 0.464780330657959, + "rewards/chosen": 0.6209693908691406, + "rewards/margins": 0.2982433319091797, + "rewards/rejected": 0.3227260589599609, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 0.7086930274963379, + "kl": 5.1618475914001465, + "learning_rate": 1.357777777777778e-06, + "logits/chosen": 40175395.2, + "logits/rejected": 39745542.4, + "logps/chosen": -187.0331298828125, + "logps/rejected": -151.17127685546876, + "loss": 0.45111641883850095, + "rewards/chosen": 0.4993483543395996, + "rewards/margins": 0.43962204456329346, + "rewards/rejected": 0.05972630977630615, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 0.5889289379119873, + "kl": 7.153553009033203, + "learning_rate": 1.3355555555555558e-06, + "logits/chosen": 43437193.6, + "logits/rejected": 41387232.0, + "logps/chosen": -122.5697265625, + "logps/rejected": -132.28131103515625, + "loss": 0.4659090042114258, + "rewards/chosen": 0.7089588165283203, + "rewards/margins": 0.31116189956665036, + "rewards/rejected": 0.3977969169616699, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_kl": 4.760587692260742, + "eval_logits/chosen": 38344187.904, + "eval_logits/rejected": 38206853.12, + "eval_logps/chosen": -150.743921875, + "eval_logps/rejected": -146.278265625, + "eval_loss": 0.4858584403991699, + "eval_rewards/chosen": 0.3581858215332031, + "eval_rewards/margins": 0.12174023437499998, + "eval_rewards/rejected": 0.23644558715820313, + "eval_runtime": 218.092, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 2.293, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 0.4249567687511444, + "kl": 6.2131242752075195, + "learning_rate": 1.3133333333333334e-06, + "logits/chosen": 57013689.6, + "logits/rejected": 56808352.0, + "logps/chosen": -164.70875244140626, + "logps/rejected": -132.06815185546876, + "loss": 0.4498802661895752, + "rewards/chosen": 0.7213836669921875, + "rewards/margins": 0.4230734348297119, + "rewards/rejected": 0.2983102321624756, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 0.549889862537384, + "kl": 6.712057590484619, + "learning_rate": 1.2911111111111112e-06, + "logits/chosen": 42846454.4, + "logits/rejected": 42999248.0, + "logps/chosen": -177.64530029296876, + "logps/rejected": -184.1470458984375, + "loss": 0.43409342765808107, + "rewards/chosen": 0.728582763671875, + "rewards/margins": 0.6158596277236938, + "rewards/rejected": 0.11272313594818115, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 0.5649115443229675, + "kl": 4.235246658325195, + "learning_rate": 1.268888888888889e-06, + "logits/chosen": 41943692.8, + "logits/rejected": 41250208.0, + "logps/chosen": -142.4334228515625, + "logps/rejected": -127.446923828125, + "loss": 0.4541294574737549, + "rewards/chosen": 0.42052087783813474, + "rewards/margins": 0.38021968901157377, + "rewards/rejected": 0.040301188826560974, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 0.42543667554855347, + "kl": 5.350770473480225, + "learning_rate": 1.2466666666666667e-06, + "logits/chosen": 43400393.6, + "logits/rejected": 40400710.4, + "logps/chosen": -155.03704833984375, + "logps/rejected": -177.0060791015625, + "loss": 0.46230545043945315, + "rewards/chosen": 0.562045955657959, + "rewards/margins": 0.3403463363647461, + "rewards/rejected": 0.22169961929321289, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 0.5134297609329224, + "kl": 4.304908752441406, + "learning_rate": 1.2244444444444445e-06, + "logits/chosen": 36589369.6, + "logits/rejected": 34298777.6, + "logps/chosen": -154.47265625, + "logps/rejected": -144.5512939453125, + "loss": 0.4730405330657959, + "rewards/chosen": 0.354435133934021, + "rewards/margins": 0.25211869478225707, + "rewards/rejected": 0.10231643915176392, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 0.46365997195243835, + "kl": 6.416839599609375, + "learning_rate": 1.2022222222222223e-06, + "logits/chosen": 40177619.2, + "logits/rejected": 39313078.4, + "logps/chosen": -168.4650146484375, + "logps/rejected": -134.8406494140625, + "loss": 0.42948031425476074, + "rewards/chosen": 0.8725629806518554, + "rewards/margins": 0.6291991949081421, + "rewards/rejected": 0.24336378574371337, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 0.4326087534427643, + "kl": 3.9851531982421875, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 32055046.4, + "logits/rejected": 32860064.0, + "logps/chosen": -120.6618408203125, + "logps/rejected": -125.195166015625, + "loss": 0.4756101131439209, + "rewards/chosen": 0.14163222312927246, + "rewards/margins": 0.17141112685203552, + "rewards/rejected": -0.02977890372276306, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 0.4545738697052002, + "kl": 5.087003231048584, + "learning_rate": 1.1577777777777778e-06, + "logits/chosen": 35810121.6, + "logits/rejected": 32762137.6, + "logps/chosen": -149.88338623046874, + "logps/rejected": -155.4554443359375, + "loss": 0.45157780647277834, + "rewards/chosen": 0.6308297634124755, + "rewards/margins": 0.421593952178955, + "rewards/rejected": 0.2092358112335205, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 0.5136933326721191, + "kl": 6.449606418609619, + "learning_rate": 1.1355555555555558e-06, + "logits/chosen": 35638320.0, + "logits/rejected": 33466137.6, + "logps/chosen": -152.856396484375, + "logps/rejected": -183.138525390625, + "loss": 0.4553223133087158, + "rewards/chosen": 0.6646287918090821, + "rewards/margins": 0.44561595916748054, + "rewards/rejected": 0.21901283264160157, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 0.3154851198196411, + "kl": 5.555708885192871, + "learning_rate": 1.1133333333333334e-06, + "logits/chosen": 39483830.4, + "logits/rejected": 38031814.4, + "logps/chosen": -155.9849609375, + "logps/rejected": -183.7311279296875, + "loss": 0.4708412647247314, + "rewards/chosen": 0.4511248588562012, + "rewards/margins": 0.33574488162994387, + "rewards/rejected": 0.11537997722625733, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_kl": 4.071971416473389, + "eval_logits/chosen": 37006954.496, + "eval_logits/rejected": 36902076.416, + "eval_logps/chosen": -152.112921875, + "eval_logps/rejected": -147.68203125, + "eval_loss": 0.48551830649375916, + "eval_rewards/chosen": 0.22128521728515624, + "eval_rewards/margins": 0.12521478271484374, + "eval_rewards/rejected": 0.0960704345703125, + "eval_runtime": 218.7826, + "eval_samples_per_second": 4.571, + "eval_steps_per_second": 2.285, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 0.5318990349769592, + "kl": 3.604361057281494, + "learning_rate": 1.0911111111111112e-06, + "logits/chosen": 36286432.0, + "logits/rejected": 37285971.2, + "logps/chosen": -113.43314208984376, + "logps/rejected": -124.557763671875, + "loss": 0.464507007598877, + "rewards/chosen": 0.35673577785491944, + "rewards/margins": 0.2908350646495819, + "rewards/rejected": 0.06590071320533752, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 0.544118344783783, + "kl": 3.670469284057617, + "learning_rate": 1.068888888888889e-06, + "logits/chosen": 45186656.0, + "logits/rejected": 45977584.0, + "logps/chosen": -139.073681640625, + "logps/rejected": -153.00821533203126, + "loss": 0.4433259963989258, + "rewards/chosen": 0.42419872283935545, + "rewards/margins": 0.5132439255714416, + "rewards/rejected": -0.08904520273208619, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 0.5683560967445374, + "kl": 2.751370906829834, + "learning_rate": 1.0466666666666669e-06, + "logits/chosen": 42249769.6, + "logits/rejected": 43098508.8, + "logps/chosen": -158.5843017578125, + "logps/rejected": -144.552197265625, + "loss": 0.4399724960327148, + "rewards/chosen": 0.42113256454467773, + "rewards/margins": 0.5389934659004212, + "rewards/rejected": -0.11786090135574341, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.5233821272850037, + "kl": 2.9688010215759277, + "learning_rate": 1.0244444444444445e-06, + "logits/chosen": 31636636.8, + "logits/rejected": 27467308.8, + "logps/chosen": -143.64234619140626, + "logps/rejected": -139.1869140625, + "loss": 0.442952299118042, + "rewards/chosen": 0.18238863945007325, + "rewards/margins": 0.5509385347366333, + "rewards/rejected": -0.36854989528656007, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.6982712745666504, + "kl": 2.9157755374908447, + "learning_rate": 1.0022222222222223e-06, + "logits/chosen": 29222934.4, + "logits/rejected": 28866115.2, + "logps/chosen": -145.965185546875, + "logps/rejected": -134.71815185546876, + "loss": 0.46454343795776365, + "rewards/chosen": 0.13749135732650758, + "rewards/margins": 0.32013021707534794, + "rewards/rejected": -0.18263885974884034, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.5256077647209167, + "kl": 5.147567272186279, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 39800131.2, + "logits/rejected": 40214745.6, + "logps/chosen": -153.8827880859375, + "logps/rejected": -155.6109375, + "loss": 0.44734792709350585, + "rewards/chosen": 0.595530891418457, + "rewards/margins": 0.46844919919967654, + "rewards/rejected": 0.12708169221878052, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.5332716703414917, + "kl": 2.908353328704834, + "learning_rate": 9.57777777777778e-07, + "logits/chosen": 25019286.4, + "logits/rejected": 26015075.2, + "logps/chosen": -153.6703857421875, + "logps/rejected": -128.41676025390626, + "loss": 0.47053236961364747, + "rewards/chosen": 0.07076652646064759, + "rewards/margins": 0.24093337655067443, + "rewards/rejected": -0.17016685009002686, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.6310443878173828, + "kl": 3.8460822105407715, + "learning_rate": 9.355555555555557e-07, + "logits/chosen": 30489289.6, + "logits/rejected": 29658633.6, + "logps/chosen": -157.56915283203125, + "logps/rejected": -126.709619140625, + "loss": 0.47043805122375487, + "rewards/chosen": 0.23312182426452638, + "rewards/margins": 0.2518186703324318, + "rewards/rejected": -0.018696846067905427, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.6265914440155029, + "kl": 2.628873109817505, + "learning_rate": 9.133333333333334e-07, + "logits/chosen": 38132934.4, + "logits/rejected": 37865273.6, + "logps/chosen": -150.04840087890625, + "logps/rejected": -152.45029296875, + "loss": 0.4601998805999756, + "rewards/chosen": 0.1351819634437561, + "rewards/margins": 0.29193094968795774, + "rewards/rejected": -0.15674898624420167, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.7189147472381592, + "kl": 4.683299541473389, + "learning_rate": 8.911111111111112e-07, + "logits/chosen": 39250364.8, + "logits/rejected": 37806796.8, + "logps/chosen": -166.75601806640626, + "logps/rejected": -189.86864013671874, + "loss": 0.47463297843933105, + "rewards/chosen": 0.1896621823310852, + "rewards/margins": 0.1576364517211914, + "rewards/rejected": 0.0320257306098938, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.463610887527466, + "eval_logits/chosen": 35089735.68, + "eval_logits/rejected": 35033989.12, + "eval_logps/chosen": -153.677015625, + "eval_logps/rejected": -149.26678125, + "eval_loss": 0.4853117763996124, + "eval_rewards/chosen": 0.06487516784667968, + "eval_rewards/margins": 0.1272816162109375, + "eval_rewards/rejected": -0.062406448364257815, + "eval_runtime": 217.5278, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.299, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 0.6290340423583984, + "kl": 4.315595626831055, + "learning_rate": 8.68888888888889e-07, + "logits/chosen": 31404550.4, + "logits/rejected": 29785132.8, + "logps/chosen": -161.92845458984374, + "logps/rejected": -143.69949951171876, + "loss": 0.4567877292633057, + "rewards/chosen": 0.3545663356781006, + "rewards/margins": 0.38114327788352964, + "rewards/rejected": -0.026576942205429076, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 0.5943707227706909, + "kl": 2.6047091484069824, + "learning_rate": 8.466666666666668e-07, + "logits/chosen": 35495555.2, + "logits/rejected": 33073846.4, + "logps/chosen": -137.365478515625, + "logps/rejected": -147.0032958984375, + "loss": 0.4664917469024658, + "rewards/chosen": -0.16478813886642457, + "rewards/margins": 0.23700910806655884, + "rewards/rejected": -0.4017972469329834, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 0.528068482875824, + "kl": 3.3957126140594482, + "learning_rate": 8.244444444444445e-07, + "logits/chosen": 31661328.0, + "logits/rejected": 30206838.4, + "logps/chosen": -162.5927978515625, + "logps/rejected": -134.8358642578125, + "loss": 0.46093249320983887, + "rewards/chosen": 0.17535465955734253, + "rewards/margins": 0.3643703818321228, + "rewards/rejected": -0.18901572227478028, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 0.46279260516166687, + "kl": 2.3419876098632812, + "learning_rate": 8.022222222222223e-07, + "logits/chosen": 42605856.0, + "logits/rejected": 40943014.4, + "logps/chosen": -132.12677001953125, + "logps/rejected": -135.65135498046874, + "loss": 0.45622806549072265, + "rewards/chosen": 0.04011918306350708, + "rewards/margins": 0.4230758786201477, + "rewards/rejected": -0.38295669555664064, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 0.5701712369918823, + "kl": 4.020439147949219, + "learning_rate": 7.8e-07, + "logits/chosen": 38577043.2, + "logits/rejected": 39374691.2, + "logps/chosen": -170.36212158203125, + "logps/rejected": -161.6606201171875, + "loss": 0.45922436714172366, + "rewards/chosen": 0.09237505197525024, + "rewards/margins": 0.30426751375198363, + "rewards/rejected": -0.2118924617767334, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 0.5047979354858398, + "kl": 2.7111480236053467, + "learning_rate": 7.577777777777779e-07, + "logits/chosen": 33372905.6, + "logits/rejected": 33328956.8, + "logps/chosen": -163.45003662109374, + "logps/rejected": -172.47750244140624, + "loss": 0.465222692489624, + "rewards/chosen": 0.034078240394592285, + "rewards/margins": 0.4608752965927124, + "rewards/rejected": -0.42679705619812014, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 0.43878477811813354, + "kl": 2.7707526683807373, + "learning_rate": 7.355555555555556e-07, + "logits/chosen": 34746457.6, + "logits/rejected": 31707171.2, + "logps/chosen": -131.509716796875, + "logps/rejected": -149.54019775390626, + "loss": 0.46474738121032716, + "rewards/chosen": 0.06415605545043945, + "rewards/margins": 0.39272706508636473, + "rewards/rejected": -0.3285710096359253, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 0.5701454877853394, + "kl": 2.9775280952453613, + "learning_rate": 7.133333333333334e-07, + "logits/chosen": 31648003.2, + "logits/rejected": 31955971.2, + "logps/chosen": -151.3242919921875, + "logps/rejected": -148.98272705078125, + "loss": 0.4547208309173584, + "rewards/chosen": 0.16725053787231445, + "rewards/margins": 0.45224099159240727, + "rewards/rejected": -0.2849904537200928, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 0.36173462867736816, + "kl": 3.2897815704345703, + "learning_rate": 6.911111111111111e-07, + "logits/chosen": 28085676.8, + "logits/rejected": 25880251.2, + "logps/chosen": -133.48956298828125, + "logps/rejected": -151.6326904296875, + "loss": 0.45902628898620607, + "rewards/chosen": 0.06616134643554687, + "rewards/margins": 0.43392994403839114, + "rewards/rejected": -0.36776859760284425, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 0.4635393023490906, + "kl": 4.791772365570068, + "learning_rate": 6.68888888888889e-07, + "logits/chosen": 39854400.0, + "logits/rejected": 43266041.6, + "logps/chosen": -180.304736328125, + "logps/rejected": -158.53660888671874, + "loss": 0.44127936363220216, + "rewards/chosen": 0.3505941152572632, + "rewards/margins": 0.4699846982955933, + "rewards/rejected": -0.11939058303833008, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_kl": 3.170285701751709, + "eval_logits/chosen": 34094372.864, + "eval_logits/rejected": 34072373.248, + "eval_logps/chosen": -154.5265625, + "eval_logps/rejected": -150.1290625, + "eval_loss": 0.48520490527153015, + "eval_rewards/chosen": -0.020077211380004883, + "eval_rewards/margins": 0.12855766105651856, + "eval_rewards/rejected": -0.14863487243652343, + "eval_runtime": 217.5482, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.298, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 0.6011971831321716, + "kl": 3.542525053024292, + "learning_rate": 6.466666666666667e-07, + "logits/chosen": 36977337.6, + "logits/rejected": 38393292.8, + "logps/chosen": -135.9339599609375, + "logps/rejected": -150.27783203125, + "loss": 0.4888314723968506, + "rewards/chosen": 0.07737842798233033, + "rewards/margins": 0.11252884268760682, + "rewards/rejected": -0.03515041470527649, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 0.5993504524230957, + "kl": 3.1693031787872314, + "learning_rate": 6.244444444444445e-07, + "logits/chosen": 30479276.8, + "logits/rejected": 29417516.8, + "logps/chosen": -132.84188232421874, + "logps/rejected": -119.2605224609375, + "loss": 0.46071271896362304, + "rewards/chosen": 0.22380545139312744, + "rewards/margins": 0.42163221836090087, + "rewards/rejected": -0.19782676696777343, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 0.6010851263999939, + "kl": 5.0478620529174805, + "learning_rate": 6.022222222222223e-07, + "logits/chosen": 38324892.8, + "logits/rejected": 37791113.6, + "logps/chosen": -135.21259765625, + "logps/rejected": -157.90794677734374, + "loss": 0.44549560546875, + "rewards/chosen": 0.5348263740539551, + "rewards/margins": 0.5059731423854827, + "rewards/rejected": 0.02885323166847229, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 0.6057806611061096, + "kl": 2.268434762954712, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 31744332.8, + "logits/rejected": 30286156.8, + "logps/chosen": -124.35537109375, + "logps/rejected": -136.41668701171875, + "loss": 0.44939751625061036, + "rewards/chosen": 0.14811928272247316, + "rewards/margins": 0.4689765214920044, + "rewards/rejected": -0.32085723876953126, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 0.4038269817829132, + "kl": 3.5659327507019043, + "learning_rate": 5.577777777777779e-07, + "logits/chosen": 31686905.6, + "logits/rejected": 32978937.6, + "logps/chosen": -156.446826171875, + "logps/rejected": -122.14853515625, + "loss": 0.45820083618164065, + "rewards/chosen": 0.2354212760925293, + "rewards/margins": 0.3473649501800537, + "rewards/rejected": -0.11194367408752441, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 0.7359764575958252, + "kl": 4.224055290222168, + "learning_rate": 5.355555555555556e-07, + "logits/chosen": 28065552.0, + "logits/rejected": 29463145.6, + "logps/chosen": -135.04927978515624, + "logps/rejected": -187.2267333984375, + "loss": 0.4682769298553467, + "rewards/chosen": 0.16567325592041016, + "rewards/margins": 0.28882311582565307, + "rewards/rejected": -0.12314985990524292, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.6170231699943542, + "kl": 3.4734268188476562, + "learning_rate": 5.133333333333334e-07, + "logits/chosen": 28978995.2, + "logits/rejected": 28156457.6, + "logps/chosen": -151.6166015625, + "logps/rejected": -139.56959228515626, + "loss": 0.4609676837921143, + "rewards/chosen": 0.0773462176322937, + "rewards/margins": 0.42135525941848756, + "rewards/rejected": -0.34400904178619385, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.5378606915473938, + "kl": 4.157721519470215, + "learning_rate": 4.911111111111112e-07, + "logits/chosen": 35223811.2, + "logits/rejected": 34011692.8, + "logps/chosen": -152.407470703125, + "logps/rejected": -153.23616943359374, + "loss": 0.4477241516113281, + "rewards/chosen": 0.24660811424255372, + "rewards/margins": 0.40975589752197267, + "rewards/rejected": -0.16314778327941895, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6521180868148804, + "kl": 3.4054481983184814, + "learning_rate": 4.688888888888889e-07, + "logits/chosen": 27037875.2, + "logits/rejected": 27271260.8, + "logps/chosen": -189.535595703125, + "logps/rejected": -115.60257568359376, + "loss": 0.47844581604003905, + "rewards/chosen": 0.05722663402557373, + "rewards/margins": 0.14552825689315796, + "rewards/rejected": -0.08830162286758422, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.6569052338600159, + "kl": 3.141408681869507, + "learning_rate": 4.466666666666667e-07, + "logits/chosen": 29871747.2, + "logits/rejected": 26977977.6, + "logps/chosen": -127.3733154296875, + "logps/rejected": -161.679931640625, + "loss": 0.464168119430542, + "rewards/chosen": 0.05878195762634277, + "rewards/margins": 0.390771484375, + "rewards/rejected": -0.3319895267486572, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.0008513927459717, + "eval_logits/chosen": 33053526.016, + "eval_logits/rejected": 33058701.312, + "eval_logps/chosen": -155.151625, + "eval_logps/rejected": -150.761421875, + "eval_loss": 0.4851257801055908, + "eval_rewards/chosen": -0.08258457946777344, + "eval_rewards/margins": 0.12928588867187502, + "eval_rewards/rejected": -0.21187046813964844, + "eval_runtime": 217.6558, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 2300 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2300/training_args.bin b/v5/KTO/KTO_10k/lora/checkpoint-2300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a05f8383f95df104b573dd06fde1a6093711cd3 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531e42fed31d279deeb217d9e592c58b0a48be16b726c4baaff52e99873e947a +size 5521 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/README.md b/v5/KTO/KTO_10k/lora/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e59b48ef11325fd83a0fa60f4e367a1bcacba7d --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "k_proj", + "o_proj", + "v_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_model.safetensors b/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8252139e6bdb3764fc46bd1f94118801f82b368d --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c2827c90e78c37bbcd97e34864b565ec667dc0d7775f964a9855b10e9fd9da5 +size 180385008 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/chat_template.jinja b/v5/KTO/KTO_10k/lora/checkpoint-2400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/optimizer.pt b/v5/KTO/KTO_10k/lora/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c32e829df4ae77b8463075e18dce00fdf5a13620 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6df1d36c2e32a6efd5ce47daae9c4eb66efa8a293ca8a8ba7d53f580a547c4ab +size 360902475 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/rng_state.pth b/v5/KTO/KTO_10k/lora/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/scaler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c83c039e83183ab3a0678557983f51465fbdff40 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da7c5085795b13d2bf0030671cbddb9f62ae43221bf1424a3830d4cf8c19012 +size 1383 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/scheduler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1d88691e65d0d4106236e713fa0a62666cce623 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8bab5b2796588f9c66d131ee61764a065b38d35768afb8af36c5b0459d64a79 +size 1465 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer.json b/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/trainer_state.json b/v5/KTO/KTO_10k/lora/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b86eb1f8f9193b408f19e8e63091f73cccdfe03e --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/trainer_state.json @@ -0,0 +1,4018 @@ +{ + "best_global_step": 2400, + "best_metric": 0.13059149169921874, + "best_model_checkpoint": "output/lora/checkpoint-2400", + "epoch": 1.92, + "eval_steps": 100, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.4994136691093445, + "kl": 0.010484933853149414, + "learning_rate": 1.8e-07, + "logits/chosen": 29687939.2, + "logits/rejected": 31342233.6, + "logps/chosen": -148.9648681640625, + "logps/rejected": -128.8302734375, + "loss": 0.500147819519043, + "rewards/chosen": -0.0003900241805240512, + "rewards/margins": -0.001183443213813007, + "rewards/rejected": 0.0007934190332889556, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.42419925332069397, + "kl": 0.018610835075378418, + "learning_rate": 3.8e-07, + "logits/chosen": 53382841.6, + "logits/rejected": 52884211.2, + "logps/chosen": -140.02025146484374, + "logps/rejected": -151.92236328125, + "loss": 0.49989566802978513, + "rewards/chosen": 0.0010854244232177735, + "rewards/margins": 0.0008347129682078958, + "rewards/rejected": 0.0002507114550098777, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.41562652587890625, + "kl": 0.00999913178384304, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 34145190.4, + "logits/rejected": 34195894.4, + "logps/chosen": -131.7357177734375, + "logps/rejected": -140.3759033203125, + "loss": 0.49987101554870605, + "rewards/chosen": 0.00029232501983642576, + "rewards/margins": 0.0010309695731848477, + "rewards/rejected": -0.0007386445533484221, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.35795858502388, + "kl": 0.01658189296722412, + "learning_rate": 7.8e-07, + "logits/chosen": 43262694.4, + "logits/rejected": 43904278.4, + "logps/chosen": -144.2994140625, + "logps/rejected": -146.0284423828125, + "loss": 0.5001150608062744, + "rewards/chosen": -0.00019423491321504116, + "rewards/margins": -0.0009199525695294142, + "rewards/rejected": 0.000725717656314373, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.322542279958725, + "kl": 0.016057539731264114, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 43062272.0, + "logits/rejected": 44864710.4, + "logps/chosen": -141.009814453125, + "logps/rejected": -154.3311279296875, + "loss": 0.4999659538269043, + "rewards/chosen": 4.65535675175488e-05, + "rewards/margins": 0.00027224536752328276, + "rewards/rejected": -0.00022569180000573397, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.27746379375457764, + "kl": 0.0211088415235281, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 36592531.2, + "logits/rejected": 34114694.4, + "logps/chosen": -105.72940673828126, + "logps/rejected": -114.016015625, + "loss": 0.4998314380645752, + "rewards/chosen": 0.0008930303156375885, + "rewards/margins": 0.0013488865923136472, + "rewards/rejected": -0.0004558562766760588, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.380987286567688, + "kl": 0.014461040496826172, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 47752102.4, + "logits/rejected": 46858576.0, + "logps/chosen": -165.7050048828125, + "logps/rejected": -175.17645263671875, + "loss": 0.49965806007385255, + "rewards/chosen": -0.007297745347023011, + "rewards/margins": 0.002736319601535796, + "rewards/rejected": -0.010034064948558807, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.4557498097419739, + "kl": 0.016758393496274948, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 40700441.6, + "logits/rejected": 40753952.0, + "logps/chosen": -154.99173583984376, + "logps/rejected": -163.616552734375, + "loss": 0.49955191612243655, + "rewards/chosen": -0.007268477231264114, + "rewards/margins": 0.0035857379436492927, + "rewards/rejected": -0.010854215174913407, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.3776198923587799, + "kl": 0.04920945316553116, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 43998704.0, + "logits/rejected": 44111488.0, + "logps/chosen": -169.890185546875, + "logps/rejected": -159.26253662109374, + "loss": 0.4991014003753662, + "rewards/chosen": -0.0037218812853097917, + "rewards/margins": 0.007189888134598732, + "rewards/rejected": -0.010911769419908523, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.45459961891174316, + "kl": 0.10221505165100098, + "learning_rate": 1.98e-06, + "logits/chosen": 27590704.0, + "logits/rejected": 27196054.4, + "logps/chosen": -134.2844970703125, + "logps/rejected": -164.56478271484374, + "loss": 0.4994335651397705, + "rewards/chosen": 0.001446514017879963, + "rewards/margins": 0.004533729329705239, + "rewards/rejected": -0.0030872153118252756, + "step": 100 + }, + { + "epoch": 0.08, + "eval_kl": 0.0926995798945427, + "eval_logits/chosen": 38615707.648, + "eval_logits/rejected": 38522241.024, + "eval_logps/chosen": -154.3604375, + "eval_logps/rejected": -148.682875, + "eval_loss": 0.4999313950538635, + "eval_rewards/chosen": -0.0034661414623260497, + "eval_rewards/margins": 0.0005488345623016356, + "eval_rewards/rejected": -0.004014976024627685, + "eval_runtime": 216.3934, + "eval_samples_per_second": 4.621, + "eval_steps_per_second": 2.311, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 0.3749667704105377, + "kl": 0.09545516967773438, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37966393.6, + "logits/rejected": 37751027.2, + "logps/chosen": -130.5518798828125, + "logps/rejected": -135.6833740234375, + "loss": 0.4993227481842041, + "rewards/chosen": 0.001455230824649334, + "rewards/margins": 0.005419917218387127, + "rewards/rejected": -0.003964686393737793, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 0.36912816762924194, + "kl": 0.14255723357200623, + "learning_rate": 2.38e-06, + "logits/chosen": 47479664.0, + "logits/rejected": 47101081.6, + "logps/chosen": -162.19322509765624, + "logps/rejected": -133.80028076171874, + "loss": 0.5003850936889649, + "rewards/chosen": -0.0034813500940799715, + "rewards/margins": -0.0030801778659224513, + "rewards/rejected": -0.0004011722281575203, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 0.3060654103755951, + "kl": 0.3212381601333618, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 39886729.6, + "logits/rejected": 38994944.0, + "logps/chosen": -161.928857421875, + "logps/rejected": -140.0421630859375, + "loss": 0.5001925468444824, + "rewards/chosen": 0.024501633644104005, + "rewards/margins": -0.001541826128959655, + "rewards/rejected": 0.02604345977306366, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 0.3445453345775604, + "kl": 0.48165637254714966, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 40894547.2, + "logits/rejected": 42894540.8, + "logps/chosen": -142.19818115234375, + "logps/rejected": -157.58607177734376, + "loss": 0.5001253128051758, + "rewards/chosen": 0.04530414342880249, + "rewards/margins": -0.001001721620559691, + "rewards/rejected": 0.04630586504936218, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 0.3646848797798157, + "kl": 0.5575106143951416, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 41080057.6, + "logits/rejected": 42315260.8, + "logps/chosen": -129.9904052734375, + "logps/rejected": -117.11707763671875, + "loss": 0.49814538955688475, + "rewards/chosen": 0.059499716758728026, + "rewards/margins": 0.014832848310470582, + "rewards/rejected": 0.044666868448257444, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 0.37343886494636536, + "kl": 0.7937558889389038, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 34726796.8, + "logits/rejected": 35066944.0, + "logps/chosen": -143.1036376953125, + "logps/rejected": -146.66500244140624, + "loss": 0.4996847152709961, + "rewards/chosen": 0.07830544710159301, + "rewards/margins": 0.0025246202945709145, + "rewards/rejected": 0.0757808268070221, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 0.3172762095928192, + "kl": 0.9795322418212891, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 37869379.2, + "logits/rejected": 40011753.6, + "logps/chosen": -137.67252197265626, + "logps/rejected": -149.25455322265626, + "loss": 0.4995111465454102, + "rewards/chosen": 0.0999127209186554, + "rewards/margins": 0.003919076919555661, + "rewards/rejected": 0.09599364399909974, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 0.459634393453598, + "kl": 1.297642707824707, + "learning_rate": 3.58e-06, + "logits/chosen": 44220444.8, + "logits/rejected": 45226771.2, + "logps/chosen": -144.420849609375, + "logps/rejected": -170.05146484375, + "loss": 0.5002459049224853, + "rewards/chosen": 0.12877843379974366, + "rewards/margins": -0.0019718408584594727, + "rewards/rejected": 0.13075027465820313, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 0.347683310508728, + "kl": 1.2592600584030151, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 41769235.2, + "logits/rejected": 43331692.8, + "logps/chosen": -123.3504150390625, + "logps/rejected": -136.4183837890625, + "loss": 0.4997075080871582, + "rewards/chosen": 0.12709956169128417, + "rewards/margins": 0.0023471236228942705, + "rewards/rejected": 0.1247524380683899, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 0.46408534049987793, + "kl": 1.3921682834625244, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 47041785.6, + "logits/rejected": 48364675.2, + "logps/chosen": -150.128076171875, + "logps/rejected": -171.89765625, + "loss": 0.49930601119995116, + "rewards/chosen": 0.14199190139770507, + "rewards/margins": 0.005550038814544661, + "rewards/rejected": 0.1364418625831604, + "step": 200 + }, + { + "epoch": 0.16, + "eval_kl": 1.0393632650375366, + "eval_logits/chosen": 39075643.392, + "eval_logits/rejected": 38930210.816, + "eval_logps/chosen": -153.263515625, + "eval_logps/rejected": -147.659890625, + "eval_loss": 0.49900853633880615, + "eval_rewards/chosen": 0.10622586059570313, + "eval_rewards/margins": 0.007942695617675785, + "eval_rewards/rejected": 0.09828316497802735, + "eval_runtime": 215.9673, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 2.315, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 0.39666494727134705, + "kl": 0.7951234579086304, + "learning_rate": 4.18e-06, + "logits/chosen": 33959907.2, + "logits/rejected": 33986992.0, + "logps/chosen": -139.88677978515625, + "logps/rejected": -131.93973388671876, + "loss": 0.5003408432006836, + "rewards/chosen": 0.06579458713531494, + "rewards/margins": -0.002647107839584356, + "rewards/rejected": 0.0684416949748993, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 0.3799448311328888, + "kl": 0.6490715146064758, + "learning_rate": 4.38e-06, + "logits/chosen": 35468355.2, + "logits/rejected": 36302822.4, + "logps/chosen": -101.356298828125, + "logps/rejected": -125.962353515625, + "loss": 0.5001154899597168, + "rewards/chosen": 0.0492926150560379, + "rewards/margins": -0.0009777992963790894, + "rewards/rejected": 0.05027041435241699, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 0.41211405396461487, + "kl": 0.3460121750831604, + "learning_rate": 4.58e-06, + "logits/chosen": 47615702.4, + "logits/rejected": 46232614.4, + "logps/chosen": -185.3808837890625, + "logps/rejected": -163.7504638671875, + "loss": 0.5009187698364258, + "rewards/chosen": -0.0020151469856500626, + "rewards/margins": -0.007613314315676689, + "rewards/rejected": 0.005598167330026627, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 0.40270859003067017, + "kl": 0.5220479369163513, + "learning_rate": 4.78e-06, + "logits/chosen": 48030569.6, + "logits/rejected": 48140400.0, + "logps/chosen": -176.74349365234374, + "logps/rejected": -166.65750732421876, + "loss": 0.5001285076141357, + "rewards/chosen": 0.028535887598991394, + "rewards/margins": -0.001354834437370299, + "rewards/rejected": 0.029890722036361693, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 0.4905576705932617, + "kl": 0.5900261402130127, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 37097190.4, + "logits/rejected": 35081888.0, + "logps/chosen": -176.0585205078125, + "logps/rejected": -145.11353759765626, + "loss": 0.4949470520019531, + "rewards/chosen": 0.06773759722709656, + "rewards/margins": 0.040507239103317265, + "rewards/rejected": 0.027230358123779295, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 0.30912280082702637, + "kl": 0.5255872011184692, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 30562265.6, + "logits/rejected": 29522019.2, + "logps/chosen": -128.9729248046875, + "logps/rejected": -131.62899169921874, + "loss": 0.4973008155822754, + "rewards/chosen": 0.040848633646965025, + "rewards/margins": 0.021623241901397704, + "rewards/rejected": 0.01922539174556732, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 0.5176734328269958, + "kl": 0.9013652801513672, + "learning_rate": 4.957777777777778e-06, + "logits/chosen": 39767792.0, + "logits/rejected": 39945158.4, + "logps/chosen": -156.84248046875, + "logps/rejected": -151.7102294921875, + "loss": 0.4969136714935303, + "rewards/chosen": 0.07821747660636902, + "rewards/margins": 0.024683624505996704, + "rewards/rejected": 0.053533852100372314, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 0.4220561385154724, + "kl": 0.8817802667617798, + "learning_rate": 4.935555555555556e-06, + "logits/chosen": 33369977.6, + "logits/rejected": 27383606.4, + "logps/chosen": -167.6235595703125, + "logps/rejected": -139.73486328125, + "loss": 0.5022628784179688, + "rewards/chosen": 0.054727953672409055, + "rewards/margins": -0.018271952867507935, + "rewards/rejected": 0.07299990653991699, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 0.33811691403388977, + "kl": 1.448921799659729, + "learning_rate": 4.9133333333333334e-06, + "logits/chosen": 31531936.0, + "logits/rejected": 30661184.0, + "logps/chosen": -145.08800048828124, + "logps/rejected": -147.349755859375, + "loss": 0.49300565719604494, + "rewards/chosen": 0.1612391948699951, + "rewards/margins": 0.05618309974670409, + "rewards/rejected": 0.10505609512329102, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 0.5129542350769043, + "kl": 1.6933104991912842, + "learning_rate": 4.891111111111111e-06, + "logits/chosen": 42485971.2, + "logits/rejected": 42720950.4, + "logps/chosen": -167.75079345703125, + "logps/rejected": -179.53148193359374, + "loss": 0.4963071823120117, + "rewards/chosen": 0.118367600440979, + "rewards/margins": 0.03180532455444336, + "rewards/rejected": 0.08656227588653564, + "step": 300 + }, + { + "epoch": 0.24, + "eval_kl": 1.5601574182510376, + "eval_logits/chosen": 38297956.352, + "eval_logits/rejected": 38117695.488, + "eval_logps/chosen": -153.006140625, + "eval_logps/rejected": -147.429, + "eval_loss": 0.49868252873420715, + "eval_rewards/chosen": 0.13196340942382812, + "eval_rewards/margins": 0.010592102050781246, + "eval_rewards/rejected": 0.12137130737304687, + "eval_runtime": 217.0741, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.303, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 0.3847499489784241, + "kl": 1.3948395252227783, + "learning_rate": 4.8688888888888895e-06, + "logits/chosen": 33896211.2, + "logits/rejected": 34871568.0, + "logps/chosen": -145.9845458984375, + "logps/rejected": -154.91959228515626, + "loss": 0.5030938625335694, + "rewards/chosen": 0.08795046210289001, + "rewards/margins": -0.024919158220291143, + "rewards/rejected": 0.11286962032318115, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 0.618556797504425, + "kl": 0.49630022048950195, + "learning_rate": 4.846666666666667e-06, + "logits/chosen": 37342124.8, + "logits/rejected": 35182000.0, + "logps/chosen": -160.91566162109376, + "logps/rejected": -134.513427734375, + "loss": 0.5022326946258545, + "rewards/chosen": -0.059583669900894164, + "rewards/margins": -0.018618279695510866, + "rewards/rejected": -0.0409653902053833, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 0.382318377494812, + "kl": 0.8811993598937988, + "learning_rate": 4.824444444444445e-06, + "logits/chosen": 46995257.6, + "logits/rejected": 44221206.4, + "logps/chosen": -153.2612060546875, + "logps/rejected": -144.4525634765625, + "loss": 0.4899014949798584, + "rewards/chosen": 0.058102655410766604, + "rewards/margins": 0.08179453760385513, + "rewards/rejected": -0.023691882193088532, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 0.4012068510055542, + "kl": 0.9655236005783081, + "learning_rate": 4.802222222222222e-06, + "logits/chosen": 39877590.4, + "logits/rejected": 40850240.0, + "logps/chosen": -134.43511962890625, + "logps/rejected": -143.74300537109374, + "loss": 0.5008483409881592, + "rewards/chosen": 0.046630316972732545, + "rewards/margins": -0.009023183584213258, + "rewards/rejected": 0.055653500556945804, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 0.4055842161178589, + "kl": 1.7407032251358032, + "learning_rate": 4.78e-06, + "logits/chosen": 37863616.0, + "logits/rejected": 36761936.0, + "logps/chosen": -133.8212646484375, + "logps/rejected": -169.326318359375, + "loss": 0.5016643524169921, + "rewards/chosen": 0.14738692045211793, + "rewards/margins": -0.01331337690353393, + "rewards/rejected": 0.16070029735565186, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 0.4029492139816284, + "kl": 1.1468133926391602, + "learning_rate": 4.7577777777777784e-06, + "logits/chosen": 41317878.4, + "logits/rejected": 38904140.8, + "logps/chosen": -147.33363037109376, + "logps/rejected": -112.39573974609375, + "loss": 0.49462456703186036, + "rewards/chosen": 0.09142228960990906, + "rewards/margins": 0.04296924769878387, + "rewards/rejected": 0.048453041911125184, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 0.39963042736053467, + "kl": 1.3335682153701782, + "learning_rate": 4.735555555555556e-06, + "logits/chosen": 38361622.4, + "logits/rejected": 38506108.8, + "logps/chosen": -146.35006103515624, + "logps/rejected": -150.335205078125, + "loss": 0.5048986434936523, + "rewards/chosen": 0.06511063575744629, + "rewards/margins": -0.04016592502593995, + "rewards/rejected": 0.10527656078338624, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 0.5386641025543213, + "kl": 1.9048980474472046, + "learning_rate": 4.713333333333334e-06, + "logits/chosen": 34626476.8, + "logits/rejected": 35537760.0, + "logps/chosen": -154.9567626953125, + "logps/rejected": -166.59052734375, + "loss": 0.5030035495758056, + "rewards/chosen": 0.13892955780029298, + "rewards/margins": -0.02978687286376952, + "rewards/rejected": 0.1687164306640625, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 0.3963494896888733, + "kl": 1.7998809814453125, + "learning_rate": 4.691111111111111e-06, + "logits/chosen": 31470185.6, + "logits/rejected": 30747776.0, + "logps/chosen": -174.68343505859374, + "logps/rejected": -149.20201416015624, + "loss": 0.4925515174865723, + "rewards/chosen": 0.17315468788146973, + "rewards/margins": 0.05992317199707031, + "rewards/rejected": 0.11323151588439942, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 0.40272921323776245, + "kl": 1.6665403842926025, + "learning_rate": 4.66888888888889e-06, + "logits/chosen": 43372483.2, + "logits/rejected": 41547449.6, + "logps/chosen": -148.32398681640626, + "logps/rejected": -134.78739013671876, + "loss": 0.49486651420593264, + "rewards/chosen": 0.149322509765625, + "rewards/margins": 0.040551638603210455, + "rewards/rejected": 0.10877087116241455, + "step": 400 + }, + { + "epoch": 0.32, + "eval_kl": 1.792982578277588, + "eval_logits/chosen": 38918168.576, + "eval_logits/rejected": 38725652.48, + "eval_logps/chosen": -152.730328125, + "eval_logps/rejected": -147.293078125, + "eval_loss": 0.4969332814216614, + "eval_rewards/chosen": 0.15954458618164064, + "eval_rewards/margins": 0.024580596923828135, + "eval_rewards/rejected": 0.1349639892578125, + "eval_runtime": 216.6464, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 0.3303642272949219, + "kl": 2.137221336364746, + "learning_rate": 4.646666666666667e-06, + "logits/chosen": 43939001.6, + "logits/rejected": 41818220.8, + "logps/chosen": -146.33731689453126, + "logps/rejected": -147.7433349609375, + "loss": 0.4917384147644043, + "rewards/chosen": 0.22513296604156494, + "rewards/margins": 0.06730514764785767, + "rewards/rejected": 0.15782781839370727, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 0.5785346031188965, + "kl": 1.536816120147705, + "learning_rate": 4.624444444444445e-06, + "logits/chosen": 34265174.4, + "logits/rejected": 32297750.4, + "logps/chosen": -161.92572021484375, + "logps/rejected": -130.8744384765625, + "loss": 0.4967160701751709, + "rewards/chosen": 0.12509127855300903, + "rewards/margins": 0.025565683841705322, + "rewards/rejected": 0.0995255947113037, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 0.39299893379211426, + "kl": 2.4938416481018066, + "learning_rate": 4.602222222222223e-06, + "logits/chosen": 37429766.4, + "logits/rejected": 33713158.4, + "logps/chosen": -168.366845703125, + "logps/rejected": -117.99913330078125, + "loss": 0.4935178279876709, + "rewards/chosen": 0.2566863536834717, + "rewards/margins": 0.051660680770874046, + "rewards/rejected": 0.20502567291259766, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 0.6378316879272461, + "kl": 3.6217243671417236, + "learning_rate": 4.58e-06, + "logits/chosen": 43531513.6, + "logits/rejected": 45458550.4, + "logps/chosen": -145.77152099609376, + "logps/rejected": -166.725390625, + "loss": 0.5008945465087891, + "rewards/chosen": 0.3571479320526123, + "rewards/margins": -0.007279539108276389, + "rewards/rejected": 0.3644274711608887, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 0.38800859451293945, + "kl": 3.8835651874542236, + "learning_rate": 4.557777777777778e-06, + "logits/chosen": 35328048.0, + "logits/rejected": 38813721.6, + "logps/chosen": -95.403271484375, + "logps/rejected": -151.7120849609375, + "loss": 0.50106782913208, + "rewards/chosen": 0.38196592330932616, + "rewards/margins": -0.00870509147644044, + "rewards/rejected": 0.3906710147857666, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 0.454421728849411, + "kl": 4.79476261138916, + "learning_rate": 4.535555555555555e-06, + "logits/chosen": 47006140.8, + "logits/rejected": 45068256.0, + "logps/chosen": -160.01910400390625, + "logps/rejected": -175.923046875, + "loss": 0.5052647590637207, + "rewards/chosen": 0.45828795433044434, + "rewards/margins": -0.042376470565795854, + "rewards/rejected": 0.5006644248962402, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 0.846814751625061, + "kl": 3.439274311065674, + "learning_rate": 4.513333333333333e-06, + "logits/chosen": 55978662.4, + "logits/rejected": 53112982.4, + "logps/chosen": -170.11988525390626, + "logps/rejected": -174.0812744140625, + "loss": 0.4981950283050537, + "rewards/chosen": 0.3498707294464111, + "rewards/margins": 0.014589142799377453, + "rewards/rejected": 0.33528158664703367, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 0.46414715051651, + "kl": 2.857430934906006, + "learning_rate": 4.4911111111111115e-06, + "logits/chosen": 44121936.0, + "logits/rejected": 43484160.0, + "logps/chosen": -149.39083251953124, + "logps/rejected": -159.20223388671874, + "loss": 0.4906013011932373, + "rewards/chosen": 0.3135632276535034, + "rewards/margins": 0.0760336399078369, + "rewards/rejected": 0.23752958774566652, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 0.31783437728881836, + "kl": 2.6989314556121826, + "learning_rate": 4.468888888888889e-06, + "logits/chosen": 29722166.4, + "logits/rejected": 27615270.4, + "logps/chosen": -146.17584228515625, + "logps/rejected": -145.2918212890625, + "loss": 0.4850144863128662, + "rewards/chosen": 0.30660200119018555, + "rewards/margins": 0.12175897359848023, + "rewards/rejected": 0.18484302759170532, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 0.4939253032207489, + "kl": 4.809385299682617, + "learning_rate": 4.446666666666667e-06, + "logits/chosen": 45245225.6, + "logits/rejected": 41364572.8, + "logps/chosen": -177.41658935546874, + "logps/rejected": -128.30506591796876, + "loss": 0.48539199829101565, + "rewards/chosen": 0.5390491962432862, + "rewards/margins": 0.1182609081268311, + "rewards/rejected": 0.4207882881164551, + "step": 500 + }, + { + "epoch": 0.4, + "eval_kl": 4.020763874053955, + "eval_logits/chosen": 40241844.224, + "eval_logits/rejected": 39968325.632, + "eval_logps/chosen": -150.122734375, + "eval_logps/rejected": -144.86325, + "eval_loss": 0.4947924017906189, + "eval_rewards/chosen": 0.4203052978515625, + "eval_rewards/margins": 0.042358032226562536, + "eval_rewards/rejected": 0.377947265625, + "eval_runtime": 216.7408, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 0.414318323135376, + "kl": 3.33302640914917, + "learning_rate": 4.424444444444444e-06, + "logits/chosen": 47571436.8, + "logits/rejected": 49124124.8, + "logps/chosen": -143.7648681640625, + "logps/rejected": -157.2596435546875, + "loss": 0.4965871810913086, + "rewards/chosen": 0.34320816993713377, + "rewards/margins": 0.027533125877380327, + "rewards/rejected": 0.31567504405975344, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 0.38320228457450867, + "kl": 4.671795845031738, + "learning_rate": 4.402222222222223e-06, + "logits/chosen": 46508307.2, + "logits/rejected": 45490304.0, + "logps/chosen": -154.68175048828124, + "logps/rejected": -160.55111083984374, + "loss": 0.4931188106536865, + "rewards/chosen": 0.47988028526306153, + "rewards/margins": 0.05947685241699219, + "rewards/rejected": 0.42040343284606935, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 0.4373217821121216, + "kl": 3.6891350746154785, + "learning_rate": 4.38e-06, + "logits/chosen": 42301033.6, + "logits/rejected": 42527356.8, + "logps/chosen": -138.6637939453125, + "logps/rejected": -173.32967529296874, + "loss": 0.5058313369750976, + "rewards/chosen": 0.33178033828735354, + "rewards/margins": -0.047040796279907204, + "rewards/rejected": 0.37882113456726074, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 0.6072640419006348, + "kl": 4.442656517028809, + "learning_rate": 4.357777777777778e-06, + "logits/chosen": 34522003.2, + "logits/rejected": 34255187.2, + "logps/chosen": -147.196533203125, + "logps/rejected": -154.7218505859375, + "loss": 0.4857354640960693, + "rewards/chosen": 0.49022879600524905, + "rewards/margins": 0.11430189609527591, + "rewards/rejected": 0.37592689990997313, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 0.4359336793422699, + "kl": 3.4759514331817627, + "learning_rate": 4.3355555555555565e-06, + "logits/chosen": 41427052.8, + "logits/rejected": 42907648.0, + "logps/chosen": -152.25201416015625, + "logps/rejected": -165.486767578125, + "loss": 0.49396610260009766, + "rewards/chosen": 0.345978832244873, + "rewards/margins": 0.05630025863647459, + "rewards/rejected": 0.28967857360839844, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 0.43716976046562195, + "kl": 3.152191638946533, + "learning_rate": 4.313333333333334e-06, + "logits/chosen": 35663577.6, + "logits/rejected": 34092796.8, + "logps/chosen": -161.36358642578125, + "logps/rejected": -148.691259765625, + "loss": 0.49653072357177735, + "rewards/chosen": 0.27045164108276365, + "rewards/margins": 0.027580332756042464, + "rewards/rejected": 0.24287130832672119, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 0.387523889541626, + "kl": 2.6372287273406982, + "learning_rate": 4.291111111111112e-06, + "logits/chosen": 41382582.4, + "logits/rejected": 40126329.6, + "logps/chosen": -147.36217041015624, + "logps/rejected": -132.27440185546874, + "loss": 0.48532447814941404, + "rewards/chosen": 0.2873492479324341, + "rewards/margins": 0.12220915555953982, + "rewards/rejected": 0.1651400923728943, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 0.4191218316555023, + "kl": 3.158555507659912, + "learning_rate": 4.268888888888889e-06, + "logits/chosen": 47333145.6, + "logits/rejected": 46272729.6, + "logps/chosen": -147.29794921875, + "logps/rejected": -157.137255859375, + "loss": 0.4924956798553467, + "rewards/chosen": 0.26015233993530273, + "rewards/margins": 0.0591968059539795, + "rewards/rejected": 0.20095553398132324, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 0.4541790783405304, + "kl": 3.111989974975586, + "learning_rate": 4.246666666666667e-06, + "logits/chosen": 29866240.0, + "logits/rejected": 30473120.0, + "logps/chosen": -128.02447509765625, + "logps/rejected": -133.55704345703126, + "loss": 0.4842988967895508, + "rewards/chosen": 0.3343390941619873, + "rewards/margins": 0.12930448055267335, + "rewards/rejected": 0.20503461360931396, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 0.565047025680542, + "kl": 2.7821693420410156, + "learning_rate": 4.2244444444444446e-06, + "logits/chosen": 28686153.6, + "logits/rejected": 31275964.8, + "logps/chosen": -147.65833740234376, + "logps/rejected": -149.1046630859375, + "loss": 0.5113170146942139, + "rewards/chosen": 0.13800346851348877, + "rewards/margins": -0.09322352409362794, + "rewards/rejected": 0.2312269926071167, + "step": 600 + }, + { + "epoch": 0.48, + "eval_kl": 2.7389280796051025, + "eval_logits/chosen": 38005252.096, + "eval_logits/rejected": 37846036.48, + "eval_logps/chosen": -152.185953125, + "eval_logps/rejected": -146.89521875, + "eval_loss": 0.4950157403945923, + "eval_rewards/chosen": 0.21398320007324217, + "eval_rewards/margins": 0.03923100280761718, + "eval_rewards/rejected": 0.174752197265625, + "eval_runtime": 217.558, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 0.4400153160095215, + "kl": 3.6884047985076904, + "learning_rate": 4.202222222222222e-06, + "logits/chosen": 45485420.8, + "logits/rejected": 41585961.6, + "logps/chosen": -193.09393310546875, + "logps/rejected": -168.6018310546875, + "loss": 0.47965612411499026, + "rewards/chosen": 0.346639347076416, + "rewards/margins": 0.19525065422058108, + "rewards/rejected": 0.15138869285583495, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 0.47579634189605713, + "kl": 2.8855841159820557, + "learning_rate": 4.18e-06, + "logits/chosen": 26868339.2, + "logits/rejected": 25530107.2, + "logps/chosen": -139.2341552734375, + "logps/rejected": -135.46981201171874, + "loss": 0.48549280166625974, + "rewards/chosen": 0.2853414058685303, + "rewards/margins": 0.12018097639083863, + "rewards/rejected": 0.16516042947769166, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 0.4894777536392212, + "kl": 3.817617893218994, + "learning_rate": 4.157777777777778e-06, + "logits/chosen": 38511724.8, + "logits/rejected": 41062003.2, + "logps/chosen": -139.717919921875, + "logps/rejected": -172.0173828125, + "loss": 0.5030189037322998, + "rewards/chosen": 0.3242809772491455, + "rewards/margins": -0.024120402336120617, + "rewards/rejected": 0.3484013795852661, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 0.5884784460067749, + "kl": 3.3272690773010254, + "learning_rate": 4.135555555555556e-06, + "logits/chosen": 40902281.6, + "logits/rejected": 39306883.2, + "logps/chosen": -189.29173583984374, + "logps/rejected": -149.8311279296875, + "loss": 0.4905113220214844, + "rewards/chosen": 0.27348809242248534, + "rewards/margins": 0.07974576950073242, + "rewards/rejected": 0.19374232292175292, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 0.489397794008255, + "kl": 4.352996349334717, + "learning_rate": 4.1133333333333335e-06, + "logits/chosen": 43261625.6, + "logits/rejected": 41635296.0, + "logps/chosen": -125.68609619140625, + "logps/rejected": -132.724267578125, + "loss": 0.49439477920532227, + "rewards/chosen": 0.3973216533660889, + "rewards/margins": 0.04381968975067141, + "rewards/rejected": 0.35350196361541747, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 0.36593517661094666, + "kl": 3.356546401977539, + "learning_rate": 4.091111111111111e-06, + "logits/chosen": 56701203.2, + "logits/rejected": 55284249.6, + "logps/chosen": -168.565625, + "logps/rejected": -132.68575439453124, + "loss": 0.48746094703674314, + "rewards/chosen": 0.3356959581375122, + "rewards/margins": 0.09914519786834713, + "rewards/rejected": 0.23655076026916505, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 0.47609221935272217, + "kl": 3.9726402759552, + "learning_rate": 4.0688888888888896e-06, + "logits/chosen": 42420092.8, + "logits/rejected": 42645120.0, + "logps/chosen": -181.13988037109374, + "logps/rejected": -183.315185546875, + "loss": 0.4789764881134033, + "rewards/chosen": 0.3791257381439209, + "rewards/margins": 0.18321629762649538, + "rewards/rejected": 0.19590944051742554, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 0.479322224855423, + "kl": 2.822577953338623, + "learning_rate": 4.046666666666667e-06, + "logits/chosen": 41487219.2, + "logits/rejected": 40422083.2, + "logps/chosen": -144.38018798828125, + "logps/rejected": -126.14996337890625, + "loss": 0.49282026290893555, + "rewards/chosen": 0.22584574222564696, + "rewards/margins": 0.055848944187164296, + "rewards/rejected": 0.16999679803848267, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 0.3670179843902588, + "kl": 4.148089408874512, + "learning_rate": 4.024444444444445e-06, + "logits/chosen": 42715072.0, + "logits/rejected": 40623603.2, + "logps/chosen": -156.69072265625, + "logps/rejected": -175.76126708984376, + "loss": 0.4938004970550537, + "rewards/chosen": 0.39667787551879885, + "rewards/margins": 0.05049760341644288, + "rewards/rejected": 0.34618027210235597, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 0.41768333315849304, + "kl": 3.2628045082092285, + "learning_rate": 4.002222222222222e-06, + "logits/chosen": 37950355.2, + "logits/rejected": 34915990.4, + "logps/chosen": -155.3704833984375, + "logps/rejected": -143.3675537109375, + "loss": 0.49524383544921874, + "rewards/chosen": 0.2646515369415283, + "rewards/margins": 0.04396252632141112, + "rewards/rejected": 0.2206890106201172, + "step": 700 + }, + { + "epoch": 0.56, + "eval_kl": 3.236727237701416, + "eval_logits/chosen": 38033387.52, + "eval_logits/rejected": 37810647.04, + "eval_logps/chosen": -151.62878125, + "eval_logps/rejected": -146.479140625, + "eval_loss": 0.49332940578460693, + "eval_rewards/chosen": 0.269699462890625, + "eval_rewards/margins": 0.05334155273437502, + "eval_rewards/rejected": 0.21635791015625, + "eval_runtime": 218.185, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.292, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 0.40857982635498047, + "kl": 4.429306983947754, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 40884387.2, + "logits/rejected": 39080608.0, + "logps/chosen": -172.00384521484375, + "logps/rejected": -133.983837890625, + "loss": 0.4770909309387207, + "rewards/chosen": 0.518680477142334, + "rewards/margins": 0.18963458538055422, + "rewards/rejected": 0.3290458917617798, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 0.3682423233985901, + "kl": 3.0005943775177, + "learning_rate": 3.9577777777777785e-06, + "logits/chosen": 33681516.8, + "logits/rejected": 34946268.8, + "logps/chosen": -120.9507080078125, + "logps/rejected": -123.20738525390625, + "loss": 0.5031956672668457, + "rewards/chosen": 0.11274595260620117, + "rewards/margins": -0.05625311136245727, + "rewards/rejected": 0.16899906396865844, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 0.37147852778434753, + "kl": 3.7080981731414795, + "learning_rate": 3.935555555555556e-06, + "logits/chosen": 36905440.0, + "logits/rejected": 34017891.2, + "logps/chosen": -152.36019287109374, + "logps/rejected": -154.4943115234375, + "loss": 0.49049901962280273, + "rewards/chosen": 0.3831493616104126, + "rewards/margins": 0.07779901027679442, + "rewards/rejected": 0.30535035133361815, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 0.48657310009002686, + "kl": 4.199930191040039, + "learning_rate": 3.913333333333334e-06, + "logits/chosen": 34702265.6, + "logits/rejected": 33570732.8, + "logps/chosen": -156.0489501953125, + "logps/rejected": -182.53206787109374, + "loss": 0.4937909603118896, + "rewards/chosen": 0.3453744649887085, + "rewards/margins": 0.06571738719940184, + "rewards/rejected": 0.27965707778930665, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 0.2791324257850647, + "kl": 3.1397013664245605, + "learning_rate": 3.891111111111111e-06, + "logits/chosen": 38985750.4, + "logits/rejected": 38637244.8, + "logps/chosen": -159.29842529296874, + "logps/rejected": -183.90196533203124, + "loss": 0.5016417980194092, + "rewards/chosen": 0.11977872848510743, + "rewards/margins": 0.02148157954216004, + "rewards/rejected": 0.09829714894294739, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 0.4719991683959961, + "kl": 1.940172553062439, + "learning_rate": 3.868888888888889e-06, + "logits/chosen": 36901945.6, + "logits/rejected": 37816726.4, + "logps/chosen": -115.81767578125, + "logps/rejected": -124.9832763671875, + "loss": 0.48836345672607423, + "rewards/chosen": 0.1182823657989502, + "rewards/margins": 0.09493236243724823, + "rewards/rejected": 0.023350003361701965, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 0.46570995450019836, + "kl": 1.71030592918396, + "learning_rate": 3.8466666666666665e-06, + "logits/chosen": 45056902.4, + "logits/rejected": 45886822.4, + "logps/chosen": -142.8737060546875, + "logps/rejected": -152.980029296875, + "loss": 0.4849842071533203, + "rewards/chosen": 0.044194817543029785, + "rewards/margins": 0.13042356967926025, + "rewards/rejected": -0.08622875213623046, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 0.48490992188453674, + "kl": 0.8850091695785522, + "learning_rate": 3.824444444444444e-06, + "logits/chosen": 41897849.6, + "logits/rejected": 42659980.8, + "logps/chosen": -163.0940673828125, + "logps/rejected": -145.29967041015624, + "loss": 0.4809276103973389, + "rewards/chosen": -0.029845520853996277, + "rewards/margins": 0.16276139318943023, + "rewards/rejected": -0.1926069140434265, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 0.42963773012161255, + "kl": 1.2656173706054688, + "learning_rate": 3.8022222222222226e-06, + "logits/chosen": 31615977.6, + "logits/rejected": 27643244.8, + "logps/chosen": -146.9314453125, + "logps/rejected": -139.28326416015625, + "loss": 0.4754499912261963, + "rewards/chosen": -0.14652204513549805, + "rewards/margins": 0.23166158199310305, + "rewards/rejected": -0.3781836271286011, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 0.5815227627754211, + "kl": 1.1006227731704712, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 28676291.2, + "logits/rejected": 28398739.2, + "logps/chosen": -149.7789306640625, + "logps/rejected": -134.99962158203124, + "loss": 0.5007228374481201, + "rewards/chosen": -0.2438833236694336, + "rewards/margins": -0.03309731483459474, + "rewards/rejected": -0.21078600883483886, + "step": 800 + }, + { + "epoch": 0.64, + "eval_kl": 1.4775981903076172, + "eval_logits/chosen": 34909265.92, + "eval_logits/rejected": 34874159.104, + "eval_logps/chosen": -156.25446875, + "eval_logps/rejected": -151.0355, + "eval_loss": 0.4928078353404999, + "eval_rewards/chosen": -0.19286886596679687, + "eval_rewards/margins": 0.04640672302246096, + "eval_rewards/rejected": -0.23927558898925783, + "eval_runtime": 217.2245, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 0.518290102481842, + "kl": 2.0555756092071533, + "learning_rate": 3.757777777777778e-06, + "logits/chosen": 39001305.6, + "logits/rejected": 39306675.2, + "logps/chosen": -159.09794921875, + "logps/rejected": -157.2656982421875, + "loss": 0.48754167556762695, + "rewards/chosen": 0.07401522397994995, + "rewards/margins": 0.1124086320400238, + "rewards/rejected": -0.03839340806007385, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 0.4529527425765991, + "kl": 1.2279353141784668, + "learning_rate": 3.7355555555555555e-06, + "logits/chosen": 24645734.4, + "logits/rejected": 24314422.4, + "logps/chosen": -157.53839111328125, + "logps/rejected": -129.3763427734375, + "loss": 0.5034448146820069, + "rewards/chosen": -0.31603260040283204, + "rewards/margins": -0.04990806579589846, + "rewards/rejected": -0.2661245346069336, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 0.5347335934638977, + "kl": 2.0512425899505615, + "learning_rate": 3.713333333333334e-06, + "logits/chosen": 31296140.8, + "logits/rejected": 29981024.0, + "logps/chosen": -160.424072265625, + "logps/rejected": -127.09144287109375, + "loss": 0.49833097457885744, + "rewards/chosen": -0.05236924290657043, + "rewards/margins": 0.004509323835372926, + "rewards/rejected": -0.05687856674194336, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 0.4843555688858032, + "kl": 1.5582542419433594, + "learning_rate": 3.6911111111111115e-06, + "logits/chosen": 39700444.8, + "logits/rejected": 39422995.2, + "logps/chosen": -151.80198974609374, + "logps/rejected": -151.66746826171874, + "loss": 0.4924652099609375, + "rewards/chosen": -0.040176278352737425, + "rewards/margins": 0.038288170099258424, + "rewards/rejected": -0.07846444845199585, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 0.5929602384567261, + "kl": 2.925257921218872, + "learning_rate": 3.668888888888889e-06, + "logits/chosen": 39676166.4, + "logits/rejected": 38709782.4, + "logps/chosen": -169.22021484375, + "logps/rejected": -189.6208251953125, + "loss": 0.507749605178833, + "rewards/chosen": -0.05675660371780396, + "rewards/margins": -0.11356353759765625, + "rewards/rejected": 0.05680693387985229, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 0.47086119651794434, + "kl": 2.162543296813965, + "learning_rate": 3.6466666666666668e-06, + "logits/chosen": 31780547.2, + "logits/rejected": 29934672.0, + "logps/chosen": -165.53729248046875, + "logps/rejected": -144.7294921875, + "loss": 0.48428568840026853, + "rewards/chosen": -0.0063173860311508175, + "rewards/margins": 0.12325810492038727, + "rewards/rejected": -0.12957549095153809, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 0.5226603150367737, + "kl": 1.0791276693344116, + "learning_rate": 3.624444444444445e-06, + "logits/chosen": 36146592.0, + "logits/rejected": 34014483.2, + "logps/chosen": -139.47996826171874, + "logps/rejected": -147.29366455078124, + "loss": 0.4861104965209961, + "rewards/chosen": -0.3762362003326416, + "rewards/margins": 0.0545970916748047, + "rewards/rejected": -0.4308332920074463, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 0.456878662109375, + "kl": 1.0787068605422974, + "learning_rate": 3.6022222222222224e-06, + "logits/chosen": 31733673.6, + "logits/rejected": 30545894.4, + "logps/chosen": -166.9651123046875, + "logps/rejected": -136.55260009765624, + "loss": 0.4926890850067139, + "rewards/chosen": -0.2618767499923706, + "rewards/margins": 0.09881234169006348, + "rewards/rejected": -0.36068909168243407, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 0.39478904008865356, + "kl": 0.7077828645706177, + "learning_rate": 3.58e-06, + "logits/chosen": 42203872.0, + "logits/rejected": 40975161.6, + "logps/chosen": -135.72105712890624, + "logps/rejected": -136.8107421875, + "loss": 0.4823348045349121, + "rewards/chosen": -0.3193112850189209, + "rewards/margins": 0.17958507537841795, + "rewards/rejected": -0.49889636039733887, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 0.4868822395801544, + "kl": 1.0201635360717773, + "learning_rate": 3.5577777777777785e-06, + "logits/chosen": 37633750.4, + "logits/rejected": 38356057.6, + "logps/chosen": -176.8873779296875, + "logps/rejected": -164.86949462890624, + "loss": 0.49372262954711915, + "rewards/chosen": -0.5601509571075439, + "rewards/margins": -0.027371644973754883, + "rewards/rejected": -0.532779312133789, + "step": 900 + }, + { + "epoch": 0.72, + "eval_kl": 0.7019873857498169, + "eval_logits/chosen": 32590643.2, + "eval_logits/rejected": 32688842.752, + "eval_logps/chosen": -159.868109375, + "eval_logps/rejected": -154.68375, + "eval_loss": 0.49165070056915283, + "eval_rewards/chosen": -0.5542342529296875, + "eval_rewards/margins": 0.049869018554687505, + "eval_rewards/rejected": -0.604103271484375, + "eval_runtime": 218.2133, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 0.4481786787509918, + "kl": 0.8257962465286255, + "learning_rate": 3.535555555555556e-06, + "logits/chosen": 32320790.4, + "logits/rejected": 32438003.2, + "logps/chosen": -168.3318603515625, + "logps/rejected": -174.481884765625, + "loss": 0.4958535671234131, + "rewards/chosen": -0.454105281829834, + "rewards/margins": 0.1731292247772217, + "rewards/rejected": -0.6272345066070557, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 0.41489994525909424, + "kl": 0.9689595103263855, + "learning_rate": 3.5133333333333337e-06, + "logits/chosen": 33867084.8, + "logits/rejected": 31248268.8, + "logps/chosen": -136.1980224609375, + "logps/rejected": -151.76387939453124, + "loss": 0.4944427490234375, + "rewards/chosen": -0.40467538833618166, + "rewards/margins": 0.14626178741455076, + "rewards/rejected": -0.5509371757507324, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 0.460254430770874, + "kl": 1.2024842500686646, + "learning_rate": 3.4911111111111113e-06, + "logits/chosen": 32133318.4, + "logits/rejected": 32185379.2, + "logps/chosen": -155.15115966796876, + "logps/rejected": -149.83077392578124, + "loss": 0.48492116928100587, + "rewards/chosen": -0.21543638706207274, + "rewards/margins": 0.15435693264007572, + "rewards/rejected": -0.36979331970214846, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 0.34393665194511414, + "kl": 1.7038171291351318, + "learning_rate": 3.4688888888888894e-06, + "logits/chosen": 27802694.4, + "logits/rejected": 25992144.0, + "logps/chosen": -136.5869140625, + "logps/rejected": -152.3591064453125, + "loss": 0.482952356338501, + "rewards/chosen": -0.24357478618621825, + "rewards/margins": 0.1968345880508423, + "rewards/rejected": -0.44040937423706056, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 0.3970324397087097, + "kl": 2.81396484375, + "learning_rate": 3.446666666666667e-06, + "logits/chosen": 40630716.8, + "logits/rejected": 43665993.6, + "logps/chosen": -184.17490234375, + "logps/rejected": -158.86982421875, + "loss": 0.48198614120483396, + "rewards/chosen": -0.03642080426216125, + "rewards/margins": 0.11629058718681336, + "rewards/rejected": -0.1527113914489746, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 0.5192223787307739, + "kl": 2.0976433753967285, + "learning_rate": 3.4244444444444446e-06, + "logits/chosen": 37736128.0, + "logits/rejected": 39264816.0, + "logps/chosen": -137.5658203125, + "logps/rejected": -150.39862060546875, + "loss": 0.5065433502197265, + "rewards/chosen": -0.08580412864685058, + "rewards/margins": -0.038575989007949826, + "rewards/rejected": -0.047228139638900754, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 0.4041205942630768, + "kl": 1.893617033958435, + "learning_rate": 3.4022222222222222e-06, + "logits/chosen": 32113164.8, + "logits/rejected": 30382905.6, + "logps/chosen": -135.3074462890625, + "logps/rejected": -119.05921630859375, + "loss": 0.4857178688049316, + "rewards/chosen": -0.022751623392105104, + "rewards/margins": 0.15494421124458313, + "rewards/rejected": -0.17769583463668823, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 0.6198734641075134, + "kl": 3.3098182678222656, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 39511980.8, + "logits/rejected": 39711251.2, + "logps/chosen": -138.0878662109375, + "logps/rejected": -157.733740234375, + "loss": 0.4792346000671387, + "rewards/chosen": 0.2472997188568115, + "rewards/margins": 0.20102626085281372, + "rewards/rejected": 0.046273458003997806, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 0.5357170104980469, + "kl": 1.1110466718673706, + "learning_rate": 3.3577777777777783e-06, + "logits/chosen": 31936358.4, + "logits/rejected": 31202771.2, + "logps/chosen": -127.25572509765625, + "logps/rejected": -136.280859375, + "loss": 0.4806090831756592, + "rewards/chosen": -0.14191631078720093, + "rewards/margins": 0.16535891294479368, + "rewards/rejected": -0.3072752237319946, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 0.351481556892395, + "kl": 1.9038498401641846, + "learning_rate": 3.335555555555556e-06, + "logits/chosen": 31806704.0, + "logits/rejected": 32803180.8, + "logps/chosen": -159.603076171875, + "logps/rejected": -122.312548828125, + "loss": 0.492017126083374, + "rewards/chosen": -0.0802042841911316, + "rewards/margins": 0.04814127683639527, + "rewards/rejected": -0.12834556102752687, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_kl": 1.7515510320663452, + "eval_logits/chosen": 34055507.968, + "eval_logits/rejected": 34077693.952, + "eval_logps/chosen": -156.563640625, + "eval_logps/rejected": -151.51715625, + "eval_loss": 0.49078983068466187, + "eval_rewards/chosen": -0.223786865234375, + "eval_rewards/margins": 0.06365646362304686, + "eval_rewards/rejected": -0.28744332885742185, + "eval_runtime": 216.801, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 0.5972615480422974, + "kl": 2.554426431655884, + "learning_rate": 3.3133333333333335e-06, + "logits/chosen": 27570451.2, + "logits/rejected": 30221734.4, + "logps/chosen": -138.31341552734375, + "logps/rejected": -188.19471435546876, + "loss": 0.49091529846191406, + "rewards/chosen": -0.16074006557464598, + "rewards/margins": 0.05920815467834473, + "rewards/rejected": -0.2199482202529907, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 0.5644449591636658, + "kl": 2.1006593704223633, + "learning_rate": 3.2911111111111116e-06, + "logits/chosen": 28881091.2, + "logits/rejected": 27956883.2, + "logps/chosen": -154.400244140625, + "logps/rejected": -139.5636474609375, + "loss": 0.4885709762573242, + "rewards/chosen": -0.201019549369812, + "rewards/margins": 0.14239611625671386, + "rewards/rejected": -0.34341566562652587, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 0.45909127593040466, + "kl": 2.598879337310791, + "learning_rate": 3.268888888888889e-06, + "logits/chosen": 34286569.6, + "logits/rejected": 33405510.4, + "logps/chosen": -155.141162109375, + "logps/rejected": -153.61441650390626, + "loss": 0.47780580520629884, + "rewards/chosen": -0.026760125160217287, + "rewards/margins": 0.17421259880065917, + "rewards/rejected": -0.20097272396087645, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 0.5554538369178772, + "kl": 2.448212146759033, + "learning_rate": 3.2466666666666668e-06, + "logits/chosen": 27163843.2, + "logits/rejected": 26525179.2, + "logps/chosen": -192.26566162109376, + "logps/rejected": -115.9719970703125, + "loss": 0.5046756744384766, + "rewards/chosen": -0.2157804250717163, + "rewards/margins": -0.09053788185119627, + "rewards/rejected": -0.12524254322052003, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 0.535012423992157, + "kl": 1.9927467107772827, + "learning_rate": 3.2244444444444444e-06, + "logits/chosen": 29665126.4, + "logits/rejected": 27342956.8, + "logps/chosen": -129.9255859375, + "logps/rejected": -160.8191162109375, + "loss": 0.4967525005340576, + "rewards/chosen": -0.1964455485343933, + "rewards/margins": 0.04946266412734987, + "rewards/rejected": -0.24590821266174318, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 0.4275314509868622, + "kl": 1.8325145244598389, + "learning_rate": 3.202222222222223e-06, + "logits/chosen": 38900185.6, + "logits/rejected": 36465756.8, + "logps/chosen": -173.99815673828124, + "logps/rejected": -189.41026611328124, + "loss": 0.5163179874420166, + "rewards/chosen": -0.341221284866333, + "rewards/margins": -0.10804271697998047, + "rewards/rejected": -0.23317856788635255, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 0.6077404022216797, + "kl": 1.2542213201522827, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 30413292.8, + "logits/rejected": 31516124.8, + "logps/chosen": -146.35406494140625, + "logps/rejected": -171.661767578125, + "loss": 0.4878209590911865, + "rewards/chosen": -0.4057271957397461, + "rewards/margins": 0.26455159187316896, + "rewards/rejected": -0.670278787612915, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 0.3333641290664673, + "kl": 0.8504716753959656, + "learning_rate": 3.157777777777778e-06, + "logits/chosen": 33478700.8, + "logits/rejected": 35287001.6, + "logps/chosen": -159.8537109375, + "logps/rejected": -140.7849609375, + "loss": 0.5070839405059815, + "rewards/chosen": -0.5834408283233643, + "rewards/margins": -0.13464021682739263, + "rewards/rejected": -0.44880061149597167, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 0.5745656490325928, + "kl": 0.5619686245918274, + "learning_rate": 3.1355555555555557e-06, + "logits/chosen": 25375937.6, + "logits/rejected": 24836934.4, + "logps/chosen": -150.52469482421876, + "logps/rejected": -119.2016357421875, + "loss": 0.5091001033782959, + "rewards/chosen": -0.6672951698303222, + "rewards/margins": -0.07665328979492181, + "rewards/rejected": -0.5906418800354004, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 0.5211692452430725, + "kl": 0.7837439775466919, + "learning_rate": 3.1133333333333337e-06, + "logits/chosen": 32551961.6, + "logits/rejected": 31246240.0, + "logps/chosen": -141.6515625, + "logps/rejected": -178.54500732421874, + "loss": 0.4797823429107666, + "rewards/chosen": -0.5657515525817871, + "rewards/margins": 0.2927797317504883, + "rewards/rejected": -0.8585312843322754, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_kl": 1.0299346446990967, + "eval_logits/chosen": 32200159.232, + "eval_logits/rejected": 32317042.688, + "eval_logps/chosen": -159.071015625, + "eval_logps/rejected": -154.04371875, + "eval_loss": 0.4902701675891876, + "eval_rewards/chosen": -0.4745252380371094, + "eval_rewards/margins": 0.0655753479003906, + "eval_rewards/rejected": -0.5401005859375, + "eval_runtime": 218.2058, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 0.5113421678543091, + "kl": 0.8989810943603516, + "learning_rate": 3.0911111111111114e-06, + "logits/chosen": 35005900.8, + "logits/rejected": 37000198.4, + "logps/chosen": -121.57982177734375, + "logps/rejected": -188.24630126953124, + "loss": 0.5006334781646729, + "rewards/chosen": -0.5451927661895752, + "rewards/margins": -0.006183815002441473, + "rewards/rejected": -0.5390089511871338, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 0.5302984714508057, + "kl": 1.4579006433486938, + "learning_rate": 3.068888888888889e-06, + "logits/chosen": 44859440.0, + "logits/rejected": 44371523.2, + "logps/chosen": -152.48531494140624, + "logps/rejected": -166.46834716796874, + "loss": 0.47098937034606936, + "rewards/chosen": -0.30156469345092773, + "rewards/margins": 0.31780052185058594, + "rewards/rejected": -0.6193652153015137, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 0.513234555721283, + "kl": 1.1729340553283691, + "learning_rate": 3.0466666666666666e-06, + "logits/chosen": 34804921.6, + "logits/rejected": 33861536.0, + "logps/chosen": -184.123681640625, + "logps/rejected": -151.5656005859375, + "loss": 0.48264274597167967, + "rewards/chosen": -0.47753205299377444, + "rewards/margins": 0.19833426475524896, + "rewards/rejected": -0.6758663177490234, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 0.3625507354736328, + "kl": 1.0059670209884644, + "learning_rate": 3.024444444444445e-06, + "logits/chosen": 21122052.8, + "logits/rejected": 21075241.6, + "logps/chosen": -109.50086669921875, + "logps/rejected": -147.45343017578125, + "loss": 0.4898237705230713, + "rewards/chosen": -0.47812538146972655, + "rewards/margins": 0.0868696689605713, + "rewards/rejected": -0.5649950504302979, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 0.45654433965682983, + "kl": 0.5379985570907593, + "learning_rate": 3.0022222222222227e-06, + "logits/chosen": 41951734.4, + "logits/rejected": 39602163.2, + "logps/chosen": -224.10224609375, + "logps/rejected": -172.74881591796876, + "loss": 0.4965188980102539, + "rewards/chosen": -1.0065871238708497, + "rewards/margins": 0.07849445343017569, + "rewards/rejected": -1.0850815773010254, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 0.4175960123538971, + "kl": 0.7397834062576294, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 25937947.2, + "logits/rejected": 23997976.0, + "logps/chosen": -139.0525634765625, + "logps/rejected": -172.82266845703126, + "loss": 0.46753606796264646, + "rewards/chosen": -0.6469098091125488, + "rewards/margins": 0.5304314613342286, + "rewards/rejected": -1.1773412704467774, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 0.6416438221931458, + "kl": 1.0863409042358398, + "learning_rate": 2.957777777777778e-06, + "logits/chosen": 27452134.4, + "logits/rejected": 28027673.6, + "logps/chosen": -141.15223388671876, + "logps/rejected": -165.84906005859375, + "loss": 0.492138671875, + "rewards/chosen": -0.9567946434020996, + "rewards/margins": -0.08803501129150393, + "rewards/rejected": -0.8687596321105957, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 0.4831324517726898, + "kl": 0.9691106081008911, + "learning_rate": 2.935555555555556e-06, + "logits/chosen": 29437625.6, + "logits/rejected": 27526915.2, + "logps/chosen": -141.1181884765625, + "logps/rejected": -170.26187744140626, + "loss": 0.48889832496643065, + "rewards/chosen": -0.5727379322052002, + "rewards/margins": 0.27677369117736816, + "rewards/rejected": -0.8495116233825684, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 0.4612303078174591, + "kl": 0.5348154306411743, + "learning_rate": 2.9133333333333335e-06, + "logits/chosen": 29246425.6, + "logits/rejected": 24839766.4, + "logps/chosen": -184.67042236328126, + "logps/rejected": -187.88712158203126, + "loss": 0.4651207447052002, + "rewards/chosen": -0.6563633918762207, + "rewards/margins": 0.6019566535949706, + "rewards/rejected": -1.2583200454711914, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 0.4897603392601013, + "kl": 0.6600741147994995, + "learning_rate": 2.891111111111111e-06, + "logits/chosen": 29597916.8, + "logits/rejected": 28004249.6, + "logps/chosen": -146.8487060546875, + "logps/rejected": -160.24705810546874, + "loss": 0.4915929794311523, + "rewards/chosen": -0.6603847980499268, + "rewards/margins": 0.15054879188537595, + "rewards/rejected": -0.8109335899353027, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_kl": 0.6639278531074524, + "eval_logits/chosen": 30372524.032, + "eval_logits/rejected": 30547062.784, + "eval_logps/chosen": -161.525796875, + "eval_logps/rejected": -156.5376875, + "eval_loss": 0.4897628426551819, + "eval_rewards/chosen": -0.720002685546875, + "eval_rewards/margins": 0.06949468994140628, + "eval_rewards/rejected": -0.7894973754882812, + "eval_runtime": 218.5755, + "eval_samples_per_second": 4.575, + "eval_steps_per_second": 2.288, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 0.48334673047065735, + "kl": 0.7732948064804077, + "learning_rate": 2.868888888888889e-06, + "logits/chosen": 21258230.4, + "logits/rejected": 23340913.6, + "logps/chosen": -139.87337646484374, + "logps/rejected": -144.701416015625, + "loss": 0.5003955841064454, + "rewards/chosen": -0.8982287406921386, + "rewards/margins": 0.038875579833984375, + "rewards/rejected": -0.937104320526123, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 0.492876261472702, + "kl": 1.0231386423110962, + "learning_rate": 2.8466666666666672e-06, + "logits/chosen": 29408150.4, + "logits/rejected": 26554420.8, + "logps/chosen": -184.746484375, + "logps/rejected": -195.84049072265626, + "loss": 0.49897193908691406, + "rewards/chosen": -0.7237229824066163, + "rewards/margins": 0.09356503486633294, + "rewards/rejected": -0.8172880172729492, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 0.4466714560985565, + "kl": 1.3907277584075928, + "learning_rate": 2.824444444444445e-06, + "logits/chosen": 26329916.8, + "logits/rejected": 26271193.6, + "logps/chosen": -150.485302734375, + "logps/rejected": -159.1055908203125, + "loss": 0.48776721954345703, + "rewards/chosen": -0.38318867683410646, + "rewards/margins": 0.07780303955078122, + "rewards/rejected": -0.4609917163848877, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 0.4158620834350586, + "kl": 1.1157363653182983, + "learning_rate": 2.8022222222222225e-06, + "logits/chosen": 27694662.4, + "logits/rejected": 27987369.6, + "logps/chosen": -125.29927978515624, + "logps/rejected": -122.66854248046874, + "loss": 0.49160265922546387, + "rewards/chosen": -0.3763638734817505, + "rewards/margins": 0.043813061714172374, + "rewards/rejected": -0.42017693519592286, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 0.5007547736167908, + "kl": 1.0412095785140991, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 19932609.6, + "logits/rejected": 20719350.4, + "logps/chosen": -115.5499267578125, + "logps/rejected": -133.51761474609376, + "loss": 0.4874756336212158, + "rewards/chosen": -0.3205535411834717, + "rewards/margins": 0.1453540325164795, + "rewards/rejected": -0.46590757369995117, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 0.5230170488357544, + "kl": 1.1391807794570923, + "learning_rate": 2.757777777777778e-06, + "logits/chosen": 24849233.6, + "logits/rejected": 25524444.8, + "logps/chosen": -152.53955078125, + "logps/rejected": -136.38438720703124, + "loss": 0.4610313892364502, + "rewards/chosen": -0.35785841941833496, + "rewards/margins": 0.3967602729797364, + "rewards/rejected": -0.7546186923980713, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 0.44898638129234314, + "kl": 1.0509991645812988, + "learning_rate": 2.7355555555555557e-06, + "logits/chosen": 45757600.0, + "logits/rejected": 44022054.4, + "logps/chosen": -142.66414794921874, + "logps/rejected": -156.66207275390624, + "loss": 0.48042120933532717, + "rewards/chosen": -0.2633040189743042, + "rewards/margins": 0.21041772365570066, + "rewards/rejected": -0.4737217426300049, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 0.38083699345588684, + "kl": 1.3006069660186768, + "learning_rate": 2.7133333333333333e-06, + "logits/chosen": 27252787.2, + "logits/rejected": 26214958.4, + "logps/chosen": -137.90665283203126, + "logps/rejected": -148.19869384765624, + "loss": 0.4850759029388428, + "rewards/chosen": -0.6168015956878662, + "rewards/margins": 0.1662153244018555, + "rewards/rejected": -0.7830169200897217, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 0.3815905749797821, + "kl": 1.5258519649505615, + "learning_rate": 2.6911111111111114e-06, + "logits/chosen": 36098556.8, + "logits/rejected": 37203574.4, + "logps/chosen": -146.7918212890625, + "logps/rejected": -150.63316650390624, + "loss": 0.48316545486450196, + "rewards/chosen": -0.2494358777999878, + "rewards/margins": 0.2103111505508423, + "rewards/rejected": -0.4597470283508301, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 0.45923149585723877, + "kl": 0.7379667162895203, + "learning_rate": 2.6688888888888894e-06, + "logits/chosen": 35448102.4, + "logits/rejected": 36526304.0, + "logps/chosen": -146.106689453125, + "logps/rejected": -163.4516357421875, + "loss": 0.47162642478942873, + "rewards/chosen": -0.5096414089202881, + "rewards/margins": 0.40263419151306157, + "rewards/rejected": -0.9122756004333497, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_kl": 1.307568073272705, + "eval_logits/chosen": 31229739.008, + "eval_logits/rejected": 31346253.824, + "eval_logps/chosen": -159.37909375, + "eval_logps/rejected": -154.479625, + "eval_loss": 0.4888923466205597, + "eval_rewards/chosen": -0.5053312683105469, + "eval_rewards/margins": 0.07835940551757814, + "eval_rewards/rejected": -0.583690673828125, + "eval_runtime": 218.3314, + "eval_samples_per_second": 4.58, + "eval_steps_per_second": 2.29, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 0.45518290996551514, + "kl": 0.6098345518112183, + "learning_rate": 2.646666666666667e-06, + "logits/chosen": 28600944.0, + "logits/rejected": 24958267.2, + "logps/chosen": -110.2119384765625, + "logps/rejected": -123.31849365234375, + "loss": 0.4656740665435791, + "rewards/chosen": -0.44736084938049314, + "rewards/margins": 0.48334193229675293, + "rewards/rejected": -0.9307027816772461, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 0.4651184678077698, + "kl": 2.3091390132904053, + "learning_rate": 2.6244444444444446e-06, + "logits/chosen": 41654179.2, + "logits/rejected": 39032467.2, + "logps/chosen": -170.0236083984375, + "logps/rejected": -183.32196044921875, + "loss": 0.48522496223449707, + "rewards/chosen": -0.43915767669677735, + "rewards/margins": 0.38542776107788085, + "rewards/rejected": -0.8245854377746582, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 0.5705190896987915, + "kl": 2.221473217010498, + "learning_rate": 2.6022222222222227e-06, + "logits/chosen": 37688201.6, + "logits/rejected": 36830240.0, + "logps/chosen": -155.06568603515626, + "logps/rejected": -168.05982666015626, + "loss": 0.4541748046875, + "rewards/chosen": -0.014664022624492646, + "rewards/margins": 0.44051638394594195, + "rewards/rejected": -0.45518040657043457, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 0.5728591680526733, + "kl": 2.2642099857330322, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 36680921.6, + "logits/rejected": 36696492.8, + "logps/chosen": -169.67305908203124, + "logps/rejected": -163.4990478515625, + "loss": 0.4542436122894287, + "rewards/chosen": 0.017992374300956727, + "rewards/margins": 0.4525547713041306, + "rewards/rejected": -0.43456239700317384, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 0.5952552556991577, + "kl": 2.2592105865478516, + "learning_rate": 2.557777777777778e-06, + "logits/chosen": 23545342.4, + "logits/rejected": 21279006.4, + "logps/chosen": -138.18184814453124, + "logps/rejected": -171.426416015625, + "loss": 0.47548651695251465, + "rewards/chosen": -0.38828775882720945, + "rewards/margins": 0.30096304416656494, + "rewards/rejected": -0.6892508029937744, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 0.45678818225860596, + "kl": 3.0089876651763916, + "learning_rate": 2.5355555555555555e-06, + "logits/chosen": 34513894.4, + "logits/rejected": 33449926.4, + "logps/chosen": -128.89869384765626, + "logps/rejected": -136.55162353515624, + "loss": 0.4745296001434326, + "rewards/chosen": 0.16677324771881102, + "rewards/margins": 0.25756397247314455, + "rewards/rejected": -0.0907907247543335, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 0.5198411345481873, + "kl": 4.8974103927612305, + "learning_rate": 2.5133333333333336e-06, + "logits/chosen": 42754617.6, + "logits/rejected": 43195552.0, + "logps/chosen": -159.500830078125, + "logps/rejected": -132.177587890625, + "loss": 0.48738694190979004, + "rewards/chosen": 0.2657592296600342, + "rewards/margins": 0.10389068126678466, + "rewards/rejected": 0.16186854839324952, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 0.4035385549068451, + "kl": 4.419563293457031, + "learning_rate": 2.491111111111111e-06, + "logits/chosen": 35998966.4, + "logits/rejected": 36727424.0, + "logps/chosen": -159.00057373046874, + "logps/rejected": -138.2158935546875, + "loss": 0.4853508949279785, + "rewards/chosen": 0.31732945442199706, + "rewards/margins": 0.10866012573242187, + "rewards/rejected": 0.2086693286895752, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 0.48067033290863037, + "kl": 2.848634958267212, + "learning_rate": 2.468888888888889e-06, + "logits/chosen": 39241859.2, + "logits/rejected": 40267868.8, + "logps/chosen": -141.302490234375, + "logps/rejected": -158.81297607421874, + "loss": 0.47108969688415525, + "rewards/chosen": 0.13487266302108764, + "rewards/margins": 0.21125618219375608, + "rewards/rejected": -0.07638351917266846, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 0.5766560435295105, + "kl": 3.4840214252471924, + "learning_rate": 2.446666666666667e-06, + "logits/chosen": 37769334.4, + "logits/rejected": 39194985.6, + "logps/chosen": -128.72220458984376, + "logps/rejected": -119.84296875, + "loss": 0.4526535987854004, + "rewards/chosen": 0.18631891012191773, + "rewards/margins": 0.41424218416213987, + "rewards/rejected": -0.22792327404022217, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_kl": 2.8408021926879883, + "eval_logits/chosen": 34422374.4, + "eval_logits/rejected": 34374045.696, + "eval_logps/chosen": -154.737546875, + "eval_logps/rejected": -149.911859375, + "eval_loss": 0.48892152309417725, + "eval_rewards/chosen": -0.04117748260498047, + "eval_rewards/margins": 0.08573676300048827, + "eval_rewards/rejected": -0.12691424560546874, + "eval_runtime": 217.9065, + "eval_samples_per_second": 4.589, + "eval_steps_per_second": 2.295, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.572714626789093, + "kl": 3.2733802795410156, + "learning_rate": 2.4244444444444444e-06, + "logits/chosen": 31887904.0, + "logits/rejected": 31864508.8, + "logps/chosen": -143.56092529296876, + "logps/rejected": -149.83736572265624, + "loss": 0.47499790191650393, + "rewards/chosen": 0.03257654905319214, + "rewards/margins": 0.27403136491775515, + "rewards/rejected": -0.241454815864563, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.41845089197158813, + "kl": 3.525341749191284, + "learning_rate": 2.4022222222222225e-06, + "logits/chosen": 34721577.6, + "logits/rejected": 34714211.2, + "logps/chosen": -135.9596923828125, + "logps/rejected": -152.25648193359376, + "loss": 0.4618217945098877, + "rewards/chosen": 0.27119529247283936, + "rewards/margins": 0.4753966093063354, + "rewards/rejected": -0.2042013168334961, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 0.48100632429122925, + "kl": 4.161208152770996, + "learning_rate": 2.38e-06, + "logits/chosen": 41846537.6, + "logits/rejected": 41130585.6, + "logps/chosen": -141.89105224609375, + "logps/rejected": -170.82178955078126, + "loss": 0.46498618125915525, + "rewards/chosen": 0.3817573547363281, + "rewards/margins": 0.3280399918556213, + "rewards/rejected": 0.05371736288070679, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 0.5169075131416321, + "kl": 3.003018617630005, + "learning_rate": 2.357777777777778e-06, + "logits/chosen": 39127420.8, + "logits/rejected": 39187852.8, + "logps/chosen": -121.814013671875, + "logps/rejected": -137.307861328125, + "loss": 0.46971497535705564, + "rewards/chosen": 0.28073878288269044, + "rewards/margins": 0.24493391513824464, + "rewards/rejected": 0.0358048677444458, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 0.6531253457069397, + "kl": 4.080590724945068, + "learning_rate": 2.3355555555555557e-06, + "logits/chosen": 45846016.0, + "logits/rejected": 43724588.8, + "logps/chosen": -147.2013671875, + "logps/rejected": -174.9349365234375, + "loss": 0.4397883892059326, + "rewards/chosen": 0.4346614837646484, + "rewards/margins": 0.6019469738006591, + "rewards/rejected": -0.16728549003601073, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 0.5618774890899658, + "kl": 2.632253885269165, + "learning_rate": 2.3133333333333333e-06, + "logits/chosen": 29858144.0, + "logits/rejected": 30215900.8, + "logps/chosen": -140.875927734375, + "logps/rejected": -135.17412109375, + "loss": 0.4855960369110107, + "rewards/chosen": -0.033119755983352664, + "rewards/margins": 0.22187880873680116, + "rewards/rejected": -0.2549985647201538, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 0.5618053674697876, + "kl": 3.8460822105407715, + "learning_rate": 2.2911111111111114e-06, + "logits/chosen": 33933001.6, + "logits/rejected": 33417001.6, + "logps/chosen": -98.64682006835938, + "logps/rejected": -126.3620849609375, + "loss": 0.4694389343261719, + "rewards/chosen": 0.3202403783798218, + "rewards/margins": 0.30994352102279665, + "rewards/rejected": 0.010296857357025147, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 0.49939385056495667, + "kl": 3.8765969276428223, + "learning_rate": 2.268888888888889e-06, + "logits/chosen": 45282752.0, + "logits/rejected": 45168672.0, + "logps/chosen": -182.675830078125, + "logps/rejected": -164.2170166015625, + "loss": 0.4553979396820068, + "rewards/chosen": 0.2684901714324951, + "rewards/margins": 0.3095468133687973, + "rewards/rejected": -0.041056641936302186, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 0.4833432734012604, + "kl": 3.6524147987365723, + "learning_rate": 2.2466666666666666e-06, + "logits/chosen": 44457616.0, + "logits/rejected": 45819459.2, + "logps/chosen": -174.73040771484375, + "logps/rejected": -167.15947265625, + "loss": 0.4712826728820801, + "rewards/chosen": 0.22984566688537597, + "rewards/margins": 0.2501527413725853, + "rewards/rejected": -0.02030707448720932, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 0.5918501019477844, + "kl": 3.85776948928833, + "learning_rate": 2.2244444444444447e-06, + "logits/chosen": 35046502.4, + "logits/rejected": 33714608.0, + "logps/chosen": -171.797265625, + "logps/rejected": -147.64517822265626, + "loss": 0.4212610721588135, + "rewards/chosen": 0.4938655376434326, + "rewards/margins": 0.7197992086410523, + "rewards/rejected": -0.22593367099761963, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_kl": 3.5718271732330322, + "eval_logits/chosen": 36649410.56, + "eval_logits/rejected": 36563263.488, + "eval_logps/chosen": -152.817171875, + "eval_logps/rejected": -148.086484375, + "eval_loss": 0.4883860945701599, + "eval_rewards/chosen": 0.15086082458496095, + "eval_rewards/margins": 0.09523786926269531, + "eval_rewards/rejected": 0.055622955322265626, + "eval_runtime": 217.5749, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 0.5604835748672485, + "kl": 2.5207982063293457, + "learning_rate": 2.2022222222222227e-06, + "logits/chosen": 28183353.6, + "logits/rejected": 26171800.0, + "logps/chosen": -127.34248046875, + "logps/rejected": -135.6572265625, + "loss": 0.44087018966674807, + "rewards/chosen": 0.20389485359191895, + "rewards/margins": 0.5874947786331177, + "rewards/rejected": -0.3835999250411987, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 0.6260045170783997, + "kl": 4.300113677978516, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37477926.4, + "logits/rejected": 37180899.2, + "logps/chosen": -153.624072265625, + "logps/rejected": -153.7638916015625, + "loss": 0.4450747013092041, + "rewards/chosen": 0.40005855560302733, + "rewards/margins": 0.5518900513648987, + "rewards/rejected": -0.15183149576187133, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 0.5863284468650818, + "kl": 3.6325111389160156, + "learning_rate": 2.157777777777778e-06, + "logits/chosen": 31546803.2, + "logits/rejected": 27054540.8, + "logps/chosen": -164.37142333984374, + "logps/rejected": -141.66112060546874, + "loss": 0.4467916488647461, + "rewards/chosen": 0.37994205951690674, + "rewards/margins": 0.499565863609314, + "rewards/rejected": -0.11962380409240722, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 0.42624586820602417, + "kl": 4.787026882171631, + "learning_rate": 2.1355555555555555e-06, + "logits/chosen": 29693196.8, + "logits/rejected": 28872902.4, + "logps/chosen": -140.95440673828125, + "logps/rejected": -148.26785888671876, + "loss": 0.43952031135559083, + "rewards/chosen": 0.574599027633667, + "rewards/margins": 0.5613519787788391, + "rewards/rejected": 0.013247048854827881, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 0.5493943095207214, + "kl": 3.9344754219055176, + "learning_rate": 2.1133333333333336e-06, + "logits/chosen": 40305772.8, + "logits/rejected": 40673481.6, + "logps/chosen": -165.5770751953125, + "logps/rejected": -181.73116455078124, + "loss": 0.4579936981201172, + "rewards/chosen": 0.33573935031890867, + "rewards/margins": 0.4691450238227844, + "rewards/rejected": -0.13340567350387572, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 0.46218565106391907, + "kl": 4.996054172515869, + "learning_rate": 2.091111111111111e-06, + "logits/chosen": 34034489.6, + "logits/rejected": 34783824.0, + "logps/chosen": -142.5915283203125, + "logps/rejected": -154.66793212890624, + "loss": 0.4685988426208496, + "rewards/chosen": 0.42725467681884766, + "rewards/margins": 0.2892191410064697, + "rewards/rejected": 0.13803553581237793, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 0.6164036989212036, + "kl": 3.2075297832489014, + "learning_rate": 2.0688888888888892e-06, + "logits/chosen": 37389939.2, + "logits/rejected": 35211542.4, + "logps/chosen": -157.27457275390626, + "logps/rejected": -135.26968994140626, + "loss": 0.4538430690765381, + "rewards/chosen": 0.30452628135681153, + "rewards/margins": 0.4211171746253968, + "rewards/rejected": -0.11659089326858521, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 0.4599875807762146, + "kl": 4.0805768966674805, + "learning_rate": 2.046666666666667e-06, + "logits/chosen": 47580665.6, + "logits/rejected": 44842764.8, + "logps/chosen": -147.82244873046875, + "logps/rejected": -144.87330322265626, + "loss": 0.4251837253570557, + "rewards/chosen": 0.6019775867462158, + "rewards/margins": 0.6677440404891968, + "rewards/rejected": -0.06576645374298096, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 0.5166321992874146, + "kl": 3.547309160232544, + "learning_rate": 2.024444444444445e-06, + "logits/chosen": 39700006.4, + "logits/rejected": 39951171.2, + "logps/chosen": -131.4615478515625, + "logps/rejected": -144.446240234375, + "loss": 0.4551235675811768, + "rewards/chosen": 0.3439887046813965, + "rewards/margins": 0.35865890979766846, + "rewards/rejected": -0.014670205116271973, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 0.515184760093689, + "kl": 5.392228126525879, + "learning_rate": 2.0022222222222225e-06, + "logits/chosen": 37336246.4, + "logits/rejected": 35750588.8, + "logps/chosen": -129.5730224609375, + "logps/rejected": -167.88192138671874, + "loss": 0.46866717338562014, + "rewards/chosen": 0.5722126007080078, + "rewards/margins": 0.2670687913894654, + "rewards/rejected": 0.30514380931854246, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_kl": 4.125787258148193, + "eval_logits/chosen": 36980781.056, + "eval_logits/rejected": 36865613.824, + "eval_logps/chosen": -152.2105625, + "eval_logps/rejected": -147.532890625, + "eval_loss": 0.4878697693347931, + "eval_rewards/chosen": 0.21152139282226562, + "eval_rewards/margins": 0.1005374526977539, + "eval_rewards/rejected": 0.11098394012451172, + "eval_runtime": 217.1057, + "eval_samples_per_second": 4.606, + "eval_steps_per_second": 2.303, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 0.33879461884498596, + "kl": 3.9892711639404297, + "learning_rate": 1.98e-06, + "logits/chosen": 38520425.6, + "logits/rejected": 37891657.6, + "logps/chosen": -144.33753662109376, + "logps/rejected": -113.0916748046875, + "loss": 0.4574281215667725, + "rewards/chosen": 0.3910325288772583, + "rewards/margins": 0.4121716648340225, + "rewards/rejected": -0.02113913595676422, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 0.4744361937046051, + "kl": 4.163745403289795, + "learning_rate": 1.9577777777777777e-06, + "logits/chosen": 37167372.8, + "logits/rejected": 37068976.0, + "logps/chosen": -144.5396484375, + "logps/rejected": -151.2422119140625, + "loss": 0.4770793914794922, + "rewards/chosen": 0.24615283012390138, + "rewards/margins": 0.2315782740712166, + "rewards/rejected": 0.014574556052684784, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 0.6805797815322876, + "kl": 4.433796405792236, + "learning_rate": 1.9355555555555558e-06, + "logits/chosen": 33666192.0, + "logits/rejected": 33794051.2, + "logps/chosen": -152.936279296875, + "logps/rejected": -167.15966796875, + "loss": 0.4641437530517578, + "rewards/chosen": 0.3409790754318237, + "rewards/margins": 0.22917660474777218, + "rewards/rejected": 0.11180247068405151, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 0.4908677637577057, + "kl": 4.329981803894043, + "learning_rate": 1.9133333333333334e-06, + "logits/chosen": 28728278.4, + "logits/rejected": 29006166.4, + "logps/chosen": -171.674267578125, + "logps/rejected": -151.65748291015626, + "loss": 0.4317901611328125, + "rewards/chosen": 0.4740726947784424, + "rewards/margins": 0.6063881039619445, + "rewards/rejected": -0.1323154091835022, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 0.5628176927566528, + "kl": 3.917667865753174, + "learning_rate": 1.8911111111111114e-06, + "logits/chosen": 41002073.6, + "logits/rejected": 38901564.8, + "logps/chosen": -146.1817138671875, + "logps/rejected": -137.590380859375, + "loss": 0.44250779151916503, + "rewards/chosen": 0.36355061531066896, + "rewards/margins": 0.5350786447525024, + "rewards/rejected": -0.17152802944183348, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 0.4226330816745758, + "kl": 4.557765007019043, + "learning_rate": 1.868888888888889e-06, + "logits/chosen": 41738940.8, + "logits/rejected": 39382457.6, + "logps/chosen": -144.0579833984375, + "logps/rejected": -149.93160400390624, + "loss": 0.4471259117126465, + "rewards/chosen": 0.4530649662017822, + "rewards/margins": 0.5140628039836883, + "rewards/rejected": -0.06099783778190613, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 0.726682186126709, + "kl": 2.606447219848633, + "learning_rate": 1.8466666666666668e-06, + "logits/chosen": 28843798.4, + "logits/rejected": 28256953.6, + "logps/chosen": -162.8355224609375, + "logps/rejected": -135.0196044921875, + "loss": 0.4602541923522949, + "rewards/chosen": 0.034113740921020506, + "rewards/margins": 0.3491029262542724, + "rewards/rejected": -0.31498918533325193, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4371758997440338, + "kl": 3.8837997913360596, + "learning_rate": 1.8244444444444445e-06, + "logits/chosen": 30105264.0, + "logits/rejected": 30200156.8, + "logps/chosen": -169.60362548828124, + "logps/rejected": -121.60936279296875, + "loss": 0.4653130054473877, + "rewards/chosen": 0.13300797939300538, + "rewards/margins": 0.2890047550201416, + "rewards/rejected": -0.15599677562713624, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.7084026336669922, + "kl": 3.9908013343811035, + "learning_rate": 1.8022222222222225e-06, + "logits/chosen": 40353881.6, + "logits/rejected": 41124192.0, + "logps/chosen": -145.57255859375, + "logps/rejected": -169.683984375, + "loss": 0.46573567390441895, + "rewards/chosen": 0.37704455852508545, + "rewards/margins": 0.30847471952438354, + "rewards/rejected": 0.0685698390007019, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.560882031917572, + "kl": 3.73456072807312, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 31647622.4, + "logits/rejected": 32944563.2, + "logps/chosen": -96.58458251953125, + "logps/rejected": -156.17933349609376, + "loss": 0.47052454948425293, + "rewards/chosen": 0.26383423805236816, + "rewards/margins": 0.31988897919654846, + "rewards/rejected": -0.0560547411441803, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 4.715727806091309, + "eval_logits/chosen": 37336121.344, + "eval_logits/rejected": 37191081.984, + "eval_logps/chosen": -151.06928125, + "eval_logps/rejected": -146.466265625, + "eval_loss": 0.4871442914009094, + "eval_rewards/chosen": 0.3256500549316406, + "eval_rewards/margins": 0.1080040283203125, + "eval_rewards/rejected": 0.21764602661132812, + "eval_runtime": 217.8394, + "eval_samples_per_second": 4.591, + "eval_steps_per_second": 2.295, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5111773014068604, + "kl": 5.973706245422363, + "learning_rate": 1.757777777777778e-06, + "logits/chosen": 43906630.4, + "logits/rejected": 41141516.8, + "logps/chosen": -158.88671875, + "logps/rejected": -179.5316650390625, + "loss": 0.4583888530731201, + "rewards/chosen": 0.5715279579162598, + "rewards/margins": 0.4317225098609924, + "rewards/rejected": 0.13980544805526735, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 0.4663240611553192, + "kl": 4.6347246170043945, + "learning_rate": 1.7355555555555555e-06, + "logits/chosen": 49689798.4, + "logits/rejected": 46981590.4, + "logps/chosen": -171.759765625, + "logps/rejected": -179.522802734375, + "loss": 0.47397675514221194, + "rewards/chosen": 0.18588199615478515, + "rewards/margins": 0.39475393295288086, + "rewards/rejected": -0.2088719367980957, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 0.5763538479804993, + "kl": 4.0374932289123535, + "learning_rate": 1.7133333333333336e-06, + "logits/chosen": 41851731.2, + "logits/rejected": 41044272.0, + "logps/chosen": -149.54090576171876, + "logps/rejected": -163.9952880859375, + "loss": 0.4446412563323975, + "rewards/chosen": 0.29855611324310305, + "rewards/margins": 0.5403349876403809, + "rewards/rejected": -0.24177887439727783, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 0.40721967816352844, + "kl": 4.595529556274414, + "learning_rate": 1.6911111111111112e-06, + "logits/chosen": 27241497.6, + "logits/rejected": 25061027.2, + "logps/chosen": -144.4770751953125, + "logps/rejected": -148.99678955078124, + "loss": 0.441908073425293, + "rewards/chosen": 0.47647829055786134, + "rewards/margins": 0.662132203578949, + "rewards/rejected": -0.18565391302108764, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 0.5112435221672058, + "kl": 6.786820411682129, + "learning_rate": 1.668888888888889e-06, + "logits/chosen": 43438329.6, + "logits/rejected": 42274822.4, + "logps/chosen": -173.9513916015625, + "logps/rejected": -128.84437255859376, + "loss": 0.44019775390625, + "rewards/chosen": 0.8855677604675293, + "rewards/margins": 0.5187113761901856, + "rewards/rejected": 0.36685638427734374, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 0.6391093134880066, + "kl": 3.4943454265594482, + "learning_rate": 1.6466666666666666e-06, + "logits/chosen": 46858697.6, + "logits/rejected": 47063660.8, + "logps/chosen": -143.279248046875, + "logps/rejected": -161.62069091796874, + "loss": 0.4441429615020752, + "rewards/chosen": 0.3917685985565186, + "rewards/margins": 0.5121995925903321, + "rewards/rejected": -0.12043099403381348, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 0.5220089554786682, + "kl": 5.633955955505371, + "learning_rate": 1.6244444444444447e-06, + "logits/chosen": 45115747.2, + "logits/rejected": 43860156.8, + "logps/chosen": -153.03194580078124, + "logps/rejected": -162.67841796875, + "loss": 0.4590646743774414, + "rewards/chosen": 0.6448601245880127, + "rewards/margins": 0.43718719482421875, + "rewards/rejected": 0.20767292976379395, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 0.5118013024330139, + "kl": 4.387326240539551, + "learning_rate": 1.6022222222222223e-06, + "logits/chosen": 40869132.8, + "logits/rejected": 39574160.0, + "logps/chosen": -138.70467529296874, + "logps/rejected": -177.04256591796874, + "loss": 0.4694656848907471, + "rewards/chosen": 0.32769317626953126, + "rewards/margins": 0.32016055583953856, + "rewards/rejected": 0.0075326204299926754, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 0.7699334025382996, + "kl": 5.964260578155518, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 33789193.6, + "logits/rejected": 32414848.0, + "logps/chosen": -144.8453857421875, + "logps/rejected": -156.7958740234375, + "loss": 0.43700370788574217, + "rewards/chosen": 0.7253459453582763, + "rewards/margins": 0.5568214774131774, + "rewards/rejected": 0.16852446794509887, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 0.569644570350647, + "kl": 4.964392185211182, + "learning_rate": 1.5577777777777777e-06, + "logits/chosen": 41654611.2, + "logits/rejected": 42416057.6, + "logps/chosen": -150.742919921875, + "logps/rejected": -167.503076171875, + "loss": 0.46422877311706545, + "rewards/chosen": 0.4968874931335449, + "rewards/margins": 0.40884148478508, + "rewards/rejected": 0.08804600834846496, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_kl": 4.746038436889648, + "eval_logits/chosen": 38021050.368, + "eval_logits/rejected": 37875998.72, + "eval_logps/chosen": -150.81178125, + "eval_logps/rejected": -146.26340625, + "eval_loss": 0.48678913712501526, + "eval_rewards/chosen": 0.3514009094238281, + "eval_rewards/margins": 0.11346868896484374, + "eval_rewards/rejected": 0.23793222045898438, + "eval_runtime": 217.6136, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 2.298, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 0.5262874960899353, + "kl": 4.751598358154297, + "learning_rate": 1.5355555555555558e-06, + "logits/chosen": 34054140.8, + "logits/rejected": 33053715.2, + "logps/chosen": -159.1825439453125, + "logps/rejected": -150.52947998046875, + "loss": 0.45218782424926757, + "rewards/chosen": 0.4885563850402832, + "rewards/margins": 0.42950677275657656, + "rewards/rejected": 0.059049612283706664, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 0.5098588466644287, + "kl": 4.014686584472656, + "learning_rate": 1.5133333333333334e-06, + "logits/chosen": 40354809.6, + "logits/rejected": 39480486.4, + "logps/chosen": -145.28863525390625, + "logps/rejected": -135.333837890625, + "loss": 0.43676166534423827, + "rewards/chosen": 0.4947031021118164, + "rewards/margins": 0.6355077743530273, + "rewards/rejected": -0.14080467224121093, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 0.448231041431427, + "kl": 5.823625564575195, + "learning_rate": 1.4911111111111112e-06, + "logits/chosen": 47668928.0, + "logits/rejected": 45680892.8, + "logps/chosen": -143.58907470703124, + "logps/rejected": -157.80914306640625, + "loss": 0.44497880935668943, + "rewards/chosen": 0.631040382385254, + "rewards/margins": 0.497272527217865, + "rewards/rejected": 0.13376785516738893, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 0.5563249588012695, + "kl": 5.55691385269165, + "learning_rate": 1.468888888888889e-06, + "logits/chosen": 30804796.8, + "logits/rejected": 30690835.2, + "logps/chosen": -124.4900146484375, + "logps/rejected": -133.585205078125, + "loss": 0.444712495803833, + "rewards/chosen": 0.6877860069274903, + "rewards/margins": 0.4855673313140869, + "rewards/rejected": 0.20221867561340331, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 0.6280549168586731, + "kl": 4.7264180183410645, + "learning_rate": 1.4466666666666669e-06, + "logits/chosen": 29776838.4, + "logits/rejected": 31885555.2, + "logps/chosen": -145.07203369140626, + "logps/rejected": -149.7443115234375, + "loss": 0.4761053562164307, + "rewards/chosen": 0.3966336488723755, + "rewards/margins": 0.2293717384338379, + "rewards/rejected": 0.1672619104385376, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 0.5127621293067932, + "kl": 6.063115119934082, + "learning_rate": 1.4244444444444447e-06, + "logits/chosen": 46517398.4, + "logits/rejected": 43311113.6, + "logps/chosen": -189.47987060546876, + "logps/rejected": -170.6665283203125, + "loss": 0.42708525657653806, + "rewards/chosen": 0.7080463409423828, + "rewards/margins": 0.7631270289421082, + "rewards/rejected": -0.05508068799972534, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 0.5822389125823975, + "kl": 4.272950649261475, + "learning_rate": 1.4022222222222223e-06, + "logits/chosen": 25695438.4, + "logits/rejected": 24746280.0, + "logps/chosen": -137.86600341796876, + "logps/rejected": -137.41337890625, + "loss": 0.4546250343322754, + "rewards/chosen": 0.42215428352355955, + "rewards/margins": 0.4513491034507751, + "rewards/rejected": -0.029194819927215575, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 0.5575308799743652, + "kl": 5.757713794708252, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 39392422.4, + "logits/rejected": 40771721.6, + "logps/chosen": -136.75103759765625, + "logps/rejected": -172.27413330078124, + "loss": 0.464780330657959, + "rewards/chosen": 0.6209693908691406, + "rewards/margins": 0.2982433319091797, + "rewards/rejected": 0.3227260589599609, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 0.7086930274963379, + "kl": 5.1618475914001465, + "learning_rate": 1.357777777777778e-06, + "logits/chosen": 40175395.2, + "logits/rejected": 39745542.4, + "logps/chosen": -187.0331298828125, + "logps/rejected": -151.17127685546876, + "loss": 0.45111641883850095, + "rewards/chosen": 0.4993483543395996, + "rewards/margins": 0.43962204456329346, + "rewards/rejected": 0.05972630977630615, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 0.5889289379119873, + "kl": 7.153553009033203, + "learning_rate": 1.3355555555555558e-06, + "logits/chosen": 43437193.6, + "logits/rejected": 41387232.0, + "logps/chosen": -122.5697265625, + "logps/rejected": -132.28131103515625, + "loss": 0.4659090042114258, + "rewards/chosen": 0.7089588165283203, + "rewards/margins": 0.31116189956665036, + "rewards/rejected": 0.3977969169616699, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_kl": 4.760587692260742, + "eval_logits/chosen": 38344187.904, + "eval_logits/rejected": 38206853.12, + "eval_logps/chosen": -150.743921875, + "eval_logps/rejected": -146.278265625, + "eval_loss": 0.4858584403991699, + "eval_rewards/chosen": 0.3581858215332031, + "eval_rewards/margins": 0.12174023437499998, + "eval_rewards/rejected": 0.23644558715820313, + "eval_runtime": 218.092, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 2.293, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 0.4249567687511444, + "kl": 6.2131242752075195, + "learning_rate": 1.3133333333333334e-06, + "logits/chosen": 57013689.6, + "logits/rejected": 56808352.0, + "logps/chosen": -164.70875244140626, + "logps/rejected": -132.06815185546876, + "loss": 0.4498802661895752, + "rewards/chosen": 0.7213836669921875, + "rewards/margins": 0.4230734348297119, + "rewards/rejected": 0.2983102321624756, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 0.549889862537384, + "kl": 6.712057590484619, + "learning_rate": 1.2911111111111112e-06, + "logits/chosen": 42846454.4, + "logits/rejected": 42999248.0, + "logps/chosen": -177.64530029296876, + "logps/rejected": -184.1470458984375, + "loss": 0.43409342765808107, + "rewards/chosen": 0.728582763671875, + "rewards/margins": 0.6158596277236938, + "rewards/rejected": 0.11272313594818115, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 0.5649115443229675, + "kl": 4.235246658325195, + "learning_rate": 1.268888888888889e-06, + "logits/chosen": 41943692.8, + "logits/rejected": 41250208.0, + "logps/chosen": -142.4334228515625, + "logps/rejected": -127.446923828125, + "loss": 0.4541294574737549, + "rewards/chosen": 0.42052087783813474, + "rewards/margins": 0.38021968901157377, + "rewards/rejected": 0.040301188826560974, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 0.42543667554855347, + "kl": 5.350770473480225, + "learning_rate": 1.2466666666666667e-06, + "logits/chosen": 43400393.6, + "logits/rejected": 40400710.4, + "logps/chosen": -155.03704833984375, + "logps/rejected": -177.0060791015625, + "loss": 0.46230545043945315, + "rewards/chosen": 0.562045955657959, + "rewards/margins": 0.3403463363647461, + "rewards/rejected": 0.22169961929321289, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 0.5134297609329224, + "kl": 4.304908752441406, + "learning_rate": 1.2244444444444445e-06, + "logits/chosen": 36589369.6, + "logits/rejected": 34298777.6, + "logps/chosen": -154.47265625, + "logps/rejected": -144.5512939453125, + "loss": 0.4730405330657959, + "rewards/chosen": 0.354435133934021, + "rewards/margins": 0.25211869478225707, + "rewards/rejected": 0.10231643915176392, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 0.46365997195243835, + "kl": 6.416839599609375, + "learning_rate": 1.2022222222222223e-06, + "logits/chosen": 40177619.2, + "logits/rejected": 39313078.4, + "logps/chosen": -168.4650146484375, + "logps/rejected": -134.8406494140625, + "loss": 0.42948031425476074, + "rewards/chosen": 0.8725629806518554, + "rewards/margins": 0.6291991949081421, + "rewards/rejected": 0.24336378574371337, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 0.4326087534427643, + "kl": 3.9851531982421875, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 32055046.4, + "logits/rejected": 32860064.0, + "logps/chosen": -120.6618408203125, + "logps/rejected": -125.195166015625, + "loss": 0.4756101131439209, + "rewards/chosen": 0.14163222312927246, + "rewards/margins": 0.17141112685203552, + "rewards/rejected": -0.02977890372276306, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 0.4545738697052002, + "kl": 5.087003231048584, + "learning_rate": 1.1577777777777778e-06, + "logits/chosen": 35810121.6, + "logits/rejected": 32762137.6, + "logps/chosen": -149.88338623046874, + "logps/rejected": -155.4554443359375, + "loss": 0.45157780647277834, + "rewards/chosen": 0.6308297634124755, + "rewards/margins": 0.421593952178955, + "rewards/rejected": 0.2092358112335205, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 0.5136933326721191, + "kl": 6.449606418609619, + "learning_rate": 1.1355555555555558e-06, + "logits/chosen": 35638320.0, + "logits/rejected": 33466137.6, + "logps/chosen": -152.856396484375, + "logps/rejected": -183.138525390625, + "loss": 0.4553223133087158, + "rewards/chosen": 0.6646287918090821, + "rewards/margins": 0.44561595916748054, + "rewards/rejected": 0.21901283264160157, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 0.3154851198196411, + "kl": 5.555708885192871, + "learning_rate": 1.1133333333333334e-06, + "logits/chosen": 39483830.4, + "logits/rejected": 38031814.4, + "logps/chosen": -155.9849609375, + "logps/rejected": -183.7311279296875, + "loss": 0.4708412647247314, + "rewards/chosen": 0.4511248588562012, + "rewards/margins": 0.33574488162994387, + "rewards/rejected": 0.11537997722625733, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_kl": 4.071971416473389, + "eval_logits/chosen": 37006954.496, + "eval_logits/rejected": 36902076.416, + "eval_logps/chosen": -152.112921875, + "eval_logps/rejected": -147.68203125, + "eval_loss": 0.48551830649375916, + "eval_rewards/chosen": 0.22128521728515624, + "eval_rewards/margins": 0.12521478271484374, + "eval_rewards/rejected": 0.0960704345703125, + "eval_runtime": 218.7826, + "eval_samples_per_second": 4.571, + "eval_steps_per_second": 2.285, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 0.5318990349769592, + "kl": 3.604361057281494, + "learning_rate": 1.0911111111111112e-06, + "logits/chosen": 36286432.0, + "logits/rejected": 37285971.2, + "logps/chosen": -113.43314208984376, + "logps/rejected": -124.557763671875, + "loss": 0.464507007598877, + "rewards/chosen": 0.35673577785491944, + "rewards/margins": 0.2908350646495819, + "rewards/rejected": 0.06590071320533752, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 0.544118344783783, + "kl": 3.670469284057617, + "learning_rate": 1.068888888888889e-06, + "logits/chosen": 45186656.0, + "logits/rejected": 45977584.0, + "logps/chosen": -139.073681640625, + "logps/rejected": -153.00821533203126, + "loss": 0.4433259963989258, + "rewards/chosen": 0.42419872283935545, + "rewards/margins": 0.5132439255714416, + "rewards/rejected": -0.08904520273208619, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 0.5683560967445374, + "kl": 2.751370906829834, + "learning_rate": 1.0466666666666669e-06, + "logits/chosen": 42249769.6, + "logits/rejected": 43098508.8, + "logps/chosen": -158.5843017578125, + "logps/rejected": -144.552197265625, + "loss": 0.4399724960327148, + "rewards/chosen": 0.42113256454467773, + "rewards/margins": 0.5389934659004212, + "rewards/rejected": -0.11786090135574341, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.5233821272850037, + "kl": 2.9688010215759277, + "learning_rate": 1.0244444444444445e-06, + "logits/chosen": 31636636.8, + "logits/rejected": 27467308.8, + "logps/chosen": -143.64234619140626, + "logps/rejected": -139.1869140625, + "loss": 0.442952299118042, + "rewards/chosen": 0.18238863945007325, + "rewards/margins": 0.5509385347366333, + "rewards/rejected": -0.36854989528656007, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.6982712745666504, + "kl": 2.9157755374908447, + "learning_rate": 1.0022222222222223e-06, + "logits/chosen": 29222934.4, + "logits/rejected": 28866115.2, + "logps/chosen": -145.965185546875, + "logps/rejected": -134.71815185546876, + "loss": 0.46454343795776365, + "rewards/chosen": 0.13749135732650758, + "rewards/margins": 0.32013021707534794, + "rewards/rejected": -0.18263885974884034, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.5256077647209167, + "kl": 5.147567272186279, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 39800131.2, + "logits/rejected": 40214745.6, + "logps/chosen": -153.8827880859375, + "logps/rejected": -155.6109375, + "loss": 0.44734792709350585, + "rewards/chosen": 0.595530891418457, + "rewards/margins": 0.46844919919967654, + "rewards/rejected": 0.12708169221878052, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.5332716703414917, + "kl": 2.908353328704834, + "learning_rate": 9.57777777777778e-07, + "logits/chosen": 25019286.4, + "logits/rejected": 26015075.2, + "logps/chosen": -153.6703857421875, + "logps/rejected": -128.41676025390626, + "loss": 0.47053236961364747, + "rewards/chosen": 0.07076652646064759, + "rewards/margins": 0.24093337655067443, + "rewards/rejected": -0.17016685009002686, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.6310443878173828, + "kl": 3.8460822105407715, + "learning_rate": 9.355555555555557e-07, + "logits/chosen": 30489289.6, + "logits/rejected": 29658633.6, + "logps/chosen": -157.56915283203125, + "logps/rejected": -126.709619140625, + "loss": 0.47043805122375487, + "rewards/chosen": 0.23312182426452638, + "rewards/margins": 0.2518186703324318, + "rewards/rejected": -0.018696846067905427, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.6265914440155029, + "kl": 2.628873109817505, + "learning_rate": 9.133333333333334e-07, + "logits/chosen": 38132934.4, + "logits/rejected": 37865273.6, + "logps/chosen": -150.04840087890625, + "logps/rejected": -152.45029296875, + "loss": 0.4601998805999756, + "rewards/chosen": 0.1351819634437561, + "rewards/margins": 0.29193094968795774, + "rewards/rejected": -0.15674898624420167, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.7189147472381592, + "kl": 4.683299541473389, + "learning_rate": 8.911111111111112e-07, + "logits/chosen": 39250364.8, + "logits/rejected": 37806796.8, + "logps/chosen": -166.75601806640626, + "logps/rejected": -189.86864013671874, + "loss": 0.47463297843933105, + "rewards/chosen": 0.1896621823310852, + "rewards/margins": 0.1576364517211914, + "rewards/rejected": 0.0320257306098938, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.463610887527466, + "eval_logits/chosen": 35089735.68, + "eval_logits/rejected": 35033989.12, + "eval_logps/chosen": -153.677015625, + "eval_logps/rejected": -149.26678125, + "eval_loss": 0.4853117763996124, + "eval_rewards/chosen": 0.06487516784667968, + "eval_rewards/margins": 0.1272816162109375, + "eval_rewards/rejected": -0.062406448364257815, + "eval_runtime": 217.5278, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.299, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 0.6290340423583984, + "kl": 4.315595626831055, + "learning_rate": 8.68888888888889e-07, + "logits/chosen": 31404550.4, + "logits/rejected": 29785132.8, + "logps/chosen": -161.92845458984374, + "logps/rejected": -143.69949951171876, + "loss": 0.4567877292633057, + "rewards/chosen": 0.3545663356781006, + "rewards/margins": 0.38114327788352964, + "rewards/rejected": -0.026576942205429076, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 0.5943707227706909, + "kl": 2.6047091484069824, + "learning_rate": 8.466666666666668e-07, + "logits/chosen": 35495555.2, + "logits/rejected": 33073846.4, + "logps/chosen": -137.365478515625, + "logps/rejected": -147.0032958984375, + "loss": 0.4664917469024658, + "rewards/chosen": -0.16478813886642457, + "rewards/margins": 0.23700910806655884, + "rewards/rejected": -0.4017972469329834, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 0.528068482875824, + "kl": 3.3957126140594482, + "learning_rate": 8.244444444444445e-07, + "logits/chosen": 31661328.0, + "logits/rejected": 30206838.4, + "logps/chosen": -162.5927978515625, + "logps/rejected": -134.8358642578125, + "loss": 0.46093249320983887, + "rewards/chosen": 0.17535465955734253, + "rewards/margins": 0.3643703818321228, + "rewards/rejected": -0.18901572227478028, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 0.46279260516166687, + "kl": 2.3419876098632812, + "learning_rate": 8.022222222222223e-07, + "logits/chosen": 42605856.0, + "logits/rejected": 40943014.4, + "logps/chosen": -132.12677001953125, + "logps/rejected": -135.65135498046874, + "loss": 0.45622806549072265, + "rewards/chosen": 0.04011918306350708, + "rewards/margins": 0.4230758786201477, + "rewards/rejected": -0.38295669555664064, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 0.5701712369918823, + "kl": 4.020439147949219, + "learning_rate": 7.8e-07, + "logits/chosen": 38577043.2, + "logits/rejected": 39374691.2, + "logps/chosen": -170.36212158203125, + "logps/rejected": -161.6606201171875, + "loss": 0.45922436714172366, + "rewards/chosen": 0.09237505197525024, + "rewards/margins": 0.30426751375198363, + "rewards/rejected": -0.2118924617767334, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 0.5047979354858398, + "kl": 2.7111480236053467, + "learning_rate": 7.577777777777779e-07, + "logits/chosen": 33372905.6, + "logits/rejected": 33328956.8, + "logps/chosen": -163.45003662109374, + "logps/rejected": -172.47750244140624, + "loss": 0.465222692489624, + "rewards/chosen": 0.034078240394592285, + "rewards/margins": 0.4608752965927124, + "rewards/rejected": -0.42679705619812014, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 0.43878477811813354, + "kl": 2.7707526683807373, + "learning_rate": 7.355555555555556e-07, + "logits/chosen": 34746457.6, + "logits/rejected": 31707171.2, + "logps/chosen": -131.509716796875, + "logps/rejected": -149.54019775390626, + "loss": 0.46474738121032716, + "rewards/chosen": 0.06415605545043945, + "rewards/margins": 0.39272706508636473, + "rewards/rejected": -0.3285710096359253, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 0.5701454877853394, + "kl": 2.9775280952453613, + "learning_rate": 7.133333333333334e-07, + "logits/chosen": 31648003.2, + "logits/rejected": 31955971.2, + "logps/chosen": -151.3242919921875, + "logps/rejected": -148.98272705078125, + "loss": 0.4547208309173584, + "rewards/chosen": 0.16725053787231445, + "rewards/margins": 0.45224099159240727, + "rewards/rejected": -0.2849904537200928, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 0.36173462867736816, + "kl": 3.2897815704345703, + "learning_rate": 6.911111111111111e-07, + "logits/chosen": 28085676.8, + "logits/rejected": 25880251.2, + "logps/chosen": -133.48956298828125, + "logps/rejected": -151.6326904296875, + "loss": 0.45902628898620607, + "rewards/chosen": 0.06616134643554687, + "rewards/margins": 0.43392994403839114, + "rewards/rejected": -0.36776859760284425, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 0.4635393023490906, + "kl": 4.791772365570068, + "learning_rate": 6.68888888888889e-07, + "logits/chosen": 39854400.0, + "logits/rejected": 43266041.6, + "logps/chosen": -180.304736328125, + "logps/rejected": -158.53660888671874, + "loss": 0.44127936363220216, + "rewards/chosen": 0.3505941152572632, + "rewards/margins": 0.4699846982955933, + "rewards/rejected": -0.11939058303833008, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_kl": 3.170285701751709, + "eval_logits/chosen": 34094372.864, + "eval_logits/rejected": 34072373.248, + "eval_logps/chosen": -154.5265625, + "eval_logps/rejected": -150.1290625, + "eval_loss": 0.48520490527153015, + "eval_rewards/chosen": -0.020077211380004883, + "eval_rewards/margins": 0.12855766105651856, + "eval_rewards/rejected": -0.14863487243652343, + "eval_runtime": 217.5482, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.298, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 0.6011971831321716, + "kl": 3.542525053024292, + "learning_rate": 6.466666666666667e-07, + "logits/chosen": 36977337.6, + "logits/rejected": 38393292.8, + "logps/chosen": -135.9339599609375, + "logps/rejected": -150.27783203125, + "loss": 0.4888314723968506, + "rewards/chosen": 0.07737842798233033, + "rewards/margins": 0.11252884268760682, + "rewards/rejected": -0.03515041470527649, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 0.5993504524230957, + "kl": 3.1693031787872314, + "learning_rate": 6.244444444444445e-07, + "logits/chosen": 30479276.8, + "logits/rejected": 29417516.8, + "logps/chosen": -132.84188232421874, + "logps/rejected": -119.2605224609375, + "loss": 0.46071271896362304, + "rewards/chosen": 0.22380545139312744, + "rewards/margins": 0.42163221836090087, + "rewards/rejected": -0.19782676696777343, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 0.6010851263999939, + "kl": 5.0478620529174805, + "learning_rate": 6.022222222222223e-07, + "logits/chosen": 38324892.8, + "logits/rejected": 37791113.6, + "logps/chosen": -135.21259765625, + "logps/rejected": -157.90794677734374, + "loss": 0.44549560546875, + "rewards/chosen": 0.5348263740539551, + "rewards/margins": 0.5059731423854827, + "rewards/rejected": 0.02885323166847229, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 0.6057806611061096, + "kl": 2.268434762954712, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 31744332.8, + "logits/rejected": 30286156.8, + "logps/chosen": -124.35537109375, + "logps/rejected": -136.41668701171875, + "loss": 0.44939751625061036, + "rewards/chosen": 0.14811928272247316, + "rewards/margins": 0.4689765214920044, + "rewards/rejected": -0.32085723876953126, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 0.4038269817829132, + "kl": 3.5659327507019043, + "learning_rate": 5.577777777777779e-07, + "logits/chosen": 31686905.6, + "logits/rejected": 32978937.6, + "logps/chosen": -156.446826171875, + "logps/rejected": -122.14853515625, + "loss": 0.45820083618164065, + "rewards/chosen": 0.2354212760925293, + "rewards/margins": 0.3473649501800537, + "rewards/rejected": -0.11194367408752441, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 0.7359764575958252, + "kl": 4.224055290222168, + "learning_rate": 5.355555555555556e-07, + "logits/chosen": 28065552.0, + "logits/rejected": 29463145.6, + "logps/chosen": -135.04927978515624, + "logps/rejected": -187.2267333984375, + "loss": 0.4682769298553467, + "rewards/chosen": 0.16567325592041016, + "rewards/margins": 0.28882311582565307, + "rewards/rejected": -0.12314985990524292, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.6170231699943542, + "kl": 3.4734268188476562, + "learning_rate": 5.133333333333334e-07, + "logits/chosen": 28978995.2, + "logits/rejected": 28156457.6, + "logps/chosen": -151.6166015625, + "logps/rejected": -139.56959228515626, + "loss": 0.4609676837921143, + "rewards/chosen": 0.0773462176322937, + "rewards/margins": 0.42135525941848756, + "rewards/rejected": -0.34400904178619385, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.5378606915473938, + "kl": 4.157721519470215, + "learning_rate": 4.911111111111112e-07, + "logits/chosen": 35223811.2, + "logits/rejected": 34011692.8, + "logps/chosen": -152.407470703125, + "logps/rejected": -153.23616943359374, + "loss": 0.4477241516113281, + "rewards/chosen": 0.24660811424255372, + "rewards/margins": 0.40975589752197267, + "rewards/rejected": -0.16314778327941895, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6521180868148804, + "kl": 3.4054481983184814, + "learning_rate": 4.688888888888889e-07, + "logits/chosen": 27037875.2, + "logits/rejected": 27271260.8, + "logps/chosen": -189.535595703125, + "logps/rejected": -115.60257568359376, + "loss": 0.47844581604003905, + "rewards/chosen": 0.05722663402557373, + "rewards/margins": 0.14552825689315796, + "rewards/rejected": -0.08830162286758422, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.6569052338600159, + "kl": 3.141408681869507, + "learning_rate": 4.466666666666667e-07, + "logits/chosen": 29871747.2, + "logits/rejected": 26977977.6, + "logps/chosen": -127.3733154296875, + "logps/rejected": -161.679931640625, + "loss": 0.464168119430542, + "rewards/chosen": 0.05878195762634277, + "rewards/margins": 0.390771484375, + "rewards/rejected": -0.3319895267486572, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.0008513927459717, + "eval_logits/chosen": 33053526.016, + "eval_logits/rejected": 33058701.312, + "eval_logps/chosen": -155.151625, + "eval_logps/rejected": -150.761421875, + "eval_loss": 0.4851257801055908, + "eval_rewards/chosen": -0.08258457946777344, + "eval_rewards/margins": 0.12928588867187502, + "eval_rewards/rejected": -0.21187046813964844, + "eval_runtime": 217.6558, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.4972571134567261, + "kl": 3.18681001663208, + "learning_rate": 4.2444444444444447e-07, + "logits/chosen": 39937222.4, + "logits/rejected": 36850470.4, + "logps/chosen": -171.2765625, + "logps/rejected": -189.08856201171875, + "loss": 0.4898653507232666, + "rewards/chosen": -0.06906133890151978, + "rewards/margins": 0.13194493055343628, + "rewards/rejected": -0.20100626945495606, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.6851227879524231, + "kl": 3.3901615142822266, + "learning_rate": 4.0222222222222224e-07, + "logits/chosen": 32688179.2, + "logits/rejected": 33617596.8, + "logps/chosen": -140.2035888671875, + "logps/rejected": -168.17904052734374, + "loss": 0.4512950420379639, + "rewards/chosen": 0.20932047367095946, + "rewards/margins": 0.5313280582427978, + "rewards/rejected": -0.32200758457183837, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.40994900465011597, + "kl": 2.513051748275757, + "learning_rate": 3.8e-07, + "logits/chosen": 36529526.4, + "logits/rejected": 38165760.0, + "logps/chosen": -154.76318359375, + "logps/rejected": -137.9931640625, + "loss": 0.48479623794555665, + "rewards/chosen": -0.07438920736312866, + "rewards/margins": 0.09523203372955323, + "rewards/rejected": -0.1696212410926819, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.7032968997955322, + "kl": 2.689875841140747, + "learning_rate": 3.577777777777778e-07, + "logits/chosen": 27584521.6, + "logits/rejected": 27075878.4, + "logps/chosen": -143.9324951171875, + "logps/rejected": -115.68133544921875, + "loss": 0.47793827056884763, + "rewards/chosen": -0.008073312044143677, + "rewards/margins": 0.23053821921348572, + "rewards/rejected": -0.2386115312576294, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 0.6183348298072815, + "kl": 2.5839390754699707, + "learning_rate": 3.3555555555555556e-07, + "logits/chosen": 33576460.8, + "logits/rejected": 31820140.8, + "logps/chosen": -136.1211669921875, + "logps/rejected": -175.8531005859375, + "loss": 0.4491901397705078, + "rewards/chosen": -0.012712603807449341, + "rewards/margins": 0.5766294658184051, + "rewards/rejected": -0.5893420696258544, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 0.5491041541099548, + "kl": 2.9139962196350098, + "learning_rate": 3.1333333333333333e-07, + "logits/chosen": 36256649.6, + "logits/rejected": 37576332.8, + "logps/chosen": -116.727734375, + "logps/rejected": -184.8653564453125, + "loss": 0.48710999488830564, + "rewards/chosen": -0.05998457670211792, + "rewards/margins": 0.14093006849288942, + "rewards/rejected": -0.20091464519500732, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 0.6111120581626892, + "kl": 3.809281587600708, + "learning_rate": 2.9111111111111116e-07, + "logits/chosen": 46606099.2, + "logits/rejected": 45940393.6, + "logps/chosen": -147.1123779296875, + "logps/rejected": -163.34154052734374, + "loss": 0.4475499153137207, + "rewards/chosen": 0.2357264995574951, + "rewards/margins": 0.5424099922180176, + "rewards/rejected": -0.30668349266052247, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 0.6118114590644836, + "kl": 3.6302967071533203, + "learning_rate": 2.6888888888888893e-07, + "logits/chosen": 36722233.6, + "logits/rejected": 35542681.6, + "logps/chosen": -177.38359375, + "logps/rejected": -147.4840087890625, + "loss": 0.45297937393188475, + "rewards/chosen": 0.19647778272628785, + "rewards/margins": 0.4641849398612976, + "rewards/rejected": -0.26770715713500975, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 0.4131172001361847, + "kl": 3.6233086585998535, + "learning_rate": 2.466666666666667e-07, + "logits/chosen": 24529001.6, + "logits/rejected": 24586342.4, + "logps/chosen": -102.13927001953125, + "logps/rejected": -141.7262451171875, + "loss": 0.47483372688293457, + "rewards/chosen": 0.25803461074829104, + "rewards/margins": 0.2503116071224213, + "rewards/rejected": 0.007723003625869751, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 0.5676055550575256, + "kl": 3.1186647415161133, + "learning_rate": 2.2444444444444445e-07, + "logits/chosen": 48083148.8, + "logits/rejected": 45228051.2, + "logps/chosen": -215.27216796875, + "logps/rejected": -166.07578125, + "loss": 0.46457977294921876, + "rewards/chosen": -0.12358083724975585, + "rewards/margins": 0.29419794082641604, + "rewards/rejected": -0.41777877807617186, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_kl": 2.721818685531616, + "eval_logits/chosen": 32252489.728, + "eval_logits/rejected": 32286822.4, + "eval_logps/chosen": -155.964, + "eval_logps/rejected": -151.586875, + "eval_loss": 0.4850628674030304, + "eval_rewards/chosen": -0.16382406616210937, + "eval_rewards/margins": 0.13059149169921874, + "eval_rewards/rejected": -0.2944155578613281, + "eval_runtime": 218.1028, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 2.292, + "step": 2400 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2400/training_args.bin b/v5/KTO/KTO_10k/lora/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a05f8383f95df104b573dd06fde1a6093711cd3 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531e42fed31d279deeb217d9e592c58b0a48be16b726c4baaff52e99873e947a +size 5521 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/README.md b/v5/KTO/KTO_10k/lora/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e59b48ef11325fd83a0fa60f4e367a1bcacba7d --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "k_proj", + "o_proj", + "v_proj", + "gate_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_model.safetensors b/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ee8e6ee6513cb5b6f8380ee3a0ef08bb3def554e --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ce8e22e8f9cfaea0c749b838ca5acd7ffeaa4277b6674abe9d93a82c75a3c8 +size 180385008 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/chat_template.jinja b/v5/KTO/KTO_10k/lora/checkpoint-2500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/optimizer.pt b/v5/KTO/KTO_10k/lora/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..03850a07faf7d704b747def6883a5992f1016022 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d9d30f8ddc68b1121b2fbf1ea4fc27643c4943ba3a6c4a41bf626d0babdff3 +size 360902475 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/rng_state.pth b/v5/KTO/KTO_10k/lora/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/scaler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6bccf2b99239cf26ef4ea2b6a5f9f897042b61f --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861ce13e6ca091acee9a68ebfc5ca38479baf4b537c37b3949f071f77b81e9f0 +size 1383 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/scheduler.pt b/v5/KTO/KTO_10k/lora/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d255c93ba4dfc0bf737b1db2e9c2a9df36e51095 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d439a7e6a3841969148f0fdc43676e377f6e9b7b68c3674146bb8ea24e3705 +size 1465 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer.json b/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer_config.json b/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/trainer_state.json b/v5/KTO/KTO_10k/lora/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..874f652c7019490a99bf34652c8ff4b9d14be881 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/trainer_state.json @@ -0,0 +1,4184 @@ +{ + "best_global_step": 2500, + "best_metric": 0.13117916870117188, + "best_model_checkpoint": "output/lora/checkpoint-2500", + "epoch": 2.0, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.4994136691093445, + "kl": 0.010484933853149414, + "learning_rate": 1.8e-07, + "logits/chosen": 29687939.2, + "logits/rejected": 31342233.6, + "logps/chosen": -148.9648681640625, + "logps/rejected": -128.8302734375, + "loss": 0.500147819519043, + "rewards/chosen": -0.0003900241805240512, + "rewards/margins": -0.001183443213813007, + "rewards/rejected": 0.0007934190332889556, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.42419925332069397, + "kl": 0.018610835075378418, + "learning_rate": 3.8e-07, + "logits/chosen": 53382841.6, + "logits/rejected": 52884211.2, + "logps/chosen": -140.02025146484374, + "logps/rejected": -151.92236328125, + "loss": 0.49989566802978513, + "rewards/chosen": 0.0010854244232177735, + "rewards/margins": 0.0008347129682078958, + "rewards/rejected": 0.0002507114550098777, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.41562652587890625, + "kl": 0.00999913178384304, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 34145190.4, + "logits/rejected": 34195894.4, + "logps/chosen": -131.7357177734375, + "logps/rejected": -140.3759033203125, + "loss": 0.49987101554870605, + "rewards/chosen": 0.00029232501983642576, + "rewards/margins": 0.0010309695731848477, + "rewards/rejected": -0.0007386445533484221, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.35795858502388, + "kl": 0.01658189296722412, + "learning_rate": 7.8e-07, + "logits/chosen": 43262694.4, + "logits/rejected": 43904278.4, + "logps/chosen": -144.2994140625, + "logps/rejected": -146.0284423828125, + "loss": 0.5001150608062744, + "rewards/chosen": -0.00019423491321504116, + "rewards/margins": -0.0009199525695294142, + "rewards/rejected": 0.000725717656314373, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.322542279958725, + "kl": 0.016057539731264114, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 43062272.0, + "logits/rejected": 44864710.4, + "logps/chosen": -141.009814453125, + "logps/rejected": -154.3311279296875, + "loss": 0.4999659538269043, + "rewards/chosen": 4.65535675175488e-05, + "rewards/margins": 0.00027224536752328276, + "rewards/rejected": -0.00022569180000573397, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.27746379375457764, + "kl": 0.0211088415235281, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 36592531.2, + "logits/rejected": 34114694.4, + "logps/chosen": -105.72940673828126, + "logps/rejected": -114.016015625, + "loss": 0.4998314380645752, + "rewards/chosen": 0.0008930303156375885, + "rewards/margins": 0.0013488865923136472, + "rewards/rejected": -0.0004558562766760588, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.380987286567688, + "kl": 0.014461040496826172, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 47752102.4, + "logits/rejected": 46858576.0, + "logps/chosen": -165.7050048828125, + "logps/rejected": -175.17645263671875, + "loss": 0.49965806007385255, + "rewards/chosen": -0.007297745347023011, + "rewards/margins": 0.002736319601535796, + "rewards/rejected": -0.010034064948558807, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.4557498097419739, + "kl": 0.016758393496274948, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 40700441.6, + "logits/rejected": 40753952.0, + "logps/chosen": -154.99173583984376, + "logps/rejected": -163.616552734375, + "loss": 0.49955191612243655, + "rewards/chosen": -0.007268477231264114, + "rewards/margins": 0.0035857379436492927, + "rewards/rejected": -0.010854215174913407, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.3776198923587799, + "kl": 0.04920945316553116, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 43998704.0, + "logits/rejected": 44111488.0, + "logps/chosen": -169.890185546875, + "logps/rejected": -159.26253662109374, + "loss": 0.4991014003753662, + "rewards/chosen": -0.0037218812853097917, + "rewards/margins": 0.007189888134598732, + "rewards/rejected": -0.010911769419908523, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.45459961891174316, + "kl": 0.10221505165100098, + "learning_rate": 1.98e-06, + "logits/chosen": 27590704.0, + "logits/rejected": 27196054.4, + "logps/chosen": -134.2844970703125, + "logps/rejected": -164.56478271484374, + "loss": 0.4994335651397705, + "rewards/chosen": 0.001446514017879963, + "rewards/margins": 0.004533729329705239, + "rewards/rejected": -0.0030872153118252756, + "step": 100 + }, + { + "epoch": 0.08, + "eval_kl": 0.0926995798945427, + "eval_logits/chosen": 38615707.648, + "eval_logits/rejected": 38522241.024, + "eval_logps/chosen": -154.3604375, + "eval_logps/rejected": -148.682875, + "eval_loss": 0.4999313950538635, + "eval_rewards/chosen": -0.0034661414623260497, + "eval_rewards/margins": 0.0005488345623016356, + "eval_rewards/rejected": -0.004014976024627685, + "eval_runtime": 216.3934, + "eval_samples_per_second": 4.621, + "eval_steps_per_second": 2.311, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 0.3749667704105377, + "kl": 0.09545516967773438, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37966393.6, + "logits/rejected": 37751027.2, + "logps/chosen": -130.5518798828125, + "logps/rejected": -135.6833740234375, + "loss": 0.4993227481842041, + "rewards/chosen": 0.001455230824649334, + "rewards/margins": 0.005419917218387127, + "rewards/rejected": -0.003964686393737793, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 0.36912816762924194, + "kl": 0.14255723357200623, + "learning_rate": 2.38e-06, + "logits/chosen": 47479664.0, + "logits/rejected": 47101081.6, + "logps/chosen": -162.19322509765624, + "logps/rejected": -133.80028076171874, + "loss": 0.5003850936889649, + "rewards/chosen": -0.0034813500940799715, + "rewards/margins": -0.0030801778659224513, + "rewards/rejected": -0.0004011722281575203, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 0.3060654103755951, + "kl": 0.3212381601333618, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 39886729.6, + "logits/rejected": 38994944.0, + "logps/chosen": -161.928857421875, + "logps/rejected": -140.0421630859375, + "loss": 0.5001925468444824, + "rewards/chosen": 0.024501633644104005, + "rewards/margins": -0.001541826128959655, + "rewards/rejected": 0.02604345977306366, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 0.3445453345775604, + "kl": 0.48165637254714966, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 40894547.2, + "logits/rejected": 42894540.8, + "logps/chosen": -142.19818115234375, + "logps/rejected": -157.58607177734376, + "loss": 0.5001253128051758, + "rewards/chosen": 0.04530414342880249, + "rewards/margins": -0.001001721620559691, + "rewards/rejected": 0.04630586504936218, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 0.3646848797798157, + "kl": 0.5575106143951416, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 41080057.6, + "logits/rejected": 42315260.8, + "logps/chosen": -129.9904052734375, + "logps/rejected": -117.11707763671875, + "loss": 0.49814538955688475, + "rewards/chosen": 0.059499716758728026, + "rewards/margins": 0.014832848310470582, + "rewards/rejected": 0.044666868448257444, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 0.37343886494636536, + "kl": 0.7937558889389038, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 34726796.8, + "logits/rejected": 35066944.0, + "logps/chosen": -143.1036376953125, + "logps/rejected": -146.66500244140624, + "loss": 0.4996847152709961, + "rewards/chosen": 0.07830544710159301, + "rewards/margins": 0.0025246202945709145, + "rewards/rejected": 0.0757808268070221, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 0.3172762095928192, + "kl": 0.9795322418212891, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 37869379.2, + "logits/rejected": 40011753.6, + "logps/chosen": -137.67252197265626, + "logps/rejected": -149.25455322265626, + "loss": 0.4995111465454102, + "rewards/chosen": 0.0999127209186554, + "rewards/margins": 0.003919076919555661, + "rewards/rejected": 0.09599364399909974, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 0.459634393453598, + "kl": 1.297642707824707, + "learning_rate": 3.58e-06, + "logits/chosen": 44220444.8, + "logits/rejected": 45226771.2, + "logps/chosen": -144.420849609375, + "logps/rejected": -170.05146484375, + "loss": 0.5002459049224853, + "rewards/chosen": 0.12877843379974366, + "rewards/margins": -0.0019718408584594727, + "rewards/rejected": 0.13075027465820313, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 0.347683310508728, + "kl": 1.2592600584030151, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 41769235.2, + "logits/rejected": 43331692.8, + "logps/chosen": -123.3504150390625, + "logps/rejected": -136.4183837890625, + "loss": 0.4997075080871582, + "rewards/chosen": 0.12709956169128417, + "rewards/margins": 0.0023471236228942705, + "rewards/rejected": 0.1247524380683899, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 0.46408534049987793, + "kl": 1.3921682834625244, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 47041785.6, + "logits/rejected": 48364675.2, + "logps/chosen": -150.128076171875, + "logps/rejected": -171.89765625, + "loss": 0.49930601119995116, + "rewards/chosen": 0.14199190139770507, + "rewards/margins": 0.005550038814544661, + "rewards/rejected": 0.1364418625831604, + "step": 200 + }, + { + "epoch": 0.16, + "eval_kl": 1.0393632650375366, + "eval_logits/chosen": 39075643.392, + "eval_logits/rejected": 38930210.816, + "eval_logps/chosen": -153.263515625, + "eval_logps/rejected": -147.659890625, + "eval_loss": 0.49900853633880615, + "eval_rewards/chosen": 0.10622586059570313, + "eval_rewards/margins": 0.007942695617675785, + "eval_rewards/rejected": 0.09828316497802735, + "eval_runtime": 215.9673, + "eval_samples_per_second": 4.63, + "eval_steps_per_second": 2.315, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 0.39666494727134705, + "kl": 0.7951234579086304, + "learning_rate": 4.18e-06, + "logits/chosen": 33959907.2, + "logits/rejected": 33986992.0, + "logps/chosen": -139.88677978515625, + "logps/rejected": -131.93973388671876, + "loss": 0.5003408432006836, + "rewards/chosen": 0.06579458713531494, + "rewards/margins": -0.002647107839584356, + "rewards/rejected": 0.0684416949748993, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 0.3799448311328888, + "kl": 0.6490715146064758, + "learning_rate": 4.38e-06, + "logits/chosen": 35468355.2, + "logits/rejected": 36302822.4, + "logps/chosen": -101.356298828125, + "logps/rejected": -125.962353515625, + "loss": 0.5001154899597168, + "rewards/chosen": 0.0492926150560379, + "rewards/margins": -0.0009777992963790894, + "rewards/rejected": 0.05027041435241699, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 0.41211405396461487, + "kl": 0.3460121750831604, + "learning_rate": 4.58e-06, + "logits/chosen": 47615702.4, + "logits/rejected": 46232614.4, + "logps/chosen": -185.3808837890625, + "logps/rejected": -163.7504638671875, + "loss": 0.5009187698364258, + "rewards/chosen": -0.0020151469856500626, + "rewards/margins": -0.007613314315676689, + "rewards/rejected": 0.005598167330026627, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 0.40270859003067017, + "kl": 0.5220479369163513, + "learning_rate": 4.78e-06, + "logits/chosen": 48030569.6, + "logits/rejected": 48140400.0, + "logps/chosen": -176.74349365234374, + "logps/rejected": -166.65750732421876, + "loss": 0.5001285076141357, + "rewards/chosen": 0.028535887598991394, + "rewards/margins": -0.001354834437370299, + "rewards/rejected": 0.029890722036361693, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 0.4905576705932617, + "kl": 0.5900261402130127, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 37097190.4, + "logits/rejected": 35081888.0, + "logps/chosen": -176.0585205078125, + "logps/rejected": -145.11353759765626, + "loss": 0.4949470520019531, + "rewards/chosen": 0.06773759722709656, + "rewards/margins": 0.040507239103317265, + "rewards/rejected": 0.027230358123779295, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 0.30912280082702637, + "kl": 0.5255872011184692, + "learning_rate": 4.980000000000001e-06, + "logits/chosen": 30562265.6, + "logits/rejected": 29522019.2, + "logps/chosen": -128.9729248046875, + "logps/rejected": -131.62899169921874, + "loss": 0.4973008155822754, + "rewards/chosen": 0.040848633646965025, + "rewards/margins": 0.021623241901397704, + "rewards/rejected": 0.01922539174556732, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 0.5176734328269958, + "kl": 0.9013652801513672, + "learning_rate": 4.957777777777778e-06, + "logits/chosen": 39767792.0, + "logits/rejected": 39945158.4, + "logps/chosen": -156.84248046875, + "logps/rejected": -151.7102294921875, + "loss": 0.4969136714935303, + "rewards/chosen": 0.07821747660636902, + "rewards/margins": 0.024683624505996704, + "rewards/rejected": 0.053533852100372314, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 0.4220561385154724, + "kl": 0.8817802667617798, + "learning_rate": 4.935555555555556e-06, + "logits/chosen": 33369977.6, + "logits/rejected": 27383606.4, + "logps/chosen": -167.6235595703125, + "logps/rejected": -139.73486328125, + "loss": 0.5022628784179688, + "rewards/chosen": 0.054727953672409055, + "rewards/margins": -0.018271952867507935, + "rewards/rejected": 0.07299990653991699, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 0.33811691403388977, + "kl": 1.448921799659729, + "learning_rate": 4.9133333333333334e-06, + "logits/chosen": 31531936.0, + "logits/rejected": 30661184.0, + "logps/chosen": -145.08800048828124, + "logps/rejected": -147.349755859375, + "loss": 0.49300565719604494, + "rewards/chosen": 0.1612391948699951, + "rewards/margins": 0.05618309974670409, + "rewards/rejected": 0.10505609512329102, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 0.5129542350769043, + "kl": 1.6933104991912842, + "learning_rate": 4.891111111111111e-06, + "logits/chosen": 42485971.2, + "logits/rejected": 42720950.4, + "logps/chosen": -167.75079345703125, + "logps/rejected": -179.53148193359374, + "loss": 0.4963071823120117, + "rewards/chosen": 0.118367600440979, + "rewards/margins": 0.03180532455444336, + "rewards/rejected": 0.08656227588653564, + "step": 300 + }, + { + "epoch": 0.24, + "eval_kl": 1.5601574182510376, + "eval_logits/chosen": 38297956.352, + "eval_logits/rejected": 38117695.488, + "eval_logps/chosen": -153.006140625, + "eval_logps/rejected": -147.429, + "eval_loss": 0.49868252873420715, + "eval_rewards/chosen": 0.13196340942382812, + "eval_rewards/margins": 0.010592102050781246, + "eval_rewards/rejected": 0.12137130737304687, + "eval_runtime": 217.0741, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.303, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 0.3847499489784241, + "kl": 1.3948395252227783, + "learning_rate": 4.8688888888888895e-06, + "logits/chosen": 33896211.2, + "logits/rejected": 34871568.0, + "logps/chosen": -145.9845458984375, + "logps/rejected": -154.91959228515626, + "loss": 0.5030938625335694, + "rewards/chosen": 0.08795046210289001, + "rewards/margins": -0.024919158220291143, + "rewards/rejected": 0.11286962032318115, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 0.618556797504425, + "kl": 0.49630022048950195, + "learning_rate": 4.846666666666667e-06, + "logits/chosen": 37342124.8, + "logits/rejected": 35182000.0, + "logps/chosen": -160.91566162109376, + "logps/rejected": -134.513427734375, + "loss": 0.5022326946258545, + "rewards/chosen": -0.059583669900894164, + "rewards/margins": -0.018618279695510866, + "rewards/rejected": -0.0409653902053833, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 0.382318377494812, + "kl": 0.8811993598937988, + "learning_rate": 4.824444444444445e-06, + "logits/chosen": 46995257.6, + "logits/rejected": 44221206.4, + "logps/chosen": -153.2612060546875, + "logps/rejected": -144.4525634765625, + "loss": 0.4899014949798584, + "rewards/chosen": 0.058102655410766604, + "rewards/margins": 0.08179453760385513, + "rewards/rejected": -0.023691882193088532, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 0.4012068510055542, + "kl": 0.9655236005783081, + "learning_rate": 4.802222222222222e-06, + "logits/chosen": 39877590.4, + "logits/rejected": 40850240.0, + "logps/chosen": -134.43511962890625, + "logps/rejected": -143.74300537109374, + "loss": 0.5008483409881592, + "rewards/chosen": 0.046630316972732545, + "rewards/margins": -0.009023183584213258, + "rewards/rejected": 0.055653500556945804, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 0.4055842161178589, + "kl": 1.7407032251358032, + "learning_rate": 4.78e-06, + "logits/chosen": 37863616.0, + "logits/rejected": 36761936.0, + "logps/chosen": -133.8212646484375, + "logps/rejected": -169.326318359375, + "loss": 0.5016643524169921, + "rewards/chosen": 0.14738692045211793, + "rewards/margins": -0.01331337690353393, + "rewards/rejected": 0.16070029735565186, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 0.4029492139816284, + "kl": 1.1468133926391602, + "learning_rate": 4.7577777777777784e-06, + "logits/chosen": 41317878.4, + "logits/rejected": 38904140.8, + "logps/chosen": -147.33363037109376, + "logps/rejected": -112.39573974609375, + "loss": 0.49462456703186036, + "rewards/chosen": 0.09142228960990906, + "rewards/margins": 0.04296924769878387, + "rewards/rejected": 0.048453041911125184, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 0.39963042736053467, + "kl": 1.3335682153701782, + "learning_rate": 4.735555555555556e-06, + "logits/chosen": 38361622.4, + "logits/rejected": 38506108.8, + "logps/chosen": -146.35006103515624, + "logps/rejected": -150.335205078125, + "loss": 0.5048986434936523, + "rewards/chosen": 0.06511063575744629, + "rewards/margins": -0.04016592502593995, + "rewards/rejected": 0.10527656078338624, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 0.5386641025543213, + "kl": 1.9048980474472046, + "learning_rate": 4.713333333333334e-06, + "logits/chosen": 34626476.8, + "logits/rejected": 35537760.0, + "logps/chosen": -154.9567626953125, + "logps/rejected": -166.59052734375, + "loss": 0.5030035495758056, + "rewards/chosen": 0.13892955780029298, + "rewards/margins": -0.02978687286376952, + "rewards/rejected": 0.1687164306640625, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 0.3963494896888733, + "kl": 1.7998809814453125, + "learning_rate": 4.691111111111111e-06, + "logits/chosen": 31470185.6, + "logits/rejected": 30747776.0, + "logps/chosen": -174.68343505859374, + "logps/rejected": -149.20201416015624, + "loss": 0.4925515174865723, + "rewards/chosen": 0.17315468788146973, + "rewards/margins": 0.05992317199707031, + "rewards/rejected": 0.11323151588439942, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 0.40272921323776245, + "kl": 1.6665403842926025, + "learning_rate": 4.66888888888889e-06, + "logits/chosen": 43372483.2, + "logits/rejected": 41547449.6, + "logps/chosen": -148.32398681640626, + "logps/rejected": -134.78739013671876, + "loss": 0.49486651420593264, + "rewards/chosen": 0.149322509765625, + "rewards/margins": 0.040551638603210455, + "rewards/rejected": 0.10877087116241455, + "step": 400 + }, + { + "epoch": 0.32, + "eval_kl": 1.792982578277588, + "eval_logits/chosen": 38918168.576, + "eval_logits/rejected": 38725652.48, + "eval_logps/chosen": -152.730328125, + "eval_logps/rejected": -147.293078125, + "eval_loss": 0.4969332814216614, + "eval_rewards/chosen": 0.15954458618164064, + "eval_rewards/margins": 0.024580596923828135, + "eval_rewards/rejected": 0.1349639892578125, + "eval_runtime": 216.6464, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 0.3303642272949219, + "kl": 2.137221336364746, + "learning_rate": 4.646666666666667e-06, + "logits/chosen": 43939001.6, + "logits/rejected": 41818220.8, + "logps/chosen": -146.33731689453126, + "logps/rejected": -147.7433349609375, + "loss": 0.4917384147644043, + "rewards/chosen": 0.22513296604156494, + "rewards/margins": 0.06730514764785767, + "rewards/rejected": 0.15782781839370727, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 0.5785346031188965, + "kl": 1.536816120147705, + "learning_rate": 4.624444444444445e-06, + "logits/chosen": 34265174.4, + "logits/rejected": 32297750.4, + "logps/chosen": -161.92572021484375, + "logps/rejected": -130.8744384765625, + "loss": 0.4967160701751709, + "rewards/chosen": 0.12509127855300903, + "rewards/margins": 0.025565683841705322, + "rewards/rejected": 0.0995255947113037, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 0.39299893379211426, + "kl": 2.4938416481018066, + "learning_rate": 4.602222222222223e-06, + "logits/chosen": 37429766.4, + "logits/rejected": 33713158.4, + "logps/chosen": -168.366845703125, + "logps/rejected": -117.99913330078125, + "loss": 0.4935178279876709, + "rewards/chosen": 0.2566863536834717, + "rewards/margins": 0.051660680770874046, + "rewards/rejected": 0.20502567291259766, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 0.6378316879272461, + "kl": 3.6217243671417236, + "learning_rate": 4.58e-06, + "logits/chosen": 43531513.6, + "logits/rejected": 45458550.4, + "logps/chosen": -145.77152099609376, + "logps/rejected": -166.725390625, + "loss": 0.5008945465087891, + "rewards/chosen": 0.3571479320526123, + "rewards/margins": -0.007279539108276389, + "rewards/rejected": 0.3644274711608887, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 0.38800859451293945, + "kl": 3.8835651874542236, + "learning_rate": 4.557777777777778e-06, + "logits/chosen": 35328048.0, + "logits/rejected": 38813721.6, + "logps/chosen": -95.403271484375, + "logps/rejected": -151.7120849609375, + "loss": 0.50106782913208, + "rewards/chosen": 0.38196592330932616, + "rewards/margins": -0.00870509147644044, + "rewards/rejected": 0.3906710147857666, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 0.454421728849411, + "kl": 4.79476261138916, + "learning_rate": 4.535555555555555e-06, + "logits/chosen": 47006140.8, + "logits/rejected": 45068256.0, + "logps/chosen": -160.01910400390625, + "logps/rejected": -175.923046875, + "loss": 0.5052647590637207, + "rewards/chosen": 0.45828795433044434, + "rewards/margins": -0.042376470565795854, + "rewards/rejected": 0.5006644248962402, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 0.846814751625061, + "kl": 3.439274311065674, + "learning_rate": 4.513333333333333e-06, + "logits/chosen": 55978662.4, + "logits/rejected": 53112982.4, + "logps/chosen": -170.11988525390626, + "logps/rejected": -174.0812744140625, + "loss": 0.4981950283050537, + "rewards/chosen": 0.3498707294464111, + "rewards/margins": 0.014589142799377453, + "rewards/rejected": 0.33528158664703367, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 0.46414715051651, + "kl": 2.857430934906006, + "learning_rate": 4.4911111111111115e-06, + "logits/chosen": 44121936.0, + "logits/rejected": 43484160.0, + "logps/chosen": -149.39083251953124, + "logps/rejected": -159.20223388671874, + "loss": 0.4906013011932373, + "rewards/chosen": 0.3135632276535034, + "rewards/margins": 0.0760336399078369, + "rewards/rejected": 0.23752958774566652, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 0.31783437728881836, + "kl": 2.6989314556121826, + "learning_rate": 4.468888888888889e-06, + "logits/chosen": 29722166.4, + "logits/rejected": 27615270.4, + "logps/chosen": -146.17584228515625, + "logps/rejected": -145.2918212890625, + "loss": 0.4850144863128662, + "rewards/chosen": 0.30660200119018555, + "rewards/margins": 0.12175897359848023, + "rewards/rejected": 0.18484302759170532, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 0.4939253032207489, + "kl": 4.809385299682617, + "learning_rate": 4.446666666666667e-06, + "logits/chosen": 45245225.6, + "logits/rejected": 41364572.8, + "logps/chosen": -177.41658935546874, + "logps/rejected": -128.30506591796876, + "loss": 0.48539199829101565, + "rewards/chosen": 0.5390491962432862, + "rewards/margins": 0.1182609081268311, + "rewards/rejected": 0.4207882881164551, + "step": 500 + }, + { + "epoch": 0.4, + "eval_kl": 4.020763874053955, + "eval_logits/chosen": 40241844.224, + "eval_logits/rejected": 39968325.632, + "eval_logps/chosen": -150.122734375, + "eval_logps/rejected": -144.86325, + "eval_loss": 0.4947924017906189, + "eval_rewards/chosen": 0.4203052978515625, + "eval_rewards/margins": 0.042358032226562536, + "eval_rewards/rejected": 0.377947265625, + "eval_runtime": 216.7408, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 0.414318323135376, + "kl": 3.33302640914917, + "learning_rate": 4.424444444444444e-06, + "logits/chosen": 47571436.8, + "logits/rejected": 49124124.8, + "logps/chosen": -143.7648681640625, + "logps/rejected": -157.2596435546875, + "loss": 0.4965871810913086, + "rewards/chosen": 0.34320816993713377, + "rewards/margins": 0.027533125877380327, + "rewards/rejected": 0.31567504405975344, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 0.38320228457450867, + "kl": 4.671795845031738, + "learning_rate": 4.402222222222223e-06, + "logits/chosen": 46508307.2, + "logits/rejected": 45490304.0, + "logps/chosen": -154.68175048828124, + "logps/rejected": -160.55111083984374, + "loss": 0.4931188106536865, + "rewards/chosen": 0.47988028526306153, + "rewards/margins": 0.05947685241699219, + "rewards/rejected": 0.42040343284606935, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 0.4373217821121216, + "kl": 3.6891350746154785, + "learning_rate": 4.38e-06, + "logits/chosen": 42301033.6, + "logits/rejected": 42527356.8, + "logps/chosen": -138.6637939453125, + "logps/rejected": -173.32967529296874, + "loss": 0.5058313369750976, + "rewards/chosen": 0.33178033828735354, + "rewards/margins": -0.047040796279907204, + "rewards/rejected": 0.37882113456726074, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 0.6072640419006348, + "kl": 4.442656517028809, + "learning_rate": 4.357777777777778e-06, + "logits/chosen": 34522003.2, + "logits/rejected": 34255187.2, + "logps/chosen": -147.196533203125, + "logps/rejected": -154.7218505859375, + "loss": 0.4857354640960693, + "rewards/chosen": 0.49022879600524905, + "rewards/margins": 0.11430189609527591, + "rewards/rejected": 0.37592689990997313, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 0.4359336793422699, + "kl": 3.4759514331817627, + "learning_rate": 4.3355555555555565e-06, + "logits/chosen": 41427052.8, + "logits/rejected": 42907648.0, + "logps/chosen": -152.25201416015625, + "logps/rejected": -165.486767578125, + "loss": 0.49396610260009766, + "rewards/chosen": 0.345978832244873, + "rewards/margins": 0.05630025863647459, + "rewards/rejected": 0.28967857360839844, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 0.43716976046562195, + "kl": 3.152191638946533, + "learning_rate": 4.313333333333334e-06, + "logits/chosen": 35663577.6, + "logits/rejected": 34092796.8, + "logps/chosen": -161.36358642578125, + "logps/rejected": -148.691259765625, + "loss": 0.49653072357177735, + "rewards/chosen": 0.27045164108276365, + "rewards/margins": 0.027580332756042464, + "rewards/rejected": 0.24287130832672119, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 0.387523889541626, + "kl": 2.6372287273406982, + "learning_rate": 4.291111111111112e-06, + "logits/chosen": 41382582.4, + "logits/rejected": 40126329.6, + "logps/chosen": -147.36217041015624, + "logps/rejected": -132.27440185546874, + "loss": 0.48532447814941404, + "rewards/chosen": 0.2873492479324341, + "rewards/margins": 0.12220915555953982, + "rewards/rejected": 0.1651400923728943, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 0.4191218316555023, + "kl": 3.158555507659912, + "learning_rate": 4.268888888888889e-06, + "logits/chosen": 47333145.6, + "logits/rejected": 46272729.6, + "logps/chosen": -147.29794921875, + "logps/rejected": -157.137255859375, + "loss": 0.4924956798553467, + "rewards/chosen": 0.26015233993530273, + "rewards/margins": 0.0591968059539795, + "rewards/rejected": 0.20095553398132324, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 0.4541790783405304, + "kl": 3.111989974975586, + "learning_rate": 4.246666666666667e-06, + "logits/chosen": 29866240.0, + "logits/rejected": 30473120.0, + "logps/chosen": -128.02447509765625, + "logps/rejected": -133.55704345703126, + "loss": 0.4842988967895508, + "rewards/chosen": 0.3343390941619873, + "rewards/margins": 0.12930448055267335, + "rewards/rejected": 0.20503461360931396, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 0.565047025680542, + "kl": 2.7821693420410156, + "learning_rate": 4.2244444444444446e-06, + "logits/chosen": 28686153.6, + "logits/rejected": 31275964.8, + "logps/chosen": -147.65833740234376, + "logps/rejected": -149.1046630859375, + "loss": 0.5113170146942139, + "rewards/chosen": 0.13800346851348877, + "rewards/margins": -0.09322352409362794, + "rewards/rejected": 0.2312269926071167, + "step": 600 + }, + { + "epoch": 0.48, + "eval_kl": 2.7389280796051025, + "eval_logits/chosen": 38005252.096, + "eval_logits/rejected": 37846036.48, + "eval_logps/chosen": -152.185953125, + "eval_logps/rejected": -146.89521875, + "eval_loss": 0.4950157403945923, + "eval_rewards/chosen": 0.21398320007324217, + "eval_rewards/margins": 0.03923100280761718, + "eval_rewards/rejected": 0.174752197265625, + "eval_runtime": 217.558, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 0.4400153160095215, + "kl": 3.6884047985076904, + "learning_rate": 4.202222222222222e-06, + "logits/chosen": 45485420.8, + "logits/rejected": 41585961.6, + "logps/chosen": -193.09393310546875, + "logps/rejected": -168.6018310546875, + "loss": 0.47965612411499026, + "rewards/chosen": 0.346639347076416, + "rewards/margins": 0.19525065422058108, + "rewards/rejected": 0.15138869285583495, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 0.47579634189605713, + "kl": 2.8855841159820557, + "learning_rate": 4.18e-06, + "logits/chosen": 26868339.2, + "logits/rejected": 25530107.2, + "logps/chosen": -139.2341552734375, + "logps/rejected": -135.46981201171874, + "loss": 0.48549280166625974, + "rewards/chosen": 0.2853414058685303, + "rewards/margins": 0.12018097639083863, + "rewards/rejected": 0.16516042947769166, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 0.4894777536392212, + "kl": 3.817617893218994, + "learning_rate": 4.157777777777778e-06, + "logits/chosen": 38511724.8, + "logits/rejected": 41062003.2, + "logps/chosen": -139.717919921875, + "logps/rejected": -172.0173828125, + "loss": 0.5030189037322998, + "rewards/chosen": 0.3242809772491455, + "rewards/margins": -0.024120402336120617, + "rewards/rejected": 0.3484013795852661, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 0.5884784460067749, + "kl": 3.3272690773010254, + "learning_rate": 4.135555555555556e-06, + "logits/chosen": 40902281.6, + "logits/rejected": 39306883.2, + "logps/chosen": -189.29173583984374, + "logps/rejected": -149.8311279296875, + "loss": 0.4905113220214844, + "rewards/chosen": 0.27348809242248534, + "rewards/margins": 0.07974576950073242, + "rewards/rejected": 0.19374232292175292, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 0.489397794008255, + "kl": 4.352996349334717, + "learning_rate": 4.1133333333333335e-06, + "logits/chosen": 43261625.6, + "logits/rejected": 41635296.0, + "logps/chosen": -125.68609619140625, + "logps/rejected": -132.724267578125, + "loss": 0.49439477920532227, + "rewards/chosen": 0.3973216533660889, + "rewards/margins": 0.04381968975067141, + "rewards/rejected": 0.35350196361541747, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 0.36593517661094666, + "kl": 3.356546401977539, + "learning_rate": 4.091111111111111e-06, + "logits/chosen": 56701203.2, + "logits/rejected": 55284249.6, + "logps/chosen": -168.565625, + "logps/rejected": -132.68575439453124, + "loss": 0.48746094703674314, + "rewards/chosen": 0.3356959581375122, + "rewards/margins": 0.09914519786834713, + "rewards/rejected": 0.23655076026916505, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 0.47609221935272217, + "kl": 3.9726402759552, + "learning_rate": 4.0688888888888896e-06, + "logits/chosen": 42420092.8, + "logits/rejected": 42645120.0, + "logps/chosen": -181.13988037109374, + "logps/rejected": -183.315185546875, + "loss": 0.4789764881134033, + "rewards/chosen": 0.3791257381439209, + "rewards/margins": 0.18321629762649538, + "rewards/rejected": 0.19590944051742554, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 0.479322224855423, + "kl": 2.822577953338623, + "learning_rate": 4.046666666666667e-06, + "logits/chosen": 41487219.2, + "logits/rejected": 40422083.2, + "logps/chosen": -144.38018798828125, + "logps/rejected": -126.14996337890625, + "loss": 0.49282026290893555, + "rewards/chosen": 0.22584574222564696, + "rewards/margins": 0.055848944187164296, + "rewards/rejected": 0.16999679803848267, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 0.3670179843902588, + "kl": 4.148089408874512, + "learning_rate": 4.024444444444445e-06, + "logits/chosen": 42715072.0, + "logits/rejected": 40623603.2, + "logps/chosen": -156.69072265625, + "logps/rejected": -175.76126708984376, + "loss": 0.4938004970550537, + "rewards/chosen": 0.39667787551879885, + "rewards/margins": 0.05049760341644288, + "rewards/rejected": 0.34618027210235597, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 0.41768333315849304, + "kl": 3.2628045082092285, + "learning_rate": 4.002222222222222e-06, + "logits/chosen": 37950355.2, + "logits/rejected": 34915990.4, + "logps/chosen": -155.3704833984375, + "logps/rejected": -143.3675537109375, + "loss": 0.49524383544921874, + "rewards/chosen": 0.2646515369415283, + "rewards/margins": 0.04396252632141112, + "rewards/rejected": 0.2206890106201172, + "step": 700 + }, + { + "epoch": 0.56, + "eval_kl": 3.236727237701416, + "eval_logits/chosen": 38033387.52, + "eval_logits/rejected": 37810647.04, + "eval_logps/chosen": -151.62878125, + "eval_logps/rejected": -146.479140625, + "eval_loss": 0.49332940578460693, + "eval_rewards/chosen": 0.269699462890625, + "eval_rewards/margins": 0.05334155273437502, + "eval_rewards/rejected": 0.21635791015625, + "eval_runtime": 218.185, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.292, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 0.40857982635498047, + "kl": 4.429306983947754, + "learning_rate": 3.980000000000001e-06, + "logits/chosen": 40884387.2, + "logits/rejected": 39080608.0, + "logps/chosen": -172.00384521484375, + "logps/rejected": -133.983837890625, + "loss": 0.4770909309387207, + "rewards/chosen": 0.518680477142334, + "rewards/margins": 0.18963458538055422, + "rewards/rejected": 0.3290458917617798, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 0.3682423233985901, + "kl": 3.0005943775177, + "learning_rate": 3.9577777777777785e-06, + "logits/chosen": 33681516.8, + "logits/rejected": 34946268.8, + "logps/chosen": -120.9507080078125, + "logps/rejected": -123.20738525390625, + "loss": 0.5031956672668457, + "rewards/chosen": 0.11274595260620117, + "rewards/margins": -0.05625311136245727, + "rewards/rejected": 0.16899906396865844, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 0.37147852778434753, + "kl": 3.7080981731414795, + "learning_rate": 3.935555555555556e-06, + "logits/chosen": 36905440.0, + "logits/rejected": 34017891.2, + "logps/chosen": -152.36019287109374, + "logps/rejected": -154.4943115234375, + "loss": 0.49049901962280273, + "rewards/chosen": 0.3831493616104126, + "rewards/margins": 0.07779901027679442, + "rewards/rejected": 0.30535035133361815, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 0.48657310009002686, + "kl": 4.199930191040039, + "learning_rate": 3.913333333333334e-06, + "logits/chosen": 34702265.6, + "logits/rejected": 33570732.8, + "logps/chosen": -156.0489501953125, + "logps/rejected": -182.53206787109374, + "loss": 0.4937909603118896, + "rewards/chosen": 0.3453744649887085, + "rewards/margins": 0.06571738719940184, + "rewards/rejected": 0.27965707778930665, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 0.2791324257850647, + "kl": 3.1397013664245605, + "learning_rate": 3.891111111111111e-06, + "logits/chosen": 38985750.4, + "logits/rejected": 38637244.8, + "logps/chosen": -159.29842529296874, + "logps/rejected": -183.90196533203124, + "loss": 0.5016417980194092, + "rewards/chosen": 0.11977872848510743, + "rewards/margins": 0.02148157954216004, + "rewards/rejected": 0.09829714894294739, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 0.4719991683959961, + "kl": 1.940172553062439, + "learning_rate": 3.868888888888889e-06, + "logits/chosen": 36901945.6, + "logits/rejected": 37816726.4, + "logps/chosen": -115.81767578125, + "logps/rejected": -124.9832763671875, + "loss": 0.48836345672607423, + "rewards/chosen": 0.1182823657989502, + "rewards/margins": 0.09493236243724823, + "rewards/rejected": 0.023350003361701965, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 0.46570995450019836, + "kl": 1.71030592918396, + "learning_rate": 3.8466666666666665e-06, + "logits/chosen": 45056902.4, + "logits/rejected": 45886822.4, + "logps/chosen": -142.8737060546875, + "logps/rejected": -152.980029296875, + "loss": 0.4849842071533203, + "rewards/chosen": 0.044194817543029785, + "rewards/margins": 0.13042356967926025, + "rewards/rejected": -0.08622875213623046, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 0.48490992188453674, + "kl": 0.8850091695785522, + "learning_rate": 3.824444444444444e-06, + "logits/chosen": 41897849.6, + "logits/rejected": 42659980.8, + "logps/chosen": -163.0940673828125, + "logps/rejected": -145.29967041015624, + "loss": 0.4809276103973389, + "rewards/chosen": -0.029845520853996277, + "rewards/margins": 0.16276139318943023, + "rewards/rejected": -0.1926069140434265, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 0.42963773012161255, + "kl": 1.2656173706054688, + "learning_rate": 3.8022222222222226e-06, + "logits/chosen": 31615977.6, + "logits/rejected": 27643244.8, + "logps/chosen": -146.9314453125, + "logps/rejected": -139.28326416015625, + "loss": 0.4754499912261963, + "rewards/chosen": -0.14652204513549805, + "rewards/margins": 0.23166158199310305, + "rewards/rejected": -0.3781836271286011, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 0.5815227627754211, + "kl": 1.1006227731704712, + "learning_rate": 3.7800000000000002e-06, + "logits/chosen": 28676291.2, + "logits/rejected": 28398739.2, + "logps/chosen": -149.7789306640625, + "logps/rejected": -134.99962158203124, + "loss": 0.5007228374481201, + "rewards/chosen": -0.2438833236694336, + "rewards/margins": -0.03309731483459474, + "rewards/rejected": -0.21078600883483886, + "step": 800 + }, + { + "epoch": 0.64, + "eval_kl": 1.4775981903076172, + "eval_logits/chosen": 34909265.92, + "eval_logits/rejected": 34874159.104, + "eval_logps/chosen": -156.25446875, + "eval_logps/rejected": -151.0355, + "eval_loss": 0.4928078353404999, + "eval_rewards/chosen": -0.19286886596679687, + "eval_rewards/margins": 0.04640672302246096, + "eval_rewards/rejected": -0.23927558898925783, + "eval_runtime": 217.2245, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 0.518290102481842, + "kl": 2.0555756092071533, + "learning_rate": 3.757777777777778e-06, + "logits/chosen": 39001305.6, + "logits/rejected": 39306675.2, + "logps/chosen": -159.09794921875, + "logps/rejected": -157.2656982421875, + "loss": 0.48754167556762695, + "rewards/chosen": 0.07401522397994995, + "rewards/margins": 0.1124086320400238, + "rewards/rejected": -0.03839340806007385, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 0.4529527425765991, + "kl": 1.2279353141784668, + "learning_rate": 3.7355555555555555e-06, + "logits/chosen": 24645734.4, + "logits/rejected": 24314422.4, + "logps/chosen": -157.53839111328125, + "logps/rejected": -129.3763427734375, + "loss": 0.5034448146820069, + "rewards/chosen": -0.31603260040283204, + "rewards/margins": -0.04990806579589846, + "rewards/rejected": -0.2661245346069336, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 0.5347335934638977, + "kl": 2.0512425899505615, + "learning_rate": 3.713333333333334e-06, + "logits/chosen": 31296140.8, + "logits/rejected": 29981024.0, + "logps/chosen": -160.424072265625, + "logps/rejected": -127.09144287109375, + "loss": 0.49833097457885744, + "rewards/chosen": -0.05236924290657043, + "rewards/margins": 0.004509323835372926, + "rewards/rejected": -0.05687856674194336, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 0.4843555688858032, + "kl": 1.5582542419433594, + "learning_rate": 3.6911111111111115e-06, + "logits/chosen": 39700444.8, + "logits/rejected": 39422995.2, + "logps/chosen": -151.80198974609374, + "logps/rejected": -151.66746826171874, + "loss": 0.4924652099609375, + "rewards/chosen": -0.040176278352737425, + "rewards/margins": 0.038288170099258424, + "rewards/rejected": -0.07846444845199585, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 0.5929602384567261, + "kl": 2.925257921218872, + "learning_rate": 3.668888888888889e-06, + "logits/chosen": 39676166.4, + "logits/rejected": 38709782.4, + "logps/chosen": -169.22021484375, + "logps/rejected": -189.6208251953125, + "loss": 0.507749605178833, + "rewards/chosen": -0.05675660371780396, + "rewards/margins": -0.11356353759765625, + "rewards/rejected": 0.05680693387985229, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 0.47086119651794434, + "kl": 2.162543296813965, + "learning_rate": 3.6466666666666668e-06, + "logits/chosen": 31780547.2, + "logits/rejected": 29934672.0, + "logps/chosen": -165.53729248046875, + "logps/rejected": -144.7294921875, + "loss": 0.48428568840026853, + "rewards/chosen": -0.0063173860311508175, + "rewards/margins": 0.12325810492038727, + "rewards/rejected": -0.12957549095153809, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 0.5226603150367737, + "kl": 1.0791276693344116, + "learning_rate": 3.624444444444445e-06, + "logits/chosen": 36146592.0, + "logits/rejected": 34014483.2, + "logps/chosen": -139.47996826171874, + "logps/rejected": -147.29366455078124, + "loss": 0.4861104965209961, + "rewards/chosen": -0.3762362003326416, + "rewards/margins": 0.0545970916748047, + "rewards/rejected": -0.4308332920074463, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 0.456878662109375, + "kl": 1.0787068605422974, + "learning_rate": 3.6022222222222224e-06, + "logits/chosen": 31733673.6, + "logits/rejected": 30545894.4, + "logps/chosen": -166.9651123046875, + "logps/rejected": -136.55260009765624, + "loss": 0.4926890850067139, + "rewards/chosen": -0.2618767499923706, + "rewards/margins": 0.09881234169006348, + "rewards/rejected": -0.36068909168243407, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 0.39478904008865356, + "kl": 0.7077828645706177, + "learning_rate": 3.58e-06, + "logits/chosen": 42203872.0, + "logits/rejected": 40975161.6, + "logps/chosen": -135.72105712890624, + "logps/rejected": -136.8107421875, + "loss": 0.4823348045349121, + "rewards/chosen": -0.3193112850189209, + "rewards/margins": 0.17958507537841795, + "rewards/rejected": -0.49889636039733887, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 0.4868822395801544, + "kl": 1.0201635360717773, + "learning_rate": 3.5577777777777785e-06, + "logits/chosen": 37633750.4, + "logits/rejected": 38356057.6, + "logps/chosen": -176.8873779296875, + "logps/rejected": -164.86949462890624, + "loss": 0.49372262954711915, + "rewards/chosen": -0.5601509571075439, + "rewards/margins": -0.027371644973754883, + "rewards/rejected": -0.532779312133789, + "step": 900 + }, + { + "epoch": 0.72, + "eval_kl": 0.7019873857498169, + "eval_logits/chosen": 32590643.2, + "eval_logits/rejected": 32688842.752, + "eval_logps/chosen": -159.868109375, + "eval_logps/rejected": -154.68375, + "eval_loss": 0.49165070056915283, + "eval_rewards/chosen": -0.5542342529296875, + "eval_rewards/margins": 0.049869018554687505, + "eval_rewards/rejected": -0.604103271484375, + "eval_runtime": 218.2133, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 0.4481786787509918, + "kl": 0.8257962465286255, + "learning_rate": 3.535555555555556e-06, + "logits/chosen": 32320790.4, + "logits/rejected": 32438003.2, + "logps/chosen": -168.3318603515625, + "logps/rejected": -174.481884765625, + "loss": 0.4958535671234131, + "rewards/chosen": -0.454105281829834, + "rewards/margins": 0.1731292247772217, + "rewards/rejected": -0.6272345066070557, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 0.41489994525909424, + "kl": 0.9689595103263855, + "learning_rate": 3.5133333333333337e-06, + "logits/chosen": 33867084.8, + "logits/rejected": 31248268.8, + "logps/chosen": -136.1980224609375, + "logps/rejected": -151.76387939453124, + "loss": 0.4944427490234375, + "rewards/chosen": -0.40467538833618166, + "rewards/margins": 0.14626178741455076, + "rewards/rejected": -0.5509371757507324, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 0.460254430770874, + "kl": 1.2024842500686646, + "learning_rate": 3.4911111111111113e-06, + "logits/chosen": 32133318.4, + "logits/rejected": 32185379.2, + "logps/chosen": -155.15115966796876, + "logps/rejected": -149.83077392578124, + "loss": 0.48492116928100587, + "rewards/chosen": -0.21543638706207274, + "rewards/margins": 0.15435693264007572, + "rewards/rejected": -0.36979331970214846, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 0.34393665194511414, + "kl": 1.7038171291351318, + "learning_rate": 3.4688888888888894e-06, + "logits/chosen": 27802694.4, + "logits/rejected": 25992144.0, + "logps/chosen": -136.5869140625, + "logps/rejected": -152.3591064453125, + "loss": 0.482952356338501, + "rewards/chosen": -0.24357478618621825, + "rewards/margins": 0.1968345880508423, + "rewards/rejected": -0.44040937423706056, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 0.3970324397087097, + "kl": 2.81396484375, + "learning_rate": 3.446666666666667e-06, + "logits/chosen": 40630716.8, + "logits/rejected": 43665993.6, + "logps/chosen": -184.17490234375, + "logps/rejected": -158.86982421875, + "loss": 0.48198614120483396, + "rewards/chosen": -0.03642080426216125, + "rewards/margins": 0.11629058718681336, + "rewards/rejected": -0.1527113914489746, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 0.5192223787307739, + "kl": 2.0976433753967285, + "learning_rate": 3.4244444444444446e-06, + "logits/chosen": 37736128.0, + "logits/rejected": 39264816.0, + "logps/chosen": -137.5658203125, + "logps/rejected": -150.39862060546875, + "loss": 0.5065433502197265, + "rewards/chosen": -0.08580412864685058, + "rewards/margins": -0.038575989007949826, + "rewards/rejected": -0.047228139638900754, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 0.4041205942630768, + "kl": 1.893617033958435, + "learning_rate": 3.4022222222222222e-06, + "logits/chosen": 32113164.8, + "logits/rejected": 30382905.6, + "logps/chosen": -135.3074462890625, + "logps/rejected": -119.05921630859375, + "loss": 0.4857178688049316, + "rewards/chosen": -0.022751623392105104, + "rewards/margins": 0.15494421124458313, + "rewards/rejected": -0.17769583463668823, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 0.6198734641075134, + "kl": 3.3098182678222656, + "learning_rate": 3.3800000000000007e-06, + "logits/chosen": 39511980.8, + "logits/rejected": 39711251.2, + "logps/chosen": -138.0878662109375, + "logps/rejected": -157.733740234375, + "loss": 0.4792346000671387, + "rewards/chosen": 0.2472997188568115, + "rewards/margins": 0.20102626085281372, + "rewards/rejected": 0.046273458003997806, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 0.5357170104980469, + "kl": 1.1110466718673706, + "learning_rate": 3.3577777777777783e-06, + "logits/chosen": 31936358.4, + "logits/rejected": 31202771.2, + "logps/chosen": -127.25572509765625, + "logps/rejected": -136.280859375, + "loss": 0.4806090831756592, + "rewards/chosen": -0.14191631078720093, + "rewards/margins": 0.16535891294479368, + "rewards/rejected": -0.3072752237319946, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 0.351481556892395, + "kl": 1.9038498401641846, + "learning_rate": 3.335555555555556e-06, + "logits/chosen": 31806704.0, + "logits/rejected": 32803180.8, + "logps/chosen": -159.603076171875, + "logps/rejected": -122.312548828125, + "loss": 0.492017126083374, + "rewards/chosen": -0.0802042841911316, + "rewards/margins": 0.04814127683639527, + "rewards/rejected": -0.12834556102752687, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_kl": 1.7515510320663452, + "eval_logits/chosen": 34055507.968, + "eval_logits/rejected": 34077693.952, + "eval_logps/chosen": -156.563640625, + "eval_logps/rejected": -151.51715625, + "eval_loss": 0.49078983068466187, + "eval_rewards/chosen": -0.223786865234375, + "eval_rewards/margins": 0.06365646362304686, + "eval_rewards/rejected": -0.28744332885742185, + "eval_runtime": 216.801, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 0.5972615480422974, + "kl": 2.554426431655884, + "learning_rate": 3.3133333333333335e-06, + "logits/chosen": 27570451.2, + "logits/rejected": 30221734.4, + "logps/chosen": -138.31341552734375, + "logps/rejected": -188.19471435546876, + "loss": 0.49091529846191406, + "rewards/chosen": -0.16074006557464598, + "rewards/margins": 0.05920815467834473, + "rewards/rejected": -0.2199482202529907, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 0.5644449591636658, + "kl": 2.1006593704223633, + "learning_rate": 3.2911111111111116e-06, + "logits/chosen": 28881091.2, + "logits/rejected": 27956883.2, + "logps/chosen": -154.400244140625, + "logps/rejected": -139.5636474609375, + "loss": 0.4885709762573242, + "rewards/chosen": -0.201019549369812, + "rewards/margins": 0.14239611625671386, + "rewards/rejected": -0.34341566562652587, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 0.45909127593040466, + "kl": 2.598879337310791, + "learning_rate": 3.268888888888889e-06, + "logits/chosen": 34286569.6, + "logits/rejected": 33405510.4, + "logps/chosen": -155.141162109375, + "logps/rejected": -153.61441650390626, + "loss": 0.47780580520629884, + "rewards/chosen": -0.026760125160217287, + "rewards/margins": 0.17421259880065917, + "rewards/rejected": -0.20097272396087645, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 0.5554538369178772, + "kl": 2.448212146759033, + "learning_rate": 3.2466666666666668e-06, + "logits/chosen": 27163843.2, + "logits/rejected": 26525179.2, + "logps/chosen": -192.26566162109376, + "logps/rejected": -115.9719970703125, + "loss": 0.5046756744384766, + "rewards/chosen": -0.2157804250717163, + "rewards/margins": -0.09053788185119627, + "rewards/rejected": -0.12524254322052003, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 0.535012423992157, + "kl": 1.9927467107772827, + "learning_rate": 3.2244444444444444e-06, + "logits/chosen": 29665126.4, + "logits/rejected": 27342956.8, + "logps/chosen": -129.9255859375, + "logps/rejected": -160.8191162109375, + "loss": 0.4967525005340576, + "rewards/chosen": -0.1964455485343933, + "rewards/margins": 0.04946266412734987, + "rewards/rejected": -0.24590821266174318, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 0.4275314509868622, + "kl": 1.8325145244598389, + "learning_rate": 3.202222222222223e-06, + "logits/chosen": 38900185.6, + "logits/rejected": 36465756.8, + "logps/chosen": -173.99815673828124, + "logps/rejected": -189.41026611328124, + "loss": 0.5163179874420166, + "rewards/chosen": -0.341221284866333, + "rewards/margins": -0.10804271697998047, + "rewards/rejected": -0.23317856788635255, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 0.6077404022216797, + "kl": 1.2542213201522827, + "learning_rate": 3.1800000000000005e-06, + "logits/chosen": 30413292.8, + "logits/rejected": 31516124.8, + "logps/chosen": -146.35406494140625, + "logps/rejected": -171.661767578125, + "loss": 0.4878209590911865, + "rewards/chosen": -0.4057271957397461, + "rewards/margins": 0.26455159187316896, + "rewards/rejected": -0.670278787612915, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 0.3333641290664673, + "kl": 0.8504716753959656, + "learning_rate": 3.157777777777778e-06, + "logits/chosen": 33478700.8, + "logits/rejected": 35287001.6, + "logps/chosen": -159.8537109375, + "logps/rejected": -140.7849609375, + "loss": 0.5070839405059815, + "rewards/chosen": -0.5834408283233643, + "rewards/margins": -0.13464021682739263, + "rewards/rejected": -0.44880061149597167, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 0.5745656490325928, + "kl": 0.5619686245918274, + "learning_rate": 3.1355555555555557e-06, + "logits/chosen": 25375937.6, + "logits/rejected": 24836934.4, + "logps/chosen": -150.52469482421876, + "logps/rejected": -119.2016357421875, + "loss": 0.5091001033782959, + "rewards/chosen": -0.6672951698303222, + "rewards/margins": -0.07665328979492181, + "rewards/rejected": -0.5906418800354004, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 0.5211692452430725, + "kl": 0.7837439775466919, + "learning_rate": 3.1133333333333337e-06, + "logits/chosen": 32551961.6, + "logits/rejected": 31246240.0, + "logps/chosen": -141.6515625, + "logps/rejected": -178.54500732421874, + "loss": 0.4797823429107666, + "rewards/chosen": -0.5657515525817871, + "rewards/margins": 0.2927797317504883, + "rewards/rejected": -0.8585312843322754, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_kl": 1.0299346446990967, + "eval_logits/chosen": 32200159.232, + "eval_logits/rejected": 32317042.688, + "eval_logps/chosen": -159.071015625, + "eval_logps/rejected": -154.04371875, + "eval_loss": 0.4902701675891876, + "eval_rewards/chosen": -0.4745252380371094, + "eval_rewards/margins": 0.0655753479003906, + "eval_rewards/rejected": -0.5401005859375, + "eval_runtime": 218.2058, + "eval_samples_per_second": 4.583, + "eval_steps_per_second": 2.291, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 0.5113421678543091, + "kl": 0.8989810943603516, + "learning_rate": 3.0911111111111114e-06, + "logits/chosen": 35005900.8, + "logits/rejected": 37000198.4, + "logps/chosen": -121.57982177734375, + "logps/rejected": -188.24630126953124, + "loss": 0.5006334781646729, + "rewards/chosen": -0.5451927661895752, + "rewards/margins": -0.006183815002441473, + "rewards/rejected": -0.5390089511871338, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 0.5302984714508057, + "kl": 1.4579006433486938, + "learning_rate": 3.068888888888889e-06, + "logits/chosen": 44859440.0, + "logits/rejected": 44371523.2, + "logps/chosen": -152.48531494140624, + "logps/rejected": -166.46834716796874, + "loss": 0.47098937034606936, + "rewards/chosen": -0.30156469345092773, + "rewards/margins": 0.31780052185058594, + "rewards/rejected": -0.6193652153015137, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 0.513234555721283, + "kl": 1.1729340553283691, + "learning_rate": 3.0466666666666666e-06, + "logits/chosen": 34804921.6, + "logits/rejected": 33861536.0, + "logps/chosen": -184.123681640625, + "logps/rejected": -151.5656005859375, + "loss": 0.48264274597167967, + "rewards/chosen": -0.47753205299377444, + "rewards/margins": 0.19833426475524896, + "rewards/rejected": -0.6758663177490234, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 0.3625507354736328, + "kl": 1.0059670209884644, + "learning_rate": 3.024444444444445e-06, + "logits/chosen": 21122052.8, + "logits/rejected": 21075241.6, + "logps/chosen": -109.50086669921875, + "logps/rejected": -147.45343017578125, + "loss": 0.4898237705230713, + "rewards/chosen": -0.47812538146972655, + "rewards/margins": 0.0868696689605713, + "rewards/rejected": -0.5649950504302979, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 0.45654433965682983, + "kl": 0.5379985570907593, + "learning_rate": 3.0022222222222227e-06, + "logits/chosen": 41951734.4, + "logits/rejected": 39602163.2, + "logps/chosen": -224.10224609375, + "logps/rejected": -172.74881591796876, + "loss": 0.4965188980102539, + "rewards/chosen": -1.0065871238708497, + "rewards/margins": 0.07849445343017569, + "rewards/rejected": -1.0850815773010254, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 0.4175960123538971, + "kl": 0.7397834062576294, + "learning_rate": 2.9800000000000003e-06, + "logits/chosen": 25937947.2, + "logits/rejected": 23997976.0, + "logps/chosen": -139.0525634765625, + "logps/rejected": -172.82266845703126, + "loss": 0.46753606796264646, + "rewards/chosen": -0.6469098091125488, + "rewards/margins": 0.5304314613342286, + "rewards/rejected": -1.1773412704467774, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 0.6416438221931458, + "kl": 1.0863409042358398, + "learning_rate": 2.957777777777778e-06, + "logits/chosen": 27452134.4, + "logits/rejected": 28027673.6, + "logps/chosen": -141.15223388671876, + "logps/rejected": -165.84906005859375, + "loss": 0.492138671875, + "rewards/chosen": -0.9567946434020996, + "rewards/margins": -0.08803501129150393, + "rewards/rejected": -0.8687596321105957, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 0.4831324517726898, + "kl": 0.9691106081008911, + "learning_rate": 2.935555555555556e-06, + "logits/chosen": 29437625.6, + "logits/rejected": 27526915.2, + "logps/chosen": -141.1181884765625, + "logps/rejected": -170.26187744140626, + "loss": 0.48889832496643065, + "rewards/chosen": -0.5727379322052002, + "rewards/margins": 0.27677369117736816, + "rewards/rejected": -0.8495116233825684, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 0.4612303078174591, + "kl": 0.5348154306411743, + "learning_rate": 2.9133333333333335e-06, + "logits/chosen": 29246425.6, + "logits/rejected": 24839766.4, + "logps/chosen": -184.67042236328126, + "logps/rejected": -187.88712158203126, + "loss": 0.4651207447052002, + "rewards/chosen": -0.6563633918762207, + "rewards/margins": 0.6019566535949706, + "rewards/rejected": -1.2583200454711914, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 0.4897603392601013, + "kl": 0.6600741147994995, + "learning_rate": 2.891111111111111e-06, + "logits/chosen": 29597916.8, + "logits/rejected": 28004249.6, + "logps/chosen": -146.8487060546875, + "logps/rejected": -160.24705810546874, + "loss": 0.4915929794311523, + "rewards/chosen": -0.6603847980499268, + "rewards/margins": 0.15054879188537595, + "rewards/rejected": -0.8109335899353027, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_kl": 0.6639278531074524, + "eval_logits/chosen": 30372524.032, + "eval_logits/rejected": 30547062.784, + "eval_logps/chosen": -161.525796875, + "eval_logps/rejected": -156.5376875, + "eval_loss": 0.4897628426551819, + "eval_rewards/chosen": -0.720002685546875, + "eval_rewards/margins": 0.06949468994140628, + "eval_rewards/rejected": -0.7894973754882812, + "eval_runtime": 218.5755, + "eval_samples_per_second": 4.575, + "eval_steps_per_second": 2.288, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 0.48334673047065735, + "kl": 0.7732948064804077, + "learning_rate": 2.868888888888889e-06, + "logits/chosen": 21258230.4, + "logits/rejected": 23340913.6, + "logps/chosen": -139.87337646484374, + "logps/rejected": -144.701416015625, + "loss": 0.5003955841064454, + "rewards/chosen": -0.8982287406921386, + "rewards/margins": 0.038875579833984375, + "rewards/rejected": -0.937104320526123, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 0.492876261472702, + "kl": 1.0231386423110962, + "learning_rate": 2.8466666666666672e-06, + "logits/chosen": 29408150.4, + "logits/rejected": 26554420.8, + "logps/chosen": -184.746484375, + "logps/rejected": -195.84049072265626, + "loss": 0.49897193908691406, + "rewards/chosen": -0.7237229824066163, + "rewards/margins": 0.09356503486633294, + "rewards/rejected": -0.8172880172729492, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 0.4466714560985565, + "kl": 1.3907277584075928, + "learning_rate": 2.824444444444445e-06, + "logits/chosen": 26329916.8, + "logits/rejected": 26271193.6, + "logps/chosen": -150.485302734375, + "logps/rejected": -159.1055908203125, + "loss": 0.48776721954345703, + "rewards/chosen": -0.38318867683410646, + "rewards/margins": 0.07780303955078122, + "rewards/rejected": -0.4609917163848877, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 0.4158620834350586, + "kl": 1.1157363653182983, + "learning_rate": 2.8022222222222225e-06, + "logits/chosen": 27694662.4, + "logits/rejected": 27987369.6, + "logps/chosen": -125.29927978515624, + "logps/rejected": -122.66854248046874, + "loss": 0.49160265922546387, + "rewards/chosen": -0.3763638734817505, + "rewards/margins": 0.043813061714172374, + "rewards/rejected": -0.42017693519592286, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 0.5007547736167908, + "kl": 1.0412095785140991, + "learning_rate": 2.7800000000000005e-06, + "logits/chosen": 19932609.6, + "logits/rejected": 20719350.4, + "logps/chosen": -115.5499267578125, + "logps/rejected": -133.51761474609376, + "loss": 0.4874756336212158, + "rewards/chosen": -0.3205535411834717, + "rewards/margins": 0.1453540325164795, + "rewards/rejected": -0.46590757369995117, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 0.5230170488357544, + "kl": 1.1391807794570923, + "learning_rate": 2.757777777777778e-06, + "logits/chosen": 24849233.6, + "logits/rejected": 25524444.8, + "logps/chosen": -152.53955078125, + "logps/rejected": -136.38438720703124, + "loss": 0.4610313892364502, + "rewards/chosen": -0.35785841941833496, + "rewards/margins": 0.3967602729797364, + "rewards/rejected": -0.7546186923980713, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 0.44898638129234314, + "kl": 1.0509991645812988, + "learning_rate": 2.7355555555555557e-06, + "logits/chosen": 45757600.0, + "logits/rejected": 44022054.4, + "logps/chosen": -142.66414794921874, + "logps/rejected": -156.66207275390624, + "loss": 0.48042120933532717, + "rewards/chosen": -0.2633040189743042, + "rewards/margins": 0.21041772365570066, + "rewards/rejected": -0.4737217426300049, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 0.38083699345588684, + "kl": 1.3006069660186768, + "learning_rate": 2.7133333333333333e-06, + "logits/chosen": 27252787.2, + "logits/rejected": 26214958.4, + "logps/chosen": -137.90665283203126, + "logps/rejected": -148.19869384765624, + "loss": 0.4850759029388428, + "rewards/chosen": -0.6168015956878662, + "rewards/margins": 0.1662153244018555, + "rewards/rejected": -0.7830169200897217, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 0.3815905749797821, + "kl": 1.5258519649505615, + "learning_rate": 2.6911111111111114e-06, + "logits/chosen": 36098556.8, + "logits/rejected": 37203574.4, + "logps/chosen": -146.7918212890625, + "logps/rejected": -150.63316650390624, + "loss": 0.48316545486450196, + "rewards/chosen": -0.2494358777999878, + "rewards/margins": 0.2103111505508423, + "rewards/rejected": -0.4597470283508301, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 0.45923149585723877, + "kl": 0.7379667162895203, + "learning_rate": 2.6688888888888894e-06, + "logits/chosen": 35448102.4, + "logits/rejected": 36526304.0, + "logps/chosen": -146.106689453125, + "logps/rejected": -163.4516357421875, + "loss": 0.47162642478942873, + "rewards/chosen": -0.5096414089202881, + "rewards/margins": 0.40263419151306157, + "rewards/rejected": -0.9122756004333497, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_kl": 1.307568073272705, + "eval_logits/chosen": 31229739.008, + "eval_logits/rejected": 31346253.824, + "eval_logps/chosen": -159.37909375, + "eval_logps/rejected": -154.479625, + "eval_loss": 0.4888923466205597, + "eval_rewards/chosen": -0.5053312683105469, + "eval_rewards/margins": 0.07835940551757814, + "eval_rewards/rejected": -0.583690673828125, + "eval_runtime": 218.3314, + "eval_samples_per_second": 4.58, + "eval_steps_per_second": 2.29, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 0.45518290996551514, + "kl": 0.6098345518112183, + "learning_rate": 2.646666666666667e-06, + "logits/chosen": 28600944.0, + "logits/rejected": 24958267.2, + "logps/chosen": -110.2119384765625, + "logps/rejected": -123.31849365234375, + "loss": 0.4656740665435791, + "rewards/chosen": -0.44736084938049314, + "rewards/margins": 0.48334193229675293, + "rewards/rejected": -0.9307027816772461, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 0.4651184678077698, + "kl": 2.3091390132904053, + "learning_rate": 2.6244444444444446e-06, + "logits/chosen": 41654179.2, + "logits/rejected": 39032467.2, + "logps/chosen": -170.0236083984375, + "logps/rejected": -183.32196044921875, + "loss": 0.48522496223449707, + "rewards/chosen": -0.43915767669677735, + "rewards/margins": 0.38542776107788085, + "rewards/rejected": -0.8245854377746582, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 0.5705190896987915, + "kl": 2.221473217010498, + "learning_rate": 2.6022222222222227e-06, + "logits/chosen": 37688201.6, + "logits/rejected": 36830240.0, + "logps/chosen": -155.06568603515626, + "logps/rejected": -168.05982666015626, + "loss": 0.4541748046875, + "rewards/chosen": -0.014664022624492646, + "rewards/margins": 0.44051638394594195, + "rewards/rejected": -0.45518040657043457, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 0.5728591680526733, + "kl": 2.2642099857330322, + "learning_rate": 2.5800000000000003e-06, + "logits/chosen": 36680921.6, + "logits/rejected": 36696492.8, + "logps/chosen": -169.67305908203124, + "logps/rejected": -163.4990478515625, + "loss": 0.4542436122894287, + "rewards/chosen": 0.017992374300956727, + "rewards/margins": 0.4525547713041306, + "rewards/rejected": -0.43456239700317384, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 0.5952552556991577, + "kl": 2.2592105865478516, + "learning_rate": 2.557777777777778e-06, + "logits/chosen": 23545342.4, + "logits/rejected": 21279006.4, + "logps/chosen": -138.18184814453124, + "logps/rejected": -171.426416015625, + "loss": 0.47548651695251465, + "rewards/chosen": -0.38828775882720945, + "rewards/margins": 0.30096304416656494, + "rewards/rejected": -0.6892508029937744, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 0.45678818225860596, + "kl": 3.0089876651763916, + "learning_rate": 2.5355555555555555e-06, + "logits/chosen": 34513894.4, + "logits/rejected": 33449926.4, + "logps/chosen": -128.89869384765626, + "logps/rejected": -136.55162353515624, + "loss": 0.4745296001434326, + "rewards/chosen": 0.16677324771881102, + "rewards/margins": 0.25756397247314455, + "rewards/rejected": -0.0907907247543335, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 0.5198411345481873, + "kl": 4.8974103927612305, + "learning_rate": 2.5133333333333336e-06, + "logits/chosen": 42754617.6, + "logits/rejected": 43195552.0, + "logps/chosen": -159.500830078125, + "logps/rejected": -132.177587890625, + "loss": 0.48738694190979004, + "rewards/chosen": 0.2657592296600342, + "rewards/margins": 0.10389068126678466, + "rewards/rejected": 0.16186854839324952, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 0.4035385549068451, + "kl": 4.419563293457031, + "learning_rate": 2.491111111111111e-06, + "logits/chosen": 35998966.4, + "logits/rejected": 36727424.0, + "logps/chosen": -159.00057373046874, + "logps/rejected": -138.2158935546875, + "loss": 0.4853508949279785, + "rewards/chosen": 0.31732945442199706, + "rewards/margins": 0.10866012573242187, + "rewards/rejected": 0.2086693286895752, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 0.48067033290863037, + "kl": 2.848634958267212, + "learning_rate": 2.468888888888889e-06, + "logits/chosen": 39241859.2, + "logits/rejected": 40267868.8, + "logps/chosen": -141.302490234375, + "logps/rejected": -158.81297607421874, + "loss": 0.47108969688415525, + "rewards/chosen": 0.13487266302108764, + "rewards/margins": 0.21125618219375608, + "rewards/rejected": -0.07638351917266846, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 0.5766560435295105, + "kl": 3.4840214252471924, + "learning_rate": 2.446666666666667e-06, + "logits/chosen": 37769334.4, + "logits/rejected": 39194985.6, + "logps/chosen": -128.72220458984376, + "logps/rejected": -119.84296875, + "loss": 0.4526535987854004, + "rewards/chosen": 0.18631891012191773, + "rewards/margins": 0.41424218416213987, + "rewards/rejected": -0.22792327404022217, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_kl": 2.8408021926879883, + "eval_logits/chosen": 34422374.4, + "eval_logits/rejected": 34374045.696, + "eval_logps/chosen": -154.737546875, + "eval_logps/rejected": -149.911859375, + "eval_loss": 0.48892152309417725, + "eval_rewards/chosen": -0.04117748260498047, + "eval_rewards/margins": 0.08573676300048827, + "eval_rewards/rejected": -0.12691424560546874, + "eval_runtime": 217.9065, + "eval_samples_per_second": 4.589, + "eval_steps_per_second": 2.295, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.572714626789093, + "kl": 3.2733802795410156, + "learning_rate": 2.4244444444444444e-06, + "logits/chosen": 31887904.0, + "logits/rejected": 31864508.8, + "logps/chosen": -143.56092529296876, + "logps/rejected": -149.83736572265624, + "loss": 0.47499790191650393, + "rewards/chosen": 0.03257654905319214, + "rewards/margins": 0.27403136491775515, + "rewards/rejected": -0.241454815864563, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.41845089197158813, + "kl": 3.525341749191284, + "learning_rate": 2.4022222222222225e-06, + "logits/chosen": 34721577.6, + "logits/rejected": 34714211.2, + "logps/chosen": -135.9596923828125, + "logps/rejected": -152.25648193359376, + "loss": 0.4618217945098877, + "rewards/chosen": 0.27119529247283936, + "rewards/margins": 0.4753966093063354, + "rewards/rejected": -0.2042013168334961, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 0.48100632429122925, + "kl": 4.161208152770996, + "learning_rate": 2.38e-06, + "logits/chosen": 41846537.6, + "logits/rejected": 41130585.6, + "logps/chosen": -141.89105224609375, + "logps/rejected": -170.82178955078126, + "loss": 0.46498618125915525, + "rewards/chosen": 0.3817573547363281, + "rewards/margins": 0.3280399918556213, + "rewards/rejected": 0.05371736288070679, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 0.5169075131416321, + "kl": 3.003018617630005, + "learning_rate": 2.357777777777778e-06, + "logits/chosen": 39127420.8, + "logits/rejected": 39187852.8, + "logps/chosen": -121.814013671875, + "logps/rejected": -137.307861328125, + "loss": 0.46971497535705564, + "rewards/chosen": 0.28073878288269044, + "rewards/margins": 0.24493391513824464, + "rewards/rejected": 0.0358048677444458, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 0.6531253457069397, + "kl": 4.080590724945068, + "learning_rate": 2.3355555555555557e-06, + "logits/chosen": 45846016.0, + "logits/rejected": 43724588.8, + "logps/chosen": -147.2013671875, + "logps/rejected": -174.9349365234375, + "loss": 0.4397883892059326, + "rewards/chosen": 0.4346614837646484, + "rewards/margins": 0.6019469738006591, + "rewards/rejected": -0.16728549003601073, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 0.5618774890899658, + "kl": 2.632253885269165, + "learning_rate": 2.3133333333333333e-06, + "logits/chosen": 29858144.0, + "logits/rejected": 30215900.8, + "logps/chosen": -140.875927734375, + "logps/rejected": -135.17412109375, + "loss": 0.4855960369110107, + "rewards/chosen": -0.033119755983352664, + "rewards/margins": 0.22187880873680116, + "rewards/rejected": -0.2549985647201538, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 0.5618053674697876, + "kl": 3.8460822105407715, + "learning_rate": 2.2911111111111114e-06, + "logits/chosen": 33933001.6, + "logits/rejected": 33417001.6, + "logps/chosen": -98.64682006835938, + "logps/rejected": -126.3620849609375, + "loss": 0.4694389343261719, + "rewards/chosen": 0.3202403783798218, + "rewards/margins": 0.30994352102279665, + "rewards/rejected": 0.010296857357025147, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 0.49939385056495667, + "kl": 3.8765969276428223, + "learning_rate": 2.268888888888889e-06, + "logits/chosen": 45282752.0, + "logits/rejected": 45168672.0, + "logps/chosen": -182.675830078125, + "logps/rejected": -164.2170166015625, + "loss": 0.4553979396820068, + "rewards/chosen": 0.2684901714324951, + "rewards/margins": 0.3095468133687973, + "rewards/rejected": -0.041056641936302186, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 0.4833432734012604, + "kl": 3.6524147987365723, + "learning_rate": 2.2466666666666666e-06, + "logits/chosen": 44457616.0, + "logits/rejected": 45819459.2, + "logps/chosen": -174.73040771484375, + "logps/rejected": -167.15947265625, + "loss": 0.4712826728820801, + "rewards/chosen": 0.22984566688537597, + "rewards/margins": 0.2501527413725853, + "rewards/rejected": -0.02030707448720932, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 0.5918501019477844, + "kl": 3.85776948928833, + "learning_rate": 2.2244444444444447e-06, + "logits/chosen": 35046502.4, + "logits/rejected": 33714608.0, + "logps/chosen": -171.797265625, + "logps/rejected": -147.64517822265626, + "loss": 0.4212610721588135, + "rewards/chosen": 0.4938655376434326, + "rewards/margins": 0.7197992086410523, + "rewards/rejected": -0.22593367099761963, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_kl": 3.5718271732330322, + "eval_logits/chosen": 36649410.56, + "eval_logits/rejected": 36563263.488, + "eval_logps/chosen": -152.817171875, + "eval_logps/rejected": -148.086484375, + "eval_loss": 0.4883860945701599, + "eval_rewards/chosen": 0.15086082458496095, + "eval_rewards/margins": 0.09523786926269531, + "eval_rewards/rejected": 0.055622955322265626, + "eval_runtime": 217.5749, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 0.5604835748672485, + "kl": 2.5207982063293457, + "learning_rate": 2.2022222222222227e-06, + "logits/chosen": 28183353.6, + "logits/rejected": 26171800.0, + "logps/chosen": -127.34248046875, + "logps/rejected": -135.6572265625, + "loss": 0.44087018966674807, + "rewards/chosen": 0.20389485359191895, + "rewards/margins": 0.5874947786331177, + "rewards/rejected": -0.3835999250411987, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 0.6260045170783997, + "kl": 4.300113677978516, + "learning_rate": 2.1800000000000003e-06, + "logits/chosen": 37477926.4, + "logits/rejected": 37180899.2, + "logps/chosen": -153.624072265625, + "logps/rejected": -153.7638916015625, + "loss": 0.4450747013092041, + "rewards/chosen": 0.40005855560302733, + "rewards/margins": 0.5518900513648987, + "rewards/rejected": -0.15183149576187133, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 0.5863284468650818, + "kl": 3.6325111389160156, + "learning_rate": 2.157777777777778e-06, + "logits/chosen": 31546803.2, + "logits/rejected": 27054540.8, + "logps/chosen": -164.37142333984374, + "logps/rejected": -141.66112060546874, + "loss": 0.4467916488647461, + "rewards/chosen": 0.37994205951690674, + "rewards/margins": 0.499565863609314, + "rewards/rejected": -0.11962380409240722, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 0.42624586820602417, + "kl": 4.787026882171631, + "learning_rate": 2.1355555555555555e-06, + "logits/chosen": 29693196.8, + "logits/rejected": 28872902.4, + "logps/chosen": -140.95440673828125, + "logps/rejected": -148.26785888671876, + "loss": 0.43952031135559083, + "rewards/chosen": 0.574599027633667, + "rewards/margins": 0.5613519787788391, + "rewards/rejected": 0.013247048854827881, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 0.5493943095207214, + "kl": 3.9344754219055176, + "learning_rate": 2.1133333333333336e-06, + "logits/chosen": 40305772.8, + "logits/rejected": 40673481.6, + "logps/chosen": -165.5770751953125, + "logps/rejected": -181.73116455078124, + "loss": 0.4579936981201172, + "rewards/chosen": 0.33573935031890867, + "rewards/margins": 0.4691450238227844, + "rewards/rejected": -0.13340567350387572, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 0.46218565106391907, + "kl": 4.996054172515869, + "learning_rate": 2.091111111111111e-06, + "logits/chosen": 34034489.6, + "logits/rejected": 34783824.0, + "logps/chosen": -142.5915283203125, + "logps/rejected": -154.66793212890624, + "loss": 0.4685988426208496, + "rewards/chosen": 0.42725467681884766, + "rewards/margins": 0.2892191410064697, + "rewards/rejected": 0.13803553581237793, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 0.6164036989212036, + "kl": 3.2075297832489014, + "learning_rate": 2.0688888888888892e-06, + "logits/chosen": 37389939.2, + "logits/rejected": 35211542.4, + "logps/chosen": -157.27457275390626, + "logps/rejected": -135.26968994140626, + "loss": 0.4538430690765381, + "rewards/chosen": 0.30452628135681153, + "rewards/margins": 0.4211171746253968, + "rewards/rejected": -0.11659089326858521, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 0.4599875807762146, + "kl": 4.0805768966674805, + "learning_rate": 2.046666666666667e-06, + "logits/chosen": 47580665.6, + "logits/rejected": 44842764.8, + "logps/chosen": -147.82244873046875, + "logps/rejected": -144.87330322265626, + "loss": 0.4251837253570557, + "rewards/chosen": 0.6019775867462158, + "rewards/margins": 0.6677440404891968, + "rewards/rejected": -0.06576645374298096, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 0.5166321992874146, + "kl": 3.547309160232544, + "learning_rate": 2.024444444444445e-06, + "logits/chosen": 39700006.4, + "logits/rejected": 39951171.2, + "logps/chosen": -131.4615478515625, + "logps/rejected": -144.446240234375, + "loss": 0.4551235675811768, + "rewards/chosen": 0.3439887046813965, + "rewards/margins": 0.35865890979766846, + "rewards/rejected": -0.014670205116271973, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 0.515184760093689, + "kl": 5.392228126525879, + "learning_rate": 2.0022222222222225e-06, + "logits/chosen": 37336246.4, + "logits/rejected": 35750588.8, + "logps/chosen": -129.5730224609375, + "logps/rejected": -167.88192138671874, + "loss": 0.46866717338562014, + "rewards/chosen": 0.5722126007080078, + "rewards/margins": 0.2670687913894654, + "rewards/rejected": 0.30514380931854246, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_kl": 4.125787258148193, + "eval_logits/chosen": 36980781.056, + "eval_logits/rejected": 36865613.824, + "eval_logps/chosen": -152.2105625, + "eval_logps/rejected": -147.532890625, + "eval_loss": 0.4878697693347931, + "eval_rewards/chosen": 0.21152139282226562, + "eval_rewards/margins": 0.1005374526977539, + "eval_rewards/rejected": 0.11098394012451172, + "eval_runtime": 217.1057, + "eval_samples_per_second": 4.606, + "eval_steps_per_second": 2.303, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 0.33879461884498596, + "kl": 3.9892711639404297, + "learning_rate": 1.98e-06, + "logits/chosen": 38520425.6, + "logits/rejected": 37891657.6, + "logps/chosen": -144.33753662109376, + "logps/rejected": -113.0916748046875, + "loss": 0.4574281215667725, + "rewards/chosen": 0.3910325288772583, + "rewards/margins": 0.4121716648340225, + "rewards/rejected": -0.02113913595676422, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 0.4744361937046051, + "kl": 4.163745403289795, + "learning_rate": 1.9577777777777777e-06, + "logits/chosen": 37167372.8, + "logits/rejected": 37068976.0, + "logps/chosen": -144.5396484375, + "logps/rejected": -151.2422119140625, + "loss": 0.4770793914794922, + "rewards/chosen": 0.24615283012390138, + "rewards/margins": 0.2315782740712166, + "rewards/rejected": 0.014574556052684784, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 0.6805797815322876, + "kl": 4.433796405792236, + "learning_rate": 1.9355555555555558e-06, + "logits/chosen": 33666192.0, + "logits/rejected": 33794051.2, + "logps/chosen": -152.936279296875, + "logps/rejected": -167.15966796875, + "loss": 0.4641437530517578, + "rewards/chosen": 0.3409790754318237, + "rewards/margins": 0.22917660474777218, + "rewards/rejected": 0.11180247068405151, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 0.4908677637577057, + "kl": 4.329981803894043, + "learning_rate": 1.9133333333333334e-06, + "logits/chosen": 28728278.4, + "logits/rejected": 29006166.4, + "logps/chosen": -171.674267578125, + "logps/rejected": -151.65748291015626, + "loss": 0.4317901611328125, + "rewards/chosen": 0.4740726947784424, + "rewards/margins": 0.6063881039619445, + "rewards/rejected": -0.1323154091835022, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 0.5628176927566528, + "kl": 3.917667865753174, + "learning_rate": 1.8911111111111114e-06, + "logits/chosen": 41002073.6, + "logits/rejected": 38901564.8, + "logps/chosen": -146.1817138671875, + "logps/rejected": -137.590380859375, + "loss": 0.44250779151916503, + "rewards/chosen": 0.36355061531066896, + "rewards/margins": 0.5350786447525024, + "rewards/rejected": -0.17152802944183348, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 0.4226330816745758, + "kl": 4.557765007019043, + "learning_rate": 1.868888888888889e-06, + "logits/chosen": 41738940.8, + "logits/rejected": 39382457.6, + "logps/chosen": -144.0579833984375, + "logps/rejected": -149.93160400390624, + "loss": 0.4471259117126465, + "rewards/chosen": 0.4530649662017822, + "rewards/margins": 0.5140628039836883, + "rewards/rejected": -0.06099783778190613, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 0.726682186126709, + "kl": 2.606447219848633, + "learning_rate": 1.8466666666666668e-06, + "logits/chosen": 28843798.4, + "logits/rejected": 28256953.6, + "logps/chosen": -162.8355224609375, + "logps/rejected": -135.0196044921875, + "loss": 0.4602541923522949, + "rewards/chosen": 0.034113740921020506, + "rewards/margins": 0.3491029262542724, + "rewards/rejected": -0.31498918533325193, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4371758997440338, + "kl": 3.8837997913360596, + "learning_rate": 1.8244444444444445e-06, + "logits/chosen": 30105264.0, + "logits/rejected": 30200156.8, + "logps/chosen": -169.60362548828124, + "logps/rejected": -121.60936279296875, + "loss": 0.4653130054473877, + "rewards/chosen": 0.13300797939300538, + "rewards/margins": 0.2890047550201416, + "rewards/rejected": -0.15599677562713624, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.7084026336669922, + "kl": 3.9908013343811035, + "learning_rate": 1.8022222222222225e-06, + "logits/chosen": 40353881.6, + "logits/rejected": 41124192.0, + "logps/chosen": -145.57255859375, + "logps/rejected": -169.683984375, + "loss": 0.46573567390441895, + "rewards/chosen": 0.37704455852508545, + "rewards/margins": 0.30847471952438354, + "rewards/rejected": 0.0685698390007019, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.560882031917572, + "kl": 3.73456072807312, + "learning_rate": 1.7800000000000001e-06, + "logits/chosen": 31647622.4, + "logits/rejected": 32944563.2, + "logps/chosen": -96.58458251953125, + "logps/rejected": -156.17933349609376, + "loss": 0.47052454948425293, + "rewards/chosen": 0.26383423805236816, + "rewards/margins": 0.31988897919654846, + "rewards/rejected": -0.0560547411441803, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 4.715727806091309, + "eval_logits/chosen": 37336121.344, + "eval_logits/rejected": 37191081.984, + "eval_logps/chosen": -151.06928125, + "eval_logps/rejected": -146.466265625, + "eval_loss": 0.4871442914009094, + "eval_rewards/chosen": 0.3256500549316406, + "eval_rewards/margins": 0.1080040283203125, + "eval_rewards/rejected": 0.21764602661132812, + "eval_runtime": 217.8394, + "eval_samples_per_second": 4.591, + "eval_steps_per_second": 2.295, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5111773014068604, + "kl": 5.973706245422363, + "learning_rate": 1.757777777777778e-06, + "logits/chosen": 43906630.4, + "logits/rejected": 41141516.8, + "logps/chosen": -158.88671875, + "logps/rejected": -179.5316650390625, + "loss": 0.4583888530731201, + "rewards/chosen": 0.5715279579162598, + "rewards/margins": 0.4317225098609924, + "rewards/rejected": 0.13980544805526735, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 0.4663240611553192, + "kl": 4.6347246170043945, + "learning_rate": 1.7355555555555555e-06, + "logits/chosen": 49689798.4, + "logits/rejected": 46981590.4, + "logps/chosen": -171.759765625, + "logps/rejected": -179.522802734375, + "loss": 0.47397675514221194, + "rewards/chosen": 0.18588199615478515, + "rewards/margins": 0.39475393295288086, + "rewards/rejected": -0.2088719367980957, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 0.5763538479804993, + "kl": 4.0374932289123535, + "learning_rate": 1.7133333333333336e-06, + "logits/chosen": 41851731.2, + "logits/rejected": 41044272.0, + "logps/chosen": -149.54090576171876, + "logps/rejected": -163.9952880859375, + "loss": 0.4446412563323975, + "rewards/chosen": 0.29855611324310305, + "rewards/margins": 0.5403349876403809, + "rewards/rejected": -0.24177887439727783, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 0.40721967816352844, + "kl": 4.595529556274414, + "learning_rate": 1.6911111111111112e-06, + "logits/chosen": 27241497.6, + "logits/rejected": 25061027.2, + "logps/chosen": -144.4770751953125, + "logps/rejected": -148.99678955078124, + "loss": 0.441908073425293, + "rewards/chosen": 0.47647829055786134, + "rewards/margins": 0.662132203578949, + "rewards/rejected": -0.18565391302108764, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 0.5112435221672058, + "kl": 6.786820411682129, + "learning_rate": 1.668888888888889e-06, + "logits/chosen": 43438329.6, + "logits/rejected": 42274822.4, + "logps/chosen": -173.9513916015625, + "logps/rejected": -128.84437255859376, + "loss": 0.44019775390625, + "rewards/chosen": 0.8855677604675293, + "rewards/margins": 0.5187113761901856, + "rewards/rejected": 0.36685638427734374, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 0.6391093134880066, + "kl": 3.4943454265594482, + "learning_rate": 1.6466666666666666e-06, + "logits/chosen": 46858697.6, + "logits/rejected": 47063660.8, + "logps/chosen": -143.279248046875, + "logps/rejected": -161.62069091796874, + "loss": 0.4441429615020752, + "rewards/chosen": 0.3917685985565186, + "rewards/margins": 0.5121995925903321, + "rewards/rejected": -0.12043099403381348, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 0.5220089554786682, + "kl": 5.633955955505371, + "learning_rate": 1.6244444444444447e-06, + "logits/chosen": 45115747.2, + "logits/rejected": 43860156.8, + "logps/chosen": -153.03194580078124, + "logps/rejected": -162.67841796875, + "loss": 0.4590646743774414, + "rewards/chosen": 0.6448601245880127, + "rewards/margins": 0.43718719482421875, + "rewards/rejected": 0.20767292976379395, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 0.5118013024330139, + "kl": 4.387326240539551, + "learning_rate": 1.6022222222222223e-06, + "logits/chosen": 40869132.8, + "logits/rejected": 39574160.0, + "logps/chosen": -138.70467529296874, + "logps/rejected": -177.04256591796874, + "loss": 0.4694656848907471, + "rewards/chosen": 0.32769317626953126, + "rewards/margins": 0.32016055583953856, + "rewards/rejected": 0.0075326204299926754, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 0.7699334025382996, + "kl": 5.964260578155518, + "learning_rate": 1.5800000000000001e-06, + "logits/chosen": 33789193.6, + "logits/rejected": 32414848.0, + "logps/chosen": -144.8453857421875, + "logps/rejected": -156.7958740234375, + "loss": 0.43700370788574217, + "rewards/chosen": 0.7253459453582763, + "rewards/margins": 0.5568214774131774, + "rewards/rejected": 0.16852446794509887, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 0.569644570350647, + "kl": 4.964392185211182, + "learning_rate": 1.5577777777777777e-06, + "logits/chosen": 41654611.2, + "logits/rejected": 42416057.6, + "logps/chosen": -150.742919921875, + "logps/rejected": -167.503076171875, + "loss": 0.46422877311706545, + "rewards/chosen": 0.4968874931335449, + "rewards/margins": 0.40884148478508, + "rewards/rejected": 0.08804600834846496, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_kl": 4.746038436889648, + "eval_logits/chosen": 38021050.368, + "eval_logits/rejected": 37875998.72, + "eval_logps/chosen": -150.81178125, + "eval_logps/rejected": -146.26340625, + "eval_loss": 0.48678913712501526, + "eval_rewards/chosen": 0.3514009094238281, + "eval_rewards/margins": 0.11346868896484374, + "eval_rewards/rejected": 0.23793222045898438, + "eval_runtime": 217.6136, + "eval_samples_per_second": 4.595, + "eval_steps_per_second": 2.298, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 0.5262874960899353, + "kl": 4.751598358154297, + "learning_rate": 1.5355555555555558e-06, + "logits/chosen": 34054140.8, + "logits/rejected": 33053715.2, + "logps/chosen": -159.1825439453125, + "logps/rejected": -150.52947998046875, + "loss": 0.45218782424926757, + "rewards/chosen": 0.4885563850402832, + "rewards/margins": 0.42950677275657656, + "rewards/rejected": 0.059049612283706664, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 0.5098588466644287, + "kl": 4.014686584472656, + "learning_rate": 1.5133333333333334e-06, + "logits/chosen": 40354809.6, + "logits/rejected": 39480486.4, + "logps/chosen": -145.28863525390625, + "logps/rejected": -135.333837890625, + "loss": 0.43676166534423827, + "rewards/chosen": 0.4947031021118164, + "rewards/margins": 0.6355077743530273, + "rewards/rejected": -0.14080467224121093, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 0.448231041431427, + "kl": 5.823625564575195, + "learning_rate": 1.4911111111111112e-06, + "logits/chosen": 47668928.0, + "logits/rejected": 45680892.8, + "logps/chosen": -143.58907470703124, + "logps/rejected": -157.80914306640625, + "loss": 0.44497880935668943, + "rewards/chosen": 0.631040382385254, + "rewards/margins": 0.497272527217865, + "rewards/rejected": 0.13376785516738893, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 0.5563249588012695, + "kl": 5.55691385269165, + "learning_rate": 1.468888888888889e-06, + "logits/chosen": 30804796.8, + "logits/rejected": 30690835.2, + "logps/chosen": -124.4900146484375, + "logps/rejected": -133.585205078125, + "loss": 0.444712495803833, + "rewards/chosen": 0.6877860069274903, + "rewards/margins": 0.4855673313140869, + "rewards/rejected": 0.20221867561340331, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 0.6280549168586731, + "kl": 4.7264180183410645, + "learning_rate": 1.4466666666666669e-06, + "logits/chosen": 29776838.4, + "logits/rejected": 31885555.2, + "logps/chosen": -145.07203369140626, + "logps/rejected": -149.7443115234375, + "loss": 0.4761053562164307, + "rewards/chosen": 0.3966336488723755, + "rewards/margins": 0.2293717384338379, + "rewards/rejected": 0.1672619104385376, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 0.5127621293067932, + "kl": 6.063115119934082, + "learning_rate": 1.4244444444444447e-06, + "logits/chosen": 46517398.4, + "logits/rejected": 43311113.6, + "logps/chosen": -189.47987060546876, + "logps/rejected": -170.6665283203125, + "loss": 0.42708525657653806, + "rewards/chosen": 0.7080463409423828, + "rewards/margins": 0.7631270289421082, + "rewards/rejected": -0.05508068799972534, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 0.5822389125823975, + "kl": 4.272950649261475, + "learning_rate": 1.4022222222222223e-06, + "logits/chosen": 25695438.4, + "logits/rejected": 24746280.0, + "logps/chosen": -137.86600341796876, + "logps/rejected": -137.41337890625, + "loss": 0.4546250343322754, + "rewards/chosen": 0.42215428352355955, + "rewards/margins": 0.4513491034507751, + "rewards/rejected": -0.029194819927215575, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 0.5575308799743652, + "kl": 5.757713794708252, + "learning_rate": 1.3800000000000001e-06, + "logits/chosen": 39392422.4, + "logits/rejected": 40771721.6, + "logps/chosen": -136.75103759765625, + "logps/rejected": -172.27413330078124, + "loss": 0.464780330657959, + "rewards/chosen": 0.6209693908691406, + "rewards/margins": 0.2982433319091797, + "rewards/rejected": 0.3227260589599609, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 0.7086930274963379, + "kl": 5.1618475914001465, + "learning_rate": 1.357777777777778e-06, + "logits/chosen": 40175395.2, + "logits/rejected": 39745542.4, + "logps/chosen": -187.0331298828125, + "logps/rejected": -151.17127685546876, + "loss": 0.45111641883850095, + "rewards/chosen": 0.4993483543395996, + "rewards/margins": 0.43962204456329346, + "rewards/rejected": 0.05972630977630615, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 0.5889289379119873, + "kl": 7.153553009033203, + "learning_rate": 1.3355555555555558e-06, + "logits/chosen": 43437193.6, + "logits/rejected": 41387232.0, + "logps/chosen": -122.5697265625, + "logps/rejected": -132.28131103515625, + "loss": 0.4659090042114258, + "rewards/chosen": 0.7089588165283203, + "rewards/margins": 0.31116189956665036, + "rewards/rejected": 0.3977969169616699, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_kl": 4.760587692260742, + "eval_logits/chosen": 38344187.904, + "eval_logits/rejected": 38206853.12, + "eval_logps/chosen": -150.743921875, + "eval_logps/rejected": -146.278265625, + "eval_loss": 0.4858584403991699, + "eval_rewards/chosen": 0.3581858215332031, + "eval_rewards/margins": 0.12174023437499998, + "eval_rewards/rejected": 0.23644558715820313, + "eval_runtime": 218.092, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 2.293, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 0.4249567687511444, + "kl": 6.2131242752075195, + "learning_rate": 1.3133333333333334e-06, + "logits/chosen": 57013689.6, + "logits/rejected": 56808352.0, + "logps/chosen": -164.70875244140626, + "logps/rejected": -132.06815185546876, + "loss": 0.4498802661895752, + "rewards/chosen": 0.7213836669921875, + "rewards/margins": 0.4230734348297119, + "rewards/rejected": 0.2983102321624756, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 0.549889862537384, + "kl": 6.712057590484619, + "learning_rate": 1.2911111111111112e-06, + "logits/chosen": 42846454.4, + "logits/rejected": 42999248.0, + "logps/chosen": -177.64530029296876, + "logps/rejected": -184.1470458984375, + "loss": 0.43409342765808107, + "rewards/chosen": 0.728582763671875, + "rewards/margins": 0.6158596277236938, + "rewards/rejected": 0.11272313594818115, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 0.5649115443229675, + "kl": 4.235246658325195, + "learning_rate": 1.268888888888889e-06, + "logits/chosen": 41943692.8, + "logits/rejected": 41250208.0, + "logps/chosen": -142.4334228515625, + "logps/rejected": -127.446923828125, + "loss": 0.4541294574737549, + "rewards/chosen": 0.42052087783813474, + "rewards/margins": 0.38021968901157377, + "rewards/rejected": 0.040301188826560974, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 0.42543667554855347, + "kl": 5.350770473480225, + "learning_rate": 1.2466666666666667e-06, + "logits/chosen": 43400393.6, + "logits/rejected": 40400710.4, + "logps/chosen": -155.03704833984375, + "logps/rejected": -177.0060791015625, + "loss": 0.46230545043945315, + "rewards/chosen": 0.562045955657959, + "rewards/margins": 0.3403463363647461, + "rewards/rejected": 0.22169961929321289, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 0.5134297609329224, + "kl": 4.304908752441406, + "learning_rate": 1.2244444444444445e-06, + "logits/chosen": 36589369.6, + "logits/rejected": 34298777.6, + "logps/chosen": -154.47265625, + "logps/rejected": -144.5512939453125, + "loss": 0.4730405330657959, + "rewards/chosen": 0.354435133934021, + "rewards/margins": 0.25211869478225707, + "rewards/rejected": 0.10231643915176392, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 0.46365997195243835, + "kl": 6.416839599609375, + "learning_rate": 1.2022222222222223e-06, + "logits/chosen": 40177619.2, + "logits/rejected": 39313078.4, + "logps/chosen": -168.4650146484375, + "logps/rejected": -134.8406494140625, + "loss": 0.42948031425476074, + "rewards/chosen": 0.8725629806518554, + "rewards/margins": 0.6291991949081421, + "rewards/rejected": 0.24336378574371337, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 0.4326087534427643, + "kl": 3.9851531982421875, + "learning_rate": 1.1800000000000001e-06, + "logits/chosen": 32055046.4, + "logits/rejected": 32860064.0, + "logps/chosen": -120.6618408203125, + "logps/rejected": -125.195166015625, + "loss": 0.4756101131439209, + "rewards/chosen": 0.14163222312927246, + "rewards/margins": 0.17141112685203552, + "rewards/rejected": -0.02977890372276306, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 0.4545738697052002, + "kl": 5.087003231048584, + "learning_rate": 1.1577777777777778e-06, + "logits/chosen": 35810121.6, + "logits/rejected": 32762137.6, + "logps/chosen": -149.88338623046874, + "logps/rejected": -155.4554443359375, + "loss": 0.45157780647277834, + "rewards/chosen": 0.6308297634124755, + "rewards/margins": 0.421593952178955, + "rewards/rejected": 0.2092358112335205, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 0.5136933326721191, + "kl": 6.449606418609619, + "learning_rate": 1.1355555555555558e-06, + "logits/chosen": 35638320.0, + "logits/rejected": 33466137.6, + "logps/chosen": -152.856396484375, + "logps/rejected": -183.138525390625, + "loss": 0.4553223133087158, + "rewards/chosen": 0.6646287918090821, + "rewards/margins": 0.44561595916748054, + "rewards/rejected": 0.21901283264160157, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 0.3154851198196411, + "kl": 5.555708885192871, + "learning_rate": 1.1133333333333334e-06, + "logits/chosen": 39483830.4, + "logits/rejected": 38031814.4, + "logps/chosen": -155.9849609375, + "logps/rejected": -183.7311279296875, + "loss": 0.4708412647247314, + "rewards/chosen": 0.4511248588562012, + "rewards/margins": 0.33574488162994387, + "rewards/rejected": 0.11537997722625733, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_kl": 4.071971416473389, + "eval_logits/chosen": 37006954.496, + "eval_logits/rejected": 36902076.416, + "eval_logps/chosen": -152.112921875, + "eval_logps/rejected": -147.68203125, + "eval_loss": 0.48551830649375916, + "eval_rewards/chosen": 0.22128521728515624, + "eval_rewards/margins": 0.12521478271484374, + "eval_rewards/rejected": 0.0960704345703125, + "eval_runtime": 218.7826, + "eval_samples_per_second": 4.571, + "eval_steps_per_second": 2.285, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 0.5318990349769592, + "kl": 3.604361057281494, + "learning_rate": 1.0911111111111112e-06, + "logits/chosen": 36286432.0, + "logits/rejected": 37285971.2, + "logps/chosen": -113.43314208984376, + "logps/rejected": -124.557763671875, + "loss": 0.464507007598877, + "rewards/chosen": 0.35673577785491944, + "rewards/margins": 0.2908350646495819, + "rewards/rejected": 0.06590071320533752, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 0.544118344783783, + "kl": 3.670469284057617, + "learning_rate": 1.068888888888889e-06, + "logits/chosen": 45186656.0, + "logits/rejected": 45977584.0, + "logps/chosen": -139.073681640625, + "logps/rejected": -153.00821533203126, + "loss": 0.4433259963989258, + "rewards/chosen": 0.42419872283935545, + "rewards/margins": 0.5132439255714416, + "rewards/rejected": -0.08904520273208619, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 0.5683560967445374, + "kl": 2.751370906829834, + "learning_rate": 1.0466666666666669e-06, + "logits/chosen": 42249769.6, + "logits/rejected": 43098508.8, + "logps/chosen": -158.5843017578125, + "logps/rejected": -144.552197265625, + "loss": 0.4399724960327148, + "rewards/chosen": 0.42113256454467773, + "rewards/margins": 0.5389934659004212, + "rewards/rejected": -0.11786090135574341, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.5233821272850037, + "kl": 2.9688010215759277, + "learning_rate": 1.0244444444444445e-06, + "logits/chosen": 31636636.8, + "logits/rejected": 27467308.8, + "logps/chosen": -143.64234619140626, + "logps/rejected": -139.1869140625, + "loss": 0.442952299118042, + "rewards/chosen": 0.18238863945007325, + "rewards/margins": 0.5509385347366333, + "rewards/rejected": -0.36854989528656007, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.6982712745666504, + "kl": 2.9157755374908447, + "learning_rate": 1.0022222222222223e-06, + "logits/chosen": 29222934.4, + "logits/rejected": 28866115.2, + "logps/chosen": -145.965185546875, + "logps/rejected": -134.71815185546876, + "loss": 0.46454343795776365, + "rewards/chosen": 0.13749135732650758, + "rewards/margins": 0.32013021707534794, + "rewards/rejected": -0.18263885974884034, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.5256077647209167, + "kl": 5.147567272186279, + "learning_rate": 9.800000000000001e-07, + "logits/chosen": 39800131.2, + "logits/rejected": 40214745.6, + "logps/chosen": -153.8827880859375, + "logps/rejected": -155.6109375, + "loss": 0.44734792709350585, + "rewards/chosen": 0.595530891418457, + "rewards/margins": 0.46844919919967654, + "rewards/rejected": 0.12708169221878052, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.5332716703414917, + "kl": 2.908353328704834, + "learning_rate": 9.57777777777778e-07, + "logits/chosen": 25019286.4, + "logits/rejected": 26015075.2, + "logps/chosen": -153.6703857421875, + "logps/rejected": -128.41676025390626, + "loss": 0.47053236961364747, + "rewards/chosen": 0.07076652646064759, + "rewards/margins": 0.24093337655067443, + "rewards/rejected": -0.17016685009002686, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.6310443878173828, + "kl": 3.8460822105407715, + "learning_rate": 9.355555555555557e-07, + "logits/chosen": 30489289.6, + "logits/rejected": 29658633.6, + "logps/chosen": -157.56915283203125, + "logps/rejected": -126.709619140625, + "loss": 0.47043805122375487, + "rewards/chosen": 0.23312182426452638, + "rewards/margins": 0.2518186703324318, + "rewards/rejected": -0.018696846067905427, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.6265914440155029, + "kl": 2.628873109817505, + "learning_rate": 9.133333333333334e-07, + "logits/chosen": 38132934.4, + "logits/rejected": 37865273.6, + "logps/chosen": -150.04840087890625, + "logps/rejected": -152.45029296875, + "loss": 0.4601998805999756, + "rewards/chosen": 0.1351819634437561, + "rewards/margins": 0.29193094968795774, + "rewards/rejected": -0.15674898624420167, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.7189147472381592, + "kl": 4.683299541473389, + "learning_rate": 8.911111111111112e-07, + "logits/chosen": 39250364.8, + "logits/rejected": 37806796.8, + "logps/chosen": -166.75601806640626, + "logps/rejected": -189.86864013671874, + "loss": 0.47463297843933105, + "rewards/chosen": 0.1896621823310852, + "rewards/margins": 0.1576364517211914, + "rewards/rejected": 0.0320257306098938, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.463610887527466, + "eval_logits/chosen": 35089735.68, + "eval_logits/rejected": 35033989.12, + "eval_logps/chosen": -153.677015625, + "eval_logps/rejected": -149.26678125, + "eval_loss": 0.4853117763996124, + "eval_rewards/chosen": 0.06487516784667968, + "eval_rewards/margins": 0.1272816162109375, + "eval_rewards/rejected": -0.062406448364257815, + "eval_runtime": 217.5278, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.299, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 0.6290340423583984, + "kl": 4.315595626831055, + "learning_rate": 8.68888888888889e-07, + "logits/chosen": 31404550.4, + "logits/rejected": 29785132.8, + "logps/chosen": -161.92845458984374, + "logps/rejected": -143.69949951171876, + "loss": 0.4567877292633057, + "rewards/chosen": 0.3545663356781006, + "rewards/margins": 0.38114327788352964, + "rewards/rejected": -0.026576942205429076, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 0.5943707227706909, + "kl": 2.6047091484069824, + "learning_rate": 8.466666666666668e-07, + "logits/chosen": 35495555.2, + "logits/rejected": 33073846.4, + "logps/chosen": -137.365478515625, + "logps/rejected": -147.0032958984375, + "loss": 0.4664917469024658, + "rewards/chosen": -0.16478813886642457, + "rewards/margins": 0.23700910806655884, + "rewards/rejected": -0.4017972469329834, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 0.528068482875824, + "kl": 3.3957126140594482, + "learning_rate": 8.244444444444445e-07, + "logits/chosen": 31661328.0, + "logits/rejected": 30206838.4, + "logps/chosen": -162.5927978515625, + "logps/rejected": -134.8358642578125, + "loss": 0.46093249320983887, + "rewards/chosen": 0.17535465955734253, + "rewards/margins": 0.3643703818321228, + "rewards/rejected": -0.18901572227478028, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 0.46279260516166687, + "kl": 2.3419876098632812, + "learning_rate": 8.022222222222223e-07, + "logits/chosen": 42605856.0, + "logits/rejected": 40943014.4, + "logps/chosen": -132.12677001953125, + "logps/rejected": -135.65135498046874, + "loss": 0.45622806549072265, + "rewards/chosen": 0.04011918306350708, + "rewards/margins": 0.4230758786201477, + "rewards/rejected": -0.38295669555664064, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 0.5701712369918823, + "kl": 4.020439147949219, + "learning_rate": 7.8e-07, + "logits/chosen": 38577043.2, + "logits/rejected": 39374691.2, + "logps/chosen": -170.36212158203125, + "logps/rejected": -161.6606201171875, + "loss": 0.45922436714172366, + "rewards/chosen": 0.09237505197525024, + "rewards/margins": 0.30426751375198363, + "rewards/rejected": -0.2118924617767334, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 0.5047979354858398, + "kl": 2.7111480236053467, + "learning_rate": 7.577777777777779e-07, + "logits/chosen": 33372905.6, + "logits/rejected": 33328956.8, + "logps/chosen": -163.45003662109374, + "logps/rejected": -172.47750244140624, + "loss": 0.465222692489624, + "rewards/chosen": 0.034078240394592285, + "rewards/margins": 0.4608752965927124, + "rewards/rejected": -0.42679705619812014, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 0.43878477811813354, + "kl": 2.7707526683807373, + "learning_rate": 7.355555555555556e-07, + "logits/chosen": 34746457.6, + "logits/rejected": 31707171.2, + "logps/chosen": -131.509716796875, + "logps/rejected": -149.54019775390626, + "loss": 0.46474738121032716, + "rewards/chosen": 0.06415605545043945, + "rewards/margins": 0.39272706508636473, + "rewards/rejected": -0.3285710096359253, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 0.5701454877853394, + "kl": 2.9775280952453613, + "learning_rate": 7.133333333333334e-07, + "logits/chosen": 31648003.2, + "logits/rejected": 31955971.2, + "logps/chosen": -151.3242919921875, + "logps/rejected": -148.98272705078125, + "loss": 0.4547208309173584, + "rewards/chosen": 0.16725053787231445, + "rewards/margins": 0.45224099159240727, + "rewards/rejected": -0.2849904537200928, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 0.36173462867736816, + "kl": 3.2897815704345703, + "learning_rate": 6.911111111111111e-07, + "logits/chosen": 28085676.8, + "logits/rejected": 25880251.2, + "logps/chosen": -133.48956298828125, + "logps/rejected": -151.6326904296875, + "loss": 0.45902628898620607, + "rewards/chosen": 0.06616134643554687, + "rewards/margins": 0.43392994403839114, + "rewards/rejected": -0.36776859760284425, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 0.4635393023490906, + "kl": 4.791772365570068, + "learning_rate": 6.68888888888889e-07, + "logits/chosen": 39854400.0, + "logits/rejected": 43266041.6, + "logps/chosen": -180.304736328125, + "logps/rejected": -158.53660888671874, + "loss": 0.44127936363220216, + "rewards/chosen": 0.3505941152572632, + "rewards/margins": 0.4699846982955933, + "rewards/rejected": -0.11939058303833008, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_kl": 3.170285701751709, + "eval_logits/chosen": 34094372.864, + "eval_logits/rejected": 34072373.248, + "eval_logps/chosen": -154.5265625, + "eval_logps/rejected": -150.1290625, + "eval_loss": 0.48520490527153015, + "eval_rewards/chosen": -0.020077211380004883, + "eval_rewards/margins": 0.12855766105651856, + "eval_rewards/rejected": -0.14863487243652343, + "eval_runtime": 217.5482, + "eval_samples_per_second": 4.597, + "eval_steps_per_second": 2.298, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 0.6011971831321716, + "kl": 3.542525053024292, + "learning_rate": 6.466666666666667e-07, + "logits/chosen": 36977337.6, + "logits/rejected": 38393292.8, + "logps/chosen": -135.9339599609375, + "logps/rejected": -150.27783203125, + "loss": 0.4888314723968506, + "rewards/chosen": 0.07737842798233033, + "rewards/margins": 0.11252884268760682, + "rewards/rejected": -0.03515041470527649, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 0.5993504524230957, + "kl": 3.1693031787872314, + "learning_rate": 6.244444444444445e-07, + "logits/chosen": 30479276.8, + "logits/rejected": 29417516.8, + "logps/chosen": -132.84188232421874, + "logps/rejected": -119.2605224609375, + "loss": 0.46071271896362304, + "rewards/chosen": 0.22380545139312744, + "rewards/margins": 0.42163221836090087, + "rewards/rejected": -0.19782676696777343, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 0.6010851263999939, + "kl": 5.0478620529174805, + "learning_rate": 6.022222222222223e-07, + "logits/chosen": 38324892.8, + "logits/rejected": 37791113.6, + "logps/chosen": -135.21259765625, + "logps/rejected": -157.90794677734374, + "loss": 0.44549560546875, + "rewards/chosen": 0.5348263740539551, + "rewards/margins": 0.5059731423854827, + "rewards/rejected": 0.02885323166847229, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 0.6057806611061096, + "kl": 2.268434762954712, + "learning_rate": 5.800000000000001e-07, + "logits/chosen": 31744332.8, + "logits/rejected": 30286156.8, + "logps/chosen": -124.35537109375, + "logps/rejected": -136.41668701171875, + "loss": 0.44939751625061036, + "rewards/chosen": 0.14811928272247316, + "rewards/margins": 0.4689765214920044, + "rewards/rejected": -0.32085723876953126, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 0.4038269817829132, + "kl": 3.5659327507019043, + "learning_rate": 5.577777777777779e-07, + "logits/chosen": 31686905.6, + "logits/rejected": 32978937.6, + "logps/chosen": -156.446826171875, + "logps/rejected": -122.14853515625, + "loss": 0.45820083618164065, + "rewards/chosen": 0.2354212760925293, + "rewards/margins": 0.3473649501800537, + "rewards/rejected": -0.11194367408752441, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 0.7359764575958252, + "kl": 4.224055290222168, + "learning_rate": 5.355555555555556e-07, + "logits/chosen": 28065552.0, + "logits/rejected": 29463145.6, + "logps/chosen": -135.04927978515624, + "logps/rejected": -187.2267333984375, + "loss": 0.4682769298553467, + "rewards/chosen": 0.16567325592041016, + "rewards/margins": 0.28882311582565307, + "rewards/rejected": -0.12314985990524292, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.6170231699943542, + "kl": 3.4734268188476562, + "learning_rate": 5.133333333333334e-07, + "logits/chosen": 28978995.2, + "logits/rejected": 28156457.6, + "logps/chosen": -151.6166015625, + "logps/rejected": -139.56959228515626, + "loss": 0.4609676837921143, + "rewards/chosen": 0.0773462176322937, + "rewards/margins": 0.42135525941848756, + "rewards/rejected": -0.34400904178619385, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.5378606915473938, + "kl": 4.157721519470215, + "learning_rate": 4.911111111111112e-07, + "logits/chosen": 35223811.2, + "logits/rejected": 34011692.8, + "logps/chosen": -152.407470703125, + "logps/rejected": -153.23616943359374, + "loss": 0.4477241516113281, + "rewards/chosen": 0.24660811424255372, + "rewards/margins": 0.40975589752197267, + "rewards/rejected": -0.16314778327941895, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6521180868148804, + "kl": 3.4054481983184814, + "learning_rate": 4.688888888888889e-07, + "logits/chosen": 27037875.2, + "logits/rejected": 27271260.8, + "logps/chosen": -189.535595703125, + "logps/rejected": -115.60257568359376, + "loss": 0.47844581604003905, + "rewards/chosen": 0.05722663402557373, + "rewards/margins": 0.14552825689315796, + "rewards/rejected": -0.08830162286758422, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.6569052338600159, + "kl": 3.141408681869507, + "learning_rate": 4.466666666666667e-07, + "logits/chosen": 29871747.2, + "logits/rejected": 26977977.6, + "logps/chosen": -127.3733154296875, + "logps/rejected": -161.679931640625, + "loss": 0.464168119430542, + "rewards/chosen": 0.05878195762634277, + "rewards/margins": 0.390771484375, + "rewards/rejected": -0.3319895267486572, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.0008513927459717, + "eval_logits/chosen": 33053526.016, + "eval_logits/rejected": 33058701.312, + "eval_logps/chosen": -155.151625, + "eval_logps/rejected": -150.761421875, + "eval_loss": 0.4851257801055908, + "eval_rewards/chosen": -0.08258457946777344, + "eval_rewards/margins": 0.12928588867187502, + "eval_rewards/rejected": -0.21187046813964844, + "eval_runtime": 217.6558, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.4972571134567261, + "kl": 3.18681001663208, + "learning_rate": 4.2444444444444447e-07, + "logits/chosen": 39937222.4, + "logits/rejected": 36850470.4, + "logps/chosen": -171.2765625, + "logps/rejected": -189.08856201171875, + "loss": 0.4898653507232666, + "rewards/chosen": -0.06906133890151978, + "rewards/margins": 0.13194493055343628, + "rewards/rejected": -0.20100626945495606, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.6851227879524231, + "kl": 3.3901615142822266, + "learning_rate": 4.0222222222222224e-07, + "logits/chosen": 32688179.2, + "logits/rejected": 33617596.8, + "logps/chosen": -140.2035888671875, + "logps/rejected": -168.17904052734374, + "loss": 0.4512950420379639, + "rewards/chosen": 0.20932047367095946, + "rewards/margins": 0.5313280582427978, + "rewards/rejected": -0.32200758457183837, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.40994900465011597, + "kl": 2.513051748275757, + "learning_rate": 3.8e-07, + "logits/chosen": 36529526.4, + "logits/rejected": 38165760.0, + "logps/chosen": -154.76318359375, + "logps/rejected": -137.9931640625, + "loss": 0.48479623794555665, + "rewards/chosen": -0.07438920736312866, + "rewards/margins": 0.09523203372955323, + "rewards/rejected": -0.1696212410926819, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.7032968997955322, + "kl": 2.689875841140747, + "learning_rate": 3.577777777777778e-07, + "logits/chosen": 27584521.6, + "logits/rejected": 27075878.4, + "logps/chosen": -143.9324951171875, + "logps/rejected": -115.68133544921875, + "loss": 0.47793827056884763, + "rewards/chosen": -0.008073312044143677, + "rewards/margins": 0.23053821921348572, + "rewards/rejected": -0.2386115312576294, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 0.6183348298072815, + "kl": 2.5839390754699707, + "learning_rate": 3.3555555555555556e-07, + "logits/chosen": 33576460.8, + "logits/rejected": 31820140.8, + "logps/chosen": -136.1211669921875, + "logps/rejected": -175.8531005859375, + "loss": 0.4491901397705078, + "rewards/chosen": -0.012712603807449341, + "rewards/margins": 0.5766294658184051, + "rewards/rejected": -0.5893420696258544, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 0.5491041541099548, + "kl": 2.9139962196350098, + "learning_rate": 3.1333333333333333e-07, + "logits/chosen": 36256649.6, + "logits/rejected": 37576332.8, + "logps/chosen": -116.727734375, + "logps/rejected": -184.8653564453125, + "loss": 0.48710999488830564, + "rewards/chosen": -0.05998457670211792, + "rewards/margins": 0.14093006849288942, + "rewards/rejected": -0.20091464519500732, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 0.6111120581626892, + "kl": 3.809281587600708, + "learning_rate": 2.9111111111111116e-07, + "logits/chosen": 46606099.2, + "logits/rejected": 45940393.6, + "logps/chosen": -147.1123779296875, + "logps/rejected": -163.34154052734374, + "loss": 0.4475499153137207, + "rewards/chosen": 0.2357264995574951, + "rewards/margins": 0.5424099922180176, + "rewards/rejected": -0.30668349266052247, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 0.6118114590644836, + "kl": 3.6302967071533203, + "learning_rate": 2.6888888888888893e-07, + "logits/chosen": 36722233.6, + "logits/rejected": 35542681.6, + "logps/chosen": -177.38359375, + "logps/rejected": -147.4840087890625, + "loss": 0.45297937393188475, + "rewards/chosen": 0.19647778272628785, + "rewards/margins": 0.4641849398612976, + "rewards/rejected": -0.26770715713500975, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 0.4131172001361847, + "kl": 3.6233086585998535, + "learning_rate": 2.466666666666667e-07, + "logits/chosen": 24529001.6, + "logits/rejected": 24586342.4, + "logps/chosen": -102.13927001953125, + "logps/rejected": -141.7262451171875, + "loss": 0.47483372688293457, + "rewards/chosen": 0.25803461074829104, + "rewards/margins": 0.2503116071224213, + "rewards/rejected": 0.007723003625869751, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 0.5676055550575256, + "kl": 3.1186647415161133, + "learning_rate": 2.2444444444444445e-07, + "logits/chosen": 48083148.8, + "logits/rejected": 45228051.2, + "logps/chosen": -215.27216796875, + "logps/rejected": -166.07578125, + "loss": 0.46457977294921876, + "rewards/chosen": -0.12358083724975585, + "rewards/margins": 0.29419794082641604, + "rewards/rejected": -0.41777877807617186, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_kl": 2.721818685531616, + "eval_logits/chosen": 32252489.728, + "eval_logits/rejected": 32286822.4, + "eval_logps/chosen": -155.964, + "eval_logps/rejected": -151.586875, + "eval_loss": 0.4850628674030304, + "eval_rewards/chosen": -0.16382406616210937, + "eval_rewards/margins": 0.13059149169921874, + "eval_rewards/rejected": -0.2944155578613281, + "eval_runtime": 218.1028, + "eval_samples_per_second": 4.585, + "eval_steps_per_second": 2.292, + "step": 2400 + }, + { + "epoch": 1.928, + "grad_norm": 0.4706978499889374, + "kl": 2.602571725845337, + "learning_rate": 2.0222222222222222e-07, + "logits/chosen": 29532422.4, + "logits/rejected": 27649926.4, + "logps/chosen": -131.91143798828125, + "logps/rejected": -166.29207763671874, + "loss": 0.44156599044799805, + "rewards/chosen": 0.06720277070999145, + "rewards/margins": 0.5914854168891907, + "rewards/rejected": -0.5242826461791992, + "step": 2410 + }, + { + "epoch": 1.936, + "grad_norm": 0.855604887008667, + "kl": 3.3109848499298096, + "learning_rate": 1.8e-07, + "logits/chosen": 31716038.4, + "logits/rejected": 31982652.8, + "logps/chosen": -132.72335205078124, + "logps/rejected": -160.2704833984375, + "loss": 0.46830215454101565, + "rewards/chosen": -0.11390522718429566, + "rewards/margins": 0.19699796438217163, + "rewards/rejected": -0.3109031915664673, + "step": 2420 + }, + { + "epoch": 1.944, + "grad_norm": 0.6688784956932068, + "kl": 4.26909875869751, + "learning_rate": 1.577777777777778e-07, + "logits/chosen": 33701654.4, + "logits/rejected": 31710796.8, + "logps/chosen": -132.8295166015625, + "logps/rejected": -163.1888427734375, + "loss": 0.46475811004638673, + "rewards/chosen": 0.2561303377151489, + "rewards/margins": 0.3983383774757385, + "rewards/rejected": -0.1422080397605896, + "step": 2430 + }, + { + "epoch": 1.952, + "grad_norm": 0.5260452628135681, + "kl": 2.616454839706421, + "learning_rate": 1.3555555555555557e-07, + "logits/chosen": 32118166.4, + "logits/rejected": 27893014.4, + "logps/chosen": -176.9322021484375, + "logps/rejected": -181.63406982421876, + "loss": 0.43079748153686526, + "rewards/chosen": 0.11745535135269165, + "rewards/margins": 0.750469982624054, + "rewards/rejected": -0.6330146312713623, + "step": 2440 + }, + { + "epoch": 1.96, + "grad_norm": 0.5318523645401001, + "kl": 2.737783908843994, + "learning_rate": 1.1333333333333336e-07, + "logits/chosen": 31735584.0, + "logits/rejected": 29941836.8, + "logps/chosen": -139.506494140625, + "logps/rejected": -155.363623046875, + "loss": 0.46228652000427245, + "rewards/chosen": 0.07383685111999512, + "rewards/margins": 0.39642753601074215, + "rewards/rejected": -0.32259068489074705, + "step": 2450 + }, + { + "epoch": 1.968, + "grad_norm": 0.7542982697486877, + "kl": 2.112974166870117, + "learning_rate": 9.111111111111113e-08, + "logits/chosen": 22150737.6, + "logits/rejected": 24037843.2, + "logps/chosen": -134.1532470703125, + "logps/rejected": -141.33638916015624, + "loss": 0.4738462448120117, + "rewards/chosen": -0.3262148857116699, + "rewards/margins": 0.2743866920471192, + "rewards/rejected": -0.6006015777587891, + "step": 2460 + }, + { + "epoch": 1.976, + "grad_norm": 0.6304248571395874, + "kl": 2.5999197959899902, + "learning_rate": 6.888888888888889e-08, + "logits/chosen": 28936220.8, + "logits/rejected": 26130355.2, + "logps/chosen": -180.4091796875, + "logps/rejected": -194.08143310546876, + "loss": 0.4691489219665527, + "rewards/chosen": -0.28999359607696534, + "rewards/margins": 0.35139133930206295, + "rewards/rejected": -0.6413849353790283, + "step": 2470 + }, + { + "epoch": 1.984, + "grad_norm": 0.5210801362991333, + "kl": 3.1966724395751953, + "learning_rate": 4.6666666666666674e-08, + "logits/chosen": 25672940.8, + "logits/rejected": 25634627.2, + "logps/chosen": -146.8095947265625, + "logps/rejected": -157.967236328125, + "loss": 0.45682740211486816, + "rewards/chosen": -0.015616022050380707, + "rewards/margins": 0.3315398350358009, + "rewards/rejected": -0.34715585708618163, + "step": 2480 + }, + { + "epoch": 1.992, + "grad_norm": 0.5444905757904053, + "kl": 2.159494400024414, + "learning_rate": 2.4444444444444447e-08, + "logits/chosen": 26597969.6, + "logits/rejected": 26869990.4, + "logps/chosen": -123.009423828125, + "logps/rejected": -122.718994140625, + "loss": 0.46950640678405764, + "rewards/chosen": -0.14737781286239623, + "rewards/margins": 0.2778432488441467, + "rewards/rejected": -0.42522106170654295, + "step": 2490 + }, + { + "epoch": 2.0, + "grad_norm": 0.6403466463088989, + "kl": 2.2389731407165527, + "learning_rate": 2.2222222222222225e-09, + "logits/chosen": 20057400.0, + "logits/rejected": 20756556.8, + "logps/chosen": -111.44208984375, + "logps/rejected": -131.98221435546876, + "loss": 0.45931458473205566, + "rewards/chosen": 0.09023119211196899, + "rewards/margins": 0.4025987029075623, + "rewards/rejected": -0.3123675107955933, + "step": 2500 + }, + { + "epoch": 2.0, + "eval_kl": 2.718485116958618, + "eval_logits/chosen": 32230203.392, + "eval_logits/rejected": 32266268.672, + "eval_logps/chosen": -155.978375, + "eval_logps/rejected": -151.607109375, + "eval_loss": 0.48499828577041626, + "eval_rewards/chosen": -0.16525991821289063, + "eval_rewards/margins": 0.13117916870117188, + "eval_rewards/rejected": -0.2964390869140625, + "eval_runtime": 218.9257, + "eval_samples_per_second": 4.568, + "eval_steps_per_second": 2.284, + "step": 2500 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_10k/lora/checkpoint-2500/training_args.bin b/v5/KTO/KTO_10k/lora/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..0a05f8383f95df104b573dd06fde1a6093711cd3 --- /dev/null +++ b/v5/KTO/KTO_10k/lora/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:531e42fed31d279deeb217d9e592c58b0a48be16b726c4baaff52e99873e947a +size 5521 diff --git a/v5/KTO/KTO_1k/MKTO_1k/README.md b/v5/KTO/KTO_1k/MKTO_1k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_1k/MKTO_1k/adapter_config.json b/v5/KTO/KTO_1k/MKTO_1k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f8b4eff733eadb76d7879837f7ae2de2f71f0b --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_1k/MKTO_1k/adapter_model.safetensors b/v5/KTO/KTO_1k/MKTO_1k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa4097423723d027fa383e9fe9a1fcaf2bcbbf2c --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:931f374af0255303593352ba5897ec541521f4cc9b7b1065871bda1e4977cf7e +size 180385008 diff --git a/v5/KTO/KTO_1k/MKTO_1k/chat_template.jinja b/v5/KTO/KTO_1k/MKTO_1k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_1k/MKTO_1k/config.json b/v5/KTO/KTO_1k/MKTO_1k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..269c2ffa2c365f594cb5e44218192c94b419a0cb --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/KTO/KTO_1k/MKTO_1k/generation_config.json b/v5/KTO/KTO_1k/MKTO_1k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9c2224cd391437f7236b3f36305dd39a63ab0a --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.0.0" +} diff --git a/v5/KTO/KTO_1k/MKTO_1k/model.safetensors b/v5/KTO/KTO_1k/MKTO_1k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..92140df5a9d0dd4a59b1ef5b8398f5ce8d852eb5 --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fef5bc25402834dd10aae057635543bbf4773bb5cab786fe64dbd0e860245a4 +size 2471645464 diff --git a/v5/KTO/KTO_1k/MKTO_1k/tokenizer.json b/v5/KTO/KTO_1k/MKTO_1k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_1k/MKTO_1k/tokenizer_config.json b/v5/KTO/KTO_1k/MKTO_1k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_1k/MKTO_1k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_1k/lora/README.md b/v5/KTO/KTO_1k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..15f606baf32b05b7280b5f673744f2b7a7bf968c --- /dev/null +++ b/v5/KTO/KTO_1k/lora/README.md @@ -0,0 +1,67 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- trl +- kto +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/9ceg35pa) + + +This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306). + +### Framework versions + +- TRL: 0.27.2 +- Transformers: 5.0.0 +- Pytorch: 2.8.0+cu128 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite KTO as: + +```bibtex +@article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/README.md b/v5/KTO/KTO_1k/lora/checkpoint-140/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_config.json b/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f8b4eff733eadb76d7879837f7ae2de2f71f0b --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_model.safetensors b/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fa4097423723d027fa383e9fe9a1fcaf2bcbbf2c --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:931f374af0255303593352ba5897ec541521f4cc9b7b1065871bda1e4977cf7e +size 180385008 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/chat_template.jinja b/v5/KTO/KTO_1k/lora/checkpoint-140/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/optimizer.pt b/v5/KTO/KTO_1k/lora/checkpoint-140/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5d45eb9eaec03c92a52a3689920e2b8b9000531 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0290f650bfbc5c60c5c1f5823ca2ace3fabf4111acc52ebad27e72447ab780dc +size 360902475 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/rng_state.pth b/v5/KTO/KTO_1k/lora/checkpoint-140/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dfc4ab5cd207a974628454f0b08694eb243fe80a --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6fa1162bdc16736d0c7285916a3c05e1ecb89fc87e670ea5fe3c4e5fb56f48 +size 14645 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/scaler.pt b/v5/KTO/KTO_1k/lora/checkpoint-140/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..773ecf57c02e9caf7e40a912947516fc19eac8f9 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7175e91c3bb2211fef63ff60d6bcb56330dc06ab925f9c1d641d686b2da06845 +size 1383 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/scheduler.pt b/v5/KTO/KTO_1k/lora/checkpoint-140/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..83280841e33aae200cdf54cf61977113e25950bc --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fbf8c91a170c0e61ace8645b7b9ee7fc4f5f8aeaf53a505913e41cc84a10a64 +size 1465 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer.json b/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer_config.json b/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/trainer_state.json b/v5/KTO/KTO_1k/lora/checkpoint-140/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..33244dda949567fce461e5e4a035aa85360c9d1c --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/trainer_state.json @@ -0,0 +1,468 @@ +{ + "best_global_step": 140, + "best_metric": 0.0017091865539550777, + "best_model_checkpoint": "output/lora/checkpoint-140", + "epoch": 1.12, + "eval_steps": 10, + "global_step": 140, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 1.9768778085708618, + "kl": 0.015772342681884766, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 30694041.6, + "logits/rejected": 32548755.2, + "logps/chosen": -147.53765869140625, + "logps/rejected": -127.52252197265625, + "loss": 0.5001121044158936, + "rewards/chosen": 0.00024262431543320417, + "rewards/margins": -0.0008967638248577714, + "rewards/rejected": 0.0011393881402909755, + "step": 10 + }, + { + "epoch": 0.08, + "eval_kl": 0.027425793930888176, + "eval_logits/chosen": 37136154.624, + "eval_logits/rejected": 37056524.288, + "eval_logps/chosen": -155.775578125, + "eval_logps/rejected": -149.951390625, + "eval_loss": 0.4999745488166809, + "eval_rewards/chosen": 0.0014533588886260986, + "eval_rewards/margins": 0.00020384418964385976, + "eval_rewards/rejected": 0.0012495146989822388, + "eval_runtime": 214.2134, + "eval_samples_per_second": 4.668, + "eval_steps_per_second": 2.334, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 1.7118260860443115, + "kl": 0.03703146055340767, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 54050995.2, + "logits/rejected": 53405817.6, + "logps/chosen": -139.1049072265625, + "logps/rejected": -150.837353515625, + "loss": 0.5002951145172119, + "rewards/chosen": 0.0013979338109493256, + "rewards/margins": -0.002360067889094353, + "rewards/rejected": 0.0037580017000436784, + "step": 20 + }, + { + "epoch": 0.16, + "eval_kl": 0.025570642203092575, + "eval_logits/chosen": 37107462.144, + "eval_logits/rejected": 37030170.624, + "eval_logps/chosen": -155.797375, + "eval_logps/rejected": -149.971109375, + "eval_loss": 0.5000007748603821, + "eval_rewards/chosen": -0.000728474497795105, + "eval_rewards/margins": -6.261587142944317e-06, + "eval_rewards/rejected": -0.0007222129106521606, + "eval_runtime": 215.2897, + "eval_samples_per_second": 4.645, + "eval_steps_per_second": 2.322, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 1.6561487913131714, + "kl": 0.037951041013002396, + "learning_rate": 4.911111111111112e-06, + "logits/chosen": 35367532.8, + "logits/rejected": 35602668.8, + "logps/chosen": -129.755419921875, + "logps/rejected": -138.6831298828125, + "loss": 0.5000378608703613, + "rewards/chosen": 0.0009906148537993432, + "rewards/margins": -0.000303511694073677, + "rewards/rejected": 0.0012941265478730202, + "step": 30 + }, + { + "epoch": 0.24, + "eval_kl": 0.05881376564502716, + "eval_logits/chosen": 36992643.072, + "eval_logits/rejected": 36911718.4, + "eval_logps/chosen": -155.769390625, + "eval_logps/rejected": -149.94559375, + "eval_loss": 0.49997004866600037, + "eval_rewards/chosen": 0.0020712406635284425, + "eval_rewards/margins": 0.0002397129535675049, + "eval_rewards/rejected": 0.0018315277099609376, + "eval_runtime": 215.487, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.32, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 1.3816574811935425, + "kl": 0.057618238031864166, + "learning_rate": 4.6888888888888895e-06, + "logits/chosen": 43991216.0, + "logits/rejected": 44489225.6, + "logps/chosen": -143.26949462890624, + "logps/rejected": -144.8991943359375, + "loss": 0.4999053955078125, + "rewards/chosen": 0.0019632244482636453, + "rewards/margins": 0.000756874028593302, + "rewards/rejected": 0.0012063504196703433, + "step": 40 + }, + { + "epoch": 0.32, + "eval_kl": 0.026525555178523064, + "eval_logits/chosen": 36892434.432, + "eval_logits/rejected": 36817154.048, + "eval_logps/chosen": -155.863125, + "eval_logps/rejected": -150.033375, + "eval_loss": 0.5000438094139099, + "eval_rewards/chosen": -0.007300267219543457, + "eval_rewards/margins": -0.000350986957550049, + "eval_rewards/rejected": -0.006949280261993408, + "eval_runtime": 215.3307, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 2.322, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 1.295904517173767, + "kl": 0.06024184077978134, + "learning_rate": 4.4666666666666665e-06, + "logits/chosen": 43765830.4, + "logits/rejected": 45366057.6, + "logps/chosen": -139.9152587890625, + "logps/rejected": -153.5464599609375, + "loss": 0.49985275268554685, + "rewards/chosen": -0.0002844284288585186, + "rewards/margins": 0.0011796096339821815, + "rewards/rejected": -0.0014640380628407, + "step": 50 + }, + { + "epoch": 0.4, + "eval_kl": 0.10677888244390488, + "eval_logits/chosen": 36868976.64, + "eval_logits/rejected": 36787453.952, + "eval_logps/chosen": -155.735203125, + "eval_logps/rejected": -149.917390625, + "eval_loss": 0.49989503622055054, + "eval_rewards/chosen": 0.00549051570892334, + "eval_rewards/margins": 0.0008393177986145018, + "eval_rewards/rejected": 0.004651197910308838, + "eval_runtime": 215.4646, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.321, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 1.1608610153198242, + "kl": 0.14546926319599152, + "learning_rate": 4.244444444444445e-06, + "logits/chosen": 37563235.2, + "logits/rejected": 35019974.4, + "logps/chosen": -104.43680419921876, + "logps/rejected": -112.33837890625, + "loss": 0.49872851371765137, + "rewards/chosen": 0.011330313980579376, + "rewards/margins": 0.010192890465259553, + "rewards/rejected": 0.0011374235153198242, + "step": 60 + }, + { + "epoch": 0.48, + "eval_kl": 0.08179865777492523, + "eval_logits/chosen": 36675063.808, + "eval_logits/rejected": 36601872.384, + "eval_logps/chosen": -155.85546875, + "eval_logps/rejected": -150.03453125, + "eval_loss": 0.4999338984489441, + "eval_rewards/chosen": -0.006535920143127441, + "eval_rewards/margins": 0.0005277481079101563, + "eval_rewards/rejected": -0.007063668251037597, + "eval_runtime": 215.4136, + "eval_samples_per_second": 4.642, + "eval_steps_per_second": 2.321, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 1.5624558925628662, + "kl": 0.12186811119318008, + "learning_rate": 4.022222222222222e-06, + "logits/chosen": 48204668.8, + "logits/rejected": 47227776.0, + "logps/chosen": -164.40889892578124, + "logps/rejected": -174.45804443359376, + "loss": 0.4987171173095703, + "rewards/chosen": -0.029685625433921815, + "rewards/margins": 0.010321748256683347, + "rewards/rejected": -0.04000737369060516, + "step": 70 + }, + { + "epoch": 0.56, + "eval_kl": 0.04982582852244377, + "eval_logits/chosen": 36299350.016, + "eval_logits/rejected": 36241809.408, + "eval_logps/chosen": -156.16525, + "eval_logps/rejected": -150.321625, + "eval_loss": 0.5002162456512451, + "eval_rewards/chosen": -0.03751410675048828, + "eval_rewards/margins": -0.0017404594421386727, + "eval_rewards/rejected": -0.03577364730834961, + "eval_runtime": 215.5371, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 1.8351110219955444, + "kl": 0.10073833167552948, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 41128102.4, + "logits/rejected": 41413350.4, + "logps/chosen": -153.54603271484376, + "logps/rejected": -162.3951171875, + "loss": 0.49887776374816895, + "rewards/chosen": -0.02412339448928833, + "rewards/margins": 0.00904244482517242, + "rewards/rejected": -0.03316583931446075, + "step": 80 + }, + { + "epoch": 0.64, + "eval_kl": 0.05352861434221268, + "eval_logits/chosen": 36170702.848, + "eval_logits/rejected": 36119736.32, + "eval_logps/chosen": -156.286734375, + "eval_logps/rejected": -150.43715625, + "eval_loss": 0.5002899765968323, + "eval_rewards/chosen": -0.04966393661499023, + "eval_rewards/margins": -0.0023392181396484357, + "eval_rewards/rejected": -0.047324718475341795, + "eval_runtime": 215.7316, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 2.318, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": 1.6580690145492554, + "kl": 0.19566671550273895, + "learning_rate": 3.577777777777778e-06, + "logits/chosen": 44902732.8, + "logits/rejected": 45233840.0, + "logps/chosen": -168.9074462890625, + "logps/rejected": -158.04383544921876, + "loss": 0.4973473072052002, + "rewards/chosen": -0.014560246467590332, + "rewards/margins": 0.021359801292419434, + "rewards/rejected": -0.03592004776000977, + "step": 90 + }, + { + "epoch": 0.72, + "eval_kl": 0.13689810037612915, + "eval_logits/chosen": 36205449.216, + "eval_logits/rejected": 36148674.56, + "eval_logps/chosen": -156.08928125, + "eval_logps/rejected": -150.249953125, + "eval_loss": 0.5001612305641174, + "eval_rewards/chosen": -0.029915948867797852, + "eval_rewards/margins": -0.0013099956512451182, + "eval_rewards/rejected": -0.028605953216552734, + "eval_runtime": 215.6166, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 1.8306394815444946, + "kl": 0.3133309781551361, + "learning_rate": 3.3555555555555557e-06, + "logits/chosen": 28461398.4, + "logits/rejected": 28053212.8, + "logps/chosen": -132.9733154296875, + "logps/rejected": -162.2736572265625, + "loss": 0.4978146553039551, + "rewards/chosen": 0.0008950136601924896, + "rewards/margins": 0.01778259202837944, + "rewards/rejected": -0.01688757836818695, + "step": 100 + }, + { + "epoch": 0.8, + "eval_kl": 0.17236100137233734, + "eval_logits/chosen": 36047056.896, + "eval_logits/rejected": 35990380.544, + "eval_logps/chosen": -156.12503125, + "eval_logps/rejected": -150.2839375, + "eval_loss": 0.5001822710037231, + "eval_rewards/chosen": -0.033492431640625, + "eval_rewards/margins": -0.0014871749877929663, + "eval_rewards/rejected": -0.032005256652832034, + "eval_runtime": 215.521, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 1.4526598453521729, + "kl": 0.3579270541667938, + "learning_rate": 3.133333333333334e-06, + "logits/chosen": 38711958.4, + "logits/rejected": 38049481.6, + "logps/chosen": -128.5083740234375, + "logps/rejected": -134.143994140625, + "loss": 0.4988682746887207, + "rewards/chosen": 0.013809390366077423, + "rewards/margins": 0.00909285619854927, + "rewards/rejected": 0.0047165341675281525, + "step": 110 + }, + { + "epoch": 0.88, + "eval_kl": 0.16148880124092102, + "eval_logits/chosen": 35890647.04, + "eval_logits/rejected": 35838906.368, + "eval_logps/chosen": -156.25996875, + "eval_logps/rejected": -150.409828125, + "eval_loss": 0.5002931952476501, + "eval_rewards/chosen": -0.046984207153320315, + "eval_rewards/margins": -0.002391498565673833, + "eval_rewards/rejected": -0.04459270858764648, + "eval_runtime": 215.616, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 1.475139856338501, + "kl": 0.5180838108062744, + "learning_rate": 2.9111111111111114e-06, + "logits/chosen": 47944940.8, + "logits/rejected": 47759408.0, + "logps/chosen": -160.9727294921875, + "logps/rejected": -132.9762939453125, + "loss": 0.5010076999664307, + "rewards/chosen": 0.001300615817308426, + "rewards/margins": -0.008088254928588867, + "rewards/rejected": 0.009388870745897292, + "step": 120 + }, + { + "epoch": 0.96, + "eval_kl": 0.22723029553890228, + "eval_logits/chosen": 36017373.184, + "eval_logits/rejected": 35955687.424, + "eval_logps/chosen": -156.066296875, + "eval_logps/rejected": -150.22778125, + "eval_loss": 0.5001496076583862, + "eval_rewards/chosen": -0.027616947174072266, + "eval_rewards/margins": -0.0012296409606933588, + "eval_rewards/rejected": -0.026387306213378908, + "eval_runtime": 215.5436, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 2.32, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 1.4342325925827026, + "kl": 0.511700451374054, + "learning_rate": 2.6888888888888892e-06, + "logits/chosen": 38445190.4, + "logits/rejected": 39013670.4, + "logps/chosen": -158.775146484375, + "logps/rejected": -127.187890625, + "loss": 0.49019808769226075, + "rewards/chosen": 0.06660090684890747, + "rewards/margins": 0.07856814712285996, + "rewards/rejected": -0.011967240273952484, + "step": 130 + }, + { + "epoch": 1.04, + "eval_kl": 0.45817074179649353, + "eval_logits/chosen": 36251000.832, + "eval_logits/rejected": 36169760.768, + "eval_logps/chosen": -155.61546875, + "eval_logps/rejected": -149.795578125, + "eval_loss": 0.49991893768310547, + "eval_rewards/chosen": 0.017465354919433594, + "eval_rewards/margins": 0.0006336040496826185, + "eval_rewards/rejected": 0.016831750869750976, + "eval_runtime": 214.8194, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 2.328, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 1.4938650131225586, + "kl": 0.7170234322547913, + "learning_rate": 2.466666666666667e-06, + "logits/chosen": 43274342.4, + "logits/rejected": 44117376.0, + "logps/chosen": -137.55440673828124, + "logps/rejected": -140.9193115234375, + "loss": 0.4798906326293945, + "rewards/chosen": 0.1318502902984619, + "rewards/margins": 0.16140162050724027, + "rewards/rejected": -0.02955133020877838, + "step": 140 + }, + { + "epoch": 1.12, + "eval_kl": 0.5524640679359436, + "eval_logits/chosen": 36227063.808, + "eval_logits/rejected": 36138741.76, + "eval_logps/chosen": -155.505078125, + "eval_logps/rejected": -149.6959375, + "eval_loss": 0.4997849762439728, + "eval_rewards/chosen": 0.028504100799560547, + "eval_rewards/margins": 0.0017091865539550777, + "eval_rewards/rejected": 0.02679491424560547, + "eval_runtime": 215.0601, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 140 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-140/training_args.bin b/v5/KTO/KTO_1k/lora/checkpoint-140/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c08165c47630e923463a136aea666c753d71c0a5 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-140/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f0c8776981919c9c02fc9f496de711556353cd990690edb7e08892b3e90ddf +size 5649 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/README.md b/v5/KTO/KTO_1k/lora/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_config.json b/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f8b4eff733eadb76d7879837f7ae2de2f71f0b --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_model.safetensors b/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..baa056f8a857aca44255244c1de90dadb2317b9f --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dda01831d802365570915eff2c3c853f2b7527be22e9feb53a418769694854ca +size 180385008 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/chat_template.jinja b/v5/KTO/KTO_1k/lora/checkpoint-240/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/optimizer.pt b/v5/KTO/KTO_1k/lora/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..002f20cdd5a24c64781a319b3d99501dc9af3783 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82b10639f52759ed07f2a31174c532510459274d8e987fc0b5b81e6a3ddc174 +size 360902475 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/rng_state.pth b/v5/KTO/KTO_1k/lora/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..13e11a54e352d8a7149df1f88c1b023ee9973959 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7affab63b271ed0f59a5b53056fc0a581226a41dcdf2fc2b80b669e7c3cf714 +size 14645 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/scaler.pt b/v5/KTO/KTO_1k/lora/checkpoint-240/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c9041f3b9dc92c4c71cfe27f1badefa3341d514 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:358036b71a9ab45fd32e9d2566e050fa5ce750795c3889b5da2b5cc1df201fc2 +size 1383 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/scheduler.pt b/v5/KTO/KTO_1k/lora/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b988fb4bd0c543f8646151f9de274fc4351f6461 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b795729a3f898bce62577e8a51d80803537f8023d400e5d68c7336616e70a45 +size 1465 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer.json b/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer_config.json b/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/trainer_state.json b/v5/KTO/KTO_1k/lora/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..633f1206fad7aaf7aa4173a3c427f017fcdea879 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/trainer_state.json @@ -0,0 +1,778 @@ +{ + "best_global_step": 140, + "best_metric": 0.0017091865539550777, + "best_model_checkpoint": "output/lora/checkpoint-140", + "epoch": 1.92, + "eval_steps": 10, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 1.9768778085708618, + "kl": 0.015772342681884766, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 30694041.6, + "logits/rejected": 32548755.2, + "logps/chosen": -147.53765869140625, + "logps/rejected": -127.52252197265625, + "loss": 0.5001121044158936, + "rewards/chosen": 0.00024262431543320417, + "rewards/margins": -0.0008967638248577714, + "rewards/rejected": 0.0011393881402909755, + "step": 10 + }, + { + "epoch": 0.08, + "eval_kl": 0.027425793930888176, + "eval_logits/chosen": 37136154.624, + "eval_logits/rejected": 37056524.288, + "eval_logps/chosen": -155.775578125, + "eval_logps/rejected": -149.951390625, + "eval_loss": 0.4999745488166809, + "eval_rewards/chosen": 0.0014533588886260986, + "eval_rewards/margins": 0.00020384418964385976, + "eval_rewards/rejected": 0.0012495146989822388, + "eval_runtime": 214.2134, + "eval_samples_per_second": 4.668, + "eval_steps_per_second": 2.334, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 1.7118260860443115, + "kl": 0.03703146055340767, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 54050995.2, + "logits/rejected": 53405817.6, + "logps/chosen": -139.1049072265625, + "logps/rejected": -150.837353515625, + "loss": 0.5002951145172119, + "rewards/chosen": 0.0013979338109493256, + "rewards/margins": -0.002360067889094353, + "rewards/rejected": 0.0037580017000436784, + "step": 20 + }, + { + "epoch": 0.16, + "eval_kl": 0.025570642203092575, + "eval_logits/chosen": 37107462.144, + "eval_logits/rejected": 37030170.624, + "eval_logps/chosen": -155.797375, + "eval_logps/rejected": -149.971109375, + "eval_loss": 0.5000007748603821, + "eval_rewards/chosen": -0.000728474497795105, + "eval_rewards/margins": -6.261587142944317e-06, + "eval_rewards/rejected": -0.0007222129106521606, + "eval_runtime": 215.2897, + "eval_samples_per_second": 4.645, + "eval_steps_per_second": 2.322, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 1.6561487913131714, + "kl": 0.037951041013002396, + "learning_rate": 4.911111111111112e-06, + "logits/chosen": 35367532.8, + "logits/rejected": 35602668.8, + "logps/chosen": -129.755419921875, + "logps/rejected": -138.6831298828125, + "loss": 0.5000378608703613, + "rewards/chosen": 0.0009906148537993432, + "rewards/margins": -0.000303511694073677, + "rewards/rejected": 0.0012941265478730202, + "step": 30 + }, + { + "epoch": 0.24, + "eval_kl": 0.05881376564502716, + "eval_logits/chosen": 36992643.072, + "eval_logits/rejected": 36911718.4, + "eval_logps/chosen": -155.769390625, + "eval_logps/rejected": -149.94559375, + "eval_loss": 0.49997004866600037, + "eval_rewards/chosen": 0.0020712406635284425, + "eval_rewards/margins": 0.0002397129535675049, + "eval_rewards/rejected": 0.0018315277099609376, + "eval_runtime": 215.487, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.32, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 1.3816574811935425, + "kl": 0.057618238031864166, + "learning_rate": 4.6888888888888895e-06, + "logits/chosen": 43991216.0, + "logits/rejected": 44489225.6, + "logps/chosen": -143.26949462890624, + "logps/rejected": -144.8991943359375, + "loss": 0.4999053955078125, + "rewards/chosen": 0.0019632244482636453, + "rewards/margins": 0.000756874028593302, + "rewards/rejected": 0.0012063504196703433, + "step": 40 + }, + { + "epoch": 0.32, + "eval_kl": 0.026525555178523064, + "eval_logits/chosen": 36892434.432, + "eval_logits/rejected": 36817154.048, + "eval_logps/chosen": -155.863125, + "eval_logps/rejected": -150.033375, + "eval_loss": 0.5000438094139099, + "eval_rewards/chosen": -0.007300267219543457, + "eval_rewards/margins": -0.000350986957550049, + "eval_rewards/rejected": -0.006949280261993408, + "eval_runtime": 215.3307, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 2.322, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 1.295904517173767, + "kl": 0.06024184077978134, + "learning_rate": 4.4666666666666665e-06, + "logits/chosen": 43765830.4, + "logits/rejected": 45366057.6, + "logps/chosen": -139.9152587890625, + "logps/rejected": -153.5464599609375, + "loss": 0.49985275268554685, + "rewards/chosen": -0.0002844284288585186, + "rewards/margins": 0.0011796096339821815, + "rewards/rejected": -0.0014640380628407, + "step": 50 + }, + { + "epoch": 0.4, + "eval_kl": 0.10677888244390488, + "eval_logits/chosen": 36868976.64, + "eval_logits/rejected": 36787453.952, + "eval_logps/chosen": -155.735203125, + "eval_logps/rejected": -149.917390625, + "eval_loss": 0.49989503622055054, + "eval_rewards/chosen": 0.00549051570892334, + "eval_rewards/margins": 0.0008393177986145018, + "eval_rewards/rejected": 0.004651197910308838, + "eval_runtime": 215.4646, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.321, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 1.1608610153198242, + "kl": 0.14546926319599152, + "learning_rate": 4.244444444444445e-06, + "logits/chosen": 37563235.2, + "logits/rejected": 35019974.4, + "logps/chosen": -104.43680419921876, + "logps/rejected": -112.33837890625, + "loss": 0.49872851371765137, + "rewards/chosen": 0.011330313980579376, + "rewards/margins": 0.010192890465259553, + "rewards/rejected": 0.0011374235153198242, + "step": 60 + }, + { + "epoch": 0.48, + "eval_kl": 0.08179865777492523, + "eval_logits/chosen": 36675063.808, + "eval_logits/rejected": 36601872.384, + "eval_logps/chosen": -155.85546875, + "eval_logps/rejected": -150.03453125, + "eval_loss": 0.4999338984489441, + "eval_rewards/chosen": -0.006535920143127441, + "eval_rewards/margins": 0.0005277481079101563, + "eval_rewards/rejected": -0.007063668251037597, + "eval_runtime": 215.4136, + "eval_samples_per_second": 4.642, + "eval_steps_per_second": 2.321, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 1.5624558925628662, + "kl": 0.12186811119318008, + "learning_rate": 4.022222222222222e-06, + "logits/chosen": 48204668.8, + "logits/rejected": 47227776.0, + "logps/chosen": -164.40889892578124, + "logps/rejected": -174.45804443359376, + "loss": 0.4987171173095703, + "rewards/chosen": -0.029685625433921815, + "rewards/margins": 0.010321748256683347, + "rewards/rejected": -0.04000737369060516, + "step": 70 + }, + { + "epoch": 0.56, + "eval_kl": 0.04982582852244377, + "eval_logits/chosen": 36299350.016, + "eval_logits/rejected": 36241809.408, + "eval_logps/chosen": -156.16525, + "eval_logps/rejected": -150.321625, + "eval_loss": 0.5002162456512451, + "eval_rewards/chosen": -0.03751410675048828, + "eval_rewards/margins": -0.0017404594421386727, + "eval_rewards/rejected": -0.03577364730834961, + "eval_runtime": 215.5371, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 1.8351110219955444, + "kl": 0.10073833167552948, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 41128102.4, + "logits/rejected": 41413350.4, + "logps/chosen": -153.54603271484376, + "logps/rejected": -162.3951171875, + "loss": 0.49887776374816895, + "rewards/chosen": -0.02412339448928833, + "rewards/margins": 0.00904244482517242, + "rewards/rejected": -0.03316583931446075, + "step": 80 + }, + { + "epoch": 0.64, + "eval_kl": 0.05352861434221268, + "eval_logits/chosen": 36170702.848, + "eval_logits/rejected": 36119736.32, + "eval_logps/chosen": -156.286734375, + "eval_logps/rejected": -150.43715625, + "eval_loss": 0.5002899765968323, + "eval_rewards/chosen": -0.04966393661499023, + "eval_rewards/margins": -0.0023392181396484357, + "eval_rewards/rejected": -0.047324718475341795, + "eval_runtime": 215.7316, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 2.318, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": 1.6580690145492554, + "kl": 0.19566671550273895, + "learning_rate": 3.577777777777778e-06, + "logits/chosen": 44902732.8, + "logits/rejected": 45233840.0, + "logps/chosen": -168.9074462890625, + "logps/rejected": -158.04383544921876, + "loss": 0.4973473072052002, + "rewards/chosen": -0.014560246467590332, + "rewards/margins": 0.021359801292419434, + "rewards/rejected": -0.03592004776000977, + "step": 90 + }, + { + "epoch": 0.72, + "eval_kl": 0.13689810037612915, + "eval_logits/chosen": 36205449.216, + "eval_logits/rejected": 36148674.56, + "eval_logps/chosen": -156.08928125, + "eval_logps/rejected": -150.249953125, + "eval_loss": 0.5001612305641174, + "eval_rewards/chosen": -0.029915948867797852, + "eval_rewards/margins": -0.0013099956512451182, + "eval_rewards/rejected": -0.028605953216552734, + "eval_runtime": 215.6166, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 1.8306394815444946, + "kl": 0.3133309781551361, + "learning_rate": 3.3555555555555557e-06, + "logits/chosen": 28461398.4, + "logits/rejected": 28053212.8, + "logps/chosen": -132.9733154296875, + "logps/rejected": -162.2736572265625, + "loss": 0.4978146553039551, + "rewards/chosen": 0.0008950136601924896, + "rewards/margins": 0.01778259202837944, + "rewards/rejected": -0.01688757836818695, + "step": 100 + }, + { + "epoch": 0.8, + "eval_kl": 0.17236100137233734, + "eval_logits/chosen": 36047056.896, + "eval_logits/rejected": 35990380.544, + "eval_logps/chosen": -156.12503125, + "eval_logps/rejected": -150.2839375, + "eval_loss": 0.5001822710037231, + "eval_rewards/chosen": -0.033492431640625, + "eval_rewards/margins": -0.0014871749877929663, + "eval_rewards/rejected": -0.032005256652832034, + "eval_runtime": 215.521, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 1.4526598453521729, + "kl": 0.3579270541667938, + "learning_rate": 3.133333333333334e-06, + "logits/chosen": 38711958.4, + "logits/rejected": 38049481.6, + "logps/chosen": -128.5083740234375, + "logps/rejected": -134.143994140625, + "loss": 0.4988682746887207, + "rewards/chosen": 0.013809390366077423, + "rewards/margins": 0.00909285619854927, + "rewards/rejected": 0.0047165341675281525, + "step": 110 + }, + { + "epoch": 0.88, + "eval_kl": 0.16148880124092102, + "eval_logits/chosen": 35890647.04, + "eval_logits/rejected": 35838906.368, + "eval_logps/chosen": -156.25996875, + "eval_logps/rejected": -150.409828125, + "eval_loss": 0.5002931952476501, + "eval_rewards/chosen": -0.046984207153320315, + "eval_rewards/margins": -0.002391498565673833, + "eval_rewards/rejected": -0.04459270858764648, + "eval_runtime": 215.616, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 1.475139856338501, + "kl": 0.5180838108062744, + "learning_rate": 2.9111111111111114e-06, + "logits/chosen": 47944940.8, + "logits/rejected": 47759408.0, + "logps/chosen": -160.9727294921875, + "logps/rejected": -132.9762939453125, + "loss": 0.5010076999664307, + "rewards/chosen": 0.001300615817308426, + "rewards/margins": -0.008088254928588867, + "rewards/rejected": 0.009388870745897292, + "step": 120 + }, + { + "epoch": 0.96, + "eval_kl": 0.22723029553890228, + "eval_logits/chosen": 36017373.184, + "eval_logits/rejected": 35955687.424, + "eval_logps/chosen": -156.066296875, + "eval_logps/rejected": -150.22778125, + "eval_loss": 0.5001496076583862, + "eval_rewards/chosen": -0.027616947174072266, + "eval_rewards/margins": -0.0012296409606933588, + "eval_rewards/rejected": -0.026387306213378908, + "eval_runtime": 215.5436, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 2.32, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 1.4342325925827026, + "kl": 0.511700451374054, + "learning_rate": 2.6888888888888892e-06, + "logits/chosen": 38445190.4, + "logits/rejected": 39013670.4, + "logps/chosen": -158.775146484375, + "logps/rejected": -127.187890625, + "loss": 0.49019808769226075, + "rewards/chosen": 0.06660090684890747, + "rewards/margins": 0.07856814712285996, + "rewards/rejected": -0.011967240273952484, + "step": 130 + }, + { + "epoch": 1.04, + "eval_kl": 0.45817074179649353, + "eval_logits/chosen": 36251000.832, + "eval_logits/rejected": 36169760.768, + "eval_logps/chosen": -155.61546875, + "eval_logps/rejected": -149.795578125, + "eval_loss": 0.49991893768310547, + "eval_rewards/chosen": 0.017465354919433594, + "eval_rewards/margins": 0.0006336040496826185, + "eval_rewards/rejected": 0.016831750869750976, + "eval_runtime": 214.8194, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 2.328, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 1.4938650131225586, + "kl": 0.7170234322547913, + "learning_rate": 2.466666666666667e-06, + "logits/chosen": 43274342.4, + "logits/rejected": 44117376.0, + "logps/chosen": -137.55440673828124, + "logps/rejected": -140.9193115234375, + "loss": 0.4798906326293945, + "rewards/chosen": 0.1318502902984619, + "rewards/margins": 0.16140162050724027, + "rewards/rejected": -0.02955133020877838, + "step": 140 + }, + { + "epoch": 1.12, + "eval_kl": 0.5524640679359436, + "eval_logits/chosen": 36227063.808, + "eval_logits/rejected": 36138741.76, + "eval_logps/chosen": -155.505078125, + "eval_logps/rejected": -149.6959375, + "eval_loss": 0.4997849762439728, + "eval_rewards/chosen": 0.028504100799560547, + "eval_rewards/margins": 0.0017091865539550777, + "eval_rewards/rejected": 0.02679491424560547, + "eval_runtime": 215.0601, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 1.8674030303955078, + "kl": 0.8660959005355835, + "learning_rate": 2.2444444444444445e-06, + "logits/chosen": 41505494.4, + "logits/rejected": 41576633.6, + "logps/chosen": -134.60517578125, + "logps/rejected": -151.21878662109376, + "loss": 0.4738303661346436, + "rewards/chosen": 0.1708309531211853, + "rewards/margins": 0.21031711697578429, + "rewards/rejected": -0.039486163854599, + "step": 150 + }, + { + "epoch": 1.2, + "eval_kl": 0.5169070363044739, + "eval_logits/chosen": 36047007.744, + "eval_logits/rejected": 35964690.432, + "eval_logps/chosen": -155.61309375, + "eval_logps/rejected": -149.796390625, + "eval_loss": 0.4998777508735657, + "eval_rewards/chosen": 0.017700881958007814, + "eval_rewards/margins": 0.0009510231018066417, + "eval_rewards/rejected": 0.016749858856201172, + "eval_runtime": 215.6102, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 1.2205264568328857, + "kl": 0.8254079818725586, + "learning_rate": 2.0222222222222223e-06, + "logits/chosen": 45365196.8, + "logits/rejected": 44416659.2, + "logps/chosen": -146.26240234375, + "logps/rejected": -146.23955078125, + "loss": 0.47802033424377444, + "rewards/chosen": 0.1414048194885254, + "rewards/margins": 0.1766757071018219, + "rewards/rejected": -0.03527088761329651, + "step": 160 + }, + { + "epoch": 1.28, + "eval_kl": 0.46929916739463806, + "eval_logits/chosen": 35877081.088, + "eval_logits/rejected": 35800481.792, + "eval_logps/chosen": -155.73628125, + "eval_logps/rejected": -149.909328125, + "eval_loss": 0.500004231929779, + "eval_rewards/chosen": 0.005384265422821045, + "eval_rewards/margins": -7.338762283325227e-05, + "eval_rewards/rejected": 0.005457653045654297, + "eval_runtime": 215.8067, + "eval_samples_per_second": 4.634, + "eval_steps_per_second": 2.317, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.7353568077087402, + "kl": 0.7226912379264832, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 44174476.8, + "logits/rejected": 45570624.0, + "logps/chosen": -141.496435546875, + "logps/rejected": -148.9132080078125, + "loss": 0.4752659797668457, + "rewards/chosen": 0.1527896046638489, + "rewards/margins": 0.19837732315063478, + "rewards/rejected": -0.04558771848678589, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 0.4444282352924347, + "eval_logits/chosen": 35815936.0, + "eval_logits/rejected": 35742294.016, + "eval_logps/chosen": -155.7953125, + "eval_logps/rejected": -149.969390625, + "eval_loss": 0.499990314245224, + "eval_rewards/chosen": -0.0005206142663955689, + "eval_rewards/margins": 2.925407886505125e-05, + "eval_rewards/rejected": -0.0005498683452606201, + "eval_runtime": 215.3994, + "eval_samples_per_second": 4.643, + "eval_steps_per_second": 2.321, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 1.6643599271774292, + "kl": 0.48263949155807495, + "learning_rate": 1.5777777777777778e-06, + "logits/chosen": 38892889.6, + "logits/rejected": 37222313.6, + "logps/chosen": -119.811962890625, + "logps/rejected": -135.897265625, + "loss": 0.47057647705078126, + "rewards/chosen": 0.10403428077697754, + "rewards/margins": 0.24373894929885864, + "rewards/rejected": -0.1397046685218811, + "step": 180 + }, + { + "epoch": 1.44, + "eval_kl": 0.4670499563217163, + "eval_logits/chosen": 35738546.176, + "eval_logits/rejected": 35665698.816, + "eval_logps/chosen": -155.79365625, + "eval_logps/rejected": -149.968703125, + "eval_loss": 0.49997708201408386, + "eval_rewards/chosen": -0.00035465264320373534, + "eval_rewards/margins": 0.00012645322084426882, + "eval_rewards/rejected": -0.00048110586404800416, + "eval_runtime": 215.7221, + "eval_samples_per_second": 4.636, + "eval_steps_per_second": 2.318, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 1.6640219688415527, + "kl": 0.7541912794113159, + "learning_rate": 1.3555555555555558e-06, + "logits/chosen": 41279955.2, + "logits/rejected": 40511801.6, + "logps/chosen": -122.19615478515625, + "logps/rejected": -145.7674072265625, + "loss": 0.46909322738647463, + "rewards/chosen": 0.12193760871887208, + "rewards/margins": 0.2589147567749024, + "rewards/rejected": -0.13697714805603028, + "step": 190 + }, + { + "epoch": 1.52, + "eval_kl": 0.4091774523258209, + "eval_logits/chosen": 35500843.008, + "eval_logits/rejected": 35437404.16, + "eval_logps/chosen": -155.991421875, + "eval_logps/rejected": -150.15725, + "eval_loss": 0.5000874996185303, + "eval_rewards/chosen": -0.020132492065429688, + "eval_rewards/margins": -0.0007962665557861327, + "eval_rewards/rejected": -0.019336225509643555, + "eval_runtime": 215.236, + "eval_samples_per_second": 4.646, + "eval_steps_per_second": 2.323, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 1.707406759262085, + "kl": 0.5876865983009338, + "learning_rate": 1.1333333333333334e-06, + "logits/chosen": 40565459.2, + "logits/rejected": 41111059.2, + "logps/chosen": -132.433154296875, + "logps/rejected": -156.41756591796874, + "loss": 0.4774333477020264, + "rewards/chosen": 0.058162355422973634, + "rewards/margins": 0.1845989227294922, + "rewards/rejected": -0.12643656730651856, + "step": 200 + }, + { + "epoch": 1.6, + "eval_kl": 0.40361329913139343, + "eval_logits/chosen": 35455897.6, + "eval_logits/rejected": 35396550.656, + "eval_logps/chosen": -156.041859375, + "eval_logps/rejected": -150.2026875, + "eval_loss": 0.5001482963562012, + "eval_rewards/chosen": -0.02517437744140625, + "eval_rewards/margins": -0.0012942276000976576, + "eval_rewards/rejected": -0.023880149841308592, + "eval_runtime": 215.0425, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.8582910299301147, + "kl": 0.8855921626091003, + "learning_rate": 9.111111111111113e-07, + "logits/chosen": 32328320.0, + "logits/rejected": 31538704.0, + "logps/chosen": -185.6922119140625, + "logps/rejected": -180.88406982421876, + "loss": 0.4729271411895752, + "rewards/chosen": 0.14517008066177367, + "rewards/margins": 0.2178096830844879, + "rewards/rejected": -0.07263960242271424, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 0.4077625572681427, + "eval_logits/chosen": 35421622.272, + "eval_logits/rejected": 35361173.504, + "eval_logps/chosen": -156.0609375, + "eval_logps/rejected": -150.21765625, + "eval_loss": 0.5001978874206543, + "eval_rewards/chosen": -0.027083938598632812, + "eval_rewards/margins": -0.0017067718505859378, + "eval_rewards/rejected": -0.025377166748046874, + "eval_runtime": 215.0369, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 1.5523241758346558, + "kl": 0.7643419504165649, + "learning_rate": 6.88888888888889e-07, + "logits/chosen": 44855187.2, + "logits/rejected": 43988204.8, + "logps/chosen": -147.70128173828124, + "logps/rejected": -145.88345947265626, + "loss": 0.4781179904937744, + "rewards/chosen": 0.11672601699829102, + "rewards/margins": 0.1772436797618866, + "rewards/rejected": -0.06051766276359558, + "step": 220 + }, + { + "epoch": 1.76, + "eval_kl": 0.42980414628982544, + "eval_logits/chosen": 35399405.568, + "eval_logits/rejected": 35338944.512, + "eval_logps/chosen": -156.039234375, + "eval_logps/rejected": -150.198140625, + "eval_loss": 0.5001705884933472, + "eval_rewards/chosen": -0.024912540435791015, + "eval_rewards/margins": -0.0014874629974365242, + "eval_rewards/rejected": -0.02342507743835449, + "eval_runtime": 214.8075, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 2.328, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.4052627086639404, + "kl": 0.560142457485199, + "learning_rate": 4.666666666666667e-07, + "logits/chosen": 32892409.6, + "logits/rejected": 31692844.8, + "logps/chosen": -114.99593505859374, + "logps/rejected": -125.1630126953125, + "loss": 0.4770528793334961, + "rewards/chosen": 0.07531413435935974, + "rewards/margins": 0.18859378695487977, + "rewards/rejected": -0.11327965259552002, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 0.42431339621543884, + "eval_logits/chosen": 35335073.792, + "eval_logits/rejected": 35277471.744, + "eval_logps/chosen": -156.08775, + "eval_logps/rejected": -150.240609375, + "eval_loss": 0.5002422332763672, + "eval_rewards/chosen": -0.02976216125488281, + "eval_rewards/margins": -0.0020921192169189445, + "eval_rewards/rejected": -0.027670042037963867, + "eval_runtime": 214.985, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 2.326, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 1.5062552690505981, + "kl": 1.0432134866714478, + "learning_rate": 2.444444444444445e-07, + "logits/chosen": 40635660.8, + "logits/rejected": 40671884.8, + "logps/chosen": -160.55706787109375, + "logps/rejected": -154.69022216796876, + "loss": 0.48739013671875, + "rewards/chosen": 0.11838672161102295, + "rewards/margins": 0.10130963623523713, + "rewards/rejected": 0.01707708537578583, + "step": 240 + }, + { + "epoch": 1.92, + "eval_kl": 0.41829124093055725, + "eval_logits/chosen": 35317178.368, + "eval_logits/rejected": 35259428.864, + "eval_logps/chosen": -156.105671875, + "eval_logps/rejected": -150.261796875, + "eval_loss": 0.500201940536499, + "eval_rewards/chosen": -0.03155590438842774, + "eval_rewards/margins": -0.0017659969329834012, + "eval_rewards/rejected": -0.029789907455444336, + "eval_runtime": 215.1895, + "eval_samples_per_second": 4.647, + "eval_steps_per_second": 2.324, + "step": 240 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-240/training_args.bin b/v5/KTO/KTO_1k/lora/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c08165c47630e923463a136aea666c753d71c0a5 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f0c8776981919c9c02fc9f496de711556353cd990690edb7e08892b3e90ddf +size 5649 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/README.md b/v5/KTO/KTO_1k/lora/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_config.json b/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f6f8b4eff733eadb76d7879837f7ae2de2f71f0b --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "q_proj", + "o_proj", + "down_proj", + "k_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_model.safetensors b/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9712d9e7f04736675bc00bec1de604f57699cb9a --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3b239bd55af53edec6d484b4d301a06d0d47b890ad745a1f68204dbb7c6ba20 +size 180385008 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/chat_template.jinja b/v5/KTO/KTO_1k/lora/checkpoint-250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/optimizer.pt b/v5/KTO/KTO_1k/lora/checkpoint-250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..01c4593cfbd494b516af5627e30994696124ed21 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c91f1e34a888d5079bfabd0fcabed1e7b4a1b99b418730847ec73e22e84bb54 +size 360902475 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/rng_state.pth b/v5/KTO/KTO_1k/lora/checkpoint-250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68c0411dd375a388cbc8c58bea912cb904778ab8 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1786ad2057a678cc204dadc7fc5d1a4f939be477df219f770c7d40e9270281 +size 14645 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/scaler.pt b/v5/KTO/KTO_1k/lora/checkpoint-250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..849c6fa8080705f2a2c4a4f07a89a8e05bf320fa --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6fca631a6bcdfa2416587314d206a68f40e27a07bc674b76e72a93db4e5058 +size 1383 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/scheduler.pt b/v5/KTO/KTO_1k/lora/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b33f80da6ebe9faee9d44eb9882eb864699291a3 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa68c9756e4b45f9fc5e31507423afdfe40083f2b850885e7dec29d5f3970b1f +size 1465 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer.json b/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer_config.json b/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/trainer_state.json b/v5/KTO/KTO_1k/lora/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dce27851e13da1df5f4e939d6bca25a2a045b249 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/trainer_state.json @@ -0,0 +1,809 @@ +{ + "best_global_step": 140, + "best_metric": 0.0017091865539550777, + "best_model_checkpoint": "output/lora/checkpoint-140", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 1.9768778085708618, + "kl": 0.015772342681884766, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 30694041.6, + "logits/rejected": 32548755.2, + "logps/chosen": -147.53765869140625, + "logps/rejected": -127.52252197265625, + "loss": 0.5001121044158936, + "rewards/chosen": 0.00024262431543320417, + "rewards/margins": -0.0008967638248577714, + "rewards/rejected": 0.0011393881402909755, + "step": 10 + }, + { + "epoch": 0.08, + "eval_kl": 0.027425793930888176, + "eval_logits/chosen": 37136154.624, + "eval_logits/rejected": 37056524.288, + "eval_logps/chosen": -155.775578125, + "eval_logps/rejected": -149.951390625, + "eval_loss": 0.4999745488166809, + "eval_rewards/chosen": 0.0014533588886260986, + "eval_rewards/margins": 0.00020384418964385976, + "eval_rewards/rejected": 0.0012495146989822388, + "eval_runtime": 214.2134, + "eval_samples_per_second": 4.668, + "eval_steps_per_second": 2.334, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 1.7118260860443115, + "kl": 0.03703146055340767, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 54050995.2, + "logits/rejected": 53405817.6, + "logps/chosen": -139.1049072265625, + "logps/rejected": -150.837353515625, + "loss": 0.5002951145172119, + "rewards/chosen": 0.0013979338109493256, + "rewards/margins": -0.002360067889094353, + "rewards/rejected": 0.0037580017000436784, + "step": 20 + }, + { + "epoch": 0.16, + "eval_kl": 0.025570642203092575, + "eval_logits/chosen": 37107462.144, + "eval_logits/rejected": 37030170.624, + "eval_logps/chosen": -155.797375, + "eval_logps/rejected": -149.971109375, + "eval_loss": 0.5000007748603821, + "eval_rewards/chosen": -0.000728474497795105, + "eval_rewards/margins": -6.261587142944317e-06, + "eval_rewards/rejected": -0.0007222129106521606, + "eval_runtime": 215.2897, + "eval_samples_per_second": 4.645, + "eval_steps_per_second": 2.322, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 1.6561487913131714, + "kl": 0.037951041013002396, + "learning_rate": 4.911111111111112e-06, + "logits/chosen": 35367532.8, + "logits/rejected": 35602668.8, + "logps/chosen": -129.755419921875, + "logps/rejected": -138.6831298828125, + "loss": 0.5000378608703613, + "rewards/chosen": 0.0009906148537993432, + "rewards/margins": -0.000303511694073677, + "rewards/rejected": 0.0012941265478730202, + "step": 30 + }, + { + "epoch": 0.24, + "eval_kl": 0.05881376564502716, + "eval_logits/chosen": 36992643.072, + "eval_logits/rejected": 36911718.4, + "eval_logps/chosen": -155.769390625, + "eval_logps/rejected": -149.94559375, + "eval_loss": 0.49997004866600037, + "eval_rewards/chosen": 0.0020712406635284425, + "eval_rewards/margins": 0.0002397129535675049, + "eval_rewards/rejected": 0.0018315277099609376, + "eval_runtime": 215.487, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.32, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 1.3816574811935425, + "kl": 0.057618238031864166, + "learning_rate": 4.6888888888888895e-06, + "logits/chosen": 43991216.0, + "logits/rejected": 44489225.6, + "logps/chosen": -143.26949462890624, + "logps/rejected": -144.8991943359375, + "loss": 0.4999053955078125, + "rewards/chosen": 0.0019632244482636453, + "rewards/margins": 0.000756874028593302, + "rewards/rejected": 0.0012063504196703433, + "step": 40 + }, + { + "epoch": 0.32, + "eval_kl": 0.026525555178523064, + "eval_logits/chosen": 36892434.432, + "eval_logits/rejected": 36817154.048, + "eval_logps/chosen": -155.863125, + "eval_logps/rejected": -150.033375, + "eval_loss": 0.5000438094139099, + "eval_rewards/chosen": -0.007300267219543457, + "eval_rewards/margins": -0.000350986957550049, + "eval_rewards/rejected": -0.006949280261993408, + "eval_runtime": 215.3307, + "eval_samples_per_second": 4.644, + "eval_steps_per_second": 2.322, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 1.295904517173767, + "kl": 0.06024184077978134, + "learning_rate": 4.4666666666666665e-06, + "logits/chosen": 43765830.4, + "logits/rejected": 45366057.6, + "logps/chosen": -139.9152587890625, + "logps/rejected": -153.5464599609375, + "loss": 0.49985275268554685, + "rewards/chosen": -0.0002844284288585186, + "rewards/margins": 0.0011796096339821815, + "rewards/rejected": -0.0014640380628407, + "step": 50 + }, + { + "epoch": 0.4, + "eval_kl": 0.10677888244390488, + "eval_logits/chosen": 36868976.64, + "eval_logits/rejected": 36787453.952, + "eval_logps/chosen": -155.735203125, + "eval_logps/rejected": -149.917390625, + "eval_loss": 0.49989503622055054, + "eval_rewards/chosen": 0.00549051570892334, + "eval_rewards/margins": 0.0008393177986145018, + "eval_rewards/rejected": 0.004651197910308838, + "eval_runtime": 215.4646, + "eval_samples_per_second": 4.641, + "eval_steps_per_second": 2.321, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 1.1608610153198242, + "kl": 0.14546926319599152, + "learning_rate": 4.244444444444445e-06, + "logits/chosen": 37563235.2, + "logits/rejected": 35019974.4, + "logps/chosen": -104.43680419921876, + "logps/rejected": -112.33837890625, + "loss": 0.49872851371765137, + "rewards/chosen": 0.011330313980579376, + "rewards/margins": 0.010192890465259553, + "rewards/rejected": 0.0011374235153198242, + "step": 60 + }, + { + "epoch": 0.48, + "eval_kl": 0.08179865777492523, + "eval_logits/chosen": 36675063.808, + "eval_logits/rejected": 36601872.384, + "eval_logps/chosen": -155.85546875, + "eval_logps/rejected": -150.03453125, + "eval_loss": 0.4999338984489441, + "eval_rewards/chosen": -0.006535920143127441, + "eval_rewards/margins": 0.0005277481079101563, + "eval_rewards/rejected": -0.007063668251037597, + "eval_runtime": 215.4136, + "eval_samples_per_second": 4.642, + "eval_steps_per_second": 2.321, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 1.5624558925628662, + "kl": 0.12186811119318008, + "learning_rate": 4.022222222222222e-06, + "logits/chosen": 48204668.8, + "logits/rejected": 47227776.0, + "logps/chosen": -164.40889892578124, + "logps/rejected": -174.45804443359376, + "loss": 0.4987171173095703, + "rewards/chosen": -0.029685625433921815, + "rewards/margins": 0.010321748256683347, + "rewards/rejected": -0.04000737369060516, + "step": 70 + }, + { + "epoch": 0.56, + "eval_kl": 0.04982582852244377, + "eval_logits/chosen": 36299350.016, + "eval_logits/rejected": 36241809.408, + "eval_logps/chosen": -156.16525, + "eval_logps/rejected": -150.321625, + "eval_loss": 0.5002162456512451, + "eval_rewards/chosen": -0.03751410675048828, + "eval_rewards/margins": -0.0017404594421386727, + "eval_rewards/rejected": -0.03577364730834961, + "eval_runtime": 215.5371, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 1.8351110219955444, + "kl": 0.10073833167552948, + "learning_rate": 3.8000000000000005e-06, + "logits/chosen": 41128102.4, + "logits/rejected": 41413350.4, + "logps/chosen": -153.54603271484376, + "logps/rejected": -162.3951171875, + "loss": 0.49887776374816895, + "rewards/chosen": -0.02412339448928833, + "rewards/margins": 0.00904244482517242, + "rewards/rejected": -0.03316583931446075, + "step": 80 + }, + { + "epoch": 0.64, + "eval_kl": 0.05352861434221268, + "eval_logits/chosen": 36170702.848, + "eval_logits/rejected": 36119736.32, + "eval_logps/chosen": -156.286734375, + "eval_logps/rejected": -150.43715625, + "eval_loss": 0.5002899765968323, + "eval_rewards/chosen": -0.04966393661499023, + "eval_rewards/margins": -0.0023392181396484357, + "eval_rewards/rejected": -0.047324718475341795, + "eval_runtime": 215.7316, + "eval_samples_per_second": 4.635, + "eval_steps_per_second": 2.318, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": 1.6580690145492554, + "kl": 0.19566671550273895, + "learning_rate": 3.577777777777778e-06, + "logits/chosen": 44902732.8, + "logits/rejected": 45233840.0, + "logps/chosen": -168.9074462890625, + "logps/rejected": -158.04383544921876, + "loss": 0.4973473072052002, + "rewards/chosen": -0.014560246467590332, + "rewards/margins": 0.021359801292419434, + "rewards/rejected": -0.03592004776000977, + "step": 90 + }, + { + "epoch": 0.72, + "eval_kl": 0.13689810037612915, + "eval_logits/chosen": 36205449.216, + "eval_logits/rejected": 36148674.56, + "eval_logps/chosen": -156.08928125, + "eval_logps/rejected": -150.249953125, + "eval_loss": 0.5001612305641174, + "eval_rewards/chosen": -0.029915948867797852, + "eval_rewards/margins": -0.0013099956512451182, + "eval_rewards/rejected": -0.028605953216552734, + "eval_runtime": 215.6166, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 1.8306394815444946, + "kl": 0.3133309781551361, + "learning_rate": 3.3555555555555557e-06, + "logits/chosen": 28461398.4, + "logits/rejected": 28053212.8, + "logps/chosen": -132.9733154296875, + "logps/rejected": -162.2736572265625, + "loss": 0.4978146553039551, + "rewards/chosen": 0.0008950136601924896, + "rewards/margins": 0.01778259202837944, + "rewards/rejected": -0.01688757836818695, + "step": 100 + }, + { + "epoch": 0.8, + "eval_kl": 0.17236100137233734, + "eval_logits/chosen": 36047056.896, + "eval_logits/rejected": 35990380.544, + "eval_logps/chosen": -156.12503125, + "eval_logps/rejected": -150.2839375, + "eval_loss": 0.5001822710037231, + "eval_rewards/chosen": -0.033492431640625, + "eval_rewards/margins": -0.0014871749877929663, + "eval_rewards/rejected": -0.032005256652832034, + "eval_runtime": 215.521, + "eval_samples_per_second": 4.64, + "eval_steps_per_second": 2.32, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 1.4526598453521729, + "kl": 0.3579270541667938, + "learning_rate": 3.133333333333334e-06, + "logits/chosen": 38711958.4, + "logits/rejected": 38049481.6, + "logps/chosen": -128.5083740234375, + "logps/rejected": -134.143994140625, + "loss": 0.4988682746887207, + "rewards/chosen": 0.013809390366077423, + "rewards/margins": 0.00909285619854927, + "rewards/rejected": 0.0047165341675281525, + "step": 110 + }, + { + "epoch": 0.88, + "eval_kl": 0.16148880124092102, + "eval_logits/chosen": 35890647.04, + "eval_logits/rejected": 35838906.368, + "eval_logps/chosen": -156.25996875, + "eval_logps/rejected": -150.409828125, + "eval_loss": 0.5002931952476501, + "eval_rewards/chosen": -0.046984207153320315, + "eval_rewards/margins": -0.002391498565673833, + "eval_rewards/rejected": -0.04459270858764648, + "eval_runtime": 215.616, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 1.475139856338501, + "kl": 0.5180838108062744, + "learning_rate": 2.9111111111111114e-06, + "logits/chosen": 47944940.8, + "logits/rejected": 47759408.0, + "logps/chosen": -160.9727294921875, + "logps/rejected": -132.9762939453125, + "loss": 0.5010076999664307, + "rewards/chosen": 0.001300615817308426, + "rewards/margins": -0.008088254928588867, + "rewards/rejected": 0.009388870745897292, + "step": 120 + }, + { + "epoch": 0.96, + "eval_kl": 0.22723029553890228, + "eval_logits/chosen": 36017373.184, + "eval_logits/rejected": 35955687.424, + "eval_logps/chosen": -156.066296875, + "eval_logps/rejected": -150.22778125, + "eval_loss": 0.5001496076583862, + "eval_rewards/chosen": -0.027616947174072266, + "eval_rewards/margins": -0.0012296409606933588, + "eval_rewards/rejected": -0.026387306213378908, + "eval_runtime": 215.5436, + "eval_samples_per_second": 4.639, + "eval_steps_per_second": 2.32, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 1.4342325925827026, + "kl": 0.511700451374054, + "learning_rate": 2.6888888888888892e-06, + "logits/chosen": 38445190.4, + "logits/rejected": 39013670.4, + "logps/chosen": -158.775146484375, + "logps/rejected": -127.187890625, + "loss": 0.49019808769226075, + "rewards/chosen": 0.06660090684890747, + "rewards/margins": 0.07856814712285996, + "rewards/rejected": -0.011967240273952484, + "step": 130 + }, + { + "epoch": 1.04, + "eval_kl": 0.45817074179649353, + "eval_logits/chosen": 36251000.832, + "eval_logits/rejected": 36169760.768, + "eval_logps/chosen": -155.61546875, + "eval_logps/rejected": -149.795578125, + "eval_loss": 0.49991893768310547, + "eval_rewards/chosen": 0.017465354919433594, + "eval_rewards/margins": 0.0006336040496826185, + "eval_rewards/rejected": 0.016831750869750976, + "eval_runtime": 214.8194, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 2.328, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 1.4938650131225586, + "kl": 0.7170234322547913, + "learning_rate": 2.466666666666667e-06, + "logits/chosen": 43274342.4, + "logits/rejected": 44117376.0, + "logps/chosen": -137.55440673828124, + "logps/rejected": -140.9193115234375, + "loss": 0.4798906326293945, + "rewards/chosen": 0.1318502902984619, + "rewards/margins": 0.16140162050724027, + "rewards/rejected": -0.02955133020877838, + "step": 140 + }, + { + "epoch": 1.12, + "eval_kl": 0.5524640679359436, + "eval_logits/chosen": 36227063.808, + "eval_logits/rejected": 36138741.76, + "eval_logps/chosen": -155.505078125, + "eval_logps/rejected": -149.6959375, + "eval_loss": 0.4997849762439728, + "eval_rewards/chosen": 0.028504100799560547, + "eval_rewards/margins": 0.0017091865539550777, + "eval_rewards/rejected": 0.02679491424560547, + "eval_runtime": 215.0601, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 1.8674030303955078, + "kl": 0.8660959005355835, + "learning_rate": 2.2444444444444445e-06, + "logits/chosen": 41505494.4, + "logits/rejected": 41576633.6, + "logps/chosen": -134.60517578125, + "logps/rejected": -151.21878662109376, + "loss": 0.4738303661346436, + "rewards/chosen": 0.1708309531211853, + "rewards/margins": 0.21031711697578429, + "rewards/rejected": -0.039486163854599, + "step": 150 + }, + { + "epoch": 1.2, + "eval_kl": 0.5169070363044739, + "eval_logits/chosen": 36047007.744, + "eval_logits/rejected": 35964690.432, + "eval_logps/chosen": -155.61309375, + "eval_logps/rejected": -149.796390625, + "eval_loss": 0.4998777508735657, + "eval_rewards/chosen": 0.017700881958007814, + "eval_rewards/margins": 0.0009510231018066417, + "eval_rewards/rejected": 0.016749858856201172, + "eval_runtime": 215.6102, + "eval_samples_per_second": 4.638, + "eval_steps_per_second": 2.319, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 1.2205264568328857, + "kl": 0.8254079818725586, + "learning_rate": 2.0222222222222223e-06, + "logits/chosen": 45365196.8, + "logits/rejected": 44416659.2, + "logps/chosen": -146.26240234375, + "logps/rejected": -146.23955078125, + "loss": 0.47802033424377444, + "rewards/chosen": 0.1414048194885254, + "rewards/margins": 0.1766757071018219, + "rewards/rejected": -0.03527088761329651, + "step": 160 + }, + { + "epoch": 1.28, + "eval_kl": 0.46929916739463806, + "eval_logits/chosen": 35877081.088, + "eval_logits/rejected": 35800481.792, + "eval_logps/chosen": -155.73628125, + "eval_logps/rejected": -149.909328125, + "eval_loss": 0.500004231929779, + "eval_rewards/chosen": 0.005384265422821045, + "eval_rewards/margins": -7.338762283325227e-05, + "eval_rewards/rejected": 0.005457653045654297, + "eval_runtime": 215.8067, + "eval_samples_per_second": 4.634, + "eval_steps_per_second": 2.317, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.7353568077087402, + "kl": 0.7226912379264832, + "learning_rate": 1.8000000000000001e-06, + "logits/chosen": 44174476.8, + "logits/rejected": 45570624.0, + "logps/chosen": -141.496435546875, + "logps/rejected": -148.9132080078125, + "loss": 0.4752659797668457, + "rewards/chosen": 0.1527896046638489, + "rewards/margins": 0.19837732315063478, + "rewards/rejected": -0.04558771848678589, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 0.4444282352924347, + "eval_logits/chosen": 35815936.0, + "eval_logits/rejected": 35742294.016, + "eval_logps/chosen": -155.7953125, + "eval_logps/rejected": -149.969390625, + "eval_loss": 0.499990314245224, + "eval_rewards/chosen": -0.0005206142663955689, + "eval_rewards/margins": 2.925407886505125e-05, + "eval_rewards/rejected": -0.0005498683452606201, + "eval_runtime": 215.3994, + "eval_samples_per_second": 4.643, + "eval_steps_per_second": 2.321, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 1.6643599271774292, + "kl": 0.48263949155807495, + "learning_rate": 1.5777777777777778e-06, + "logits/chosen": 38892889.6, + "logits/rejected": 37222313.6, + "logps/chosen": -119.811962890625, + "logps/rejected": -135.897265625, + "loss": 0.47057647705078126, + "rewards/chosen": 0.10403428077697754, + "rewards/margins": 0.24373894929885864, + "rewards/rejected": -0.1397046685218811, + "step": 180 + }, + { + "epoch": 1.44, + "eval_kl": 0.4670499563217163, + "eval_logits/chosen": 35738546.176, + "eval_logits/rejected": 35665698.816, + "eval_logps/chosen": -155.79365625, + "eval_logps/rejected": -149.968703125, + "eval_loss": 0.49997708201408386, + "eval_rewards/chosen": -0.00035465264320373534, + "eval_rewards/margins": 0.00012645322084426882, + "eval_rewards/rejected": -0.00048110586404800416, + "eval_runtime": 215.7221, + "eval_samples_per_second": 4.636, + "eval_steps_per_second": 2.318, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 1.6640219688415527, + "kl": 0.7541912794113159, + "learning_rate": 1.3555555555555558e-06, + "logits/chosen": 41279955.2, + "logits/rejected": 40511801.6, + "logps/chosen": -122.19615478515625, + "logps/rejected": -145.7674072265625, + "loss": 0.46909322738647463, + "rewards/chosen": 0.12193760871887208, + "rewards/margins": 0.2589147567749024, + "rewards/rejected": -0.13697714805603028, + "step": 190 + }, + { + "epoch": 1.52, + "eval_kl": 0.4091774523258209, + "eval_logits/chosen": 35500843.008, + "eval_logits/rejected": 35437404.16, + "eval_logps/chosen": -155.991421875, + "eval_logps/rejected": -150.15725, + "eval_loss": 0.5000874996185303, + "eval_rewards/chosen": -0.020132492065429688, + "eval_rewards/margins": -0.0007962665557861327, + "eval_rewards/rejected": -0.019336225509643555, + "eval_runtime": 215.236, + "eval_samples_per_second": 4.646, + "eval_steps_per_second": 2.323, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 1.707406759262085, + "kl": 0.5876865983009338, + "learning_rate": 1.1333333333333334e-06, + "logits/chosen": 40565459.2, + "logits/rejected": 41111059.2, + "logps/chosen": -132.433154296875, + "logps/rejected": -156.41756591796874, + "loss": 0.4774333477020264, + "rewards/chosen": 0.058162355422973634, + "rewards/margins": 0.1845989227294922, + "rewards/rejected": -0.12643656730651856, + "step": 200 + }, + { + "epoch": 1.6, + "eval_kl": 0.40361329913139343, + "eval_logits/chosen": 35455897.6, + "eval_logits/rejected": 35396550.656, + "eval_logps/chosen": -156.041859375, + "eval_logps/rejected": -150.2026875, + "eval_loss": 0.5001482963562012, + "eval_rewards/chosen": -0.02517437744140625, + "eval_rewards/margins": -0.0012942276000976576, + "eval_rewards/rejected": -0.023880149841308592, + "eval_runtime": 215.0425, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.8582910299301147, + "kl": 0.8855921626091003, + "learning_rate": 9.111111111111113e-07, + "logits/chosen": 32328320.0, + "logits/rejected": 31538704.0, + "logps/chosen": -185.6922119140625, + "logps/rejected": -180.88406982421876, + "loss": 0.4729271411895752, + "rewards/chosen": 0.14517008066177367, + "rewards/margins": 0.2178096830844879, + "rewards/rejected": -0.07263960242271424, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 0.4077625572681427, + "eval_logits/chosen": 35421622.272, + "eval_logits/rejected": 35361173.504, + "eval_logps/chosen": -156.0609375, + "eval_logps/rejected": -150.21765625, + "eval_loss": 0.5001978874206543, + "eval_rewards/chosen": -0.027083938598632812, + "eval_rewards/margins": -0.0017067718505859378, + "eval_rewards/rejected": -0.025377166748046874, + "eval_runtime": 215.0369, + "eval_samples_per_second": 4.65, + "eval_steps_per_second": 2.325, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 1.5523241758346558, + "kl": 0.7643419504165649, + "learning_rate": 6.88888888888889e-07, + "logits/chosen": 44855187.2, + "logits/rejected": 43988204.8, + "logps/chosen": -147.70128173828124, + "logps/rejected": -145.88345947265626, + "loss": 0.4781179904937744, + "rewards/chosen": 0.11672601699829102, + "rewards/margins": 0.1772436797618866, + "rewards/rejected": -0.06051766276359558, + "step": 220 + }, + { + "epoch": 1.76, + "eval_kl": 0.42980414628982544, + "eval_logits/chosen": 35399405.568, + "eval_logits/rejected": 35338944.512, + "eval_logps/chosen": -156.039234375, + "eval_logps/rejected": -150.198140625, + "eval_loss": 0.5001705884933472, + "eval_rewards/chosen": -0.024912540435791015, + "eval_rewards/margins": -0.0014874629974365242, + "eval_rewards/rejected": -0.02342507743835449, + "eval_runtime": 214.8075, + "eval_samples_per_second": 4.655, + "eval_steps_per_second": 2.328, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.4052627086639404, + "kl": 0.560142457485199, + "learning_rate": 4.666666666666667e-07, + "logits/chosen": 32892409.6, + "logits/rejected": 31692844.8, + "logps/chosen": -114.99593505859374, + "logps/rejected": -125.1630126953125, + "loss": 0.4770528793334961, + "rewards/chosen": 0.07531413435935974, + "rewards/margins": 0.18859378695487977, + "rewards/rejected": -0.11327965259552002, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 0.42431339621543884, + "eval_logits/chosen": 35335073.792, + "eval_logits/rejected": 35277471.744, + "eval_logps/chosen": -156.08775, + "eval_logps/rejected": -150.240609375, + "eval_loss": 0.5002422332763672, + "eval_rewards/chosen": -0.02976216125488281, + "eval_rewards/margins": -0.0020921192169189445, + "eval_rewards/rejected": -0.027670042037963867, + "eval_runtime": 214.985, + "eval_samples_per_second": 4.651, + "eval_steps_per_second": 2.326, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 1.5062552690505981, + "kl": 1.0432134866714478, + "learning_rate": 2.444444444444445e-07, + "logits/chosen": 40635660.8, + "logits/rejected": 40671884.8, + "logps/chosen": -160.55706787109375, + "logps/rejected": -154.69022216796876, + "loss": 0.48739013671875, + "rewards/chosen": 0.11838672161102295, + "rewards/margins": 0.10130963623523713, + "rewards/rejected": 0.01707708537578583, + "step": 240 + }, + { + "epoch": 1.92, + "eval_kl": 0.41829124093055725, + "eval_logits/chosen": 35317178.368, + "eval_logits/rejected": 35259428.864, + "eval_logps/chosen": -156.105671875, + "eval_logps/rejected": -150.261796875, + "eval_loss": 0.500201940536499, + "eval_rewards/chosen": -0.03155590438842774, + "eval_rewards/margins": -0.0017659969329834012, + "eval_rewards/rejected": -0.029789907455444336, + "eval_runtime": 215.1895, + "eval_samples_per_second": 4.647, + "eval_steps_per_second": 2.324, + "step": 240 + }, + { + "epoch": 2.0, + "grad_norm": 1.4132362604141235, + "kl": 0.8731099963188171, + "learning_rate": 2.2222222222222224e-08, + "logits/chosen": 47889168.0, + "logits/rejected": 48506752.0, + "logps/chosen": -159.80478515625, + "logps/rejected": -135.6299560546875, + "loss": 0.48304042816162107, + "rewards/chosen": 0.10256694555282593, + "rewards/margins": 0.13703858554363252, + "rewards/rejected": -0.03447163999080658, + "step": 250 + }, + { + "epoch": 2.0, + "eval_kl": 0.42476609349250793, + "eval_logits/chosen": 35321581.568, + "eval_logits/rejected": 35262775.296, + "eval_logps/chosen": -156.09065625, + "eval_logps/rejected": -150.24690625, + "eval_loss": 0.500200092792511, + "eval_rewards/chosen": -0.030054378509521484, + "eval_rewards/margins": -0.0017528228759765632, + "eval_rewards/rejected": -0.02830155563354492, + "eval_runtime": 214.7934, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 250 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_1k/lora/checkpoint-250/training_args.bin b/v5/KTO/KTO_1k/lora/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..c08165c47630e923463a136aea666c753d71c0a5 --- /dev/null +++ b/v5/KTO/KTO_1k/lora/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01f0c8776981919c9c02fc9f496de711556353cd990690edb7e08892b3e90ddf +size 5649 diff --git a/v5/KTO/KTO_20k/KTO_20k/README.md b/v5/KTO/KTO_20k/KTO_20k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_20k/KTO_20k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_20k/KTO_20k/adapter_config.json b/v5/KTO/KTO_20k/KTO_20k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5726eb3a65b963dd94788413b8a63d4accbb95c3 --- /dev/null +++ b/v5/KTO/KTO_20k/KTO_20k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_20k/KTO_20k/adapter_model.safetensors b/v5/KTO/KTO_20k/KTO_20k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ffdc50f5c8901f45be168f0ba6b75d44562215f3 --- /dev/null +++ b/v5/KTO/KTO_20k/KTO_20k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a58550c550a64ca56e172c92c1d63b43276bcab05d53668b6034492a783b380 +size 180385008 diff --git a/v5/KTO/KTO_20k/MKTO_20k/chat_template.jinja b/v5/KTO/KTO_20k/MKTO_20k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_20k/MKTO_20k/config.json b/v5/KTO/KTO_20k/MKTO_20k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/KTO/KTO_20k/MKTO_20k/generation_config.json b/v5/KTO/KTO_20k/MKTO_20k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/KTO/KTO_20k/MKTO_20k/model.safetensors b/v5/KTO/KTO_20k/MKTO_20k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d051172c90b03e72ebf5127ba1fd6ba8b5758511 --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89523986a5bae229b165f8d340d4b51954d2819711e0c8f2dcd4145d030c428 +size 2471645464 diff --git a/v5/KTO/KTO_20k/MKTO_20k/tokenizer.json b/v5/KTO/KTO_20k/MKTO_20k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_20k/MKTO_20k/tokenizer_config.json b/v5/KTO/KTO_20k/MKTO_20k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_20k/MKTO_20k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_20k/lora/README.md b/v5/KTO/KTO_20k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e14b8a0409cc1375b9f36212e5299f21f43c9cba --- /dev/null +++ b/v5/KTO/KTO_20k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- kto +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/3ldvns74) + + +This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite KTO as: + +```bibtex +@article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/README.md b/v5/KTO/KTO_20k/lora/checkpoint-4600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_config.json b/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5726eb3a65b963dd94788413b8a63d4accbb95c3 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_model.safetensors b/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ffdc50f5c8901f45be168f0ba6b75d44562215f3 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a58550c550a64ca56e172c92c1d63b43276bcab05d53668b6034492a783b380 +size 180385008 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/chat_template.jinja b/v5/KTO/KTO_20k/lora/checkpoint-4600/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/optimizer.pt b/v5/KTO/KTO_20k/lora/checkpoint-4600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a1853a72b9c57e128083e956b4606235145c72f --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e565373ec8e5711a7c3b99dc1c121a6e09341d7888e368bc8b3fc4f1c3dd3819 +size 360902475 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/rng_state.pth b/v5/KTO/KTO_20k/lora/checkpoint-4600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..80f98e729a85d79fa77e6570e00c15b63087b058 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a804dd9b4962bc1e7c8e5b51c83ce95f04ab0a366340b47fc4849e7d4ecffd6d +size 14645 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/scaler.pt b/v5/KTO/KTO_20k/lora/checkpoint-4600/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16c64071594d1239890f9c2446259dcdd385b2e4 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d5f1d0b668240e318afe61a3e255204044d52e6d78b98d08469e3aaa3293711 +size 1383 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/scheduler.pt b/v5/KTO/KTO_20k/lora/checkpoint-4600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..75d192701cd4965993cbe27086d3a1855ee03524 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64a4d1f4318958e5461e0783e92780d386d34e250984ad9254aad65a0f968189 +size 1465 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer.json b/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer_config.json b/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/trainer_state.json b/v5/KTO/KTO_20k/lora/checkpoint-4600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..21cb4ed410ef607bd87f0fabffef581c7a4ec756 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/trainer_state.json @@ -0,0 +1,7302 @@ +{ + "best_global_step": 4600, + "best_metric": 0.2337820434570313, + "best_model_checkpoint": "output/lora/checkpoint-4600", + "epoch": 1.8399999999999999, + "eval_steps": 200, + "global_step": 4600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 0.5129354000091553, + "kl": 0.01111381035298109, + "learning_rate": 9e-08, + "logits/chosen": 29682550.4, + "logits/rejected": 31339625.6, + "logps/chosen": -148.96693115234376, + "logps/rejected": -128.8356201171875, + "loss": 0.5001067161560059, + "rewards/chosen": -0.0005946397315710783, + "rewards/margins": -0.0008538532070815563, + "rewards/rejected": 0.000259213475510478, + "step": 10 + }, + { + "epoch": 0.008, + "grad_norm": 0.4323230981826782, + "kl": 0.015593004412949085, + "learning_rate": 1.9e-07, + "logits/chosen": 53384144.0, + "logits/rejected": 52884704.0, + "logps/chosen": -140.024853515625, + "logps/rejected": -151.92880859375, + "loss": 0.49987268447875977, + "rewards/chosen": 0.0006237029097974301, + "rewards/margins": 0.0010180996730923652, + "rewards/rejected": -0.0003943967632949352, + "step": 20 + }, + { + "epoch": 0.012, + "grad_norm": 0.4257548451423645, + "kl": 0.014815926551818848, + "learning_rate": 2.9000000000000003e-07, + "logits/chosen": 34151433.6, + "logits/rejected": 34198240.0, + "logps/chosen": -131.73375244140624, + "logps/rejected": -140.37911376953124, + "loss": 0.4998063087463379, + "rewards/chosen": 0.0004901790525764227, + "rewards/margins": 0.0015492869075387715, + "rewards/rejected": -0.0010591078549623489, + "step": 30 + }, + { + "epoch": 0.016, + "grad_norm": 0.36496493220329285, + "kl": 0.02263352833688259, + "learning_rate": 3.9e-07, + "logits/chosen": 43278188.8, + "logits/rejected": 43919286.4, + "logps/chosen": -144.2862060546875, + "logps/rejected": -146.0272705078125, + "loss": 0.4999645233154297, + "rewards/chosen": 0.0011271238327026367, + "rewards/margins": 0.00028378488495945926, + "rewards/rejected": 0.0008433389477431775, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.3303475081920624, + "kl": 0.018513035029172897, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 43083849.6, + "logits/rejected": 44890182.4, + "logps/chosen": -140.999267578125, + "logps/rejected": -154.3203369140625, + "loss": 0.4999688625335693, + "rewards/chosen": 0.0011019515804946423, + "rewards/margins": 0.0002493190579116345, + "rewards/rejected": 0.0008526325225830078, + "step": 50 + }, + { + "epoch": 0.024, + "grad_norm": 0.2820725739002228, + "kl": 0.01858975924551487, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 36625328.0, + "logits/rejected": 34144819.2, + "logps/chosen": -105.73199462890625, + "logps/rejected": -114.0021728515625, + "loss": 0.5000367164611816, + "rewards/chosen": 0.0006336641497910022, + "rewards/margins": -0.0002944803796708584, + "rewards/rejected": 0.0009281445294618606, + "step": 60 + }, + { + "epoch": 0.028, + "grad_norm": 0.3881119191646576, + "kl": 0.00938491802662611, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 47839347.2, + "logits/rejected": 46951110.4, + "logps/chosen": -165.68013916015624, + "logps/rejected": -175.11986083984374, + "loss": 0.5000545501708984, + "rewards/chosen": -0.004812383651733398, + "rewards/margins": -0.0004361916333436959, + "rewards/rejected": -0.004376192018389702, + "step": 70 + }, + { + "epoch": 0.032, + "grad_norm": 0.4655516743659973, + "kl": 0.011602235026657581, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 40787196.8, + "logits/rejected": 40853612.8, + "logps/chosen": -154.95506591796874, + "logps/rejected": -163.55113525390624, + "loss": 0.4999113082885742, + "rewards/chosen": -0.003601384162902832, + "rewards/margins": 0.0007092095911502838, + "rewards/rejected": -0.004310593754053116, + "step": 80 + }, + { + "epoch": 0.036, + "grad_norm": 0.3819780647754669, + "kl": 0.02207348309457302, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 44163190.4, + "logits/rejected": 44268889.6, + "logps/chosen": -169.8670166015625, + "logps/rejected": -159.19212646484374, + "loss": 0.4996920108795166, + "rewards/chosen": -0.0014049055054783822, + "rewards/margins": 0.002464146353304386, + "rewards/rejected": -0.003869051858782768, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.4587983191013336, + "kl": 0.056853484362363815, + "learning_rate": 9.9e-07, + "logits/chosen": 27709289.6, + "logits/rejected": 27346092.8, + "logps/chosen": -134.2815185546875, + "logps/rejected": -164.53704833984375, + "loss": 0.4997425556182861, + "rewards/chosen": 0.0017462443560361863, + "rewards/margins": 0.0020607755985111, + "rewards/rejected": -0.0003145312424749136, + "step": 100 + }, + { + "epoch": 0.044, + "grad_norm": 0.3832976818084717, + "kl": 0.052884578704833984, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 38150864.0, + "logits/rejected": 37954022.4, + "logps/chosen": -130.54158935546874, + "logps/rejected": -135.6479248046875, + "loss": 0.49963693618774413, + "rewards/chosen": 0.002483482100069523, + "rewards/margins": 0.0029049014206975698, + "rewards/rejected": -0.000421419320628047, + "step": 110 + }, + { + "epoch": 0.048, + "grad_norm": 0.3761675953865051, + "kl": 0.06726250797510147, + "learning_rate": 1.19e-06, + "logits/chosen": 47769347.2, + "logits/rejected": 47376777.6, + "logps/chosen": -162.1564208984375, + "logps/rejected": -133.792041015625, + "loss": 0.5000278949737549, + "rewards/chosen": 0.00019950373098254204, + "rewards/margins": -0.00022385641932487488, + "rewards/rejected": 0.0004233601503074169, + "step": 120 + }, + { + "epoch": 0.052, + "grad_norm": 0.3125726580619812, + "kl": 0.15044990181922913, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 40041744.0, + "logits/rejected": 39132192.0, + "logps/chosen": -162.06031494140626, + "logps/rejected": -140.18397216796876, + "loss": 0.5000635147094726, + "rewards/chosen": 0.011354871094226837, + "rewards/margins": -0.0005075931549072266, + "rewards/rejected": 0.011862464249134064, + "step": 130 + }, + { + "epoch": 0.056, + "grad_norm": 0.35332098603248596, + "kl": 0.2332003116607666, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 40936960.0, + "logits/rejected": 42938476.8, + "logps/chosen": -142.440185546875, + "logps/rejected": -157.8077880859375, + "loss": 0.5003787040710449, + "rewards/chosen": 0.021105077862739564, + "rewards/margins": -0.003029544651508332, + "rewards/rejected": 0.024134622514247896, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.37256330251693726, + "kl": 0.25889211893081665, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 41140886.4, + "logits/rejected": 42385772.8, + "logps/chosen": -130.3114013671875, + "logps/rejected": -117.353369140625, + "loss": 0.49920454025268557, + "rewards/chosen": 0.02739974558353424, + "rewards/margins": 0.006363460421562196, + "rewards/rejected": 0.021036285161972045, + "step": 150 + }, + { + "epoch": 0.064, + "grad_norm": 0.38222262263298035, + "kl": 0.3707125782966614, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 34721360.0, + "logits/rejected": 35081372.8, + "logps/chosen": -143.524462890625, + "logps/rejected": -147.06605224609376, + "loss": 0.49993181228637695, + "rewards/chosen": 0.03622217178344726, + "rewards/margins": 0.0005454152822494465, + "rewards/rejected": 0.035676756501197816, + "step": 160 + }, + { + "epoch": 0.068, + "grad_norm": 0.3260433077812195, + "kl": 0.4656868577003479, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 37774179.2, + "logits/rejected": 39969705.6, + "logps/chosen": -138.2024658203125, + "logps/rejected": -149.75228271484374, + "loss": 0.4999128818511963, + "rewards/chosen": 0.04691722691059112, + "rewards/margins": 0.0006973743438720703, + "rewards/rejected": 0.04621985256671905, + "step": 170 + }, + { + "epoch": 0.072, + "grad_norm": 0.47040465474128723, + "kl": 0.6031174659729004, + "learning_rate": 1.79e-06, + "logits/chosen": 44058707.2, + "logits/rejected": 45027283.2, + "logps/chosen": -145.10814208984374, + "logps/rejected": -170.75323486328125, + "loss": 0.5000656604766845, + "rewards/chosen": 0.060049277544021604, + "rewards/margins": -0.0005249440670013483, + "rewards/rejected": 0.06057422161102295, + "step": 180 + }, + { + "epoch": 0.076, + "grad_norm": 0.3483351767063141, + "kl": 0.5900982022285461, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 41571993.6, + "logits/rejected": 43139596.8, + "logps/chosen": -124.036376953125, + "logps/rejected": -137.070751953125, + "loss": 0.5001267910003662, + "rewards/chosen": 0.05850306153297424, + "rewards/margins": -0.0010134875774383545, + "rewards/rejected": 0.0595165491104126, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.47156763076782227, + "kl": 0.654812753200531, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 46899654.4, + "logits/rejected": 48147660.8, + "logps/chosen": -150.88424072265624, + "logps/rejected": -172.6162109375, + "loss": 0.4997762680053711, + "rewards/chosen": 0.06637628674507141, + "rewards/margins": 0.001789605617523199, + "rewards/rejected": 0.06458668112754821, + "step": 200 + }, + { + "epoch": 0.08, + "eval_kl": 0.4785654842853546, + "eval_logits/chosen": 39006478.336, + "eval_logits/rejected": 38887682.048, + "eval_logps/chosen": -153.8359375, + "eval_logps/rejected": -148.1899375, + "eval_loss": 0.49953681230545044, + "eval_rewards/chosen": 0.04898439407348633, + "eval_rewards/margins": 0.00370624542236328, + "eval_rewards/rejected": 0.04527814865112305, + "eval_runtime": 217.7826, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 200 + }, + { + "epoch": 0.084, + "grad_norm": 0.3955392837524414, + "kl": 0.3641941249370575, + "learning_rate": 2.09e-06, + "logits/chosen": 34072531.2, + "logits/rejected": 34148444.8, + "logps/chosen": -140.24954833984376, + "logps/rejected": -132.308837890625, + "loss": 0.5002529621124268, + "rewards/chosen": 0.02951604127883911, + "rewards/margins": -0.002015212178230287, + "rewards/rejected": 0.0315312534570694, + "step": 210 + }, + { + "epoch": 0.088, + "grad_norm": 0.3798522651195526, + "kl": 0.2939055263996124, + "learning_rate": 2.19e-06, + "logits/chosen": 35659238.4, + "logits/rejected": 36517523.2, + "logps/chosen": -101.61099853515626, + "logps/rejected": -126.22640380859374, + "loss": 0.500004768371582, + "rewards/chosen": 0.023823246359825134, + "rewards/margins": -4.143416881561418e-05, + "rewards/rejected": 0.02386468052864075, + "step": 220 + }, + { + "epoch": 0.092, + "grad_norm": 0.4183703660964966, + "kl": 0.16193707287311554, + "learning_rate": 2.29e-06, + "logits/chosen": 47875980.8, + "logits/rejected": 46433056.0, + "logps/chosen": -185.33223876953124, + "logps/rejected": -163.7679443359375, + "loss": 0.5001229286193848, + "rewards/chosen": 0.002850056067109108, + "rewards/margins": -0.0010004475712776183, + "rewards/rejected": 0.003850503638386726, + "step": 230 + }, + { + "epoch": 0.096, + "grad_norm": 0.41121652722358704, + "kl": 0.2479170858860016, + "learning_rate": 2.39e-06, + "logits/chosen": 48178169.6, + "logits/rejected": 48277104.0, + "logps/chosen": -176.87537841796876, + "logps/rejected": -166.7927978515625, + "loss": 0.5001242637634278, + "rewards/chosen": 0.015346670150756836, + "rewards/margins": -0.0010146483778953556, + "rewards/rejected": 0.01636131852865219, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.49889811873435974, + "kl": 0.2833125591278076, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 37073097.6, + "logits/rejected": 35158246.4, + "logps/chosen": -176.40118408203125, + "logps/rejected": -145.24967041015626, + "loss": 0.49751877784729004, + "rewards/chosen": 0.03347367346286774, + "rewards/margins": 0.01985820829868317, + "rewards/rejected": 0.01361546516418457, + "step": 250 + }, + { + "epoch": 0.104, + "grad_norm": 0.30958813428878784, + "kl": 0.262741357088089, + "learning_rate": 2.59e-06, + "logits/chosen": 30595280.0, + "logits/rejected": 29650486.4, + "logps/chosen": -129.1654296875, + "logps/rejected": -131.6908935546875, + "loss": 0.4989294528961182, + "rewards/chosen": 0.02159818708896637, + "rewards/margins": 0.008565258979797364, + "rewards/rejected": 0.013032928109169006, + "step": 260 + }, + { + "epoch": 0.108, + "grad_norm": 0.5211781859397888, + "kl": 0.45164403319358826, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 39772844.8, + "logits/rejected": 40001942.4, + "logps/chosen": -157.1872314453125, + "logps/rejected": -151.93499755859375, + "loss": 0.498414421081543, + "rewards/chosen": 0.04374273419380188, + "rewards/margins": 0.012685334682464602, + "rewards/rejected": 0.03105739951133728, + "step": 270 + }, + { + "epoch": 0.112, + "grad_norm": 0.4277898073196411, + "kl": 0.4805964529514313, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 33355596.8, + "logits/rejected": 27445552.0, + "logps/chosen": -167.81568603515626, + "logps/rejected": -140.00860595703125, + "loss": 0.5012622356414795, + "rewards/chosen": 0.035515934228897095, + "rewards/margins": -0.010110187530517581, + "rewards/rejected": 0.045626121759414676, + "step": 280 + }, + { + "epoch": 0.116, + "grad_norm": 0.34256765246391296, + "kl": 0.7766927480697632, + "learning_rate": 2.89e-06, + "logits/chosen": 31572070.4, + "logits/rejected": 30722460.8, + "logps/chosen": -145.83292236328126, + "logps/rejected": -147.77294921875, + "loss": 0.4969996452331543, + "rewards/chosen": 0.08674753904342651, + "rewards/margins": 0.024010515213012687, + "rewards/rejected": 0.06273702383041382, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.5291184782981873, + "kl": 0.9063084721565247, + "learning_rate": 2.99e-06, + "logits/chosen": 42822204.8, + "logits/rejected": 43076217.6, + "logps/chosen": -168.21903076171876, + "logps/rejected": -179.7796630859375, + "loss": 0.49880061149597166, + "rewards/chosen": 0.07154507040977479, + "rewards/margins": 0.009801769256591805, + "rewards/rejected": 0.06174330115318298, + "step": 300 + }, + { + "epoch": 0.124, + "grad_norm": 0.37273386120796204, + "kl": 0.7940840125083923, + "learning_rate": 3.09e-06, + "logits/chosen": 34380668.8, + "logits/rejected": 35391296.0, + "logps/chosen": -146.24031982421874, + "logps/rejected": -155.27100830078126, + "loss": 0.501917552947998, + "rewards/chosen": 0.06237313747406006, + "rewards/margins": -0.015355908870697023, + "rewards/rejected": 0.07772904634475708, + "step": 310 + }, + { + "epoch": 0.128, + "grad_norm": 0.606876015663147, + "kl": 0.3188292682170868, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 38069395.2, + "logits/rejected": 35962320.0, + "logps/chosen": -160.331884765625, + "logps/rejected": -134.08292236328126, + "loss": 0.5004048347473145, + "rewards/chosen": -0.0012070264667272568, + "rewards/margins": -0.0032936643809080126, + "rewards/rejected": 0.0020866379141807555, + "step": 320 + }, + { + "epoch": 0.132, + "grad_norm": 0.37304550409317017, + "kl": 0.49732810258865356, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 47658496.0, + "logits/rejected": 44939072.0, + "logps/chosen": -153.3678955078125, + "logps/rejected": -144.11009521484374, + "loss": 0.49539766311645506, + "rewards/chosen": 0.047433477640151975, + "rewards/margins": 0.036879205703735346, + "rewards/rejected": 0.010554271936416625, + "step": 330 + }, + { + "epoch": 0.136, + "grad_norm": 0.39304211735725403, + "kl": 0.6668508052825928, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 40542966.4, + "logits/rejected": 41501622.4, + "logps/chosen": -134.35115966796874, + "logps/rejected": -143.7405029296875, + "loss": 0.500092887878418, + "rewards/chosen": 0.05502796769142151, + "rewards/margins": -0.0008756637573242229, + "rewards/rejected": 0.05590363144874573, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 0.40085893869400024, + "kl": 1.0664705038070679, + "learning_rate": 3.49e-06, + "logits/chosen": 38259638.4, + "logits/rejected": 37112304.0, + "logps/chosen": -134.30029296875, + "logps/rejected": -169.855859375, + "loss": 0.501033353805542, + "rewards/chosen": 0.09948489665985108, + "rewards/margins": -0.008264517784118644, + "rewards/rejected": 0.10774941444396972, + "step": 350 + }, + { + "epoch": 0.144, + "grad_norm": 0.3823564350605011, + "kl": 0.684799075126648, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 41781318.4, + "logits/rejected": 39330675.2, + "logps/chosen": -147.53521728515625, + "logps/rejected": -112.3668701171875, + "loss": 0.4975080966949463, + "rewards/chosen": 0.07126325964927674, + "rewards/margins": 0.019923430681228642, + "rewards/rejected": 0.0513398289680481, + "step": 360 + }, + { + "epoch": 0.148, + "grad_norm": 0.3882247507572174, + "kl": 0.9012428522109985, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 38998243.2, + "logits/rejected": 39052438.4, + "logps/chosen": -146.253564453125, + "logps/rejected": -150.4812744140625, + "loss": 0.5019874572753906, + "rewards/chosen": 0.07476127743721009, + "rewards/margins": -0.01590984463691711, + "rewards/rejected": 0.0906711220741272, + "step": 370 + }, + { + "epoch": 0.152, + "grad_norm": 0.6131926774978638, + "kl": 1.3055822849273682, + "learning_rate": 3.79e-06, + "logits/chosen": 34907878.4, + "logits/rejected": 35887766.4, + "logps/chosen": -155.17113037109374, + "logps/rejected": -166.9649658203125, + "loss": 0.501701831817627, + "rewards/chosen": 0.11749210357666015, + "rewards/margins": -0.013779759407043457, + "rewards/rejected": 0.1312718629837036, + "step": 380 + }, + { + "epoch": 0.156, + "grad_norm": 0.40174201130867004, + "kl": 1.3810280561447144, + "learning_rate": 3.89e-06, + "logits/chosen": 32044432.0, + "logits/rejected": 31293644.8, + "logps/chosen": -174.93670654296875, + "logps/rejected": -149.1033935546875, + "loss": 0.49690823554992675, + "rewards/chosen": 0.14782886505126952, + "rewards/margins": 0.024736273288726796, + "rewards/rejected": 0.12309259176254272, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 0.38557207584381104, + "kl": 1.346494197845459, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 43779216.0, + "logits/rejected": 41962329.6, + "logps/chosen": -148.337939453125, + "logps/rejected": -134.69471435546876, + "loss": 0.49627056121826174, + "rewards/chosen": 0.14792776107788086, + "rewards/margins": 0.029889833927154538, + "rewards/rejected": 0.11803792715072632, + "step": 400 + }, + { + "epoch": 0.16, + "eval_kl": 1.5156359672546387, + "eval_logits/chosen": 39535529.984, + "eval_logits/rejected": 39357890.56, + "eval_logps/chosen": -152.728359375, + "eval_logps/rejected": -147.235078125, + "eval_loss": 0.4976339638233185, + "eval_rewards/chosen": 0.15974119567871095, + "eval_rewards/margins": 0.01897695922851564, + "eval_rewards/rejected": 0.1407642364501953, + "eval_runtime": 217.4122, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 400 + }, + { + "epoch": 0.164, + "grad_norm": 0.3281092047691345, + "kl": 1.7389549016952515, + "learning_rate": 4.09e-06, + "logits/chosen": 44133203.2, + "logits/rejected": 42124723.2, + "logps/chosen": -146.7071044921875, + "logps/rejected": -147.72525634765626, + "loss": 0.4964505672454834, + "rewards/chosen": 0.1881537079811096, + "rewards/margins": 0.028516340255737294, + "rewards/rejected": 0.15963736772537232, + "step": 410 + }, + { + "epoch": 0.168, + "grad_norm": 0.5361565947532654, + "kl": 1.5194333791732788, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 35550390.4, + "logits/rejected": 33523052.8, + "logps/chosen": -161.477978515625, + "logps/rejected": -130.558740234375, + "loss": 0.4951611518859863, + "rewards/chosen": 0.16986674070358276, + "rewards/margins": 0.0387694001197815, + "rewards/rejected": 0.13109734058380126, + "step": 420 + }, + { + "epoch": 0.172, + "grad_norm": 0.3820321559906006, + "kl": 2.423292875289917, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 38340924.8, + "logits/rejected": 34324185.6, + "logps/chosen": -168.2252685546875, + "logps/rejected": -117.912451171875, + "loss": 0.4928645133972168, + "rewards/chosen": 0.2708438873291016, + "rewards/margins": 0.05714957714080812, + "rewards/rejected": 0.21369431018829346, + "step": 430 + }, + { + "epoch": 0.176, + "grad_norm": 0.5985101461410522, + "kl": 3.3684749603271484, + "learning_rate": 4.39e-06, + "logits/chosen": 43800515.2, + "logits/rejected": 45902227.2, + "logps/chosen": -145.99703369140624, + "logps/rejected": -166.97867431640626, + "loss": 0.5005404472351074, + "rewards/chosen": 0.3345966339111328, + "rewards/margins": -0.004501628875732411, + "rewards/rejected": 0.3390982627868652, + "step": 440 + }, + { + "epoch": 0.18, + "grad_norm": 0.38066738843917847, + "kl": 3.7418315410614014, + "learning_rate": 4.49e-06, + "logits/chosen": 35791923.2, + "logits/rejected": 39352179.2, + "logps/chosen": -95.49920654296875, + "logps/rejected": -151.85885009765624, + "loss": 0.5004417419433593, + "rewards/chosen": 0.3723719596862793, + "rewards/margins": -0.00362257957458495, + "rewards/rejected": 0.37599453926086424, + "step": 450 + }, + { + "epoch": 0.184, + "grad_norm": 0.44641122221946716, + "kl": 4.462111473083496, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 47153696.0, + "logits/rejected": 45399382.4, + "logps/chosen": -160.3977294921875, + "logps/rejected": -176.209765625, + "loss": 0.5064189434051514, + "rewards/chosen": 0.420426607131958, + "rewards/margins": -0.05156884193420408, + "rewards/rejected": 0.4719954490661621, + "step": 460 + }, + { + "epoch": 0.188, + "grad_norm": 0.8194193840026855, + "kl": 3.437168836593628, + "learning_rate": 4.69e-06, + "logits/chosen": 56483507.2, + "logits/rejected": 53677011.2, + "logps/chosen": -170.1270751953125, + "logps/rejected": -174.05128173828126, + "loss": 0.4986457824707031, + "rewards/chosen": 0.3491526126861572, + "rewards/margins": 0.010871815681457508, + "rewards/rejected": 0.3382807970046997, + "step": 470 + }, + { + "epoch": 0.192, + "grad_norm": 0.4502066373825073, + "kl": 2.8006443977355957, + "learning_rate": 4.79e-06, + "logits/chosen": 44358038.4, + "logits/rejected": 43814537.6, + "logps/chosen": -149.608935546875, + "logps/rejected": -158.89376220703124, + "loss": 0.49710774421691895, + "rewards/chosen": 0.29175291061401365, + "rewards/margins": 0.02337703704833982, + "rewards/rejected": 0.26837587356567383, + "step": 480 + }, + { + "epoch": 0.196, + "grad_norm": 0.3172740638256073, + "kl": 2.634169816970825, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 30142633.6, + "logits/rejected": 28152640.0, + "logps/chosen": -146.2323974609375, + "logps/rejected": -144.8813720703125, + "loss": 0.49065570831298827, + "rewards/chosen": 0.30094659328460693, + "rewards/margins": 0.07505896091461181, + "rewards/rejected": 0.22588763236999512, + "step": 490 + }, + { + "epoch": 0.2, + "grad_norm": 0.5071095824241638, + "kl": 4.3001179695129395, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 45352835.2, + "logits/rejected": 41344652.8, + "logps/chosen": -177.9995361328125, + "logps/rejected": -128.72022705078126, + "loss": 0.48739986419677733, + "rewards/chosen": 0.48075294494628906, + "rewards/margins": 0.10148224830627439, + "rewards/rejected": 0.37927069664001467, + "step": 500 + }, + { + "epoch": 0.204, + "grad_norm": 0.393530011177063, + "kl": 3.5296618938446045, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 47881750.4, + "logits/rejected": 49526156.8, + "logps/chosen": -143.6640625, + "logps/rejected": -156.88994140625, + "loss": 0.49992995262145995, + "rewards/chosen": 0.3532871723175049, + "rewards/margins": 0.0006417512893676647, + "rewards/rejected": 0.35264542102813723, + "step": 510 + }, + { + "epoch": 0.208, + "grad_norm": 0.3692869544029236, + "kl": 4.48037576675415, + "learning_rate": 4.978888888888889e-06, + "logits/chosen": 46837849.6, + "logits/rejected": 45857177.6, + "logps/chosen": -154.83260498046874, + "logps/rejected": -160.442333984375, + "loss": 0.495820426940918, + "rewards/chosen": 0.464794921875, + "rewards/margins": 0.03351507186889646, + "rewards/rejected": 0.4312798500061035, + "step": 520 + }, + { + "epoch": 0.212, + "grad_norm": 0.44535931944847107, + "kl": 4.121534824371338, + "learning_rate": 4.967777777777778e-06, + "logits/chosen": 42945225.6, + "logits/rejected": 43357875.2, + "logps/chosen": -138.18310546875, + "logps/rejected": -172.6733154296875, + "loss": 0.5080226421356201, + "rewards/chosen": 0.3798489570617676, + "rewards/margins": -0.06460924148559571, + "rewards/rejected": 0.4444581985473633, + "step": 530 + }, + { + "epoch": 0.216, + "grad_norm": 0.5619053840637207, + "kl": 4.352797031402588, + "learning_rate": 4.956666666666667e-06, + "logits/chosen": 34937552.0, + "logits/rejected": 34883318.4, + "logps/chosen": -147.39837646484375, + "logps/rejected": -154.47596435546876, + "loss": 0.49129457473754884, + "rewards/chosen": 0.4700439929962158, + "rewards/margins": 0.0695285320281982, + "rewards/rejected": 0.4005154609680176, + "step": 540 + }, + { + "epoch": 0.22, + "grad_norm": 0.4256366193294525, + "kl": 3.3400237560272217, + "learning_rate": 4.945555555555557e-06, + "logits/chosen": 41670598.4, + "logits/rejected": 43236768.0, + "logps/chosen": -152.20511474609376, + "logps/rejected": -165.210205078125, + "loss": 0.4960598945617676, + "rewards/chosen": 0.3506686449050903, + "rewards/margins": 0.03333282470703125, + "rewards/rejected": 0.31733582019805906, + "step": 550 + }, + { + "epoch": 0.224, + "grad_norm": 0.42866551876068115, + "kl": 3.0413570404052734, + "learning_rate": 4.934444444444445e-06, + "logits/chosen": 36545302.4, + "logits/rejected": 34813177.6, + "logps/chosen": -161.16314697265625, + "logps/rejected": -148.3569091796875, + "loss": 0.4982303619384766, + "rewards/chosen": 0.2904952049255371, + "rewards/margins": 0.014188337326049794, + "rewards/rejected": 0.2763068675994873, + "step": 560 + }, + { + "epoch": 0.228, + "grad_norm": 0.3665854334831238, + "kl": 2.66752290725708, + "learning_rate": 4.923333333333334e-06, + "logits/chosen": 41975648.0, + "logits/rejected": 40743257.6, + "logps/chosen": -147.2247802734375, + "logps/rejected": -131.81417236328124, + "loss": 0.4888582706451416, + "rewards/chosen": 0.3010892391204834, + "rewards/margins": 0.08992741107940674, + "rewards/rejected": 0.21116182804107667, + "step": 570 + }, + { + "epoch": 0.232, + "grad_norm": 0.42764145135879517, + "kl": 2.8396944999694824, + "learning_rate": 4.912222222222223e-06, + "logits/chosen": 47665238.4, + "logits/rejected": 46761827.2, + "logps/chosen": -147.21837158203124, + "logps/rejected": -156.8475830078125, + "loss": 0.4951943874359131, + "rewards/chosen": 0.2681096315383911, + "rewards/margins": 0.03818519115447999, + "rewards/rejected": 0.22992444038391113, + "step": 580 + }, + { + "epoch": 0.236, + "grad_norm": 0.45218735933303833, + "kl": 2.9479668140411377, + "learning_rate": 4.901111111111112e-06, + "logits/chosen": 30179158.4, + "logits/rejected": 30914195.2, + "logps/chosen": -128.025927734375, + "logps/rejected": -133.37138671875, + "loss": 0.4864190101623535, + "rewards/chosen": 0.3341956615447998, + "rewards/margins": 0.1105940818786621, + "rewards/rejected": 0.2236015796661377, + "step": 590 + }, + { + "epoch": 0.24, + "grad_norm": 0.5611497759819031, + "kl": 2.7078356742858887, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 29134601.6, + "logits/rejected": 31641536.0, + "logps/chosen": -147.10302734375, + "logps/rejected": -148.70350341796876, + "loss": 0.5095005035400391, + "rewards/chosen": 0.1935347557067871, + "rewards/margins": -0.07780742645263675, + "rewards/rejected": 0.27134218215942385, + "step": 600 + }, + { + "epoch": 0.24, + "eval_kl": 2.6268301010131836, + "eval_logits/chosen": 38577520.64, + "eval_logits/rejected": 38429237.248, + "eval_logps/chosen": -151.8366875, + "eval_logps/rejected": -146.522453125, + "eval_loss": 0.49543091654777527, + "eval_rewards/chosen": 0.24890777587890625, + "eval_rewards/margins": 0.036881546020507805, + "eval_rewards/rejected": 0.21202622985839845, + "eval_runtime": 216.8269, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 600 + }, + { + "epoch": 0.244, + "grad_norm": 0.43945616483688354, + "kl": 3.3616116046905518, + "learning_rate": 4.878888888888889e-06, + "logits/chosen": 45687324.8, + "logits/rejected": 41788883.2, + "logps/chosen": -193.040185546875, + "logps/rejected": -168.3222900390625, + "loss": 0.48148174285888673, + "rewards/chosen": 0.35201478004455566, + "rewards/margins": 0.17267082929611205, + "rewards/rejected": 0.17934395074844361, + "step": 610 + }, + { + "epoch": 0.248, + "grad_norm": 0.47497129440307617, + "kl": 3.055345058441162, + "learning_rate": 4.867777777777778e-06, + "logits/chosen": 27710918.4, + "logits/rejected": 26318662.4, + "logps/chosen": -138.644775390625, + "logps/rejected": -134.76346435546876, + "loss": 0.4866151809692383, + "rewards/chosen": 0.3442774772644043, + "rewards/margins": 0.10848057270050046, + "rewards/rejected": 0.23579690456390381, + "step": 620 + }, + { + "epoch": 0.252, + "grad_norm": 0.4793793559074402, + "kl": 3.6866455078125, + "learning_rate": 4.856666666666667e-06, + "logits/chosen": 39017129.6, + "logits/rejected": 41635366.4, + "logps/chosen": -139.60113525390625, + "logps/rejected": -171.87177734375, + "loss": 0.5033475399017334, + "rewards/chosen": 0.33596067428588866, + "rewards/margins": -0.02700204849243165, + "rewards/rejected": 0.3629627227783203, + "step": 630 + }, + { + "epoch": 0.256, + "grad_norm": 0.5821816921234131, + "kl": 3.2494399547576904, + "learning_rate": 4.845555555555556e-06, + "logits/chosen": 41812588.8, + "logits/rejected": 40030470.4, + "logps/chosen": -188.748583984375, + "logps/rejected": -149.2949951171875, + "loss": 0.4903052806854248, + "rewards/chosen": 0.3278029918670654, + "rewards/margins": 0.08044664859771727, + "rewards/rejected": 0.24735634326934813, + "step": 640 + }, + { + "epoch": 0.26, + "grad_norm": 0.4971711337566376, + "kl": 3.889043092727661, + "learning_rate": 4.834444444444445e-06, + "logits/chosen": 43703507.2, + "logits/rejected": 42196211.2, + "logps/chosen": -125.8577392578125, + "logps/rejected": -132.8236328125, + "loss": 0.4952712535858154, + "rewards/chosen": 0.3801560878753662, + "rewards/margins": 0.03659126758575437, + "rewards/rejected": 0.3435648202896118, + "step": 650 + }, + { + "epoch": 0.264, + "grad_norm": 0.37012672424316406, + "kl": 2.9793968200683594, + "learning_rate": 4.8233333333333335e-06, + "logits/chosen": 57068806.4, + "logits/rejected": 55593145.6, + "logps/chosen": -168.82774658203124, + "logps/rejected": -132.77369384765626, + "loss": 0.4897792339324951, + "rewards/chosen": 0.3094865083694458, + "rewards/margins": 0.08172969818115233, + "rewards/rejected": 0.22775681018829347, + "step": 660 + }, + { + "epoch": 0.268, + "grad_norm": 0.5025138258934021, + "kl": 3.803910493850708, + "learning_rate": 4.812222222222222e-06, + "logits/chosen": 42714249.6, + "logits/rejected": 43013395.2, + "logps/chosen": -180.80955810546874, + "logps/rejected": -182.71279296875, + "loss": 0.48125367164611815, + "rewards/chosen": 0.4121575832366943, + "rewards/margins": 0.15600955486297607, + "rewards/rejected": 0.25614802837371825, + "step": 670 + }, + { + "epoch": 0.272, + "grad_norm": 0.47364944219589233, + "kl": 2.744506597518921, + "learning_rate": 4.801111111111111e-06, + "logits/chosen": 41662313.6, + "logits/rejected": 40533548.8, + "logps/chosen": -143.998193359375, + "logps/rejected": -125.735693359375, + "loss": 0.493405818939209, + "rewards/chosen": 0.2640446662902832, + "rewards/margins": 0.05262007713317873, + "rewards/rejected": 0.2114245891571045, + "step": 680 + }, + { + "epoch": 0.276, + "grad_norm": 0.378603994846344, + "kl": 4.192839622497559, + "learning_rate": 4.79e-06, + "logits/chosen": 42741641.6, + "logits/rejected": 40729804.8, + "logps/chosen": -156.440087890625, + "logps/rejected": -175.48450927734376, + "loss": 0.4940618991851807, + "rewards/chosen": 0.42174320220947265, + "rewards/margins": 0.04788670539855955, + "rewards/rejected": 0.3738564968109131, + "step": 690 + }, + { + "epoch": 0.28, + "grad_norm": 0.4181530773639679, + "kl": 3.237612247467041, + "learning_rate": 4.778888888888889e-06, + "logits/chosen": 38295142.4, + "logits/rejected": 35257382.4, + "logps/chosen": -155.112744140625, + "logps/rejected": -143.0890869140625, + "loss": 0.49517078399658204, + "rewards/chosen": 0.2904268026351929, + "rewards/margins": 0.041890859603881836, + "rewards/rejected": 0.24853594303131105, + "step": 700 + }, + { + "epoch": 0.284, + "grad_norm": 0.41070079803466797, + "kl": 4.423883438110352, + "learning_rate": 4.767777777777778e-06, + "logits/chosen": 40968300.8, + "logits/rejected": 39222742.4, + "logps/chosen": -172.097021484375, + "logps/rejected": -133.7148193359375, + "loss": 0.4814108371734619, + "rewards/chosen": 0.5093639373779297, + "rewards/margins": 0.1534171581268311, + "rewards/rejected": 0.35594677925109863, + "step": 710 + }, + { + "epoch": 0.288, + "grad_norm": 0.374776691198349, + "kl": 3.082357406616211, + "learning_rate": 4.756666666666667e-06, + "logits/chosen": 34210566.4, + "logits/rejected": 35579948.8, + "logps/chosen": -120.33433837890625, + "logps/rejected": -122.443017578125, + "loss": 0.5063377857208252, + "rewards/chosen": 0.1743820548057556, + "rewards/margins": -0.07105478048324584, + "rewards/rejected": 0.24543683528900145, + "step": 720 + }, + { + "epoch": 0.292, + "grad_norm": 0.3790406882762909, + "kl": 3.781698226928711, + "learning_rate": 4.745555555555556e-06, + "logits/chosen": 37423852.8, + "logits/rejected": 34581126.4, + "logps/chosen": -152.25120849609374, + "logps/rejected": -154.15382080078126, + "loss": 0.493280029296875, + "rewards/chosen": 0.39404921531677245, + "rewards/margins": 0.05465142726898192, + "rewards/rejected": 0.3393977880477905, + "step": 730 + }, + { + "epoch": 0.296, + "grad_norm": 0.47025066614151, + "kl": 4.019077301025391, + "learning_rate": 4.734444444444445e-06, + "logits/chosen": 34749833.6, + "logits/rejected": 33696905.6, + "logps/chosen": -156.05477294921874, + "logps/rejected": -182.29608154296875, + "loss": 0.49593114852905273, + "rewards/chosen": 0.34479031562805174, + "rewards/margins": 0.04153461456298824, + "rewards/rejected": 0.3032557010650635, + "step": 740 + }, + { + "epoch": 0.3, + "grad_norm": 0.2891245484352112, + "kl": 2.7223880290985107, + "learning_rate": 4.7233333333333336e-06, + "logits/chosen": 39233993.6, + "logits/rejected": 38955433.6, + "logps/chosen": -159.49801025390624, + "logps/rejected": -183.92698974609374, + "loss": 0.5023125648498535, + "rewards/chosen": 0.09982055425643921, + "rewards/margins": 0.004025018215179449, + "rewards/rejected": 0.09579553604125976, + "step": 750 + }, + { + "epoch": 0.304, + "grad_norm": 0.4833555817604065, + "kl": 1.6150490045547485, + "learning_rate": 4.712222222222222e-06, + "logits/chosen": 36941795.2, + "logits/rejected": 37837513.6, + "logps/chosen": -115.98502197265626, + "logps/rejected": -125.23421630859374, + "loss": 0.487321662902832, + "rewards/chosen": 0.10154855251312256, + "rewards/margins": 0.10329384654760361, + "rewards/rejected": -0.0017452940344810485, + "step": 760 + }, + { + "epoch": 0.308, + "grad_norm": 0.48121944069862366, + "kl": 1.2522486448287964, + "learning_rate": 4.701111111111111e-06, + "logits/chosen": 44757888.0, + "logits/rejected": 45581593.6, + "logps/chosen": -143.2324951171875, + "logps/rejected": -153.2897216796875, + "loss": 0.4856001377105713, + "rewards/chosen": 0.008315862715244293, + "rewards/margins": 0.1255118027329445, + "rewards/rejected": -0.1171959400177002, + "step": 770 + }, + { + "epoch": 0.312, + "grad_norm": 0.4913221001625061, + "kl": 0.6294690370559692, + "learning_rate": 4.69e-06, + "logits/chosen": 41557488.0, + "logits/rejected": 42490796.8, + "logps/chosen": -163.6885498046875, + "logps/rejected": -145.68963623046875, + "loss": 0.48305044174194334, + "rewards/chosen": -0.08929510116577148, + "rewards/margins": 0.14230823516845703, + "rewards/rejected": -0.23160333633422853, + "step": 780 + }, + { + "epoch": 0.316, + "grad_norm": 0.4478524327278137, + "kl": 1.0363415479660034, + "learning_rate": 4.67888888888889e-06, + "logits/chosen": 31436777.6, + "logits/rejected": 27623001.6, + "logps/chosen": -147.31231689453125, + "logps/rejected": -139.38282470703126, + "loss": 0.478118371963501, + "rewards/chosen": -0.18460922241210936, + "rewards/margins": 0.20353126525878906, + "rewards/rejected": -0.3881404876708984, + "step": 790 + }, + { + "epoch": 0.32, + "grad_norm": 0.5910397171974182, + "kl": 0.9226576089859009, + "learning_rate": 4.6677777777777785e-06, + "logits/chosen": 28333865.6, + "logits/rejected": 28280502.4, + "logps/chosen": -150.2372802734375, + "logps/rejected": -135.2026123046875, + "loss": 0.5036224842071533, + "rewards/chosen": -0.28971683979034424, + "rewards/margins": -0.058632898330688465, + "rewards/rejected": -0.23108394145965577, + "step": 800 + }, + { + "epoch": 0.32, + "eval_kl": 1.187766432762146, + "eval_logits/chosen": 34635034.624, + "eval_logits/rejected": 34656247.808, + "eval_logps/chosen": -156.55328125, + "eval_logps/rejected": -151.3069375, + "eval_loss": 0.4932977855205536, + "eval_rewards/chosen": -0.2227491455078125, + "eval_rewards/margins": 0.04367276000976561, + "eval_rewards/rejected": -0.2664219055175781, + "eval_runtime": 216.6151, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 800 + }, + { + "epoch": 0.324, + "grad_norm": 0.5326458811759949, + "kl": 1.5910537242889404, + "learning_rate": 4.656666666666667e-06, + "logits/chosen": 38717907.2, + "logits/rejected": 39118412.8, + "logps/chosen": -159.7564453125, + "logps/rejected": -157.7111572265625, + "loss": 0.49022369384765624, + "rewards/chosen": 0.008166373521089555, + "rewards/margins": 0.09110548570752143, + "rewards/rejected": -0.08293911218643188, + "step": 810 + }, + { + "epoch": 0.328, + "grad_norm": 0.4700395464897156, + "kl": 1.0638387203216553, + "learning_rate": 4.645555555555556e-06, + "logits/chosen": 24480648.0, + "logits/rejected": 24138777.6, + "logps/chosen": -157.56463623046875, + "logps/rejected": -129.4783935546875, + "loss": 0.5027867794036865, + "rewards/chosen": -0.3186595916748047, + "rewards/margins": -0.04232857227325437, + "rewards/rejected": -0.2763310194015503, + "step": 820 + }, + { + "epoch": 0.332, + "grad_norm": 0.5291322469711304, + "kl": 1.9198650121688843, + "learning_rate": 4.634444444444445e-06, + "logits/chosen": 31482019.2, + "logits/rejected": 30204611.2, + "logps/chosen": -160.241064453125, + "logps/rejected": -127.0677490234375, + "loss": 0.49699864387512205, + "rewards/chosen": -0.03406925797462464, + "rewards/margins": 0.020440274477005, + "rewards/rejected": -0.054509532451629636, + "step": 830 + }, + { + "epoch": 0.336, + "grad_norm": 0.5016227960586548, + "kl": 1.5567735433578491, + "learning_rate": 4.623333333333334e-06, + "logits/chosen": 39855260.8, + "logits/rejected": 39533484.8, + "logps/chosen": -151.4185302734375, + "logps/rejected": -151.45257568359375, + "loss": 0.4913910388946533, + "rewards/chosen": -0.0018289029598236085, + "rewards/margins": 0.05514721870422363, + "rewards/rejected": -0.05697612166404724, + "step": 840 + }, + { + "epoch": 0.34, + "grad_norm": 0.5913689732551575, + "kl": 2.8981661796569824, + "learning_rate": 4.6122222222222225e-06, + "logits/chosen": 39872361.6, + "logits/rejected": 38837766.4, + "logps/chosen": -168.816357421875, + "logps/rejected": -189.29534912109375, + "loss": 0.5085726261138916, + "rewards/chosen": -0.016373127698898315, + "rewards/margins": -0.10572689771652222, + "rewards/rejected": 0.0893537700176239, + "step": 850 + }, + { + "epoch": 0.344, + "grad_norm": 0.4795176088809967, + "kl": 1.9737087488174438, + "learning_rate": 4.601111111111112e-06, + "logits/chosen": 31763212.8, + "logits/rejected": 30045996.8, + "logps/chosen": -165.5231689453125, + "logps/rejected": -144.56939697265625, + "loss": 0.4861030101776123, + "rewards/chosen": -0.004905380308628082, + "rewards/margins": 0.10866030305624008, + "rewards/rejected": -0.11356568336486816, + "step": 860 + }, + { + "epoch": 0.348, + "grad_norm": 0.5342633128166199, + "kl": 0.8907498121261597, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 36357731.2, + "logits/rejected": 34329561.6, + "logps/chosen": -139.342138671875, + "logps/rejected": -147.3021728515625, + "loss": 0.4868171691894531, + "rewards/chosen": -0.3624546766281128, + "rewards/margins": 0.06923034191131588, + "rewards/rejected": -0.4316850185394287, + "step": 870 + }, + { + "epoch": 0.352, + "grad_norm": 0.4725877046585083, + "kl": 0.8254868388175964, + "learning_rate": 4.57888888888889e-06, + "logits/chosen": 31605561.6, + "logits/rejected": 30546553.6, + "logps/chosen": -167.25799560546875, + "logps/rejected": -137.0789794921875, + "loss": 0.4910862922668457, + "rewards/chosen": -0.2911639451980591, + "rewards/margins": 0.12216508388519287, + "rewards/rejected": -0.413329029083252, + "step": 880 + }, + { + "epoch": 0.356, + "grad_norm": 0.40740740299224854, + "kl": 0.5278605222702026, + "learning_rate": 4.5677777777777786e-06, + "logits/chosen": 42018956.8, + "logits/rejected": 41023542.4, + "logps/chosen": -135.9039306640625, + "logps/rejected": -137.1427001953125, + "loss": 0.48065881729125975, + "rewards/chosen": -0.33759872913360595, + "rewards/margins": 0.1944932222366333, + "rewards/rejected": -0.5320919513702392, + "step": 890 + }, + { + "epoch": 0.36, + "grad_norm": 0.506528377532959, + "kl": 0.6414504647254944, + "learning_rate": 4.556666666666667e-06, + "logits/chosen": 37310515.2, + "logits/rejected": 38113939.2, + "logps/chosen": -177.546728515625, + "logps/rejected": -165.55384521484376, + "loss": 0.4943400382995605, + "rewards/chosen": -0.6260869026184082, + "rewards/margins": -0.024873304367065363, + "rewards/rejected": -0.6012135982513428, + "step": 900 + }, + { + "epoch": 0.364, + "grad_norm": 0.4898208975791931, + "kl": 0.6840685606002808, + "learning_rate": 4.545555555555556e-06, + "logits/chosen": 31999705.6, + "logits/rejected": 32116691.2, + "logps/chosen": -168.6631591796875, + "logps/rejected": -174.8232421875, + "loss": 0.49452638626098633, + "rewards/chosen": -0.4872349739074707, + "rewards/margins": 0.17413578033447263, + "rewards/rejected": -0.6613707542419434, + "step": 910 + }, + { + "epoch": 0.368, + "grad_norm": 0.42078927159309387, + "kl": 0.8319946527481079, + "learning_rate": 4.534444444444445e-06, + "logits/chosen": 33578412.8, + "logits/rejected": 30967862.4, + "logps/chosen": -136.42371826171876, + "logps/rejected": -151.80224609375, + "loss": 0.4958657741546631, + "rewards/chosen": -0.4272448539733887, + "rewards/margins": 0.1275300025939941, + "rewards/rejected": -0.5547748565673828, + "step": 920 + }, + { + "epoch": 0.372, + "grad_norm": 0.4784943163394928, + "kl": 1.0423786640167236, + "learning_rate": 4.523333333333334e-06, + "logits/chosen": 32027404.8, + "logits/rejected": 32098508.8, + "logps/chosen": -155.27005615234376, + "logps/rejected": -149.925537109375, + "loss": 0.48459978103637696, + "rewards/chosen": -0.22732582092285156, + "rewards/margins": 0.15194621086120605, + "rewards/rejected": -0.3792720317840576, + "step": 930 + }, + { + "epoch": 0.376, + "grad_norm": 0.3698587119579315, + "kl": 1.5603997707366943, + "learning_rate": 4.512222222222223e-06, + "logits/chosen": 27671084.8, + "logits/rejected": 25976814.4, + "logps/chosen": -136.42069091796876, + "logps/rejected": -152.234033203125, + "loss": 0.4825894355773926, + "rewards/chosen": -0.22695178985595704, + "rewards/margins": 0.20094985961914064, + "rewards/rejected": -0.4279016494750977, + "step": 940 + }, + { + "epoch": 0.38, + "grad_norm": 0.40601083636283875, + "kl": 2.8003079891204834, + "learning_rate": 4.501111111111111e-06, + "logits/chosen": 40676198.4, + "logits/rejected": 43776691.2, + "logps/chosen": -183.81981201171874, + "logps/rejected": -158.6890380859375, + "loss": 0.4809588432312012, + "rewards/chosen": -0.0009134054183959961, + "rewards/margins": 0.13371984958648683, + "rewards/rejected": -0.13463325500488282, + "step": 950 + }, + { + "epoch": 0.384, + "grad_norm": 0.5407139658927917, + "kl": 2.086998224258423, + "learning_rate": 4.49e-06, + "logits/chosen": 37516128.0, + "logits/rejected": 39039443.2, + "logps/chosen": -137.39747314453126, + "logps/rejected": -150.29815673828125, + "loss": 0.5065193176269531, + "rewards/chosen": -0.06897132396697998, + "rewards/margins": -0.031788992881774905, + "rewards/rejected": -0.03718233108520508, + "step": 960 + }, + { + "epoch": 0.388, + "grad_norm": 0.4142342209815979, + "kl": 1.7635319232940674, + "learning_rate": 4.478888888888889e-06, + "logits/chosen": 32190848.0, + "logits/rejected": 30587993.6, + "logps/chosen": -135.15673828125, + "logps/rejected": -118.86663818359375, + "loss": 0.48531031608581543, + "rewards/chosen": -0.007679381966590881, + "rewards/margins": 0.15075800120830538, + "rewards/rejected": -0.15843738317489625, + "step": 970 + }, + { + "epoch": 0.392, + "grad_norm": 0.6221582889556885, + "kl": 3.294914722442627, + "learning_rate": 4.467777777777778e-06, + "logits/chosen": 39407718.4, + "logits/rejected": 39707820.8, + "logps/chosen": -138.0949462890625, + "logps/rejected": -157.43453369140624, + "loss": 0.4826664447784424, + "rewards/chosen": 0.24659197330474852, + "rewards/margins": 0.17039816975593566, + "rewards/rejected": 0.07619380354881286, + "step": 980 + }, + { + "epoch": 0.396, + "grad_norm": 0.5434563755989075, + "kl": 1.082279920578003, + "learning_rate": 4.456666666666667e-06, + "logits/chosen": 31821408.0, + "logits/rejected": 31118764.8, + "logps/chosen": -127.49896240234375, + "logps/rejected": -136.4136962890625, + "loss": 0.48195528984069824, + "rewards/chosen": -0.16623904705047607, + "rewards/margins": 0.15431931018829345, + "rewards/rejected": -0.3205583572387695, + "step": 990 + }, + { + "epoch": 0.4, + "grad_norm": 0.37851211428642273, + "kl": 1.7661035060882568, + "learning_rate": 4.4455555555555554e-06, + "logits/chosen": 31584358.4, + "logits/rejected": 32753641.6, + "logps/chosen": -159.625048828125, + "logps/rejected": -122.3124267578125, + "loss": 0.49321880340576174, + "rewards/chosen": -0.08240060806274414, + "rewards/margins": 0.045932340621948245, + "rewards/rejected": -0.1283329486846924, + "step": 1000 + }, + { + "epoch": 0.4, + "eval_kl": 1.6750891208648682, + "eval_logits/chosen": 33939980.288, + "eval_logits/rejected": 34026332.16, + "eval_logps/chosen": -156.401671875, + "eval_logps/rejected": -151.40996875, + "eval_loss": 0.4904634356498718, + "eval_rewards/chosen": -0.20758909606933593, + "eval_rewards/margins": 0.06913508605957033, + "eval_rewards/rejected": -0.27672418212890626, + "eval_runtime": 216.882, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 1000 + }, + { + "epoch": 0.404, + "grad_norm": 0.6070407032966614, + "kl": 2.5559988021850586, + "learning_rate": 4.434444444444444e-06, + "logits/chosen": 27506348.8, + "logits/rejected": 30075392.0, + "logps/chosen": -138.08692626953126, + "logps/rejected": -187.8861083984375, + "loss": 0.4917243480682373, + "rewards/chosen": -0.13809033632278442, + "rewards/margins": 0.05099650621414184, + "rewards/rejected": -0.18908684253692626, + "step": 1010 + }, + { + "epoch": 0.408, + "grad_norm": 0.5809450149536133, + "kl": 2.151444911956787, + "learning_rate": 4.423333333333334e-06, + "logits/chosen": 28877452.8, + "logits/rejected": 28118339.2, + "logps/chosen": -154.03690185546876, + "logps/rejected": -139.09736328125, + "loss": 0.4888266086578369, + "rewards/chosen": -0.16468460559844972, + "rewards/margins": 0.13210070133209229, + "rewards/rejected": -0.296785306930542, + "step": 1020 + }, + { + "epoch": 0.412, + "grad_norm": 0.4774882197380066, + "kl": 2.590153217315674, + "learning_rate": 4.412222222222223e-06, + "logits/chosen": 33996892.8, + "logits/rejected": 33160086.4, + "logps/chosen": -154.9474609375, + "logps/rejected": -153.31983642578126, + "loss": 0.47945427894592285, + "rewards/chosen": -0.007390469312667847, + "rewards/margins": 0.16412567496299743, + "rewards/rejected": -0.17151614427566528, + "step": 1030 + }, + { + "epoch": 0.416, + "grad_norm": 0.5751529335975647, + "kl": 2.692246913909912, + "learning_rate": 4.4011111111111115e-06, + "logits/chosen": 27060502.4, + "logits/rejected": 26739710.4, + "logps/chosen": -191.5754150390625, + "logps/rejected": -115.40594482421875, + "loss": 0.504734468460083, + "rewards/chosen": -0.14675636291503907, + "rewards/margins": -0.07811862826347352, + "rewards/rejected": -0.06863773465156556, + "step": 1040 + }, + { + "epoch": 0.42, + "grad_norm": 0.5552707314491272, + "kl": 2.183612108230591, + "learning_rate": 4.39e-06, + "logits/chosen": 30119641.6, + "logits/rejected": 27840016.0, + "logps/chosen": -129.14801025390625, + "logps/rejected": -159.921337890625, + "loss": 0.4975595951080322, + "rewards/chosen": -0.11868793964385986, + "rewards/margins": 0.03744263648986816, + "rewards/rejected": -0.15613057613372802, + "step": 1050 + }, + { + "epoch": 0.424, + "grad_norm": 0.4398513436317444, + "kl": 2.019963026046753, + "learning_rate": 4.378888888888889e-06, + "logits/chosen": 39605126.4, + "logits/rejected": 37338668.8, + "logps/chosen": -173.10638427734375, + "logps/rejected": -188.400146484375, + "loss": 0.5177321434020996, + "rewards/chosen": -0.252044153213501, + "rewards/margins": -0.1198780655860901, + "rewards/rejected": -0.1321660876274109, + "step": 1060 + }, + { + "epoch": 0.428, + "grad_norm": 0.6157165765762329, + "kl": 1.4567959308624268, + "learning_rate": 4.367777777777778e-06, + "logits/chosen": 31087238.4, + "logits/rejected": 32085881.6, + "logps/chosen": -145.3509521484375, + "logps/rejected": -170.3815185546875, + "loss": 0.4887071132659912, + "rewards/chosen": -0.3054164171218872, + "rewards/margins": 0.236836838722229, + "rewards/rejected": -0.5422532558441162, + "step": 1070 + }, + { + "epoch": 0.432, + "grad_norm": 0.3502284288406372, + "kl": 0.8787339925765991, + "learning_rate": 4.356666666666667e-06, + "logits/chosen": 34486451.2, + "logits/rejected": 36169574.4, + "logps/chosen": -158.5971923828125, + "logps/rejected": -139.94036865234375, + "loss": 0.5054500579833985, + "rewards/chosen": -0.45778846740722656, + "rewards/margins": -0.09344666004180907, + "rewards/rejected": -0.3643418073654175, + "step": 1080 + }, + { + "epoch": 0.436, + "grad_norm": 0.624359667301178, + "kl": 0.525427520275116, + "learning_rate": 4.3455555555555555e-06, + "logits/chosen": 26498083.2, + "logits/rejected": 25839392.0, + "logps/chosen": -149.42689208984376, + "logps/rejected": -118.26563720703125, + "loss": 0.5076635360717774, + "rewards/chosen": -0.557512617111206, + "rewards/margins": -0.06046972274780271, + "rewards/rejected": -0.4970428943634033, + "step": 1090 + }, + { + "epoch": 0.44, + "grad_norm": 0.549114465713501, + "kl": 0.8173803091049194, + "learning_rate": 4.334444444444445e-06, + "logits/chosen": 34397792.0, + "logits/rejected": 33410729.6, + "logps/chosen": -140.214111328125, + "logps/rejected": -176.107958984375, + "loss": 0.48507490158081057, + "rewards/chosen": -0.4220071792602539, + "rewards/margins": 0.19281878471374514, + "rewards/rejected": -0.614825963973999, + "step": 1100 + }, + { + "epoch": 0.444, + "grad_norm": 0.5036312937736511, + "kl": 0.7975673675537109, + "learning_rate": 4.323333333333334e-06, + "logits/chosen": 36466489.6, + "logits/rejected": 38251465.6, + "logps/chosen": -120.687255859375, + "logps/rejected": -187.36099853515626, + "loss": 0.5016714572906494, + "rewards/chosen": -0.4559361457824707, + "rewards/margins": -0.0054581642150878795, + "rewards/rejected": -0.45047798156738283, + "step": 1110 + }, + { + "epoch": 0.448, + "grad_norm": 0.5358121395111084, + "kl": 1.3897031545639038, + "learning_rate": 4.312222222222223e-06, + "logits/chosen": 46269334.4, + "logits/rejected": 45639856.0, + "logps/chosen": -151.86192626953124, + "logps/rejected": -165.55286865234376, + "loss": 0.4728604793548584, + "rewards/chosen": -0.23922853469848632, + "rewards/margins": 0.28858757019042974, + "rewards/rejected": -0.5278161048889161, + "step": 1120 + }, + { + "epoch": 0.452, + "grad_norm": 0.5269862413406372, + "kl": 1.1441795825958252, + "learning_rate": 4.301111111111112e-06, + "logits/chosen": 35708649.6, + "logits/rejected": 34836294.4, + "logps/chosen": -183.39061279296874, + "logps/rejected": -150.405078125, + "loss": 0.4849276065826416, + "rewards/chosen": -0.40422697067260743, + "rewards/margins": 0.15558710098266598, + "rewards/rejected": -0.5598140716552734, + "step": 1130 + }, + { + "epoch": 0.456, + "grad_norm": 0.3800269067287445, + "kl": 0.8876265287399292, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 22079545.6, + "logits/rejected": 22083444.8, + "logps/chosen": -109.09158935546876, + "logps/rejected": -147.06978759765624, + "loss": 0.4905365467071533, + "rewards/chosen": -0.43719801902770994, + "rewards/margins": 0.08943343162536621, + "rewards/rejected": -0.5266314506530761, + "step": 1140 + }, + { + "epoch": 0.46, + "grad_norm": 0.4693025052547455, + "kl": 0.49091872572898865, + "learning_rate": 4.278888888888889e-06, + "logits/chosen": 42701616.0, + "logits/rejected": 40578803.2, + "logps/chosen": -223.8361328125, + "logps/rejected": -172.5617431640625, + "loss": 0.4969001293182373, + "rewards/chosen": -0.9799749374389648, + "rewards/margins": 0.08639993667602552, + "rewards/rejected": -1.0663748741149903, + "step": 1150 + }, + { + "epoch": 0.464, + "grad_norm": 0.4174056053161621, + "kl": 0.7064284682273865, + "learning_rate": 4.267777777777778e-06, + "logits/chosen": 26652801.6, + "logits/rejected": 24801236.8, + "logps/chosen": -138.50279541015624, + "logps/rejected": -171.82225341796874, + "loss": 0.4671950817108154, + "rewards/chosen": -0.5919324398040772, + "rewards/margins": 0.4853674411773682, + "rewards/rejected": -1.0772998809814454, + "step": 1160 + }, + { + "epoch": 0.468, + "grad_norm": 0.629512369632721, + "kl": 1.179760217666626, + "learning_rate": 4.256666666666668e-06, + "logits/chosen": 28567804.8, + "logits/rejected": 29090739.2, + "logps/chosen": -140.1174560546875, + "logps/rejected": -165.07166748046876, + "loss": 0.49239435195922854, + "rewards/chosen": -0.8533164024353027, + "rewards/margins": -0.06229524612426751, + "rewards/rejected": -0.7910211563110352, + "step": 1170 + }, + { + "epoch": 0.472, + "grad_norm": 0.4868221580982208, + "kl": 0.9739119410514832, + "learning_rate": 4.2455555555555565e-06, + "logits/chosen": 30410720.0, + "logits/rejected": 28420300.8, + "logps/chosen": -140.90198974609376, + "logps/rejected": -170.1091552734375, + "loss": 0.48679437637329104, + "rewards/chosen": -0.551117992401123, + "rewards/margins": 0.2831212997436524, + "rewards/rejected": -0.8342392921447754, + "step": 1180 + }, + { + "epoch": 0.476, + "grad_norm": 0.47686856985092163, + "kl": 0.48265019059181213, + "learning_rate": 4.234444444444445e-06, + "logits/chosen": 29930240.0, + "logits/rejected": 25699152.0, + "logps/chosen": -184.52239990234375, + "logps/rejected": -187.303173828125, + "loss": 0.4651634693145752, + "rewards/chosen": -0.6415619850158691, + "rewards/margins": 0.5583641052246093, + "rewards/rejected": -1.1999260902404785, + "step": 1190 + }, + { + "epoch": 0.48, + "grad_norm": 0.49188584089279175, + "kl": 0.6842840909957886, + "learning_rate": 4.223333333333334e-06, + "logits/chosen": 30387222.4, + "logits/rejected": 28950054.4, + "logps/chosen": -146.3315673828125, + "logps/rejected": -159.7215576171875, + "loss": 0.4879584789276123, + "rewards/chosen": -0.608671236038208, + "rewards/margins": 0.14971170425415037, + "rewards/rejected": -0.7583829402923584, + "step": 1200 + }, + { + "epoch": 0.48, + "eval_kl": 0.6333972215652466, + "eval_logits/chosen": 31495737.344, + "eval_logits/rejected": 31723335.68, + "eval_logps/chosen": -160.925171875, + "eval_logps/rejected": -156.03046875, + "eval_loss": 0.48919567465782166, + "eval_rewards/chosen": -0.659940185546875, + "eval_rewards/margins": 0.07883386230468759, + "eval_rewards/rejected": -0.7387740478515625, + "eval_runtime": 217.7778, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 1200 + }, + { + "epoch": 0.484, + "grad_norm": 0.5108934640884399, + "kl": 0.7764253616333008, + "learning_rate": 4.212222222222223e-06, + "logits/chosen": 22495624.0, + "logits/rejected": 24596646.4, + "logps/chosen": -138.95447998046876, + "logps/rejected": -143.8316162109375, + "loss": 0.5000998020172119, + "rewards/chosen": -0.806338119506836, + "rewards/margins": 0.04378585815429681, + "rewards/rejected": -0.8501239776611328, + "step": 1210 + }, + { + "epoch": 0.488, + "grad_norm": 0.5415228009223938, + "kl": 1.1236222982406616, + "learning_rate": 4.201111111111112e-06, + "logits/chosen": 31318569.6, + "logits/rejected": 28306940.8, + "logps/chosen": -183.03011474609374, + "logps/rejected": -194.442236328125, + "loss": 0.4961515426635742, + "rewards/chosen": -0.552086067199707, + "rewards/margins": 0.12537789344787598, + "rewards/rejected": -0.677463960647583, + "step": 1220 + }, + { + "epoch": 0.492, + "grad_norm": 0.4574231505393982, + "kl": 1.6093800067901611, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 28117484.8, + "logits/rejected": 27773507.2, + "logps/chosen": -148.87581787109374, + "logps/rejected": -157.650537109375, + "loss": 0.48798060417175293, + "rewards/chosen": -0.22223844528198242, + "rewards/margins": 0.0932478666305542, + "rewards/rejected": -0.3154863119125366, + "step": 1230 + }, + { + "epoch": 0.496, + "grad_norm": 0.455790638923645, + "kl": 1.5425220727920532, + "learning_rate": 4.178888888888889e-06, + "logits/chosen": 30485878.4, + "logits/rejected": 30765398.4, + "logps/chosen": -123.1314208984375, + "logps/rejected": -120.400634765625, + "loss": 0.4938685894012451, + "rewards/chosen": -0.15957858562469482, + "rewards/margins": 0.033807253837585455, + "rewards/rejected": -0.19338583946228027, + "step": 1240 + }, + { + "epoch": 0.5, + "grad_norm": 0.5282999873161316, + "kl": 1.4079266786575317, + "learning_rate": 4.167777777777778e-06, + "logits/chosen": 22162673.6, + "logits/rejected": 22915948.8, + "logps/chosen": -113.82423095703125, + "logps/rejected": -131.29176025390626, + "loss": 0.49029102325439455, + "rewards/chosen": -0.14798271656036377, + "rewards/margins": 0.09533922672271727, + "rewards/rejected": -0.24332194328308104, + "step": 1250 + }, + { + "epoch": 0.504, + "grad_norm": 0.5007496476173401, + "kl": 1.7635902166366577, + "learning_rate": 4.156666666666667e-06, + "logits/chosen": 27436982.4, + "logits/rejected": 27643766.4, + "logps/chosen": -147.7771484375, + "logps/rejected": -167.73524169921876, + "loss": 0.46639671325683596, + "rewards/chosen": -0.29362332820892334, + "rewards/margins": 0.3400294542312622, + "rewards/rejected": -0.6336527824401855, + "step": 1260 + }, + { + "epoch": 0.508, + "grad_norm": 0.4727869927883148, + "kl": 1.2690056562423706, + "learning_rate": 4.145555555555556e-06, + "logits/chosen": 29958118.4, + "logits/rejected": 26773496.0, + "logps/chosen": -172.3375732421875, + "logps/rejected": -155.7744140625, + "loss": 0.4935513973236084, + "rewards/chosen": -0.3117243528366089, + "rewards/margins": 0.08564956188201905, + "rewards/rejected": -0.39737391471862793, + "step": 1270 + }, + { + "epoch": 0.512, + "grad_norm": 0.4609099328517914, + "kl": 1.781589150428772, + "learning_rate": 4.1344444444444446e-06, + "logits/chosen": 45966684.8, + "logits/rejected": 46560012.8, + "logps/chosen": -154.45379638671875, + "logps/rejected": -171.34287109375, + "loss": 0.49072775840759275, + "rewards/chosen": -0.026087772846221925, + "rewards/margins": 0.12879917621612547, + "rewards/rejected": -0.1548869490623474, + "step": 1280 + }, + { + "epoch": 0.516, + "grad_norm": 0.5082091093063354, + "kl": 1.657065749168396, + "learning_rate": 4.123333333333333e-06, + "logits/chosen": 27531948.8, + "logits/rejected": 28266195.2, + "logps/chosen": -133.0970947265625, + "logps/rejected": -141.93575439453124, + "loss": 0.4872725486755371, + "rewards/chosen": -0.0539365291595459, + "rewards/margins": 0.13880285024642947, + "rewards/rejected": -0.19273937940597535, + "step": 1290 + }, + { + "epoch": 0.52, + "grad_norm": 0.5041593909263611, + "kl": 2.0696568489074707, + "learning_rate": 4.112222222222222e-06, + "logits/chosen": 37664678.4, + "logits/rejected": 34784227.2, + "logps/chosen": -183.206103515625, + "logps/rejected": -145.39962158203124, + "loss": 0.48463997840881345, + "rewards/chosen": -0.03633859157562256, + "rewards/margins": 0.14437620639801027, + "rewards/rejected": -0.18071479797363282, + "step": 1300 + }, + { + "epoch": 0.524, + "grad_norm": 0.6096036434173584, + "kl": 2.0778517723083496, + "learning_rate": 4.101111111111111e-06, + "logits/chosen": 30281945.6, + "logits/rejected": 30007484.8, + "logps/chosen": -140.6286865234375, + "logps/rejected": -148.30921630859376, + "loss": 0.49010205268859863, + "rewards/chosen": -0.1496596097946167, + "rewards/margins": 0.10425436496734616, + "rewards/rejected": -0.25391397476196287, + "step": 1310 + }, + { + "epoch": 0.528, + "grad_norm": 0.3967672884464264, + "kl": 3.4023184776306152, + "learning_rate": 4.09e-06, + "logits/chosen": 38450940.8, + "logits/rejected": 36835715.2, + "logps/chosen": -149.884423828125, + "logps/rejected": -155.450439453125, + "loss": 0.455477237701416, + "rewards/chosen": 0.22590782642364501, + "rewards/margins": 0.39357452392578124, + "rewards/rejected": -0.16766669750213622, + "step": 1320 + }, + { + "epoch": 0.532, + "grad_norm": 0.39660006761550903, + "kl": 1.7329654693603516, + "learning_rate": 4.0788888888888895e-06, + "logits/chosen": 29744569.6, + "logits/rejected": 30137328.0, + "logps/chosen": -155.14593505859375, + "logps/rejected": -160.4675048828125, + "loss": 0.4830836296081543, + "rewards/chosen": -0.31779026985168457, + "rewards/margins": 0.1599587440490723, + "rewards/rejected": -0.47774901390075686, + "step": 1330 + }, + { + "epoch": 0.536, + "grad_norm": 0.6326448917388916, + "kl": 2.0254123210906982, + "learning_rate": 4.067777777777778e-06, + "logits/chosen": 26790800.0, + "logits/rejected": 28456883.2, + "logps/chosen": -151.97984619140624, + "logps/rejected": -130.97991943359375, + "loss": 0.4777104377746582, + "rewards/chosen": -0.04423903226852417, + "rewards/margins": 0.20697282552719115, + "rewards/rejected": -0.2512118577957153, + "step": 1340 + }, + { + "epoch": 0.54, + "grad_norm": 0.4449482858181, + "kl": 1.457157015800476, + "learning_rate": 4.056666666666667e-06, + "logits/chosen": 29013564.8, + "logits/rejected": 28593001.6, + "logps/chosen": -128.33466796875, + "logps/rejected": -122.49200439453125, + "loss": 0.4760580539703369, + "rewards/chosen": -0.19824122190475463, + "rewards/margins": 0.20185590982437135, + "rewards/rejected": -0.400097131729126, + "step": 1350 + }, + { + "epoch": 0.544, + "grad_norm": 0.45084336400032043, + "kl": 3.831247329711914, + "learning_rate": 4.045555555555556e-06, + "logits/chosen": 31035744.0, + "logits/rejected": 32034198.4, + "logps/chosen": -164.5341064453125, + "logps/rejected": -147.6629638671875, + "loss": 0.48288540840148925, + "rewards/chosen": 0.16254035234451295, + "rewards/margins": 0.1516798198223114, + "rewards/rejected": 0.010860532522201538, + "step": 1360 + }, + { + "epoch": 0.548, + "grad_norm": 0.5451259613037109, + "kl": 3.273149013519287, + "learning_rate": 4.034444444444445e-06, + "logits/chosen": 28394259.2, + "logits/rejected": 25613750.4, + "logps/chosen": -176.49615478515625, + "logps/rejected": -159.41533203125, + "loss": 0.463987922668457, + "rewards/chosen": 0.05027390718460083, + "rewards/margins": 0.3991087079048157, + "rewards/rejected": -0.34883480072021483, + "step": 1370 + }, + { + "epoch": 0.552, + "grad_norm": 0.4214652180671692, + "kl": 2.222465991973877, + "learning_rate": 4.0233333333333335e-06, + "logits/chosen": 34603212.8, + "logits/rejected": 34498118.4, + "logps/chosen": -148.70673828125, + "logps/rejected": -138.68626708984374, + "loss": 0.4929951667785645, + "rewards/chosen": -0.12773821353912354, + "rewards/margins": 0.0252701163291931, + "rewards/rejected": -0.15300832986831664, + "step": 1380 + }, + { + "epoch": 0.556, + "grad_norm": 0.5307957530021667, + "kl": 2.981513500213623, + "learning_rate": 4.012222222222222e-06, + "logits/chosen": 39500022.4, + "logits/rejected": 41076224.0, + "logps/chosen": -156.5267333984375, + "logps/rejected": -168.4097900390625, + "loss": 0.5046597480773926, + "rewards/chosen": 0.03251245319843292, + "rewards/margins": -0.05116569101810456, + "rewards/rejected": 0.08367814421653748, + "step": 1390 + }, + { + "epoch": 0.56, + "grad_norm": 0.5756453275680542, + "kl": 3.648423671722412, + "learning_rate": 4.001111111111111e-06, + "logits/chosen": 36128464.0, + "logits/rejected": 36108208.0, + "logps/chosen": -147.972119140625, + "logps/rejected": -180.87730712890624, + "loss": 0.49908957481384275, + "rewards/chosen": 0.16207314729690553, + "rewards/margins": -0.004686105251312245, + "rewards/rejected": 0.16675925254821777, + "step": 1400 + }, + { + "epoch": 0.56, + "eval_kl": 3.1687636375427246, + "eval_logits/chosen": 33501499.392, + "eval_logits/rejected": 33484677.12, + "eval_logps/chosen": -154.072703125, + "eval_logps/rejected": -149.52703125, + "eval_loss": 0.486517995595932, + "eval_rewards/chosen": 0.025308061599731445, + "eval_rewards/margins": 0.11373865699768065, + "eval_rewards/rejected": -0.08843059539794922, + "eval_runtime": 217.6832, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 1400 + }, + { + "epoch": 0.564, + "grad_norm": 0.5075347423553467, + "kl": 3.5038933753967285, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 28111721.6, + "logits/rejected": 28974422.4, + "logps/chosen": -153.78463134765624, + "logps/rejected": -143.94952392578125, + "loss": 0.4918965816497803, + "rewards/chosen": -0.004850611090660095, + "rewards/margins": 0.020186284184455873, + "rewards/rejected": -0.025036895275115968, + "step": 1410 + }, + { + "epoch": 0.568, + "grad_norm": 0.5109780430793762, + "kl": 2.3000378608703613, + "learning_rate": 3.9788888888888896e-06, + "logits/chosen": 33186614.4, + "logits/rejected": 33699417.6, + "logps/chosen": -141.6215087890625, + "logps/rejected": -178.1355712890625, + "loss": 0.4941215991973877, + "rewards/chosen": -0.18355293273925782, + "rewards/margins": 0.026853704452514643, + "rewards/rejected": -0.21040663719177247, + "step": 1420 + }, + { + "epoch": 0.572, + "grad_norm": 0.6244523525238037, + "kl": 1.984100580215454, + "learning_rate": 3.967777777777778e-06, + "logits/chosen": 39980752.0, + "logits/rejected": 35690995.2, + "logps/chosen": -194.154638671875, + "logps/rejected": -171.23074951171876, + "loss": 0.46099395751953126, + "rewards/chosen": -0.10000758171081543, + "rewards/margins": 0.5053775310516357, + "rewards/rejected": -0.6053851127624512, + "step": 1430 + }, + { + "epoch": 0.576, + "grad_norm": 0.41846802830696106, + "kl": 2.4030237197875977, + "learning_rate": 3.956666666666667e-06, + "logits/chosen": 25522387.2, + "logits/rejected": 26411580.8, + "logps/chosen": -122.002734375, + "logps/rejected": -139.70345458984374, + "loss": 0.4711480617523193, + "rewards/chosen": -0.17828741073608398, + "rewards/margins": 0.30072832107543945, + "rewards/rejected": -0.47901573181152346, + "step": 1440 + }, + { + "epoch": 0.58, + "grad_norm": 0.3383093774318695, + "kl": 2.3522331714630127, + "learning_rate": 3.945555555555556e-06, + "logits/chosen": 34720166.4, + "logits/rejected": 35382691.2, + "logps/chosen": -134.02327880859374, + "logps/rejected": -143.1811767578125, + "loss": 0.48169522285461425, + "rewards/chosen": -0.12290234565734863, + "rewards/margins": 0.22536482810974118, + "rewards/rejected": -0.3482671737670898, + "step": 1450 + }, + { + "epoch": 0.584, + "grad_norm": 0.47618529200553894, + "kl": 1.4843952655792236, + "learning_rate": 3.934444444444445e-06, + "logits/chosen": 29917385.6, + "logits/rejected": 29642912.0, + "logps/chosen": -157.6127197265625, + "logps/rejected": -164.204248046875, + "loss": 0.48633370399475095, + "rewards/chosen": -0.5775248527526855, + "rewards/margins": 0.010549926757812522, + "rewards/rejected": -0.588074779510498, + "step": 1460 + }, + { + "epoch": 0.588, + "grad_norm": 0.4691362977027893, + "kl": 1.8532390594482422, + "learning_rate": 3.923333333333334e-06, + "logits/chosen": 24143035.2, + "logits/rejected": 26696252.8, + "logps/chosen": -145.52325439453125, + "logps/rejected": -114.97313232421875, + "loss": 0.4996927261352539, + "rewards/chosen": -0.3415048837661743, + "rewards/margins": -0.0695812225341797, + "rewards/rejected": -0.27192366123199463, + "step": 1470 + }, + { + "epoch": 0.592, + "grad_norm": 0.49410581588745117, + "kl": 2.910165309906006, + "learning_rate": 3.912222222222222e-06, + "logits/chosen": 29227424.0, + "logits/rejected": 26583780.8, + "logps/chosen": -153.87852783203124, + "logps/rejected": -167.90714111328126, + "loss": 0.4679962158203125, + "rewards/chosen": -0.16606519222259522, + "rewards/margins": 0.28040225505828853, + "rewards/rejected": -0.4464674472808838, + "step": 1480 + }, + { + "epoch": 0.596, + "grad_norm": 0.6437669992446899, + "kl": 4.011757850646973, + "learning_rate": 3.901111111111111e-06, + "logits/chosen": 40104499.2, + "logits/rejected": 35466915.2, + "logps/chosen": -141.6960693359375, + "logps/rejected": -148.79417724609374, + "loss": 0.45351347923278806, + "rewards/chosen": 0.29918632507324217, + "rewards/margins": 0.7735027313232421, + "rewards/rejected": -0.47431640625, + "step": 1490 + }, + { + "epoch": 0.6, + "grad_norm": 0.598638653755188, + "kl": 2.5277042388916016, + "learning_rate": 3.89e-06, + "logits/chosen": 30581568.0, + "logits/rejected": 29237926.4, + "logps/chosen": -170.3593017578125, + "logps/rejected": -161.714111328125, + "loss": 0.5054315567016602, + "rewards/chosen": -0.5106431007385254, + "rewards/margins": -0.14114959239959712, + "rewards/rejected": -0.36949350833892824, + "step": 1500 + }, + { + "epoch": 0.604, + "grad_norm": 0.5450658202171326, + "kl": 3.1822094917297363, + "learning_rate": 3.87888888888889e-06, + "logits/chosen": 30121491.2, + "logits/rejected": 30883408.0, + "logps/chosen": -177.3155029296875, + "logps/rejected": -172.50675048828126, + "loss": 0.4777498722076416, + "rewards/chosen": -0.09470235109329224, + "rewards/margins": 0.16894682645797732, + "rewards/rejected": -0.26364917755126954, + "step": 1510 + }, + { + "epoch": 0.608, + "grad_norm": 0.32850226759910583, + "kl": 3.073251724243164, + "learning_rate": 3.8677777777777785e-06, + "logits/chosen": 32764054.4, + "logits/rejected": 33643142.4, + "logps/chosen": -167.7408447265625, + "logps/rejected": -171.3683349609375, + "loss": 0.4882831573486328, + "rewards/chosen": -0.5916567325592041, + "rewards/margins": 0.06371226310729972, + "rewards/rejected": -0.6553689956665039, + "step": 1520 + }, + { + "epoch": 0.612, + "grad_norm": 0.776578426361084, + "kl": 2.1626362800598145, + "learning_rate": 3.856666666666667e-06, + "logits/chosen": 20513964.8, + "logits/rejected": 19167148.8, + "logps/chosen": -138.76737060546876, + "logps/rejected": -200.4188232421875, + "loss": 0.47345700263977053, + "rewards/chosen": -0.39080009460449217, + "rewards/margins": 0.3349196434020997, + "rewards/rejected": -0.7257197380065918, + "step": 1530 + }, + { + "epoch": 0.616, + "grad_norm": 0.7884080410003662, + "kl": 2.2347629070281982, + "learning_rate": 3.845555555555556e-06, + "logits/chosen": 21506472.0, + "logits/rejected": 20219934.4, + "logps/chosen": -141.54342041015624, + "logps/rejected": -150.6858154296875, + "loss": 0.46153483390808103, + "rewards/chosen": -0.5252087116241455, + "rewards/margins": 0.5382626056671143, + "rewards/rejected": -1.0634713172912598, + "step": 1540 + }, + { + "epoch": 0.62, + "grad_norm": 0.6161748766899109, + "kl": 1.0965118408203125, + "learning_rate": 3.834444444444445e-06, + "logits/chosen": 24290136.0, + "logits/rejected": 24614228.8, + "logps/chosen": -178.11864013671874, + "logps/rejected": -153.21026611328125, + "loss": 0.4584649085998535, + "rewards/chosen": -0.8390473365783692, + "rewards/margins": 0.8001769065856933, + "rewards/rejected": -1.6392242431640625, + "step": 1550 + }, + { + "epoch": 0.624, + "grad_norm": 0.6851525902748108, + "kl": 0.8686630129814148, + "learning_rate": 3.823333333333334e-06, + "logits/chosen": 17298494.4, + "logits/rejected": 14839955.2, + "logps/chosen": -181.07423095703126, + "logps/rejected": -169.425439453125, + "loss": 0.49303278923034666, + "rewards/chosen": -0.948878288269043, + "rewards/margins": 0.510734748840332, + "rewards/rejected": -1.459613037109375, + "step": 1560 + }, + { + "epoch": 0.628, + "grad_norm": 0.6733571290969849, + "kl": 2.8747103214263916, + "learning_rate": 3.8122222222222225e-06, + "logits/chosen": 29427056.0, + "logits/rejected": 24132025.6, + "logps/chosen": -138.527490234375, + "logps/rejected": -174.9707763671875, + "loss": 0.4342005729675293, + "rewards/chosen": 0.09871820211410523, + "rewards/margins": 0.8701262354850768, + "rewards/rejected": -0.7714080333709716, + "step": 1570 + }, + { + "epoch": 0.632, + "grad_norm": 0.5916578769683838, + "kl": 1.7868465185165405, + "learning_rate": 3.8011111111111113e-06, + "logits/chosen": 26572758.4, + "logits/rejected": 23886825.6, + "logps/chosen": -212.365966796875, + "logps/rejected": -208.8218505859375, + "loss": 0.48333086967468264, + "rewards/chosen": -1.0144821166992188, + "rewards/margins": 0.16185150146484362, + "rewards/rejected": -1.1763336181640625, + "step": 1580 + }, + { + "epoch": 0.636, + "grad_norm": 0.5978784561157227, + "kl": 2.861311435699463, + "learning_rate": 3.79e-06, + "logits/chosen": 27835440.0, + "logits/rejected": 27677552.0, + "logps/chosen": -188.40328369140624, + "logps/rejected": -173.6368896484375, + "loss": 0.4893380641937256, + "rewards/chosen": -0.2336580753326416, + "rewards/margins": 0.4210014343261719, + "rewards/rejected": -0.6546595096588135, + "step": 1590 + }, + { + "epoch": 0.64, + "grad_norm": 0.574272632598877, + "kl": 1.9626449346542358, + "learning_rate": 3.7788888888888894e-06, + "logits/chosen": 20977523.2, + "logits/rejected": 17175705.6, + "logps/chosen": -150.7464111328125, + "logps/rejected": -180.0102294921875, + "loss": 0.45204753875732423, + "rewards/chosen": -0.791524600982666, + "rewards/margins": 1.0010954856872558, + "rewards/rejected": -1.7926200866699218, + "step": 1600 + }, + { + "epoch": 0.64, + "eval_kl": 2.5991017818450928, + "eval_logits/chosen": 24918573.056, + "eval_logits/rejected": 25185402.88, + "eval_logps/chosen": -159.12659375, + "eval_logps/rejected": -154.646546875, + "eval_loss": 0.4858725666999817, + "eval_rewards/chosen": -0.480081787109375, + "eval_rewards/margins": 0.12030004882812506, + "eval_rewards/rejected": -0.6003818359375, + "eval_runtime": 217.5803, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1600 + }, + { + "epoch": 0.644, + "grad_norm": 0.6671045422554016, + "kl": 2.1377835273742676, + "learning_rate": 3.767777777777778e-06, + "logits/chosen": 35406131.2, + "logits/rejected": 31205331.2, + "logps/chosen": -184.4550537109375, + "logps/rejected": -165.74014892578126, + "loss": 0.4641073703765869, + "rewards/chosen": -0.07752754688262939, + "rewards/margins": 0.44615256786346436, + "rewards/rejected": -0.5236801147460938, + "step": 1610 + }, + { + "epoch": 0.648, + "grad_norm": 0.6126360297203064, + "kl": 2.7373385429382324, + "learning_rate": 3.756666666666667e-06, + "logits/chosen": 29280086.4, + "logits/rejected": 32377139.2, + "logps/chosen": -212.76513671875, + "logps/rejected": -190.2232666015625, + "loss": 0.49649949073791505, + "rewards/chosen": -0.929378604888916, + "rewards/margins": -0.4132873058319092, + "rewards/rejected": -0.5160912990570068, + "step": 1620 + }, + { + "epoch": 0.652, + "grad_norm": 0.5612730979919434, + "kl": 2.997823476791382, + "learning_rate": 3.7455555555555558e-06, + "logits/chosen": 21078843.2, + "logits/rejected": 19578608.0, + "logps/chosen": -149.2550537109375, + "logps/rejected": -154.4807861328125, + "loss": 0.45549612045288085, + "rewards/chosen": -0.679559326171875, + "rewards/margins": 0.24375944137573247, + "rewards/rejected": -0.9233187675476074, + "step": 1630 + }, + { + "epoch": 0.656, + "grad_norm": 0.44815537333488464, + "kl": 2.8388514518737793, + "learning_rate": 3.734444444444445e-06, + "logits/chosen": 22369004.8, + "logits/rejected": 18537880.0, + "logps/chosen": -171.47335205078124, + "logps/rejected": -165.7509521484375, + "loss": 0.4616579532623291, + "rewards/chosen": -0.18010754585266114, + "rewards/margins": 0.7035699367523194, + "rewards/rejected": -0.8836774826049805, + "step": 1640 + }, + { + "epoch": 0.66, + "grad_norm": 0.8153596520423889, + "kl": 3.4207565784454346, + "learning_rate": 3.723333333333334e-06, + "logits/chosen": 24538766.4, + "logits/rejected": 26516555.2, + "logps/chosen": -150.9359619140625, + "logps/rejected": -122.23038330078126, + "loss": 0.48783044815063475, + "rewards/chosen": -0.1703397035598755, + "rewards/margins": 0.0270324468612671, + "rewards/rejected": -0.1973721504211426, + "step": 1650 + }, + { + "epoch": 0.664, + "grad_norm": 0.45013901591300964, + "kl": 3.9859955310821533, + "learning_rate": 3.7122222222222226e-06, + "logits/chosen": 22145710.4, + "logits/rejected": 20591705.6, + "logps/chosen": -124.9506591796875, + "logps/rejected": -146.586279296875, + "loss": 0.4657421112060547, + "rewards/chosen": 0.028095448017120363, + "rewards/margins": 0.4449395298957825, + "rewards/rejected": -0.4168440818786621, + "step": 1660 + }, + { + "epoch": 0.668, + "grad_norm": 0.5674629211425781, + "kl": 3.48918080329895, + "learning_rate": 3.7011111111111114e-06, + "logits/chosen": 28042899.2, + "logits/rejected": 27730080.0, + "logps/chosen": -170.19053955078124, + "logps/rejected": -192.9843017578125, + "loss": 0.4754744052886963, + "rewards/chosen": -0.249656343460083, + "rewards/margins": 0.22593021392822268, + "rewards/rejected": -0.4755865573883057, + "step": 1670 + }, + { + "epoch": 0.672, + "grad_norm": 0.7562563419342041, + "kl": 3.0691466331481934, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 26120401.6, + "logits/rejected": 25757414.4, + "logps/chosen": -154.65439453125, + "logps/rejected": -162.4265869140625, + "loss": 0.4740549087524414, + "rewards/chosen": -0.2537501811981201, + "rewards/margins": 0.16815314292907718, + "rewards/rejected": -0.4219033241271973, + "step": 1680 + }, + { + "epoch": 0.676, + "grad_norm": 0.6189448237419128, + "kl": 3.324810028076172, + "learning_rate": 3.678888888888889e-06, + "logits/chosen": 30877590.4, + "logits/rejected": 29492992.0, + "logps/chosen": -162.27489013671874, + "logps/rejected": -156.6115234375, + "loss": 0.46096296310424806, + "rewards/chosen": -0.32442150115966795, + "rewards/margins": 0.5965461730957031, + "rewards/rejected": -0.9209676742553711, + "step": 1690 + }, + { + "epoch": 0.68, + "grad_norm": 0.5689833760261536, + "kl": 3.417942762374878, + "learning_rate": 3.667777777777778e-06, + "logits/chosen": 30880060.8, + "logits/rejected": 30390649.6, + "logps/chosen": -146.108203125, + "logps/rejected": -140.8014892578125, + "loss": 0.48299012184143064, + "rewards/chosen": -0.08942080736160278, + "rewards/margins": 0.09204813241958619, + "rewards/rejected": -0.18146893978118897, + "step": 1700 + }, + { + "epoch": 0.684, + "grad_norm": 0.7074683904647827, + "kl": 2.7843141555786133, + "learning_rate": 3.6566666666666667e-06, + "logits/chosen": 23799224.0, + "logits/rejected": 23789908.8, + "logps/chosen": -132.7684326171875, + "logps/rejected": -171.96357421875, + "loss": 0.4795389652252197, + "rewards/chosen": -0.4700624942779541, + "rewards/margins": 0.3144543647766114, + "rewards/rejected": -0.7845168590545655, + "step": 1710 + }, + { + "epoch": 0.688, + "grad_norm": 0.8114802241325378, + "kl": 2.740182876586914, + "learning_rate": 3.645555555555556e-06, + "logits/chosen": 25693836.8, + "logits/rejected": 25391835.2, + "logps/chosen": -147.47672119140626, + "logps/rejected": -162.006640625, + "loss": 0.47942562103271485, + "rewards/chosen": -0.12961168289184571, + "rewards/margins": 0.2411248922348022, + "rewards/rejected": -0.3707365751266479, + "step": 1720 + }, + { + "epoch": 0.692, + "grad_norm": 0.6404406428337097, + "kl": 5.742056369781494, + "learning_rate": 3.6344444444444447e-06, + "logits/chosen": 23561008.0, + "logits/rejected": 24549129.6, + "logps/chosen": -152.6004150390625, + "logps/rejected": -174.73006591796874, + "loss": 0.4857301712036133, + "rewards/chosen": 0.12083638906478882, + "rewards/margins": 0.12182764708995819, + "rewards/rejected": -0.0009912580251693725, + "step": 1730 + }, + { + "epoch": 0.696, + "grad_norm": 0.8152211308479309, + "kl": 2.7150015830993652, + "learning_rate": 3.6233333333333335e-06, + "logits/chosen": 20060864.0, + "logits/rejected": 21277550.4, + "logps/chosen": -161.858642578125, + "logps/rejected": -154.7493408203125, + "loss": 0.49634590148925783, + "rewards/chosen": -0.6363963603973388, + "rewards/margins": -0.21991643905639646, + "rewards/rejected": -0.41647992134094236, + "step": 1740 + }, + { + "epoch": 0.7, + "grad_norm": 0.5856395959854126, + "kl": 3.9709296226501465, + "learning_rate": 3.6122222222222223e-06, + "logits/chosen": 24168908.8, + "logits/rejected": 26363808.0, + "logps/chosen": -216.2795654296875, + "logps/rejected": -159.5930908203125, + "loss": 0.4658236026763916, + "rewards/chosen": -0.5366491794586181, + "rewards/margins": 0.038147354125976585, + "rewards/rejected": -0.5747965335845947, + "step": 1750 + }, + { + "epoch": 0.704, + "grad_norm": 0.6619251370429993, + "kl": 3.0937228202819824, + "learning_rate": 3.601111111111111e-06, + "logits/chosen": 17747806.4, + "logits/rejected": 22547065.6, + "logps/chosen": -187.3780517578125, + "logps/rejected": -159.7064453125, + "loss": 0.5088288307189941, + "rewards/chosen": -1.2607831954956055, + "rewards/margins": -0.6743541717529297, + "rewards/rejected": -0.5864290237426758, + "step": 1760 + }, + { + "epoch": 0.708, + "grad_norm": 0.5218913555145264, + "kl": 2.901822566986084, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 22269254.4, + "logits/rejected": 23662136.0, + "logps/chosen": -136.06712646484374, + "logps/rejected": -121.06827392578126, + "loss": 0.497973108291626, + "rewards/chosen": -0.4131883144378662, + "rewards/margins": -0.10578060150146484, + "rewards/rejected": -0.30740771293640134, + "step": 1770 + }, + { + "epoch": 0.712, + "grad_norm": 0.6656368970870972, + "kl": 3.2692978382110596, + "learning_rate": 3.578888888888889e-06, + "logits/chosen": 25755620.8, + "logits/rejected": 26518835.2, + "logps/chosen": -165.65780029296874, + "logps/rejected": -147.2259033203125, + "loss": 0.506129789352417, + "rewards/chosen": -0.5433285236358643, + "rewards/margins": -0.17034811973571778, + "rewards/rejected": -0.3729804039001465, + "step": 1780 + }, + { + "epoch": 0.716, + "grad_norm": 0.771259069442749, + "kl": 3.0249366760253906, + "learning_rate": 3.5677777777777784e-06, + "logits/chosen": 23546620.8, + "logits/rejected": 25753550.4, + "logps/chosen": -151.70357666015624, + "logps/rejected": -122.8987548828125, + "loss": 0.5249699592590332, + "rewards/chosen": -0.5662184715270996, + "rewards/margins": -0.2557974576950073, + "rewards/rejected": -0.3104210138320923, + "step": 1790 + }, + { + "epoch": 0.72, + "grad_norm": 0.872774064540863, + "kl": 3.2898342609405518, + "learning_rate": 3.556666666666667e-06, + "logits/chosen": 18870168.0, + "logits/rejected": 17117038.4, + "logps/chosen": -150.25985107421874, + "logps/rejected": -161.5357666015625, + "loss": 0.451005744934082, + "rewards/chosen": -0.1520848512649536, + "rewards/margins": 0.6849667310714721, + "rewards/rejected": -0.8370515823364257, + "step": 1800 + }, + { + "epoch": 0.72, + "eval_kl": 3.5156476497650146, + "eval_logits/chosen": 26424913.92, + "eval_logits/rejected": 26601347.072, + "eval_logps/chosen": -156.963453125, + "eval_logps/rejected": -152.583296875, + "eval_loss": 0.48428651690483093, + "eval_rewards/chosen": -0.2637669677734375, + "eval_rewards/margins": 0.13028942871093752, + "eval_rewards/rejected": -0.394056396484375, + "eval_runtime": 217.3905, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 1800 + }, + { + "epoch": 0.724, + "grad_norm": 0.6279663443565369, + "kl": 1.9814598560333252, + "learning_rate": 3.545555555555556e-06, + "logits/chosen": 31417820.8, + "logits/rejected": 29248547.2, + "logps/chosen": -143.02147216796874, + "logps/rejected": -176.2625732421875, + "loss": 0.47081918716430665, + "rewards/chosen": -0.7984821319580078, + "rewards/margins": 0.2239703178405763, + "rewards/rejected": -1.022452449798584, + "step": 1810 + }, + { + "epoch": 0.728, + "grad_norm": 0.6332824230194092, + "kl": 3.8811469078063965, + "learning_rate": 3.534444444444445e-06, + "logits/chosen": 25455878.4, + "logits/rejected": 24060284.8, + "logps/chosen": -148.748779296875, + "logps/rejected": -152.15390625, + "loss": 0.48478074073791505, + "rewards/chosen": -0.37164936065673826, + "rewards/margins": 0.1325855255126953, + "rewards/rejected": -0.5042348861694336, + "step": 1820 + }, + { + "epoch": 0.732, + "grad_norm": 0.570693850517273, + "kl": 3.4417755603790283, + "learning_rate": 3.5233333333333336e-06, + "logits/chosen": 21330112.0, + "logits/rejected": 23791145.6, + "logps/chosen": -175.6027099609375, + "logps/rejected": -139.43577880859374, + "loss": 0.4806610107421875, + "rewards/chosen": -0.7079993724822998, + "rewards/margins": -0.224769401550293, + "rewards/rejected": -0.4832299709320068, + "step": 1830 + }, + { + "epoch": 0.736, + "grad_norm": 0.6215969920158386, + "kl": 2.516907215118408, + "learning_rate": 3.5122222222222224e-06, + "logits/chosen": 19252992.0, + "logits/rejected": 17279195.2, + "logps/chosen": -138.5216552734375, + "logps/rejected": -175.40498046875, + "loss": 0.4477705955505371, + "rewards/chosen": -0.37056674957275393, + "rewards/margins": 0.6211806297302246, + "rewards/rejected": -0.9917473793029785, + "step": 1840 + }, + { + "epoch": 0.74, + "grad_norm": 0.477038711309433, + "kl": 2.8053412437438965, + "learning_rate": 3.5011111111111112e-06, + "logits/chosen": 21869585.6, + "logits/rejected": 25226084.8, + "logps/chosen": -169.3609130859375, + "logps/rejected": -135.9068603515625, + "loss": 0.49389004707336426, + "rewards/chosen": -0.8072388648986817, + "rewards/margins": -0.23602757453918466, + "rewards/rejected": -0.571211290359497, + "step": 1850 + }, + { + "epoch": 0.744, + "grad_norm": 0.4190019369125366, + "kl": 4.664608955383301, + "learning_rate": 3.49e-06, + "logits/chosen": 18998553.6, + "logits/rejected": 18716126.4, + "logps/chosen": -154.63216552734374, + "logps/rejected": -157.9931884765625, + "loss": 0.45591115951538086, + "rewards/chosen": -0.11906745433807372, + "rewards/margins": 0.5571311235427856, + "rewards/rejected": -0.6761985778808594, + "step": 1860 + }, + { + "epoch": 0.748, + "grad_norm": 0.5092635154724121, + "kl": 5.426673412322998, + "learning_rate": 3.4788888888888893e-06, + "logits/chosen": 24268691.2, + "logits/rejected": 23287683.2, + "logps/chosen": -150.13511962890624, + "logps/rejected": -137.98375244140624, + "loss": 0.4658195018768311, + "rewards/chosen": 0.17463077306747438, + "rewards/margins": 0.33842480182647705, + "rewards/rejected": -0.16379402875900267, + "step": 1870 + }, + { + "epoch": 0.752, + "grad_norm": 0.5116318464279175, + "kl": 3.4443411827087402, + "learning_rate": 3.467777777777778e-06, + "logits/chosen": 25825232.0, + "logits/rejected": 28601868.8, + "logps/chosen": -131.28408203125, + "logps/rejected": -165.76693115234374, + "loss": 0.4923543453216553, + "rewards/chosen": -0.23933188915252684, + "rewards/margins": 0.14726905822753905, + "rewards/rejected": -0.3866009473800659, + "step": 1880 + }, + { + "epoch": 0.756, + "grad_norm": 0.6366556286811829, + "kl": 3.051987409591675, + "learning_rate": 3.456666666666667e-06, + "logits/chosen": 24197241.6, + "logits/rejected": 24256118.4, + "logps/chosen": -149.84852294921876, + "logps/rejected": -140.0214599609375, + "loss": 0.4848769664764404, + "rewards/chosen": -0.3190887212753296, + "rewards/margins": 0.17881777286529538, + "rewards/rejected": -0.497906494140625, + "step": 1890 + }, + { + "epoch": 0.76, + "grad_norm": 0.4979274570941925, + "kl": 1.4698994159698486, + "learning_rate": 3.4455555555555557e-06, + "logits/chosen": 23184480.0, + "logits/rejected": 25139280.0, + "logps/chosen": -134.97877197265626, + "logps/rejected": -144.17740478515626, + "loss": 0.4794943809509277, + "rewards/chosen": -0.6308645248413086, + "rewards/margins": 0.13470888137817383, + "rewards/rejected": -0.7655734062194824, + "step": 1900 + }, + { + "epoch": 0.764, + "grad_norm": 0.6274532079696655, + "kl": 3.6468818187713623, + "learning_rate": 3.4344444444444445e-06, + "logits/chosen": 24657672.0, + "logits/rejected": 21508489.6, + "logps/chosen": -160.151904296875, + "logps/rejected": -166.94481201171874, + "loss": 0.4795567512512207, + "rewards/chosen": -0.22244927883148194, + "rewards/margins": 0.15467300415039062, + "rewards/rejected": -0.37712228298187256, + "step": 1910 + }, + { + "epoch": 0.768, + "grad_norm": 0.7713479399681091, + "kl": 4.167417049407959, + "learning_rate": 3.4233333333333333e-06, + "logits/chosen": 23603747.2, + "logits/rejected": 21209184.0, + "logps/chosen": -134.7058837890625, + "logps/rejected": -163.1764404296875, + "loss": 0.4544349193572998, + "rewards/chosen": 0.17299318313598633, + "rewards/margins": 0.5065126180648803, + "rewards/rejected": -0.333519434928894, + "step": 1920 + }, + { + "epoch": 0.772, + "grad_norm": 0.5262131690979004, + "kl": 2.8361663818359375, + "learning_rate": 3.412222222222222e-06, + "logits/chosen": 29174873.6, + "logits/rejected": 33003203.2, + "logps/chosen": -158.76817626953124, + "logps/rejected": -142.29862060546876, + "loss": 0.47826762199401857, + "rewards/chosen": -0.2532700300216675, + "rewards/margins": 0.13431007862091066, + "rewards/rejected": -0.38758010864257814, + "step": 1930 + }, + { + "epoch": 0.776, + "grad_norm": 0.610528826713562, + "kl": 1.9879090785980225, + "learning_rate": 3.4011111111111113e-06, + "logits/chosen": 14738179.2, + "logits/rejected": 17543468.8, + "logps/chosen": -144.6372314453125, + "logps/rejected": -121.0155517578125, + "loss": 0.5197708129882812, + "rewards/chosen": -0.7448621273040772, + "rewards/margins": -0.2757446765899659, + "rewards/rejected": -0.4691174507141113, + "step": 1940 + }, + { + "epoch": 0.78, + "grad_norm": 0.4867253601551056, + "kl": 2.61750864982605, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 29278848.0, + "logits/rejected": 27723072.0, + "logps/chosen": -167.261474609375, + "logps/rejected": -166.44840087890626, + "loss": 0.48351154327392576, + "rewards/chosen": -0.45982890129089354, + "rewards/margins": 0.17165498733520507, + "rewards/rejected": -0.6314838886260986, + "step": 1950 + }, + { + "epoch": 0.784, + "grad_norm": 0.507047176361084, + "kl": 1.4705009460449219, + "learning_rate": 3.3788888888888894e-06, + "logits/chosen": 21861384.0, + "logits/rejected": 22609652.8, + "logps/chosen": -154.51759033203126, + "logps/rejected": -141.37017822265625, + "loss": 0.4911977291107178, + "rewards/chosen": -0.9377481460571289, + "rewards/margins": 0.045456314086914085, + "rewards/rejected": -0.983204460144043, + "step": 1960 + }, + { + "epoch": 0.788, + "grad_norm": 0.5638304352760315, + "kl": 2.8000810146331787, + "learning_rate": 3.367777777777778e-06, + "logits/chosen": 29543376.0, + "logits/rejected": 30959481.6, + "logps/chosen": -154.2559814453125, + "logps/rejected": -140.09403076171876, + "loss": 0.4743481636047363, + "rewards/chosen": -0.27635998725891114, + "rewards/margins": 0.31996994018554686, + "rewards/rejected": -0.596329927444458, + "step": 1970 + }, + { + "epoch": 0.792, + "grad_norm": 0.622689962387085, + "kl": 1.304429292678833, + "learning_rate": 3.356666666666667e-06, + "logits/chosen": 16238214.4, + "logits/rejected": 15864859.2, + "logps/chosen": -148.69432373046874, + "logps/rejected": -155.14200439453126, + "loss": 0.4647815227508545, + "rewards/chosen": -0.6397994041442872, + "rewards/margins": 0.38712730407714835, + "rewards/rejected": -1.0269267082214355, + "step": 1980 + }, + { + "epoch": 0.796, + "grad_norm": 0.5903355479240417, + "kl": 3.8611984252929688, + "learning_rate": 3.345555555555556e-06, + "logits/chosen": 26873817.6, + "logits/rejected": 25962048.0, + "logps/chosen": -168.3064208984375, + "logps/rejected": -165.02401123046874, + "loss": 0.44381189346313477, + "rewards/chosen": -0.056187999248504636, + "rewards/margins": 0.48386293649673456, + "rewards/rejected": -0.5400509357452392, + "step": 1990 + }, + { + "epoch": 0.8, + "grad_norm": 0.6087274551391602, + "kl": 2.4798474311828613, + "learning_rate": 3.3344444444444446e-06, + "logits/chosen": 28899868.8, + "logits/rejected": 28327043.2, + "logps/chosen": -131.373046875, + "logps/rejected": -144.835546875, + "loss": 0.4636848449707031, + "rewards/chosen": -0.4337655544281006, + "rewards/margins": 0.1502884864807129, + "rewards/rejected": -0.5840540409088135, + "step": 2000 + }, + { + "epoch": 0.8, + "eval_kl": 2.3182120323181152, + "eval_logits/chosen": 23415599.104, + "eval_logits/rejected": 23816060.928, + "eval_logps/chosen": -161.6585625, + "eval_logps/rejected": -157.55559375, + "eval_loss": 0.48174571990966797, + "eval_rewards/chosen": -0.7332791137695313, + "eval_rewards/margins": 0.1580091552734375, + "eval_rewards/rejected": -0.8912882690429688, + "eval_runtime": 216.8959, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 2000 + }, + { + "epoch": 0.804, + "grad_norm": 0.6840182542800903, + "kl": 3.759185791015625, + "learning_rate": 3.3233333333333334e-06, + "logits/chosen": 29734800.0, + "logits/rejected": 27820688.0, + "logps/chosen": -171.00633544921874, + "logps/rejected": -149.1771240234375, + "loss": 0.4692417621612549, + "rewards/chosen": -0.2439584493637085, + "rewards/margins": 0.4319137811660767, + "rewards/rejected": -0.6758722305297852, + "step": 2010 + }, + { + "epoch": 0.808, + "grad_norm": 0.4128756523132324, + "kl": 2.642878770828247, + "learning_rate": 3.3122222222222222e-06, + "logits/chosen": 19955732.8, + "logits/rejected": 18758494.4, + "logps/chosen": -189.38092041015625, + "logps/rejected": -170.4149658203125, + "loss": 0.46123080253601073, + "rewards/chosen": -0.5583849906921386, + "rewards/margins": 0.32184505462646484, + "rewards/rejected": -0.8802300453186035, + "step": 2020 + }, + { + "epoch": 0.812, + "grad_norm": 0.5455370545387268, + "kl": 1.1196393966674805, + "learning_rate": 3.3011111111111115e-06, + "logits/chosen": 22045115.2, + "logits/rejected": 18838947.2, + "logps/chosen": -148.83717041015626, + "logps/rejected": -179.09134521484376, + "loss": 0.4277163505554199, + "rewards/chosen": -0.9872810363769531, + "rewards/margins": 0.9499824523925782, + "rewards/rejected": -1.9372634887695312, + "step": 2030 + }, + { + "epoch": 0.816, + "grad_norm": 0.5655795335769653, + "kl": 2.0870370864868164, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 30604160.0, + "logits/rejected": 25881659.2, + "logps/chosen": -170.49140625, + "logps/rejected": -214.0181640625, + "loss": 0.4611818790435791, + "rewards/chosen": -0.9137911796569824, + "rewards/margins": 0.8909661293029785, + "rewards/rejected": -1.804757308959961, + "step": 2040 + }, + { + "epoch": 0.82, + "grad_norm": 0.48172426223754883, + "kl": 2.043773651123047, + "learning_rate": 3.278888888888889e-06, + "logits/chosen": 16779667.2, + "logits/rejected": 17778121.6, + "logps/chosen": -148.56708984375, + "logps/rejected": -125.018115234375, + "loss": 0.49151906967163084, + "rewards/chosen": -1.0805482864379883, + "rewards/margins": -0.045468139648437544, + "rewards/rejected": -1.0350801467895507, + "step": 2050 + }, + { + "epoch": 0.824, + "grad_norm": 0.5591869950294495, + "kl": 1.8221423625946045, + "learning_rate": 3.267777777777778e-06, + "logits/chosen": 18840448.0, + "logits/rejected": 14656315.2, + "logps/chosen": -178.58017578125, + "logps/rejected": -194.3201171875, + "loss": 0.5037118434906006, + "rewards/chosen": -1.5193581581115723, + "rewards/margins": 0.3175524711608886, + "rewards/rejected": -1.8369106292724608, + "step": 2060 + }, + { + "epoch": 0.828, + "grad_norm": 0.6082685589790344, + "kl": 2.7332985401153564, + "learning_rate": 3.2566666666666667e-06, + "logits/chosen": 19653870.4, + "logits/rejected": 18607360.0, + "logps/chosen": -191.222802734375, + "logps/rejected": -195.412109375, + "loss": 0.43700380325317384, + "rewards/chosen": -1.0758570671081542, + "rewards/margins": 1.051231098175049, + "rewards/rejected": -2.127088165283203, + "step": 2070 + }, + { + "epoch": 0.832, + "grad_norm": 0.8018869161605835, + "kl": 1.3849284648895264, + "learning_rate": 3.2455555555555555e-06, + "logits/chosen": 18399478.4, + "logits/rejected": 19887457.6, + "logps/chosen": -193.88709716796876, + "logps/rejected": -149.70872802734374, + "loss": 0.5033087730407715, + "rewards/chosen": -1.796027946472168, + "rewards/margins": -0.644907569885254, + "rewards/rejected": -1.151120376586914, + "step": 2080 + }, + { + "epoch": 0.836, + "grad_norm": 0.6100642681121826, + "kl": 1.6638615131378174, + "learning_rate": 3.2344444444444443e-06, + "logits/chosen": 13364839.2, + "logits/rejected": 12106027.2, + "logps/chosen": -150.7623291015625, + "logps/rejected": -176.14700927734376, + "loss": 0.45767946243286134, + "rewards/chosen": -0.9122394561767578, + "rewards/margins": 1.188156890869141, + "rewards/rejected": -2.1003963470458986, + "step": 2090 + }, + { + "epoch": 0.84, + "grad_norm": 0.4774913191795349, + "kl": 2.0765693187713623, + "learning_rate": 3.223333333333334e-06, + "logits/chosen": 14278204.8, + "logits/rejected": 16952772.8, + "logps/chosen": -180.37510986328124, + "logps/rejected": -164.04486083984375, + "loss": 0.5365061283111572, + "rewards/chosen": -1.5350143432617187, + "rewards/margins": -0.5786049842834472, + "rewards/rejected": -0.9564093589782715, + "step": 2100 + }, + { + "epoch": 0.844, + "grad_norm": 0.424125999212265, + "kl": 1.1270596981048584, + "learning_rate": 3.2122222222222228e-06, + "logits/chosen": 11056914.4, + "logits/rejected": 10464643.2, + "logps/chosen": -159.667138671875, + "logps/rejected": -179.2357177734375, + "loss": 0.4685384750366211, + "rewards/chosen": -0.7500426292419433, + "rewards/margins": 0.7069652557373047, + "rewards/rejected": -1.457007884979248, + "step": 2110 + }, + { + "epoch": 0.848, + "grad_norm": 0.6812456846237183, + "kl": 3.2760558128356934, + "learning_rate": 3.2011111111111116e-06, + "logits/chosen": 12226829.6, + "logits/rejected": 9145164.0, + "logps/chosen": -152.55595703125, + "logps/rejected": -163.9876953125, + "loss": 0.4829984664916992, + "rewards/chosen": -1.0188889503479004, + "rewards/margins": 0.4964068412780762, + "rewards/rejected": -1.5152957916259766, + "step": 2120 + }, + { + "epoch": 0.852, + "grad_norm": 0.521295964717865, + "kl": 1.6184799671173096, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 19296177.6, + "logits/rejected": 18037955.2, + "logps/chosen": -190.2342041015625, + "logps/rejected": -179.3730224609375, + "loss": 0.4534940719604492, + "rewards/chosen": -1.4859835624694824, + "rewards/margins": 0.12996721267700195, + "rewards/rejected": -1.6159507751464843, + "step": 2130 + }, + { + "epoch": 0.856, + "grad_norm": 0.3901250660419464, + "kl": 2.1943907737731934, + "learning_rate": 3.178888888888889e-06, + "logits/chosen": 14292169.6, + "logits/rejected": 16561420.8, + "logps/chosen": -172.7265869140625, + "logps/rejected": -174.55390625, + "loss": 0.46813135147094725, + "rewards/chosen": -0.9191327095031738, + "rewards/margins": 0.9562966346740722, + "rewards/rejected": -1.875429344177246, + "step": 2140 + }, + { + "epoch": 0.86, + "grad_norm": 1.2093825340270996, + "kl": 2.53037691116333, + "learning_rate": 3.167777777777778e-06, + "logits/chosen": 15527200.0, + "logits/rejected": 14247240.0, + "logps/chosen": -175.73638916015625, + "logps/rejected": -177.11685791015626, + "loss": 0.4883676052093506, + "rewards/chosen": -1.1620004653930665, + "rewards/margins": 0.6007183074951172, + "rewards/rejected": -1.7627187728881837, + "step": 2150 + }, + { + "epoch": 0.864, + "grad_norm": 0.7262481451034546, + "kl": 2.6998825073242188, + "learning_rate": 3.156666666666667e-06, + "logits/chosen": 25509097.6, + "logits/rejected": 26202662.4, + "logps/chosen": -159.83707275390626, + "logps/rejected": -170.9850341796875, + "loss": 0.44543633460998533, + "rewards/chosen": -0.19056529998779298, + "rewards/margins": 0.7149291038513184, + "rewards/rejected": -0.9054944038391113, + "step": 2160 + }, + { + "epoch": 0.868, + "grad_norm": 0.4244597554206848, + "kl": 1.9011032581329346, + "learning_rate": 3.1455555555555556e-06, + "logits/chosen": 13862787.2, + "logits/rejected": 13148918.4, + "logps/chosen": -144.288720703125, + "logps/rejected": -159.27891845703124, + "loss": 0.46286282539367674, + "rewards/chosen": -1.0397714614868163, + "rewards/margins": 0.25113573074340834, + "rewards/rejected": -1.2909071922302247, + "step": 2170 + }, + { + "epoch": 0.872, + "grad_norm": 0.7587819695472717, + "kl": 4.540980339050293, + "learning_rate": 3.134444444444445e-06, + "logits/chosen": 29813209.6, + "logits/rejected": 31323004.8, + "logps/chosen": -183.304345703125, + "logps/rejected": -198.9873779296875, + "loss": 0.4856124401092529, + "rewards/chosen": -0.8742061614990234, + "rewards/margins": -0.17829103469848628, + "rewards/rejected": -0.6959151268005371, + "step": 2180 + }, + { + "epoch": 0.876, + "grad_norm": 0.36061376333236694, + "kl": 2.879594326019287, + "learning_rate": 3.1233333333333336e-06, + "logits/chosen": 29873868.8, + "logits/rejected": 30440390.4, + "logps/chosen": -143.857861328125, + "logps/rejected": -136.5346435546875, + "loss": 0.5109179496765137, + "rewards/chosen": -0.8628176689147949, + "rewards/margins": -0.39257164001464845, + "rewards/rejected": -0.4702460289001465, + "step": 2190 + }, + { + "epoch": 0.88, + "grad_norm": 0.39630356431007385, + "kl": 2.539196729660034, + "learning_rate": 3.1122222222222224e-06, + "logits/chosen": 20060600.0, + "logits/rejected": 18553404.8, + "logps/chosen": -159.40810546875, + "logps/rejected": -152.76990966796876, + "loss": 0.47215023040771487, + "rewards/chosen": -0.8129859924316406, + "rewards/margins": 0.28859338760375974, + "rewards/rejected": -1.1015793800354003, + "step": 2200 + }, + { + "epoch": 0.88, + "eval_kl": 2.5812811851501465, + "eval_logits/chosen": 21032757.248, + "eval_logits/rejected": 21261236.224, + "eval_logps/chosen": -162.41909375, + "eval_logps/rejected": -158.462875, + "eval_loss": 0.48150432109832764, + "eval_rewards/chosen": -0.8093319091796874, + "eval_rewards/margins": 0.17268438720703128, + "eval_rewards/rejected": -0.9820162963867187, + "eval_runtime": 216.8473, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 2200 + }, + { + "epoch": 0.884, + "grad_norm": 0.6626996994018555, + "kl": 2.7219512462615967, + "learning_rate": 3.1011111111111113e-06, + "logits/chosen": 20549030.4, + "logits/rejected": 23361507.2, + "logps/chosen": -180.177783203125, + "logps/rejected": -172.0116943359375, + "loss": 0.5009243011474609, + "rewards/chosen": -1.0695799827575683, + "rewards/margins": 0.0988718032836915, + "rewards/rejected": -1.1684517860412598, + "step": 2210 + }, + { + "epoch": 0.888, + "grad_norm": 0.40188467502593994, + "kl": 4.388433933258057, + "learning_rate": 3.09e-06, + "logits/chosen": 31197132.8, + "logits/rejected": 29223571.2, + "logps/chosen": -158.33319091796875, + "logps/rejected": -156.518505859375, + "loss": 0.49659576416015627, + "rewards/chosen": -0.5678246021270752, + "rewards/margins": 0.040923357009887695, + "rewards/rejected": -0.6087479591369629, + "step": 2220 + }, + { + "epoch": 0.892, + "grad_norm": 0.7662191390991211, + "kl": 3.7732715606689453, + "learning_rate": 3.078888888888889e-06, + "logits/chosen": 23728937.6, + "logits/rejected": 24493553.6, + "logps/chosen": -122.2478759765625, + "logps/rejected": -116.500439453125, + "loss": 0.4903532028198242, + "rewards/chosen": -0.12796418666839598, + "rewards/margins": 0.14529306888580323, + "rewards/rejected": -0.2732572555541992, + "step": 2230 + }, + { + "epoch": 0.896, + "grad_norm": 0.5434762835502625, + "kl": 5.346643924713135, + "learning_rate": 3.0677777777777777e-06, + "logits/chosen": 23763382.4, + "logits/rejected": 20131742.4, + "logps/chosen": -148.9446044921875, + "logps/rejected": -145.7754638671875, + "loss": 0.4672962188720703, + "rewards/chosen": 0.033642816543579104, + "rewards/margins": 0.5607096195220947, + "rewards/rejected": -0.5270668029785156, + "step": 2240 + }, + { + "epoch": 0.9, + "grad_norm": 0.5850833058357239, + "kl": 4.739095211029053, + "learning_rate": 3.0566666666666665e-06, + "logits/chosen": 20819936.0, + "logits/rejected": 24134200.0, + "logps/chosen": -142.128466796875, + "logps/rejected": -151.9432861328125, + "loss": 0.4636435031890869, + "rewards/chosen": 0.028873807191848753, + "rewards/margins": 0.2783109962940216, + "rewards/rejected": -0.24943718910217286, + "step": 2250 + }, + { + "epoch": 0.904, + "grad_norm": 0.6144809126853943, + "kl": 4.373375415802002, + "learning_rate": 3.045555555555556e-06, + "logits/chosen": 33130332.8, + "logits/rejected": 34606784.0, + "logps/chosen": -155.8130126953125, + "logps/rejected": -168.7025390625, + "loss": 0.47957863807678225, + "rewards/chosen": -0.34006266593933104, + "rewards/margins": 0.1747920036315918, + "rewards/rejected": -0.5148546695709229, + "step": 2260 + }, + { + "epoch": 0.908, + "grad_norm": 0.48172423243522644, + "kl": 3.806690216064453, + "learning_rate": 3.034444444444445e-06, + "logits/chosen": 29197280.0, + "logits/rejected": 26733576.0, + "logps/chosen": -156.34307861328125, + "logps/rejected": -124.57420654296875, + "loss": 0.486788272857666, + "rewards/chosen": -0.15740108489990234, + "rewards/margins": 0.07049424648284913, + "rewards/rejected": -0.22789533138275148, + "step": 2270 + }, + { + "epoch": 0.912, + "grad_norm": 0.5201888680458069, + "kl": 2.4590580463409424, + "learning_rate": 3.0233333333333338e-06, + "logits/chosen": 13540443.2, + "logits/rejected": 11543592.0, + "logps/chosen": -128.15758056640624, + "logps/rejected": -144.884130859375, + "loss": 0.5047108173370362, + "rewards/chosen": -0.8088220596313477, + "rewards/margins": 0.07349948883056634, + "rewards/rejected": -0.882321548461914, + "step": 2280 + }, + { + "epoch": 0.916, + "grad_norm": 0.5650275945663452, + "kl": 4.004490852355957, + "learning_rate": 3.0122222222222226e-06, + "logits/chosen": 30858310.4, + "logits/rejected": 30752073.6, + "logps/chosen": -173.15472412109375, + "logps/rejected": -179.914208984375, + "loss": 0.45772509574890136, + "rewards/chosen": -0.07655960321426392, + "rewards/margins": 0.580165708065033, + "rewards/rejected": -0.6567253112792969, + "step": 2290 + }, + { + "epoch": 0.92, + "grad_norm": 0.6002667546272278, + "kl": 2.4904167652130127, + "learning_rate": 3.0011111111111114e-06, + "logits/chosen": 27612214.4, + "logits/rejected": 29905420.8, + "logps/chosen": -170.78812255859376, + "logps/rejected": -171.15484619140625, + "loss": 0.48928098678588866, + "rewards/chosen": -0.48044404983520506, + "rewards/margins": 0.08476023674011235, + "rewards/rejected": -0.5652042865753174, + "step": 2300 + }, + { + "epoch": 0.924, + "grad_norm": 0.7137225866317749, + "kl": 2.9995059967041016, + "learning_rate": 2.99e-06, + "logits/chosen": 33246598.4, + "logits/rejected": 31494838.4, + "logps/chosen": -124.85633544921875, + "logps/rejected": -151.764404296875, + "loss": 0.46476993560791013, + "rewards/chosen": -0.3835261344909668, + "rewards/margins": 0.35444231033325196, + "rewards/rejected": -0.7379684448242188, + "step": 2310 + }, + { + "epoch": 0.928, + "grad_norm": 0.48665422201156616, + "kl": 4.963588714599609, + "learning_rate": 2.978888888888889e-06, + "logits/chosen": 26167496.0, + "logits/rejected": 26003188.8, + "logps/chosen": -154.3181640625, + "logps/rejected": -193.70732421875, + "loss": 0.4603987216949463, + "rewards/chosen": -0.019819003343582154, + "rewards/margins": 0.6258892238140107, + "rewards/rejected": -0.6457082271575928, + "step": 2320 + }, + { + "epoch": 0.932, + "grad_norm": 0.6779302954673767, + "kl": 3.996805191040039, + "learning_rate": 2.9677777777777778e-06, + "logits/chosen": 26639760.0, + "logits/rejected": 24185547.2, + "logps/chosen": -145.71864013671876, + "logps/rejected": -165.16456298828126, + "loss": 0.41465444564819337, + "rewards/chosen": 0.16978931427001953, + "rewards/margins": 0.8076234340667725, + "rewards/rejected": -0.637834119796753, + "step": 2330 + }, + { + "epoch": 0.936, + "grad_norm": 0.8533156514167786, + "kl": 3.2005672454833984, + "learning_rate": 2.956666666666667e-06, + "logits/chosen": 17311833.6, + "logits/rejected": 18152035.2, + "logps/chosen": -139.579248046875, + "logps/rejected": -141.84622802734376, + "loss": 0.4770832538604736, + "rewards/chosen": -0.5391797542572021, + "rewards/margins": 0.2623293399810791, + "rewards/rejected": -0.8015090942382812, + "step": 2340 + }, + { + "epoch": 0.94, + "grad_norm": 0.7501420974731445, + "kl": 5.2280778884887695, + "learning_rate": 2.945555555555556e-06, + "logits/chosen": 19459948.8, + "logits/rejected": 19488014.4, + "logps/chosen": -181.2943603515625, + "logps/rejected": -141.72918701171875, + "loss": 0.4741385459899902, + "rewards/chosen": -0.3969358682632446, + "rewards/margins": 0.048303866386413596, + "rewards/rejected": -0.4452397346496582, + "step": 2350 + }, + { + "epoch": 0.944, + "grad_norm": 0.47924181818962097, + "kl": 5.855168342590332, + "learning_rate": 2.9344444444444446e-06, + "logits/chosen": 25643113.6, + "logits/rejected": 21872040.0, + "logps/chosen": -146.56474609375, + "logps/rejected": -149.1546630859375, + "loss": 0.4523441314697266, + "rewards/chosen": 0.34644312858581544, + "rewards/margins": 0.6119464874267578, + "rewards/rejected": -0.2655033588409424, + "step": 2360 + }, + { + "epoch": 0.948, + "grad_norm": 0.6821103692054749, + "kl": 7.001960754394531, + "learning_rate": 2.9233333333333334e-06, + "logits/chosen": 26589932.8, + "logits/rejected": 24771849.6, + "logps/chosen": -132.04959716796876, + "logps/rejected": -197.91544189453126, + "loss": 0.5003488063812256, + "rewards/chosen": -0.008898758888244629, + "rewards/margins": 0.02103534936904907, + "rewards/rejected": -0.0299341082572937, + "step": 2370 + }, + { + "epoch": 0.952, + "grad_norm": 0.6097027063369751, + "kl": 6.377338409423828, + "learning_rate": 2.9122222222222222e-06, + "logits/chosen": 40187350.4, + "logits/rejected": 39877142.4, + "logps/chosen": -171.79857177734374, + "logps/rejected": -151.0936767578125, + "loss": 0.46953182220458983, + "rewards/chosen": 0.39073307514190675, + "rewards/margins": 0.4624105989933014, + "rewards/rejected": -0.07167752385139466, + "step": 2380 + }, + { + "epoch": 0.956, + "grad_norm": 0.8344343900680542, + "kl": 5.0947136878967285, + "learning_rate": 2.901111111111111e-06, + "logits/chosen": 27525568.0, + "logits/rejected": 27525084.8, + "logps/chosen": -174.18642578125, + "logps/rejected": -169.3951171875, + "loss": 0.4775404453277588, + "rewards/chosen": -0.11374995708465577, + "rewards/margins": 0.36258018016815186, + "rewards/rejected": -0.4763301372528076, + "step": 2390 + }, + { + "epoch": 0.96, + "grad_norm": 0.5999415516853333, + "kl": 5.195433616638184, + "learning_rate": 2.89e-06, + "logits/chosen": 32106156.8, + "logits/rejected": 31147836.8, + "logps/chosen": -166.06192626953126, + "logps/rejected": -175.74422607421874, + "loss": 0.4666886329650879, + "rewards/chosen": 0.09231564402580261, + "rewards/margins": 0.445311564207077, + "rewards/rejected": -0.3529959201812744, + "step": 2400 + }, + { + "epoch": 0.96, + "eval_kl": 5.085776329040527, + "eval_logits/chosen": 27241426.944, + "eval_logits/rejected": 27194333.184, + "eval_logps/chosen": -154.09196875, + "eval_logps/rejected": -150.0654375, + "eval_loss": 0.4826502501964569, + "eval_rewards/chosen": 0.023380521774291993, + "eval_rewards/margins": 0.16565044975280763, + "eval_rewards/rejected": -0.14226992797851562, + "eval_runtime": 216.6502, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 2400 + }, + { + "epoch": 0.964, + "grad_norm": 0.6591479182243347, + "kl": 4.856285095214844, + "learning_rate": 2.8788888888888895e-06, + "logits/chosen": 33843148.8, + "logits/rejected": 33023673.6, + "logps/chosen": -162.89716796875, + "logps/rejected": -152.8024169921875, + "loss": 0.4535430908203125, + "rewards/chosen": 0.35766189098358153, + "rewards/margins": 0.5051510214805603, + "rewards/rejected": -0.14748913049697876, + "step": 2410 + }, + { + "epoch": 0.968, + "grad_norm": 0.649363100528717, + "kl": 5.67615270614624, + "learning_rate": 2.8677777777777783e-06, + "logits/chosen": 28120470.4, + "logits/rejected": 28187414.4, + "logps/chosen": -148.36553955078125, + "logps/rejected": -164.37691650390624, + "loss": 0.49935593605041506, + "rewards/chosen": -0.13897392749786378, + "rewards/margins": 0.09212601184844971, + "rewards/rejected": -0.23109993934631348, + "step": 2420 + }, + { + "epoch": 0.972, + "grad_norm": 0.7029784321784973, + "kl": 5.981629848480225, + "learning_rate": 2.856666666666667e-06, + "logits/chosen": 33376736.0, + "logits/rejected": 35385472.0, + "logps/chosen": -160.655419921875, + "logps/rejected": -113.14091796875, + "loss": 0.5014323711395263, + "rewards/chosen": -0.06511507034301758, + "rewards/margins": -0.10962846279144288, + "rewards/rejected": 0.044513392448425296, + "step": 2430 + }, + { + "epoch": 0.976, + "grad_norm": 0.5741814970970154, + "kl": 7.015416145324707, + "learning_rate": 2.845555555555556e-06, + "logits/chosen": 24375812.8, + "logits/rejected": 23925715.2, + "logps/chosen": -142.3306884765625, + "logps/rejected": -145.99935302734374, + "loss": 0.47071352005004885, + "rewards/chosen": 0.3844744205474854, + "rewards/margins": 0.33515343666076663, + "rewards/rejected": 0.04932098388671875, + "step": 2440 + }, + { + "epoch": 0.98, + "grad_norm": 0.708365261554718, + "kl": 7.640904426574707, + "learning_rate": 2.8344444444444447e-06, + "logits/chosen": 36292083.2, + "logits/rejected": 33427609.6, + "logps/chosen": -175.033447265625, + "logps/rejected": -175.34300537109374, + "loss": 0.46329379081726074, + "rewards/chosen": 0.707914161682129, + "rewards/margins": 0.43219349384307865, + "rewards/rejected": 0.2757206678390503, + "step": 2450 + }, + { + "epoch": 0.984, + "grad_norm": 0.8229350447654724, + "kl": 6.793179512023926, + "learning_rate": 2.8233333333333335e-06, + "logits/chosen": 34248473.6, + "logits/rejected": 34939712.0, + "logps/chosen": -144.50880126953126, + "logps/rejected": -149.553759765625, + "loss": 0.49341444969177245, + "rewards/chosen": 0.4542993545532227, + "rewards/margins": 0.14487073421478275, + "rewards/rejected": 0.30942862033843993, + "step": 2460 + }, + { + "epoch": 0.988, + "grad_norm": 0.8729678392410278, + "kl": 6.059536933898926, + "learning_rate": 2.8122222222222224e-06, + "logits/chosen": 39128422.4, + "logits/rejected": 35834524.8, + "logps/chosen": -160.21749267578124, + "logps/rejected": -118.927099609375, + "loss": 0.4856616973876953, + "rewards/chosen": 0.2684544324874878, + "rewards/margins": 0.2460126757621765, + "rewards/rejected": 0.02244175672531128, + "step": 2470 + }, + { + "epoch": 0.992, + "grad_norm": 0.7808408737182617, + "kl": 4.119040489196777, + "learning_rate": 2.801111111111111e-06, + "logits/chosen": 14563339.2, + "logits/rejected": 14386867.2, + "logps/chosen": -131.0562255859375, + "logps/rejected": -109.05662841796875, + "loss": 0.50515718460083, + "rewards/chosen": -0.17694272994995117, + "rewards/margins": -0.02005159854888916, + "rewards/rejected": -0.156891131401062, + "step": 2480 + }, + { + "epoch": 0.996, + "grad_norm": 0.7683461904525757, + "kl": 5.681182861328125, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 34791257.6, + "logits/rejected": 35001116.8, + "logps/chosen": -116.57052001953124, + "logps/rejected": -133.0627197265625, + "loss": 0.46341490745544434, + "rewards/chosen": 0.3942615032196045, + "rewards/margins": 0.4242114990949631, + "rewards/rejected": -0.02994999587535858, + "step": 2490 + }, + { + "epoch": 1.0, + "grad_norm": 0.7146331667900085, + "kl": 7.186850547790527, + "learning_rate": 2.778888888888889e-06, + "logits/chosen": 27759424.0, + "logits/rejected": 28190390.4, + "logps/chosen": -158.3378173828125, + "logps/rejected": -122.17666015625, + "loss": 0.4907883644104004, + "rewards/chosen": 0.33272812366485593, + "rewards/margins": 3.4856796264637335e-05, + "rewards/rejected": 0.3326932668685913, + "step": 2500 + }, + { + "epoch": 1.004, + "grad_norm": 0.7267434597015381, + "kl": 7.022622108459473, + "learning_rate": 2.767777777777778e-06, + "logits/chosen": 23414056.0, + "logits/rejected": 23530460.8, + "logps/chosen": -142.120947265625, + "logps/rejected": -126.23233642578126, + "loss": 0.450551700592041, + "rewards/chosen": 0.6840017318725586, + "rewards/margins": 0.4234133481979371, + "rewards/rejected": 0.26058838367462156, + "step": 2510 + }, + { + "epoch": 1.008, + "grad_norm": 0.613120436668396, + "kl": 7.363889217376709, + "learning_rate": 2.756666666666667e-06, + "logits/chosen": 42853379.2, + "logits/rejected": 42718368.0, + "logps/chosen": -133.13275146484375, + "logps/rejected": -147.75242919921874, + "loss": 0.4773738384246826, + "rewards/chosen": 0.6898352622985839, + "rewards/margins": 0.2725923061370849, + "rewards/rejected": 0.41724295616149903, + "step": 2520 + }, + { + "epoch": 1.012, + "grad_norm": 0.4656667113304138, + "kl": 6.543205261230469, + "learning_rate": 2.7455555555555556e-06, + "logits/chosen": 24894561.6, + "logits/rejected": 23945945.6, + "logps/chosen": -130.03875732421875, + "logps/rejected": -139.270556640625, + "loss": 0.48987507820129395, + "rewards/chosen": 0.16998794078826904, + "rewards/margins": 0.060191738605499256, + "rewards/rejected": 0.10979620218276978, + "step": 2530 + }, + { + "epoch": 1.016, + "grad_norm": 0.6344980597496033, + "kl": 8.745767593383789, + "learning_rate": 2.7344444444444444e-06, + "logits/chosen": 33636630.4, + "logits/rejected": 33898816.0, + "logps/chosen": -137.0499755859375, + "logps/rejected": -142.44915771484375, + "loss": 0.46440706253051756, + "rewards/chosen": 0.7247509479522705, + "rewards/margins": 0.3660990238189697, + "rewards/rejected": 0.35865192413330077, + "step": 2540 + }, + { + "epoch": 1.02, + "grad_norm": 0.5636667013168335, + "kl": 5.15373420715332, + "learning_rate": 2.7233333333333332e-06, + "logits/chosen": 32778352.0, + "logits/rejected": 34006931.2, + "logps/chosen": -138.170361328125, + "logps/rejected": -156.81767578125, + "loss": 0.4536026954650879, + "rewards/chosen": 0.28399336338043213, + "rewards/margins": 0.5328751564025879, + "rewards/rejected": -0.24888179302215577, + "step": 2550 + }, + { + "epoch": 1.024, + "grad_norm": 0.5508406758308411, + "kl": 4.445399284362793, + "learning_rate": 2.712222222222222e-06, + "logits/chosen": 24235310.4, + "logits/rejected": 20467011.2, + "logps/chosen": -102.88858642578126, + "logps/rejected": -117.8940673828125, + "loss": 0.4521032333374023, + "rewards/chosen": 0.2849747180938721, + "rewards/margins": 0.6732351303100585, + "rewards/rejected": -0.3882604122161865, + "step": 2560 + }, + { + "epoch": 1.028, + "grad_norm": 0.6794329881668091, + "kl": 7.5771074295043945, + "learning_rate": 2.7011111111111117e-06, + "logits/chosen": 39230246.4, + "logits/rejected": 36269590.4, + "logps/chosen": -160.77816162109374, + "logps/rejected": -175.435302734375, + "loss": 0.4510225296020508, + "rewards/chosen": 0.48538646697998045, + "rewards/margins": 0.5213055074214935, + "rewards/rejected": -0.03591904044151306, + "step": 2570 + }, + { + "epoch": 1.032, + "grad_norm": 0.873762845993042, + "kl": 7.515707969665527, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 34034691.2, + "logits/rejected": 32782438.4, + "logps/chosen": -146.67379150390624, + "logps/rejected": -160.89415283203124, + "loss": 0.4409791946411133, + "rewards/chosen": 0.8245258331298828, + "rewards/margins": 0.5631396770477295, + "rewards/rejected": 0.2613861560821533, + "step": 2580 + }, + { + "epoch": 1.036, + "grad_norm": 0.8786899447441101, + "kl": 7.477902889251709, + "learning_rate": 2.6788888888888893e-06, + "logits/chosen": 31405507.2, + "logits/rejected": 30546566.4, + "logps/chosen": -162.37747802734376, + "logps/rejected": -158.5367919921875, + "loss": 0.429317569732666, + "rewards/chosen": 0.7475490093231201, + "rewards/margins": 0.6858846783638, + "rewards/rejected": 0.06166433095932007, + "step": 2590 + }, + { + "epoch": 1.04, + "grad_norm": 0.9408835768699646, + "kl": 6.193826198577881, + "learning_rate": 2.667777777777778e-06, + "logits/chosen": 18314294.4, + "logits/rejected": 17261046.4, + "logps/chosen": -133.769482421875, + "logps/rejected": -166.3760498046875, + "loss": 0.4750513553619385, + "rewards/chosen": 0.05294798612594605, + "rewards/margins": 0.23716256618499754, + "rewards/rejected": -0.1842145800590515, + "step": 2600 + }, + { + "epoch": 1.04, + "eval_kl": 6.557363986968994, + "eval_logits/chosen": 28159451.136, + "eval_logits/rejected": 27912509.44, + "eval_logps/chosen": -150.601921875, + "eval_logps/rejected": -146.599171875, + "eval_loss": 0.4829034209251404, + "eval_rewards/chosen": 0.3723853454589844, + "eval_rewards/margins": 0.16802960205078127, + "eval_rewards/rejected": 0.20435574340820312, + "eval_runtime": 217.1791, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 2600 + }, + { + "epoch": 1.044, + "grad_norm": 0.6559078693389893, + "kl": 6.465074062347412, + "learning_rate": 2.656666666666667e-06, + "logits/chosen": 29268515.2, + "logits/rejected": 27382860.8, + "logps/chosen": -124.25496826171874, + "logps/rejected": -132.88599853515626, + "loss": 0.46654496192932127, + "rewards/chosen": 0.6311461448669433, + "rewards/margins": 0.35537390708923333, + "rewards/rejected": 0.27577223777771, + "step": 2610 + }, + { + "epoch": 1.048, + "grad_norm": 0.8091041445732117, + "kl": 10.043633460998535, + "learning_rate": 2.6455555555555557e-06, + "logits/chosen": 36276444.8, + "logits/rejected": 36037961.6, + "logps/chosen": -156.52952880859374, + "logps/rejected": -128.44246826171874, + "loss": 0.49141683578491213, + "rewards/chosen": 0.5628880023956299, + "rewards/margins": 0.02750706672668457, + "rewards/rejected": 0.5353809356689453, + "step": 2620 + }, + { + "epoch": 1.052, + "grad_norm": 0.6571462154388428, + "kl": 9.852654457092285, + "learning_rate": 2.6344444444444445e-06, + "logits/chosen": 31615228.8, + "logits/rejected": 32248979.2, + "logps/chosen": -154.07755126953126, + "logps/rejected": -132.39072265625, + "loss": 0.49523077011108396, + "rewards/chosen": 0.8096317291259766, + "rewards/margins": 0.018443870544433638, + "rewards/rejected": 0.791187858581543, + "step": 2630 + }, + { + "epoch": 1.056, + "grad_norm": 0.7390360832214355, + "kl": 6.219546318054199, + "learning_rate": 2.6233333333333333e-06, + "logits/chosen": 34434326.4, + "logits/rejected": 35229926.4, + "logps/chosen": -137.31668701171876, + "logps/rejected": -155.1895751953125, + "loss": 0.4699239730834961, + "rewards/chosen": 0.5334546089172363, + "rewards/margins": 0.24749846458435054, + "rewards/rejected": 0.28595614433288574, + "step": 2640 + }, + { + "epoch": 1.06, + "grad_norm": 0.4814999997615814, + "kl": 7.876091003417969, + "learning_rate": 2.6122222222222226e-06, + "logits/chosen": 31323043.2, + "logits/rejected": 32345257.6, + "logps/chosen": -123.53333740234375, + "logps/rejected": -115.850341796875, + "loss": 0.43686504364013673, + "rewards/chosen": 0.7052061557769775, + "rewards/margins": 0.533865237236023, + "rewards/rejected": 0.1713409185409546, + "step": 2650 + }, + { + "epoch": 1.064, + "grad_norm": 0.690242350101471, + "kl": 6.12372350692749, + "learning_rate": 2.6011111111111114e-06, + "logits/chosen": 25323872.0, + "logits/rejected": 25615576.0, + "logps/chosen": -141.92086181640624, + "logps/rejected": -147.47308349609375, + "loss": 0.4814923763275146, + "rewards/chosen": 0.1965832829475403, + "rewards/margins": 0.2016111582517624, + "rewards/rejected": -0.005027875304222107, + "step": 2660 + }, + { + "epoch": 1.068, + "grad_norm": 0.5838690400123596, + "kl": 5.5848894119262695, + "learning_rate": 2.59e-06, + "logits/chosen": 26498441.6, + "logits/rejected": 25519251.2, + "logps/chosen": -134.1501220703125, + "logps/rejected": -152.97462158203126, + "loss": 0.44647746086120604, + "rewards/chosen": 0.45215396881103515, + "rewards/margins": 0.7281685590744018, + "rewards/rejected": -0.2760145902633667, + "step": 2670 + }, + { + "epoch": 1.072, + "grad_norm": 0.6714196801185608, + "kl": 6.437767028808594, + "learning_rate": 2.578888888888889e-06, + "logits/chosen": 33809168.0, + "logits/rejected": 33002240.0, + "logps/chosen": -140.62269287109376, + "logps/rejected": -169.9681640625, + "loss": 0.46931910514831543, + "rewards/chosen": 0.5085949420928955, + "rewards/margins": 0.36951395273208615, + "rewards/rejected": 0.13908098936080932, + "step": 2680 + }, + { + "epoch": 1.076, + "grad_norm": 0.6106992959976196, + "kl": 4.450573921203613, + "learning_rate": 2.567777777777778e-06, + "logits/chosen": 30799760.0, + "logits/rejected": 30721590.4, + "logps/chosen": -122.03265380859375, + "logps/rejected": -137.297802734375, + "loss": 0.4664300441741943, + "rewards/chosen": 0.25887534618377683, + "rewards/margins": 0.22206425368785856, + "rewards/rejected": 0.036811092495918275, + "step": 2690 + }, + { + "epoch": 1.08, + "grad_norm": 0.9053173661231995, + "kl": 6.650594234466553, + "learning_rate": 2.5566666666666666e-06, + "logits/chosen": 38100038.4, + "logits/rejected": 34380816.0, + "logps/chosen": -145.046533203125, + "logps/rejected": -173.3657958984375, + "loss": 0.43851666450500487, + "rewards/chosen": 0.6501460552215577, + "rewards/margins": 0.6605178594589234, + "rewards/rejected": -0.010371804237365723, + "step": 2700 + }, + { + "epoch": 1.084, + "grad_norm": 0.676274836063385, + "kl": 3.7379002571105957, + "learning_rate": 2.5455555555555554e-06, + "logits/chosen": 19580768.0, + "logits/rejected": 20510683.2, + "logps/chosen": -145.21326904296876, + "logps/rejected": -136.80472412109376, + "loss": 0.5046684741973877, + "rewards/chosen": -0.46685400009155276, + "rewards/margins": -0.04879570007324219, + "rewards/rejected": -0.41805830001831057, + "step": 2710 + }, + { + "epoch": 1.088, + "grad_norm": 0.7222055792808533, + "kl": 5.816348075866699, + "learning_rate": 2.534444444444445e-06, + "logits/chosen": 22407427.2, + "logits/rejected": 21474601.6, + "logps/chosen": -100.11134643554688, + "logps/rejected": -127.4869384765625, + "loss": 0.4877651214599609, + "rewards/chosen": 0.17378766536712648, + "rewards/margins": 0.2759766340255737, + "rewards/rejected": -0.10218896865844726, + "step": 2720 + }, + { + "epoch": 1.092, + "grad_norm": 0.6274256110191345, + "kl": 5.171383857727051, + "learning_rate": 2.523333333333334e-06, + "logits/chosen": 35835520.0, + "logits/rejected": 35165875.2, + "logps/chosen": -186.44775390625, + "logps/rejected": -167.0265869140625, + "loss": 0.466900634765625, + "rewards/chosen": -0.10870237350463867, + "rewards/margins": 0.21330931186676022, + "rewards/rejected": -0.3220116853713989, + "step": 2730 + }, + { + "epoch": 1.096, + "grad_norm": 0.6451675295829773, + "kl": 5.268320083618164, + "learning_rate": 2.5122222222222227e-06, + "logits/chosen": 32315388.8, + "logits/rejected": 34424963.2, + "logps/chosen": -177.06943359375, + "logps/rejected": -168.0710205078125, + "loss": 0.47601852416992185, + "rewards/chosen": -0.004057984054088593, + "rewards/margins": 0.10740263015031815, + "rewards/rejected": -0.11146061420440674, + "step": 2740 + }, + { + "epoch": 1.1, + "grad_norm": 0.9512624740600586, + "kl": 5.623786926269531, + "learning_rate": 2.5011111111111115e-06, + "logits/chosen": 24552736.0, + "logits/rejected": 23781004.8, + "logps/chosen": -174.98658447265626, + "logps/rejected": -152.33095703125, + "loss": 0.42731657028198244, + "rewards/chosen": 0.17493221759796143, + "rewards/margins": 0.8694432020187378, + "rewards/rejected": -0.6945109844207764, + "step": 2750 + }, + { + "epoch": 1.104, + "grad_norm": 0.8605503439903259, + "kl": 3.258263111114502, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 17540195.2, + "logits/rejected": 14481008.0, + "logps/chosen": -130.9520263671875, + "logps/rejected": -140.87447509765624, + "loss": 0.4474031925201416, + "rewards/chosen": -0.1570604920387268, + "rewards/margins": 0.7482632040977477, + "rewards/rejected": -0.9053236961364746, + "step": 2760 + }, + { + "epoch": 1.108, + "grad_norm": 0.8902762532234192, + "kl": 6.445823669433594, + "learning_rate": 2.478888888888889e-06, + "logits/chosen": 25843875.2, + "logits/rejected": 25419281.6, + "logps/chosen": -155.84178466796874, + "logps/rejected": -156.02589111328126, + "loss": 0.464084529876709, + "rewards/chosen": 0.1782880425453186, + "rewards/margins": 0.5563196301460266, + "rewards/rejected": -0.378031587600708, + "step": 2770 + }, + { + "epoch": 1.112, + "grad_norm": 0.7916592359542847, + "kl": 5.980124473571777, + "learning_rate": 2.467777777777778e-06, + "logits/chosen": 23357057.6, + "logits/rejected": 19639329.6, + "logps/chosen": -165.2984619140625, + "logps/rejected": -144.48494873046874, + "loss": 0.44487595558166504, + "rewards/chosen": 0.28723764419555664, + "rewards/margins": 0.6892455577850342, + "rewards/rejected": -0.4020079135894775, + "step": 2780 + }, + { + "epoch": 1.116, + "grad_norm": 0.5942727327346802, + "kl": 7.625303745269775, + "learning_rate": 2.4566666666666667e-06, + "logits/chosen": 22922739.2, + "logits/rejected": 22430774.4, + "logps/chosen": -139.4244873046875, + "logps/rejected": -147.5355224609375, + "loss": 0.43770174980163573, + "rewards/chosen": 0.7275904655456543, + "rewards/margins": 0.641109848022461, + "rewards/rejected": 0.08648061752319336, + "step": 2790 + }, + { + "epoch": 1.12, + "grad_norm": 0.6265588402748108, + "kl": 5.79810094833374, + "learning_rate": 2.4455555555555555e-06, + "logits/chosen": 33326956.8, + "logits/rejected": 33234153.6, + "logps/chosen": -165.5775634765625, + "logps/rejected": -181.19012451171875, + "loss": 0.46999220848083495, + "rewards/chosen": 0.3356909275054932, + "rewards/margins": 0.41499342918396, + "rewards/rejected": -0.0793025016784668, + "step": 2800 + }, + { + "epoch": 1.12, + "eval_kl": 6.51247501373291, + "eval_logits/chosen": 31125557.248, + "eval_logits/rejected": 30991392.768, + "eval_logps/chosen": -150.6563125, + "eval_logps/rejected": -146.877875, + "eval_loss": 0.480685293674469, + "eval_rewards/chosen": 0.36694525146484375, + "eval_rewards/margins": 0.19045977783203125, + "eval_rewards/rejected": 0.1764854736328125, + "eval_runtime": 217.0415, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.304, + "step": 2800 + }, + { + "epoch": 1.124, + "grad_norm": 0.6718530654907227, + "kl": 7.274069309234619, + "learning_rate": 2.4344444444444448e-06, + "logits/chosen": 28781395.2, + "logits/rejected": 28910336.0, + "logps/chosen": -142.17664794921876, + "logps/rejected": -153.1429443359375, + "loss": 0.48170881271362304, + "rewards/chosen": 0.4687415599822998, + "rewards/margins": 0.17820630073547367, + "rewards/rejected": 0.29053525924682616, + "step": 2810 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.8993642330169678, + "kl": 5.114150047302246, + "learning_rate": 2.4233333333333336e-06, + "logits/chosen": 31094944.0, + "logits/rejected": 29708396.8, + "logps/chosen": -158.43695068359375, + "logps/rejected": -134.23763427734374, + "loss": 0.4788343906402588, + "rewards/chosen": 0.18828521966934203, + "rewards/margins": 0.20167077183723447, + "rewards/rejected": -0.013385552167892455, + "step": 2820 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 0.5459766387939453, + "kl": 6.137971878051758, + "learning_rate": 2.4122222222222224e-06, + "logits/chosen": 41590592.0, + "logits/rejected": 39072947.2, + "logps/chosen": -145.41527099609374, + "logps/rejected": -144.8617919921875, + "loss": 0.40869617462158203, + "rewards/chosen": 0.8426953315734863, + "rewards/margins": 0.9073116958141326, + "rewards/rejected": -0.06461636424064636, + "step": 2830 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.6354557275772095, + "kl": 5.928516387939453, + "learning_rate": 2.401111111111111e-06, + "logits/chosen": 34861424.0, + "logits/rejected": 34759747.2, + "logps/chosen": -129.4930908203125, + "logps/rejected": -143.89154052734375, + "loss": 0.44280567169189455, + "rewards/chosen": 0.5408330440521241, + "rewards/margins": 0.5000333577394486, + "rewards/rejected": 0.04079968631267548, + "step": 2840 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.5932331085205078, + "kl": 8.782042503356934, + "learning_rate": 2.39e-06, + "logits/chosen": 32643923.2, + "logits/rejected": 30833843.2, + "logps/chosen": -127.0966064453125, + "logps/rejected": -164.51356201171876, + "loss": 0.4796291351318359, + "rewards/chosen": 0.8198535919189454, + "rewards/margins": 0.17787570953369147, + "rewards/rejected": 0.6419778823852539, + "step": 2850 + }, + { + "epoch": 1.144, + "grad_norm": 0.4119536578655243, + "kl": 6.023087024688721, + "learning_rate": 2.3788888888888892e-06, + "logits/chosen": 34157424.0, + "logits/rejected": 33897881.6, + "logps/chosen": -142.58350830078126, + "logps/rejected": -112.63641357421875, + "loss": 0.44989490509033203, + "rewards/chosen": 0.5664341926574707, + "rewards/margins": 0.5420472577214241, + "rewards/rejected": 0.0243869349360466, + "step": 2860 + }, + { + "epoch": 1.148, + "grad_norm": 0.6733080744743347, + "kl": 7.073210716247559, + "learning_rate": 2.367777777777778e-06, + "logits/chosen": 32286940.8, + "logits/rejected": 32065385.6, + "logps/chosen": -144.9827392578125, + "logps/rejected": -150.515478515625, + "loss": 0.4916172981262207, + "rewards/chosen": 0.20184409618377686, + "rewards/margins": 0.11459586024284363, + "rewards/rejected": 0.08724823594093323, + "step": 2870 + }, + { + "epoch": 1.152, + "grad_norm": 1.03435480594635, + "kl": 6.8236589431762695, + "learning_rate": 2.356666666666667e-06, + "logits/chosen": 29720467.2, + "logits/rejected": 29326092.8, + "logps/chosen": -151.82506103515624, + "logps/rejected": -165.98399658203124, + "loss": 0.45775256156921384, + "rewards/chosen": 0.4520999908447266, + "rewards/margins": 0.2227289915084839, + "rewards/rejected": 0.22937099933624266, + "step": 2880 + }, + { + "epoch": 1.156, + "grad_norm": 0.5883368849754333, + "kl": 6.491732120513916, + "learning_rate": 2.3455555555555556e-06, + "logits/chosen": 23835811.2, + "logits/rejected": 23859849.6, + "logps/chosen": -170.26668701171874, + "logps/rejected": -150.8224609375, + "loss": 0.431504487991333, + "rewards/chosen": 0.614830207824707, + "rewards/margins": 0.6636435002088547, + "rewards/rejected": -0.048813292384147645, + "step": 2890 + }, + { + "epoch": 1.16, + "grad_norm": 0.7225193977355957, + "kl": 5.883708477020264, + "learning_rate": 2.334444444444445e-06, + "logits/chosen": 36251644.8, + "logits/rejected": 33482300.8, + "logps/chosen": -145.1569091796875, + "logps/rejected": -140.39755859375, + "loss": 0.4224736213684082, + "rewards/chosen": 0.4660323619842529, + "rewards/margins": 0.918277359008789, + "rewards/rejected": -0.45224499702453613, + "step": 2900 + }, + { + "epoch": 1.164, + "grad_norm": 0.550423800945282, + "kl": 6.711850643157959, + "learning_rate": 2.3233333333333337e-06, + "logits/chosen": 37449692.8, + "logits/rejected": 35041868.8, + "logps/chosen": -142.85999755859376, + "logps/rejected": -148.87371826171875, + "loss": 0.45527114868164065, + "rewards/chosen": 0.5728636741638183, + "rewards/margins": 0.5280719608068466, + "rewards/rejected": 0.04479171335697174, + "step": 2910 + }, + { + "epoch": 1.168, + "grad_norm": 0.8112925887107849, + "kl": 3.537787675857544, + "learning_rate": 2.3122222222222225e-06, + "logits/chosen": 24047856.0, + "logits/rejected": 23857336.0, + "logps/chosen": -164.10267333984376, + "logps/rejected": -137.96185302734375, + "loss": 0.44495596885681155, + "rewards/chosen": -0.09260135293006896, + "rewards/margins": 0.5166131436824798, + "rewards/rejected": -0.6092144966125488, + "step": 2920 + }, + { + "epoch": 1.172, + "grad_norm": 0.5182350277900696, + "kl": 5.1398210525512695, + "learning_rate": 2.3011111111111113e-06, + "logits/chosen": 24040510.4, + "logits/rejected": 25510265.6, + "logps/chosen": -170.7597412109375, + "logps/rejected": -124.09306640625, + "loss": 0.45585017204284667, + "rewards/chosen": 0.017396342754364014, + "rewards/margins": 0.4217635035514832, + "rewards/rejected": -0.40436716079711915, + "step": 2930 + }, + { + "epoch": 1.176, + "grad_norm": 0.7129570841789246, + "kl": 5.55086612701416, + "learning_rate": 2.29e-06, + "logits/chosen": 36301065.6, + "logits/rejected": 36730444.8, + "logps/chosen": -144.58111572265625, + "logps/rejected": -170.131298828125, + "loss": 0.45364060401916506, + "rewards/chosen": 0.47618856430053713, + "rewards/margins": 0.4523512840270996, + "rewards/rejected": 0.0238372802734375, + "step": 2940 + }, + { + "epoch": 1.18, + "grad_norm": 0.689380943775177, + "kl": 5.013358116149902, + "learning_rate": 2.278888888888889e-06, + "logits/chosen": 26987398.4, + "logits/rejected": 27705705.6, + "logps/chosen": -97.45775146484375, + "logps/rejected": -156.65115966796876, + "loss": 0.4797633171081543, + "rewards/chosen": 0.1765173554420471, + "rewards/margins": 0.27975412607192995, + "rewards/rejected": -0.10323677062988282, + "step": 2950 + }, + { + "epoch": 1.184, + "grad_norm": 0.5471240282058716, + "kl": 8.275094985961914, + "learning_rate": 2.2677777777777777e-06, + "logits/chosen": 40253052.8, + "logits/rejected": 37156057.6, + "logps/chosen": -157.5787109375, + "logps/rejected": -178.797021484375, + "loss": 0.46055126190185547, + "rewards/chosen": 0.7023271083831787, + "rewards/margins": 0.4890592336654663, + "rewards/rejected": 0.2132678747177124, + "step": 2960 + }, + { + "epoch": 1.188, + "grad_norm": 0.5812104344367981, + "kl": 6.715832710266113, + "learning_rate": 2.2566666666666665e-06, + "logits/chosen": 43975625.6, + "logits/rejected": 41445318.4, + "logps/chosen": -171.347216796875, + "logps/rejected": -180.57864990234376, + "loss": 0.4628589630126953, + "rewards/chosen": 0.22713685035705566, + "rewards/margins": 0.5415925025939942, + "rewards/rejected": -0.3144556522369385, + "step": 2970 + }, + { + "epoch": 1.192, + "grad_norm": 0.8072389364242554, + "kl": 5.419320583343506, + "learning_rate": 2.2455555555555557e-06, + "logits/chosen": 37647660.8, + "logits/rejected": 36517382.4, + "logps/chosen": -149.52344970703126, + "logps/rejected": -164.54786376953126, + "loss": 0.44607295989990237, + "rewards/chosen": 0.300301718711853, + "rewards/margins": 0.5973361253738403, + "rewards/rejected": -0.2970344066619873, + "step": 2980 + }, + { + "epoch": 1.196, + "grad_norm": 0.44364601373672485, + "kl": 6.054505825042725, + "learning_rate": 2.2344444444444446e-06, + "logits/chosen": 22235811.2, + "logits/rejected": 19985859.2, + "logps/chosen": -143.41585693359374, + "logps/rejected": -150.84788818359374, + "loss": 0.41753711700439455, + "rewards/chosen": 0.5826003074645996, + "rewards/margins": 0.9533658504486083, + "rewards/rejected": -0.37076554298400877, + "step": 2990 + }, + { + "epoch": 1.2, + "grad_norm": 0.541634202003479, + "kl": 8.647984504699707, + "learning_rate": 2.2233333333333334e-06, + "logits/chosen": 38845638.4, + "logits/rejected": 38343161.6, + "logps/chosen": -171.0084228515625, + "logps/rejected": -129.40447998046875, + "loss": 0.4108599662780762, + "rewards/chosen": 1.1798659324645997, + "rewards/margins": 0.8690209865570069, + "rewards/rejected": 0.31084494590759276, + "step": 3000 + }, + { + "epoch": 1.2, + "eval_kl": 5.96298360824585, + "eval_logits/chosen": 32690919.424, + "eval_logits/rejected": 32526981.12, + "eval_logps/chosen": -150.91603125, + "eval_logps/rejected": -147.333453125, + "eval_loss": 0.4783514738082886, + "eval_rewards/chosen": 0.3409757690429687, + "eval_rewards/margins": 0.21004846191406248, + "eval_rewards/rejected": 0.13092730712890624, + "eval_runtime": 216.7869, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3000 + }, + { + "epoch": 1.204, + "grad_norm": 0.735106348991394, + "kl": 4.565227508544922, + "learning_rate": 2.212222222222222e-06, + "logits/chosen": 42755068.8, + "logits/rejected": 41808000.0, + "logps/chosen": -144.0309814453125, + "logps/rejected": -162.88955078125, + "loss": 0.4483033180236816, + "rewards/chosen": 0.3165964841842651, + "rewards/margins": 0.5639133214950561, + "rewards/rejected": -0.24731683731079102, + "step": 3010 + }, + { + "epoch": 1.208, + "grad_norm": 0.4859018623828888, + "kl": 6.566149711608887, + "learning_rate": 2.2011111111111114e-06, + "logits/chosen": 40423686.4, + "logits/rejected": 38088403.2, + "logps/chosen": -152.13575439453126, + "logps/rejected": -164.60054931640624, + "loss": 0.43638858795166013, + "rewards/chosen": 0.7344797134399415, + "rewards/margins": 0.7190215766429902, + "rewards/rejected": 0.015458136796951294, + "step": 3020 + }, + { + "epoch": 1.212, + "grad_norm": 0.5802381634712219, + "kl": 5.501416206359863, + "learning_rate": 2.19e-06, + "logits/chosen": 36018793.6, + "logits/rejected": 34128659.2, + "logps/chosen": -139.50609130859374, + "logps/rejected": -176.41171875, + "loss": 0.48515634536743163, + "rewards/chosen": 0.2475515365600586, + "rewards/margins": 0.1769363284111023, + "rewards/rejected": 0.0706152081489563, + "step": 3030 + }, + { + "epoch": 1.216, + "grad_norm": 0.9714307188987732, + "kl": 7.294144630432129, + "learning_rate": 2.178888888888889e-06, + "logits/chosen": 29871635.2, + "logits/rejected": 28148294.4, + "logps/chosen": -144.24107666015624, + "logps/rejected": -157.43570556640626, + "loss": 0.4299461364746094, + "rewards/chosen": 0.7857762336730957, + "rewards/margins": 0.6812341213226318, + "rewards/rejected": 0.10454211235046387, + "step": 3040 + }, + { + "epoch": 1.22, + "grad_norm": 0.6073058247566223, + "kl": 5.890301704406738, + "learning_rate": 2.1677777777777782e-06, + "logits/chosen": 36303308.8, + "logits/rejected": 36356560.0, + "logps/chosen": -151.838623046875, + "logps/rejected": -171.02061767578124, + "loss": 0.4609498977661133, + "rewards/chosen": 0.38731932640075684, + "rewards/margins": 0.6510258436203002, + "rewards/rejected": -0.26370651721954347, + "step": 3050 + }, + { + "epoch": 1.224, + "grad_norm": 0.5871905088424683, + "kl": 5.802463531494141, + "learning_rate": 2.156666666666667e-06, + "logits/chosen": 28059456.0, + "logits/rejected": 26678612.8, + "logps/chosen": -159.48240966796874, + "logps/rejected": -153.2945068359375, + "loss": 0.43854827880859376, + "rewards/chosen": 0.4585693359375, + "rewards/margins": 0.676022219657898, + "rewards/rejected": -0.21745288372039795, + "step": 3060 + }, + { + "epoch": 1.228, + "grad_norm": 0.4357960522174835, + "kl": 4.12323522567749, + "learning_rate": 2.145555555555556e-06, + "logits/chosen": 34136323.2, + "logits/rejected": 33183904.0, + "logps/chosen": -146.4657958984375, + "logps/rejected": -139.709375, + "loss": 0.43050317764282225, + "rewards/chosen": 0.3769852876663208, + "rewards/margins": 0.9553431749343871, + "rewards/rejected": -0.5783578872680664, + "step": 3070 + }, + { + "epoch": 1.232, + "grad_norm": 0.47064968943595886, + "kl": 7.40407657623291, + "learning_rate": 2.1344444444444447e-06, + "logits/chosen": 42982582.4, + "logits/rejected": 40440524.8, + "logps/chosen": -142.09869384765625, + "logps/rejected": -158.719091796875, + "loss": 0.42528462409973145, + "rewards/chosen": 0.7800788879394531, + "rewards/margins": 0.7373038113117218, + "rewards/rejected": 0.04277507662773132, + "step": 3080 + }, + { + "epoch": 1.236, + "grad_norm": 0.7308046221733093, + "kl": 7.210939884185791, + "learning_rate": 2.1233333333333335e-06, + "logits/chosen": 28548304.0, + "logits/rejected": 28032595.2, + "logps/chosen": -122.3899658203125, + "logps/rejected": -133.5687744140625, + "loss": 0.4277194976806641, + "rewards/chosen": 0.8977908134460449, + "rewards/margins": 0.6939298629760742, + "rewards/rejected": 0.20386095046997071, + "step": 3090 + }, + { + "epoch": 1.24, + "grad_norm": 0.8042078614234924, + "kl": 6.6927170753479, + "learning_rate": 2.1122222222222223e-06, + "logits/chosen": 27459798.4, + "logits/rejected": 29134211.2, + "logps/chosen": -143.5735107421875, + "logps/rejected": -149.86500244140626, + "loss": 0.46234517097473143, + "rewards/chosen": 0.5464875221252441, + "rewards/margins": 0.39129424095153803, + "rewards/rejected": 0.15519328117370607, + "step": 3100 + }, + { + "epoch": 1.244, + "grad_norm": 0.5080249309539795, + "kl": 7.83388614654541, + "learning_rate": 2.101111111111111e-06, + "logits/chosen": 44153635.2, + "logits/rejected": 41055212.8, + "logps/chosen": -187.93551025390624, + "logps/rejected": -171.871630859375, + "loss": 0.4076192378997803, + "rewards/chosen": 0.8624818801879883, + "rewards/margins": 1.0380724906921388, + "rewards/rejected": -0.1755906105041504, + "step": 3110 + }, + { + "epoch": 1.248, + "grad_norm": 0.7141006588935852, + "kl": 6.444394111633301, + "learning_rate": 2.09e-06, + "logits/chosen": 22851744.0, + "logits/rejected": 21574438.4, + "logps/chosen": -135.523486328125, + "logps/rejected": -137.11583251953124, + "loss": 0.4397727966308594, + "rewards/chosen": 0.6564054965972901, + "rewards/margins": 0.65584604293108, + "rewards/rejected": 0.0005594536662101746, + "step": 3120 + }, + { + "epoch": 1.252, + "grad_norm": 0.6408936977386475, + "kl": 8.526594161987305, + "learning_rate": 2.078888888888889e-06, + "logits/chosen": 37600892.8, + "logits/rejected": 38453331.2, + "logps/chosen": -134.79913330078125, + "logps/rejected": -169.9215087890625, + "loss": 0.47039794921875, + "rewards/chosen": 0.816160011291504, + "rewards/margins": 0.25816926956176767, + "rewards/rejected": 0.5579907417297363, + "step": 3130 + }, + { + "epoch": 1.256, + "grad_norm": 0.7661492228507996, + "kl": 7.337412357330322, + "learning_rate": 2.067777777777778e-06, + "logits/chosen": 37269350.4, + "logits/rejected": 36887142.4, + "logps/chosen": -185.44752197265626, + "logps/rejected": -150.3639404296875, + "loss": 0.4498098850250244, + "rewards/chosen": 0.6579087257385254, + "rewards/margins": 0.5174487590789796, + "rewards/rejected": 0.1404599666595459, + "step": 3140 + }, + { + "epoch": 1.26, + "grad_norm": 0.5819596648216248, + "kl": 9.288617134094238, + "learning_rate": 2.0566666666666667e-06, + "logits/chosen": 41466233.6, + "logits/rejected": 39139404.8, + "logps/chosen": -120.82750244140625, + "logps/rejected": -131.20218505859376, + "loss": 0.46291580200195315, + "rewards/chosen": 0.883179759979248, + "rewards/margins": 0.3774709701538086, + "rewards/rejected": 0.5057087898254394, + "step": 3150 + }, + { + "epoch": 1.264, + "grad_norm": 0.46087002754211426, + "kl": 8.114151000976562, + "learning_rate": 2.0455555555555555e-06, + "logits/chosen": 54930982.4, + "logits/rejected": 55207168.0, + "logps/chosen": -161.81624755859374, + "logps/rejected": -131.8754638671875, + "loss": 0.4237666130065918, + "rewards/chosen": 1.0106356620788575, + "rewards/margins": 0.6930557727813721, + "rewards/rejected": 0.3175798892974854, + "step": 3160 + }, + { + "epoch": 1.268, + "grad_norm": 0.6536068320274353, + "kl": 9.215258598327637, + "learning_rate": 2.0344444444444448e-06, + "logits/chosen": 40884854.4, + "logits/rejected": 41148396.8, + "logps/chosen": -174.63795166015626, + "logps/rejected": -183.98502197265626, + "loss": 0.410884428024292, + "rewards/chosen": 1.0293194770812988, + "rewards/margins": 0.9003942966461181, + "rewards/rejected": 0.12892518043518067, + "step": 3170 + }, + { + "epoch": 1.272, + "grad_norm": 0.6242780089378357, + "kl": 5.226318359375, + "learning_rate": 2.0233333333333336e-06, + "logits/chosen": 39783075.2, + "logits/rejected": 38742624.0, + "logps/chosen": -140.7747314453125, + "logps/rejected": -128.35888671875, + "loss": 0.4299330234527588, + "rewards/chosen": 0.5863895893096924, + "rewards/margins": 0.637285441160202, + "rewards/rejected": -0.05089585185050964, + "step": 3180 + }, + { + "epoch": 1.276, + "grad_norm": 0.46700039505958557, + "kl": 7.267230033874512, + "learning_rate": 2.0122222222222224e-06, + "logits/chosen": 41375708.8, + "logits/rejected": 38081993.6, + "logps/chosen": -153.80274658203126, + "logps/rejected": -176.299755859375, + "loss": 0.460453462600708, + "rewards/chosen": 0.6854756832122803, + "rewards/margins": 0.3931422710418701, + "rewards/rejected": 0.29233341217041015, + "step": 3190 + }, + { + "epoch": 1.28, + "grad_norm": 0.6310598850250244, + "kl": 5.743724346160889, + "learning_rate": 2.001111111111111e-06, + "logits/chosen": 34063660.8, + "logits/rejected": 31735603.2, + "logps/chosen": -153.7698974609375, + "logps/rejected": -144.71922607421874, + "loss": 0.46727771759033204, + "rewards/chosen": 0.4247100830078125, + "rewards/margins": 0.3391889691352844, + "rewards/rejected": 0.08552111387252807, + "step": 3200 + }, + { + "epoch": 1.28, + "eval_kl": 6.358611106872559, + "eval_logits/chosen": 35785789.44, + "eval_logits/rejected": 35595558.912, + "eval_logps/chosen": -149.448609375, + "eval_logps/rejected": -146.011734375, + "eval_loss": 0.47684431076049805, + "eval_rewards/chosen": 0.48771685791015623, + "eval_rewards/margins": 0.22461712646484372, + "eval_rewards/rejected": 0.2630997314453125, + "eval_runtime": 216.7977, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3200 + }, + { + "epoch": 1.284, + "grad_norm": 0.5444361567497253, + "kl": 8.359007835388184, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 38480896.0, + "logits/rejected": 37109968.0, + "logps/chosen": -166.1022705078125, + "logps/rejected": -135.0826416015625, + "loss": 0.4136789321899414, + "rewards/chosen": 1.1088358879089355, + "rewards/margins": 0.8896709442138672, + "rewards/rejected": 0.21916494369506836, + "step": 3210 + }, + { + "epoch": 1.288, + "grad_norm": 0.47785794734954834, + "kl": 5.674698829650879, + "learning_rate": 1.9788888888888892e-06, + "logits/chosen": 30067932.8, + "logits/rejected": 30675318.4, + "logps/chosen": -119.010009765625, + "logps/rejected": -125.2324462890625, + "loss": 0.4624322891235352, + "rewards/chosen": 0.3068151712417603, + "rewards/margins": 0.3403205394744873, + "rewards/rejected": -0.033505368232727054, + "step": 3220 + }, + { + "epoch": 1.292, + "grad_norm": 0.5500114560127258, + "kl": 6.791600704193115, + "learning_rate": 1.967777777777778e-06, + "logits/chosen": 34258720.0, + "logits/rejected": 31316112.0, + "logps/chosen": -148.19810791015624, + "logps/rejected": -154.41190185546876, + "loss": 0.449018669128418, + "rewards/chosen": 0.7993580818176269, + "rewards/margins": 0.4857671737670898, + "rewards/rejected": 0.31359090805053713, + "step": 3230 + }, + { + "epoch": 1.296, + "grad_norm": 0.5430236458778381, + "kl": 8.808231353759766, + "learning_rate": 1.956666666666667e-06, + "logits/chosen": 34977206.4, + "logits/rejected": 31934518.4, + "logps/chosen": -150.738330078125, + "logps/rejected": -182.1927490234375, + "loss": 0.45286874771118163, + "rewards/chosen": 0.8764358520507812, + "rewards/margins": 0.5628475189208983, + "rewards/rejected": 0.3135883331298828, + "step": 3240 + }, + { + "epoch": 1.3, + "grad_norm": 0.350079745054245, + "kl": 8.156396865844727, + "learning_rate": 1.9455555555555557e-06, + "logits/chosen": 38458761.6, + "logits/rejected": 36443888.0, + "logps/chosen": -153.3631103515625, + "logps/rejected": -182.10977783203126, + "loss": 0.4610450267791748, + "rewards/chosen": 0.7133102416992188, + "rewards/margins": 0.43579509258270266, + "rewards/rejected": 0.2775151491165161, + "step": 3250 + }, + { + "epoch": 1.304, + "grad_norm": 0.5987509489059448, + "kl": 4.856646537780762, + "learning_rate": 1.9344444444444445e-06, + "logits/chosen": 34302396.8, + "logits/rejected": 35170553.6, + "logps/chosen": -113.03687744140625, + "logps/rejected": -124.1705810546875, + "loss": 0.4647495269775391, + "rewards/chosen": 0.39636247158050536, + "rewards/margins": 0.2917425155639648, + "rewards/rejected": 0.10461995601654053, + "step": 3260 + }, + { + "epoch": 1.308, + "grad_norm": 0.6457140445709229, + "kl": 4.818373680114746, + "learning_rate": 1.9233333333333333e-06, + "logits/chosen": 43591721.6, + "logits/rejected": 44343180.8, + "logps/chosen": -137.19747314453124, + "logps/rejected": -153.1467529296875, + "loss": 0.42650656700134276, + "rewards/chosen": 0.6118185997009278, + "rewards/margins": 0.7147171497344971, + "rewards/rejected": -0.10289855003356933, + "step": 3270 + }, + { + "epoch": 1.312, + "grad_norm": 0.6024192571640015, + "kl": 4.331322193145752, + "learning_rate": 1.912222222222222e-06, + "logits/chosen": 40203308.8, + "logits/rejected": 41383084.8, + "logps/chosen": -156.63048095703124, + "logps/rejected": -144.2884033203125, + "loss": 0.4273221969604492, + "rewards/chosen": 0.6165127277374267, + "rewards/margins": 0.7079921424388885, + "rewards/rejected": -0.09147941470146179, + "step": 3280 + }, + { + "epoch": 1.316, + "grad_norm": 0.6954644918441772, + "kl": 4.1213908195495605, + "learning_rate": 1.9011111111111113e-06, + "logits/chosen": 28768950.4, + "logits/rejected": 24494809.6, + "logps/chosen": -143.09805908203126, + "logps/rejected": -139.8798095703125, + "loss": 0.4376859664916992, + "rewards/chosen": 0.23681788444519042, + "rewards/margins": 0.6746566295623779, + "rewards/rejected": -0.4378387451171875, + "step": 3290 + }, + { + "epoch": 1.32, + "grad_norm": 0.6745891571044922, + "kl": 4.750722408294678, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 26675907.2, + "logits/rejected": 26277996.8, + "logps/chosen": -143.6562255859375, + "logps/rejected": -134.79100341796874, + "loss": 0.4433170795440674, + "rewards/chosen": 0.3683876276016235, + "rewards/margins": 0.5583112239837646, + "rewards/rejected": -0.18992359638214112, + "step": 3300 + }, + { + "epoch": 1.324, + "grad_norm": 0.5548813939094543, + "kl": 6.803833961486816, + "learning_rate": 1.878888888888889e-06, + "logits/chosen": 36996547.2, + "logits/rejected": 37710758.4, + "logps/chosen": -151.74556884765624, + "logps/rejected": -154.92069091796876, + "loss": 0.43219637870788574, + "rewards/chosen": 0.8092526435852051, + "rewards/margins": 0.6131466150283813, + "rewards/rejected": 0.19610602855682374, + "step": 3310 + }, + { + "epoch": 1.328, + "grad_norm": 0.5542740821838379, + "kl": 3.4922118186950684, + "learning_rate": 1.8677777777777777e-06, + "logits/chosen": 21493171.2, + "logits/rejected": 22701468.8, + "logps/chosen": -153.733984375, + "logps/rejected": -130.3265380859375, + "loss": 0.45499577522277834, + "rewards/chosen": 0.06440688967704773, + "rewards/margins": 0.4255514085292816, + "rewards/rejected": -0.36114451885223386, + "step": 3320 + }, + { + "epoch": 1.332, + "grad_norm": 0.5660212635993958, + "kl": 5.5110368728637695, + "learning_rate": 1.856666666666667e-06, + "logits/chosen": 25962280.0, + "logits/rejected": 25115568.0, + "logps/chosen": -157.3265625, + "logps/rejected": -127.7706298828125, + "loss": 0.45712642669677733, + "rewards/chosen": 0.25737948417663575, + "rewards/margins": 0.38217872381210327, + "rewards/rejected": -0.12479923963546753, + "step": 3330 + }, + { + "epoch": 1.336, + "grad_norm": 0.7863187193870544, + "kl": 4.013291835784912, + "learning_rate": 1.8455555555555558e-06, + "logits/chosen": 35347702.4, + "logits/rejected": 34501942.4, + "logps/chosen": -148.804345703125, + "logps/rejected": -153.26226806640625, + "loss": 0.4377838134765625, + "rewards/chosen": 0.25958924293518065, + "rewards/margins": 0.4975349426269531, + "rewards/rejected": -0.23794569969177246, + "step": 3340 + }, + { + "epoch": 1.34, + "grad_norm": 0.8637145757675171, + "kl": 6.854001522064209, + "learning_rate": 1.8344444444444446e-06, + "logits/chosen": 37165539.2, + "logits/rejected": 35276435.2, + "logps/chosen": -165.85059814453126, + "logps/rejected": -189.76441650390626, + "loss": 0.46552677154541017, + "rewards/chosen": 0.28020381927490234, + "rewards/margins": 0.23775566816329957, + "rewards/rejected": 0.042448151111602786, + "step": 3350 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.7173047661781311, + "kl": 6.264387130737305, + "learning_rate": 1.8233333333333334e-06, + "logits/chosen": 29046736.0, + "logits/rejected": 27349148.8, + "logps/chosen": -160.81956787109374, + "logps/rejected": -144.21968994140624, + "loss": 0.4449786186218262, + "rewards/chosen": 0.4654555320739746, + "rewards/margins": 0.5440494477748871, + "rewards/rejected": -0.07859391570091248, + "step": 3360 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.6456140279769897, + "kl": 3.3577468395233154, + "learning_rate": 1.8122222222222224e-06, + "logits/chosen": 32191424.0, + "logits/rejected": 29473795.2, + "logps/chosen": -138.01751708984375, + "logps/rejected": -148.85108642578126, + "loss": 0.4563105583190918, + "rewards/chosen": -0.22999329566955568, + "rewards/margins": 0.3565816402435303, + "rewards/rejected": -0.586574935913086, + "step": 3370 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.5196599960327148, + "kl": 4.913334369659424, + "learning_rate": 1.8011111111111112e-06, + "logits/chosen": 29135721.6, + "logits/rejected": 27177129.6, + "logps/chosen": -161.2333984375, + "logps/rejected": -136.623095703125, + "loss": 0.42957291603088377, + "rewards/chosen": 0.3112953186035156, + "rewards/margins": 0.6790343999862671, + "rewards/rejected": -0.36773908138275146, + "step": 3380 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.5491528511047363, + "kl": 2.574375867843628, + "learning_rate": 1.79e-06, + "logits/chosen": 39787881.6, + "logits/rejected": 37482313.6, + "logps/chosen": -132.85360107421874, + "logps/rejected": -138.88128662109375, + "loss": 0.4316267490386963, + "rewards/chosen": -0.03256496787071228, + "rewards/margins": 0.6733869135379791, + "rewards/rejected": -0.7059518814086914, + "step": 3390 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.5999844670295715, + "kl": 5.321146011352539, + "learning_rate": 1.7788888888888892e-06, + "logits/chosen": 34423936.0, + "logits/rejected": 35799561.6, + "logps/chosen": -169.89874267578125, + "logps/rejected": -162.8022705078125, + "loss": 0.4550165176391602, + "rewards/chosen": 0.13871285915374756, + "rewards/margins": 0.4647700548171997, + "rewards/rejected": -0.32605719566345215, + "step": 3400 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.4790494441986084, + "eval_logits/chosen": 29217603.584, + "eval_logits/rejected": 29389656.064, + "eval_logps/chosen": -156.61384375, + "eval_logps/rejected": -153.2220625, + "eval_loss": 0.47793951630592346, + "eval_rewards/chosen": -0.22880656433105467, + "eval_rewards/margins": 0.2291276092529297, + "eval_rewards/rejected": -0.45793417358398436, + "eval_runtime": 216.674, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 3400 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 0.5353158712387085, + "kl": 3.89123272895813, + "learning_rate": 1.767777777777778e-06, + "logits/chosen": 30302416.0, + "logits/rejected": 30243564.8, + "logps/chosen": -163.517431640625, + "logps/rejected": -174.63206787109374, + "loss": 0.4478933334350586, + "rewards/chosen": 0.02733871340751648, + "rewards/margins": 0.6695918262004852, + "rewards/rejected": -0.6422531127929687, + "step": 3410 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5308664441108704, + "kl": 3.7901718616485596, + "learning_rate": 1.7566666666666669e-06, + "logits/chosen": 30754256.0, + "logits/rejected": 27248163.2, + "logps/chosen": -130.69642333984376, + "logps/rejected": -149.67838134765626, + "loss": 0.454122257232666, + "rewards/chosen": 0.1454862356185913, + "rewards/margins": 0.4878753900527954, + "rewards/rejected": -0.3423891544342041, + "step": 3420 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 0.7273574471473694, + "kl": 4.090758323669434, + "learning_rate": 1.7455555555555557e-06, + "logits/chosen": 28009366.4, + "logits/rejected": 28281456.0, + "logps/chosen": -150.5, + "logps/rejected": -151.46649169921875, + "loss": 0.43383193016052246, + "rewards/chosen": 0.24967949390411376, + "rewards/margins": 0.7830450773239135, + "rewards/rejected": -0.5333655834197998, + "step": 3430 + }, + { + "epoch": 1.376, + "grad_norm": 0.3887825906276703, + "kl": 5.064081192016602, + "learning_rate": 1.7344444444444447e-06, + "logits/chosen": 27018854.4, + "logits/rejected": 24356123.2, + "logps/chosen": -130.98511962890626, + "logps/rejected": -151.60482177734374, + "loss": 0.43924894332885744, + "rewards/chosen": 0.3166049957275391, + "rewards/margins": 0.6815865993499757, + "rewards/rejected": -0.36498160362243653, + "step": 3440 + }, + { + "epoch": 1.38, + "grad_norm": 0.4775777757167816, + "kl": 7.2533087730407715, + "learning_rate": 1.7233333333333335e-06, + "logits/chosen": 38613673.6, + "logits/rejected": 41955670.4, + "logps/chosen": -176.4742919921875, + "logps/rejected": -157.8961181640625, + "loss": 0.4113172054290771, + "rewards/chosen": 0.7336381912231446, + "rewards/margins": 0.7889788269996644, + "rewards/rejected": -0.05534063577651978, + "step": 3450 + }, + { + "epoch": 1.384, + "grad_norm": 0.7163631916046143, + "kl": 5.157084941864014, + "learning_rate": 1.7122222222222223e-06, + "logits/chosen": 35435958.4, + "logits/rejected": 36708380.8, + "logps/chosen": -135.27044677734375, + "logps/rejected": -149.26287841796875, + "loss": 0.49365973472595215, + "rewards/chosen": 0.14373122453689574, + "rewards/margins": 0.07738589048385619, + "rewards/rejected": 0.06634533405303955, + "step": 3460 + }, + { + "epoch": 1.388, + "grad_norm": 0.6470320224761963, + "kl": 4.387451648712158, + "learning_rate": 1.7011111111111111e-06, + "logits/chosen": 29047792.0, + "logits/rejected": 28239174.4, + "logps/chosen": -131.78316650390624, + "logps/rejected": -120.0147216796875, + "loss": 0.4465984344482422, + "rewards/chosen": 0.3296776294708252, + "rewards/margins": 0.6029248237609863, + "rewards/rejected": -0.27324719429016114, + "step": 3470 + }, + { + "epoch": 1.392, + "grad_norm": 0.6309516429901123, + "kl": 7.3288164138793945, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 36852099.2, + "logits/rejected": 35572294.4, + "logps/chosen": -133.08834228515624, + "logps/rejected": -156.20654296875, + "loss": 0.44939703941345216, + "rewards/chosen": 0.7472519397735595, + "rewards/margins": 0.5482589960098266, + "rewards/rejected": 0.19899294376373292, + "step": 3480 + }, + { + "epoch": 1.396, + "grad_norm": 0.7260765433311462, + "kl": 3.237916946411133, + "learning_rate": 1.6788888888888891e-06, + "logits/chosen": 30554035.2, + "logits/rejected": 28196780.8, + "logps/chosen": -123.18148193359374, + "logps/rejected": -137.39403076171874, + "loss": 0.43561625480651855, + "rewards/chosen": 0.26550750732421874, + "rewards/margins": 0.6841002464294433, + "rewards/rejected": -0.4185927391052246, + "step": 3490 + }, + { + "epoch": 1.4, + "grad_norm": 0.4278745949268341, + "kl": 5.290976047515869, + "learning_rate": 1.667777777777778e-06, + "logits/chosen": 29699056.0, + "logits/rejected": 30936454.4, + "logps/chosen": -153.85557861328124, + "logps/rejected": -122.64144287109374, + "loss": 0.4286449909210205, + "rewards/chosen": 0.49454541206359864, + "rewards/margins": 0.6557791233062744, + "rewards/rejected": -0.16123371124267577, + "step": 3500 + }, + { + "epoch": 1.404, + "grad_norm": 0.7629099488258362, + "kl": 6.994016170501709, + "learning_rate": 1.6566666666666668e-06, + "logits/chosen": 26804969.6, + "logits/rejected": 27539884.8, + "logps/chosen": -131.692041015625, + "logps/rejected": -185.0650146484375, + "loss": 0.4522398948669434, + "rewards/chosen": 0.5013983726501465, + "rewards/margins": 0.40837590694427495, + "rewards/rejected": 0.09302246570587158, + "step": 3510 + }, + { + "epoch": 1.408, + "grad_norm": 0.67551189661026, + "kl": 4.912568092346191, + "learning_rate": 1.6455555555555558e-06, + "logits/chosen": 27143923.2, + "logits/rejected": 26963395.2, + "logps/chosen": -149.304931640625, + "logps/rejected": -139.75704345703124, + "loss": 0.44557414054870603, + "rewards/chosen": 0.3085124731063843, + "rewards/margins": 0.6712681531906128, + "rewards/rejected": -0.36275568008422854, + "step": 3520 + }, + { + "epoch": 1.412, + "grad_norm": 0.7234175801277161, + "kl": 5.856083869934082, + "learning_rate": 1.6344444444444446e-06, + "logits/chosen": 33198009.6, + "logits/rejected": 32032864.0, + "logps/chosen": -150.87943115234376, + "logps/rejected": -153.37586669921876, + "loss": 0.4352092742919922, + "rewards/chosen": 0.3994121074676514, + "rewards/margins": 0.5765307188034058, + "rewards/rejected": -0.1771186113357544, + "step": 3530 + }, + { + "epoch": 1.416, + "grad_norm": 0.7202039361000061, + "kl": 5.610236167907715, + "learning_rate": 1.6233333333333334e-06, + "logits/chosen": 25017616.0, + "logits/rejected": 26410630.4, + "logps/chosen": -187.0184814453125, + "logps/rejected": -115.09561767578126, + "loss": 0.4591636657714844, + "rewards/chosen": 0.3089368104934692, + "rewards/margins": 0.34654129743576045, + "rewards/rejected": -0.03760448694229126, + "step": 3540 + }, + { + "epoch": 1.42, + "grad_norm": 0.7653972506523132, + "kl": 4.185455322265625, + "learning_rate": 1.6122222222222222e-06, + "logits/chosen": 27766281.6, + "logits/rejected": 24358944.0, + "logps/chosen": -125.4722412109375, + "logps/rejected": -162.280615234375, + "loss": 0.4453754901885986, + "rewards/chosen": 0.24888882637023926, + "rewards/margins": 0.640946888923645, + "rewards/rejected": -0.39205806255340575, + "step": 3550 + }, + { + "epoch": 1.424, + "grad_norm": 0.5244606137275696, + "kl": 5.084301948547363, + "learning_rate": 1.6011111111111114e-06, + "logits/chosen": 38255152.0, + "logits/rejected": 35147708.8, + "logps/chosen": -169.7523681640625, + "logps/rejected": -189.486328125, + "loss": 0.4723203659057617, + "rewards/chosen": 0.08335857987403869, + "rewards/margins": 0.32414146065711974, + "rewards/rejected": -0.24078288078308105, + "step": 3560 + }, + { + "epoch": 1.428, + "grad_norm": 0.7249192595481873, + "kl": 6.4514336585998535, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 31364140.8, + "logits/rejected": 32550483.2, + "logps/chosen": -136.05145263671875, + "logps/rejected": -165.39798583984376, + "loss": 0.43005828857421874, + "rewards/chosen": 0.6245347499847412, + "rewards/margins": 0.6684352219104767, + "rewards/rejected": -0.04390047192573547, + "step": 3570 + }, + { + "epoch": 1.432, + "grad_norm": 0.5767175555229187, + "kl": 3.358916759490967, + "learning_rate": 1.578888888888889e-06, + "logits/chosen": 34861779.2, + "logits/rejected": 36117113.6, + "logps/chosen": -153.4178466796875, + "logps/rejected": -138.995556640625, + "loss": 0.46177167892456056, + "rewards/chosen": 0.06014393568038941, + "rewards/margins": 0.33000377416610716, + "rewards/rejected": -0.26985983848571776, + "step": 3580 + }, + { + "epoch": 1.436, + "grad_norm": 0.8270474076271057, + "kl": 3.5596280097961426, + "learning_rate": 1.5677777777777778e-06, + "logits/chosen": 24936995.2, + "logits/rejected": 25041536.0, + "logps/chosen": -142.94605712890626, + "logps/rejected": -115.97581787109375, + "loss": 0.46255645751953123, + "rewards/chosen": 0.09056978225708008, + "rewards/margins": 0.3586315393447876, + "rewards/rejected": -0.2680617570877075, + "step": 3590 + }, + { + "epoch": 1.44, + "grad_norm": 0.712232768535614, + "kl": 3.8768982887268066, + "learning_rate": 1.5566666666666669e-06, + "logits/chosen": 31608688.0, + "logits/rejected": 29185264.0, + "logps/chosen": -135.2075439453125, + "logps/rejected": -174.82825927734376, + "loss": 0.45167975425720214, + "rewards/chosen": 0.0786507248878479, + "rewards/margins": 0.565507709980011, + "rewards/rejected": -0.4868569850921631, + "step": 3600 + }, + { + "epoch": 1.44, + "eval_kl": 4.0966901779174805, + "eval_logits/chosen": 30547773.44, + "eval_logits/rejected": 30678024.192, + "eval_logps/chosen": -154.22709375, + "eval_logps/rejected": -150.8435625, + "eval_loss": 0.47759392857551575, + "eval_rewards/chosen": 0.009867694854736328, + "eval_rewards/margins": 0.22995056533813477, + "eval_rewards/rejected": -0.22008287048339845, + "eval_runtime": 216.6033, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 3600 + }, + { + "epoch": 1.444, + "grad_norm": 0.6500243544578552, + "kl": 4.5812273025512695, + "learning_rate": 1.5455555555555557e-06, + "logits/chosen": 35394022.4, + "logits/rejected": 35018995.2, + "logps/chosen": -116.204833984375, + "logps/rejected": -184.157275390625, + "loss": 0.4943058490753174, + "rewards/chosen": -0.007693278789520264, + "rewards/margins": 0.12241411209106444, + "rewards/rejected": -0.1301073908805847, + "step": 3610 + }, + { + "epoch": 1.448, + "grad_norm": 0.6954056024551392, + "kl": 5.1914801597595215, + "learning_rate": 1.5344444444444445e-06, + "logits/chosen": 44866396.8, + "logits/rejected": 43301910.4, + "logps/chosen": -144.904541015625, + "logps/rejected": -164.3775390625, + "loss": 0.41394357681274413, + "rewards/chosen": 0.45650997161865237, + "rewards/margins": 0.8667933464050293, + "rewards/rejected": -0.41028337478637694, + "step": 3620 + }, + { + "epoch": 1.452, + "grad_norm": 0.6982813477516174, + "kl": 4.627970218658447, + "learning_rate": 1.5233333333333333e-06, + "logits/chosen": 33511337.6, + "logits/rejected": 33426937.6, + "logps/chosen": -176.5933837890625, + "logps/rejected": -148.85958251953124, + "loss": 0.43149843215942385, + "rewards/chosen": 0.27549741268157957, + "rewards/margins": 0.6807630777359008, + "rewards/rejected": -0.40526566505432127, + "step": 3630 + }, + { + "epoch": 1.456, + "grad_norm": 0.472672700881958, + "kl": 4.886686325073242, + "learning_rate": 1.5122222222222225e-06, + "logits/chosen": 21423132.8, + "logits/rejected": 20745464.0, + "logps/chosen": -100.18471069335938, + "logps/rejected": -142.314697265625, + "loss": 0.45476489067077636, + "rewards/chosen": 0.453489875793457, + "rewards/margins": 0.5046129763126374, + "rewards/rejected": -0.0511231005191803, + "step": 3640 + }, + { + "epoch": 1.46, + "grad_norm": 0.6913832426071167, + "kl": 4.06036901473999, + "learning_rate": 1.5011111111111113e-06, + "logits/chosen": 41536867.2, + "logits/rejected": 40642899.2, + "logps/chosen": -215.0681396484375, + "logps/rejected": -168.06806640625, + "loss": 0.4454173564910889, + "rewards/chosen": -0.10317556858062744, + "rewards/margins": 0.5138320684432983, + "rewards/rejected": -0.6170076370239258, + "step": 3650 + }, + { + "epoch": 1.464, + "grad_norm": 0.5362917184829712, + "kl": 3.2193520069122314, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 25776776.0, + "logits/rejected": 22911784.0, + "logps/chosen": -130.6083984375, + "logps/rejected": -168.973876953125, + "loss": 0.41051359176635743, + "rewards/chosen": 0.19750649929046632, + "rewards/margins": 0.9899675607681275, + "rewards/rejected": -0.7924610614776612, + "step": 3660 + }, + { + "epoch": 1.468, + "grad_norm": 0.9869509935379028, + "kl": 4.485353946685791, + "learning_rate": 1.478888888888889e-06, + "logits/chosen": 27324668.8, + "logits/rejected": 27301977.6, + "logps/chosen": -132.47244873046876, + "logps/rejected": -162.38021240234374, + "loss": 0.45110092163085935, + "rewards/chosen": -0.08881351947784424, + "rewards/margins": 0.4330620527267456, + "rewards/rejected": -0.5218755722045898, + "step": 3670 + }, + { + "epoch": 1.472, + "grad_norm": 0.8374236822128296, + "kl": 5.5675249099731445, + "learning_rate": 1.467777777777778e-06, + "logits/chosen": 30133260.8, + "logits/rejected": 27502342.4, + "logps/chosen": -132.419970703125, + "logps/rejected": -164.09249267578124, + "loss": 0.4621445655822754, + "rewards/chosen": 0.29708335399627683, + "rewards/margins": 0.5296570777893066, + "rewards/rejected": -0.2325737237930298, + "step": 3680 + }, + { + "epoch": 1.476, + "grad_norm": 0.5651530623435974, + "kl": 3.6750998497009277, + "learning_rate": 1.4566666666666668e-06, + "logits/chosen": 28068777.6, + "logits/rejected": 24100668.8, + "logps/chosen": -176.51123046875, + "logps/rejected": -184.35858154296875, + "loss": 0.4131460666656494, + "rewards/chosen": 0.15955194234848022, + "rewards/margins": 1.0650185942649841, + "rewards/rejected": -0.9054666519165039, + "step": 3690 + }, + { + "epoch": 1.48, + "grad_norm": 0.6101991534233093, + "kl": 3.6094698905944824, + "learning_rate": 1.4455555555555556e-06, + "logits/chosen": 28100012.8, + "logits/rejected": 26111475.2, + "logps/chosen": -138.737109375, + "logps/rejected": -156.04112548828124, + "loss": 0.4490304470062256, + "rewards/chosen": 0.15077462196350097, + "rewards/margins": 0.5411154270172119, + "rewards/rejected": -0.39034080505371094, + "step": 3700 + }, + { + "epoch": 1.484, + "grad_norm": 0.8218708038330078, + "kl": 3.022378444671631, + "learning_rate": 1.4344444444444446e-06, + "logits/chosen": 18033281.6, + "logits/rejected": 19812489.6, + "logps/chosen": -134.19481201171874, + "logps/rejected": -143.6683837890625, + "loss": 0.4570739269256592, + "rewards/chosen": -0.3303727626800537, + "rewards/margins": 0.5034278392791749, + "rewards/rejected": -0.8338006019592286, + "step": 3710 + }, + { + "epoch": 1.488, + "grad_norm": 0.6318350434303284, + "kl": 3.771150588989258, + "learning_rate": 1.4233333333333336e-06, + "logits/chosen": 26958435.2, + "logits/rejected": 23352366.4, + "logps/chosen": -178.9782470703125, + "logps/rejected": -194.87041015625, + "loss": 0.4509871482849121, + "rewards/chosen": -0.14689927101135253, + "rewards/margins": 0.5733826160430908, + "rewards/rejected": -0.7202818870544434, + "step": 3720 + }, + { + "epoch": 1.492, + "grad_norm": 0.6234843730926514, + "kl": 5.08270263671875, + "learning_rate": 1.4122222222222224e-06, + "logits/chosen": 24577870.4, + "logits/rejected": 24378513.6, + "logps/chosen": -144.27496337890625, + "logps/rejected": -157.0753662109375, + "loss": 0.439809513092041, + "rewards/chosen": 0.2378466844558716, + "rewards/margins": 0.495815110206604, + "rewards/rejected": -0.2579684257507324, + "step": 3730 + }, + { + "epoch": 1.496, + "grad_norm": 0.6093852519989014, + "kl": 3.4414265155792236, + "learning_rate": 1.4011111111111112e-06, + "logits/chosen": 25166454.4, + "logits/rejected": 25530366.4, + "logps/chosen": -121.027001953125, + "logps/rejected": -122.510009765625, + "loss": 0.45766735076904297, + "rewards/chosen": 0.05086352825164795, + "rewards/margins": 0.45518562793731693, + "rewards/rejected": -0.40432209968566896, + "step": 3740 + }, + { + "epoch": 1.5, + "grad_norm": 0.6537352204322815, + "kl": 3.7078990936279297, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 19157790.4, + "logits/rejected": 19428764.8, + "logps/chosen": -108.5373046875, + "logps/rejected": -131.371728515625, + "loss": 0.4350168228149414, + "rewards/chosen": 0.38070919513702395, + "rewards/margins": 0.6320278406143189, + "rewards/rejected": -0.25131864547729493, + "step": 3750 + }, + { + "epoch": 1.504, + "grad_norm": 0.7157226204872131, + "kl": 3.980473041534424, + "learning_rate": 1.378888888888889e-06, + "logits/chosen": 24798140.8, + "logits/rejected": 23861692.8, + "logps/chosen": -141.9797607421875, + "logps/rejected": -169.5621337890625, + "loss": 0.39097282886505125, + "rewards/chosen": 0.2861147403717041, + "rewards/margins": 1.1024574756622314, + "rewards/rejected": -0.8163427352905274, + "step": 3760 + }, + { + "epoch": 1.508, + "grad_norm": 0.5401111245155334, + "kl": 3.915759563446045, + "learning_rate": 1.3677777777777779e-06, + "logits/chosen": 27341155.2, + "logits/rejected": 23659843.2, + "logps/chosen": -167.150537109375, + "logps/rejected": -156.52796630859376, + "loss": 0.4424854278564453, + "rewards/chosen": 0.20697882175445556, + "rewards/margins": 0.6797100305557251, + "rewards/rejected": -0.4727312088012695, + "step": 3770 + }, + { + "epoch": 1.512, + "grad_norm": 0.5865006446838379, + "kl": 3.9945666790008545, + "learning_rate": 1.3566666666666667e-06, + "logits/chosen": 43287993.6, + "logits/rejected": 42994304.0, + "logps/chosen": -151.379150390625, + "logps/rejected": -170.74891357421876, + "loss": 0.46252665519714353, + "rewards/chosen": 0.28137707710266113, + "rewards/margins": 0.3768645763397217, + "rewards/rejected": -0.09548749923706054, + "step": 3780 + }, + { + "epoch": 1.516, + "grad_norm": 0.7108325362205505, + "kl": 5.5239386558532715, + "learning_rate": 1.3455555555555557e-06, + "logits/chosen": 25728556.8, + "logits/rejected": 25374808.0, + "logps/chosen": -126.28800048828126, + "logps/rejected": -141.21763916015624, + "loss": 0.4297455310821533, + "rewards/chosen": 0.6269711494445801, + "rewards/margins": 0.7478980660438538, + "rewards/rejected": -0.12092691659927368, + "step": 3790 + }, + { + "epoch": 1.52, + "grad_norm": 0.5811319947242737, + "kl": 5.248955726623535, + "learning_rate": 1.3344444444444447e-06, + "logits/chosen": 33503753.6, + "logits/rejected": 31553878.4, + "logps/chosen": -178.1581298828125, + "logps/rejected": -143.417431640625, + "loss": 0.45870108604431153, + "rewards/chosen": 0.4684587001800537, + "rewards/margins": 0.4509533554315567, + "rewards/rejected": 0.01750534474849701, + "step": 3800 + }, + { + "epoch": 1.52, + "eval_kl": 4.183420181274414, + "eval_logits/chosen": 30411423.744, + "eval_logits/rejected": 30603616.256, + "eval_logps/chosen": -154.112203125, + "eval_logps/rejected": -150.76103125, + "eval_loss": 0.4779178202152252, + "eval_rewards/chosen": 0.021359254837036133, + "eval_rewards/margins": 0.23318927192687988, + "eval_rewards/rejected": -0.21183001708984375, + "eval_runtime": 217.1598, + "eval_samples_per_second": 4.605, + "eval_steps_per_second": 2.302, + "step": 3800 + }, + { + "epoch": 1.524, + "grad_norm": 0.7062050700187683, + "kl": 4.107216835021973, + "learning_rate": 1.3233333333333335e-06, + "logits/chosen": 26509099.2, + "logits/rejected": 24838300.8, + "logps/chosen": -137.1124267578125, + "logps/rejected": -149.33499755859376, + "loss": 0.44512219429016114, + "rewards/chosen": 0.20196728706359862, + "rewards/margins": 0.5584580659866333, + "rewards/rejected": -0.35649077892303466, + "step": 3810 + }, + { + "epoch": 1.528, + "grad_norm": 0.43405744433403015, + "kl": 6.5067572593688965, + "learning_rate": 1.3122222222222223e-06, + "logits/chosen": 34846220.8, + "logits/rejected": 33152172.8, + "logps/chosen": -144.03416748046874, + "logps/rejected": -156.00250244140625, + "loss": 0.3955928564071655, + "rewards/chosen": 0.8109316825866699, + "rewards/margins": 1.0338047742843628, + "rewards/rejected": -0.22287309169769287, + "step": 3820 + }, + { + "epoch": 1.532, + "grad_norm": 0.48609739542007446, + "kl": 4.6954779624938965, + "learning_rate": 1.3011111111111113e-06, + "logits/chosen": 24615228.8, + "logits/rejected": 25253913.6, + "logps/chosen": -152.94261474609374, + "logps/rejected": -162.1392578125, + "loss": 0.44533653259277345, + "rewards/chosen": -0.09745782017707824, + "rewards/margins": 0.5474663436412811, + "rewards/rejected": -0.6449241638183594, + "step": 3830 + }, + { + "epoch": 1.536, + "grad_norm": 0.8033897280693054, + "kl": 4.541080951690674, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 23824740.8, + "logits/rejected": 25749264.0, + "logps/chosen": -148.12535400390624, + "logps/rejected": -132.90029296875, + "loss": 0.4263105869293213, + "rewards/chosen": 0.34121017456054686, + "rewards/margins": 0.7844597339630126, + "rewards/rejected": -0.4432495594024658, + "step": 3840 + }, + { + "epoch": 1.54, + "grad_norm": 0.5979334115982056, + "kl": 3.1920642852783203, + "learning_rate": 1.278888888888889e-06, + "logits/chosen": 25940036.8, + "logits/rejected": 25332123.2, + "logps/chosen": -125.19688720703125, + "logps/rejected": -124.583349609375, + "loss": 0.42458858489990237, + "rewards/chosen": 0.11553690433502198, + "rewards/margins": 0.7247669935226441, + "rewards/rejected": -0.6092300891876221, + "step": 3850 + }, + { + "epoch": 1.544, + "grad_norm": 0.6010407209396362, + "kl": 7.0015411376953125, + "learning_rate": 1.2677777777777778e-06, + "logits/chosen": 28349667.2, + "logits/rejected": 29532544.0, + "logps/chosen": -160.0486083984375, + "logps/rejected": -148.159619140625, + "loss": 0.433948278427124, + "rewards/chosen": 0.6110920429229736, + "rewards/margins": 0.6498981416225433, + "rewards/rejected": -0.038806098699569705, + "step": 3860 + }, + { + "epoch": 1.548, + "grad_norm": 0.5501318573951721, + "kl": 5.648660659790039, + "learning_rate": 1.2566666666666668e-06, + "logits/chosen": 26864864.0, + "logits/rejected": 25404660.8, + "logps/chosen": -173.2337890625, + "logps/rejected": -160.03525390625, + "loss": 0.43099102973937986, + "rewards/chosen": 0.3765087604522705, + "rewards/margins": 0.7873351097106933, + "rewards/rejected": -0.4108263492584229, + "step": 3870 + }, + { + "epoch": 1.552, + "grad_norm": 0.5478479862213135, + "kl": 3.376481294631958, + "learning_rate": 1.2455555555555556e-06, + "logits/chosen": 32903283.2, + "logits/rejected": 32423433.6, + "logps/chosen": -146.17325439453126, + "logps/rejected": -139.81845703125, + "loss": 0.4483139991760254, + "rewards/chosen": 0.12561094760894775, + "rewards/margins": 0.3918390512466431, + "rewards/rejected": -0.26622810363769533, + "step": 3880 + }, + { + "epoch": 1.556, + "grad_norm": 0.6676125526428223, + "kl": 4.547513484954834, + "learning_rate": 1.2344444444444446e-06, + "logits/chosen": 37385532.8, + "logits/rejected": 38763891.2, + "logps/chosen": -155.23748779296875, + "logps/rejected": -168.7784912109375, + "loss": 0.4901449203491211, + "rewards/chosen": 0.16143620014190674, + "rewards/margins": 0.11462950706481934, + "rewards/rejected": 0.0468066930770874, + "step": 3890 + }, + { + "epoch": 1.56, + "grad_norm": 0.7040978670120239, + "kl": 5.753296852111816, + "learning_rate": 1.2233333333333334e-06, + "logits/chosen": 34341609.6, + "logits/rejected": 33446153.6, + "logps/chosen": -145.878369140625, + "logps/rejected": -180.95491943359374, + "loss": 0.4818913459777832, + "rewards/chosen": 0.37144837379455564, + "rewards/margins": 0.2124497532844543, + "rewards/rejected": 0.15899862051010133, + "step": 3900 + }, + { + "epoch": 1.564, + "grad_norm": 0.48866015672683716, + "kl": 5.357041358947754, + "learning_rate": 1.2122222222222222e-06, + "logits/chosen": 27682144.0, + "logits/rejected": 28275977.6, + "logps/chosen": -150.2932861328125, + "logps/rejected": -144.590478515625, + "loss": 0.4552725315093994, + "rewards/chosen": 0.34428427219390867, + "rewards/margins": 0.4334153890609741, + "rewards/rejected": -0.08913111686706543, + "step": 3910 + }, + { + "epoch": 1.568, + "grad_norm": 0.536604642868042, + "kl": 4.181241512298584, + "learning_rate": 1.2011111111111112e-06, + "logits/chosen": 33525574.4, + "logits/rejected": 32405542.4, + "logps/chosen": -137.73907470703125, + "logps/rejected": -178.59801025390624, + "loss": 0.4515504837036133, + "rewards/chosen": 0.20468955039978026, + "rewards/margins": 0.46134023666381835, + "rewards/rejected": -0.2566506862640381, + "step": 3920 + }, + { + "epoch": 1.572, + "grad_norm": 0.7674170136451721, + "kl": 4.251042366027832, + "learning_rate": 1.19e-06, + "logits/chosen": 40275475.2, + "logits/rejected": 36563088.0, + "logps/chosen": -187.68133544921875, + "logps/rejected": -170.8183349609375, + "loss": 0.406461238861084, + "rewards/chosen": 0.5473237037658691, + "rewards/margins": 1.11146821975708, + "rewards/rejected": -0.564144515991211, + "step": 3930 + }, + { + "epoch": 1.576, + "grad_norm": 0.483766108751297, + "kl": 4.224446773529053, + "learning_rate": 1.178888888888889e-06, + "logits/chosen": 25433030.4, + "logits/rejected": 25341385.6, + "logps/chosen": -117.7307861328125, + "logps/rejected": -139.6504150390625, + "loss": 0.42197356224060056, + "rewards/chosen": 0.24890828132629395, + "rewards/margins": 0.7226192951202393, + "rewards/rejected": -0.4737110137939453, + "step": 3940 + }, + { + "epoch": 1.58, + "grad_norm": 0.5180490016937256, + "kl": 3.8868117332458496, + "learning_rate": 1.1677777777777779e-06, + "logits/chosen": 33638163.2, + "logits/rejected": 34016854.4, + "logps/chosen": -129.69632568359376, + "logps/rejected": -143.5869384765625, + "loss": 0.43038105964660645, + "rewards/chosen": 0.3097927808761597, + "rewards/margins": 0.6986382722854614, + "rewards/rejected": -0.38884549140930175, + "step": 3950 + }, + { + "epoch": 1.584, + "grad_norm": 0.5631889700889587, + "kl": 3.5186545848846436, + "learning_rate": 1.1566666666666667e-06, + "logits/chosen": 30367190.4, + "logits/rejected": 29603392.0, + "logps/chosen": -153.10296630859375, + "logps/rejected": -163.72001953125, + "loss": 0.4525346279144287, + "rewards/chosen": -0.12654991149902345, + "rewards/margins": 0.4131012439727783, + "rewards/rejected": -0.5396511554718018, + "step": 3960 + }, + { + "epoch": 1.588, + "grad_norm": 0.5241718292236328, + "kl": 4.312124729156494, + "learning_rate": 1.1455555555555557e-06, + "logits/chosen": 24180620.8, + "logits/rejected": 26728572.8, + "logps/chosen": -140.080517578125, + "logps/rejected": -115.592919921875, + "loss": 0.4370166301727295, + "rewards/chosen": 0.2027698278427124, + "rewards/margins": 0.536671781539917, + "rewards/rejected": -0.3339019536972046, + "step": 3970 + }, + { + "epoch": 1.592, + "grad_norm": 0.5848884582519531, + "kl": 4.956355571746826, + "learning_rate": 1.1344444444444445e-06, + "logits/chosen": 29427676.8, + "logits/rejected": 25774260.8, + "logps/chosen": -147.395361328125, + "logps/rejected": -165.61865234375, + "loss": 0.4145470142364502, + "rewards/chosen": 0.4822521686553955, + "rewards/margins": 0.6998722791671753, + "rewards/rejected": -0.2176201105117798, + "step": 3980 + }, + { + "epoch": 1.596, + "grad_norm": 0.7140029668807983, + "kl": 5.854944705963135, + "learning_rate": 1.1233333333333333e-06, + "logits/chosen": 39306499.2, + "logits/rejected": 34811254.4, + "logps/chosen": -137.74361572265624, + "logps/rejected": -148.427978515625, + "loss": 0.41861691474914553, + "rewards/chosen": 0.6944310188293457, + "rewards/margins": 1.1321285247802733, + "rewards/rejected": -0.4376975059509277, + "step": 3990 + }, + { + "epoch": 1.6, + "grad_norm": 0.7309412360191345, + "kl": 3.7196297645568848, + "learning_rate": 1.1122222222222223e-06, + "logits/chosen": 29282588.8, + "logits/rejected": 28362857.6, + "logps/chosen": -168.0974365234375, + "logps/rejected": -160.11337890625, + "loss": 0.4827260971069336, + "rewards/chosen": -0.2844557285308838, + "rewards/margins": -0.075036096572876, + "rewards/rejected": -0.2094196319580078, + "step": 4000 + }, + { + "epoch": 1.6, + "eval_kl": 4.738241195678711, + "eval_logits/chosen": 29865963.52, + "eval_logits/rejected": 30057035.776, + "eval_logps/chosen": -153.191, + "eval_logps/rejected": -149.831890625, + "eval_loss": 0.4781652092933655, + "eval_rewards/chosen": 0.1134777603149414, + "eval_rewards/margins": 0.23239292907714842, + "eval_rewards/rejected": -0.11891516876220704, + "eval_runtime": 216.5956, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 4000 + }, + { + "epoch": 1.604, + "grad_norm": 0.6143787503242493, + "kl": 5.21004581451416, + "learning_rate": 1.1011111111111113e-06, + "logits/chosen": 31273155.2, + "logits/rejected": 31765638.4, + "logps/chosen": -172.1530029296875, + "logps/rejected": -172.15975341796874, + "loss": 0.4306319713592529, + "rewards/chosen": 0.42154908180236816, + "rewards/margins": 0.6504992485046387, + "rewards/rejected": -0.2289501667022705, + "step": 4010 + }, + { + "epoch": 1.608, + "grad_norm": 0.3771494925022125, + "kl": 4.662692070007324, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 33774937.6, + "logits/rejected": 34804582.4, + "logps/chosen": -162.08702392578124, + "logps/rejected": -170.07498779296876, + "loss": 0.4483354568481445, + "rewards/chosen": -0.026274442672729492, + "rewards/margins": 0.4997582912445069, + "rewards/rejected": -0.5260327339172364, + "step": 4020 + }, + { + "epoch": 1.612, + "grad_norm": 0.8712416291236877, + "kl": 4.398558616638184, + "learning_rate": 1.078888888888889e-06, + "logits/chosen": 23578403.2, + "logits/rejected": 21598651.2, + "logps/chosen": -131.9225830078125, + "logps/rejected": -196.6409423828125, + "loss": 0.441709041595459, + "rewards/chosen": 0.29367847442626954, + "rewards/margins": 0.6416111469268799, + "rewards/rejected": -0.34793267250061033, + "step": 4030 + }, + { + "epoch": 1.616, + "grad_norm": 0.7502478957176208, + "kl": 4.180516242980957, + "learning_rate": 1.0677777777777778e-06, + "logits/chosen": 24851664.0, + "logits/rejected": 24331059.2, + "logps/chosen": -134.76131591796874, + "logps/rejected": -146.1375, + "loss": 0.42934479713439944, + "rewards/chosen": 0.15300320386886596, + "rewards/margins": 0.7616443037986755, + "rewards/rejected": -0.6086410999298095, + "step": 4040 + }, + { + "epoch": 1.62, + "grad_norm": 0.6729795932769775, + "kl": 3.5344510078430176, + "learning_rate": 1.0566666666666668e-06, + "logits/chosen": 30835174.4, + "logits/rejected": 30916979.2, + "logps/chosen": -167.9162841796875, + "logps/rejected": -148.01121826171874, + "loss": 0.40194120407104494, + "rewards/chosen": 0.18118813037872314, + "rewards/margins": 1.300507092475891, + "rewards/rejected": -1.119318962097168, + "step": 4050 + }, + { + "epoch": 1.624, + "grad_norm": 0.8483315110206604, + "kl": 4.144981384277344, + "learning_rate": 1.0455555555555556e-06, + "logits/chosen": 23790352.0, + "logits/rejected": 21096166.4, + "logps/chosen": -167.572314453125, + "logps/rejected": -160.46065673828124, + "loss": 0.4239004135131836, + "rewards/chosen": 0.40131430625915526, + "rewards/margins": 0.9644507408142089, + "rewards/rejected": -0.5631364345550537, + "step": 4060 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 0.7283557653427124, + "kl": 5.704648971557617, + "learning_rate": 1.0344444444444446e-06, + "logits/chosen": 34577296.0, + "logits/rejected": 28464720.0, + "logps/chosen": -131.04425048828125, + "logps/rejected": -170.54326171875, + "loss": 0.3883501052856445, + "rewards/chosen": 0.8470425605773926, + "rewards/margins": 1.1756995677948, + "rewards/rejected": -0.3286570072174072, + "step": 4070 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.7542695999145508, + "kl": 5.014037132263184, + "learning_rate": 1.0233333333333334e-06, + "logits/chosen": 33581536.0, + "logits/rejected": 31188915.2, + "logps/chosen": -200.83006591796874, + "logps/rejected": -201.557470703125, + "loss": 0.4472477912902832, + "rewards/chosen": 0.1391082763671875, + "rewards/margins": 0.5890022277832031, + "rewards/rejected": -0.4498939514160156, + "step": 4080 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 0.699129045009613, + "kl": 6.782160758972168, + "learning_rate": 1.0122222222222224e-06, + "logits/chosen": 33043475.2, + "logits/rejected": 32764982.4, + "logps/chosen": -178.3090087890625, + "logps/rejected": -168.98370361328125, + "loss": 0.42490806579589846, + "rewards/chosen": 0.7757717132568359, + "rewards/margins": 0.9651144385337829, + "rewards/rejected": -0.189342725276947, + "step": 4090 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.7144243717193604, + "kl": 3.5951950550079346, + "learning_rate": 1.0011111111111112e-06, + "logits/chosen": 24177262.4, + "logits/rejected": 20001184.0, + "logps/chosen": -142.5383056640625, + "logps/rejected": -174.16627197265626, + "loss": 0.4106290817260742, + "rewards/chosen": 0.029287612438201903, + "rewards/margins": 1.2375101923942566, + "rewards/rejected": -1.2082225799560546, + "step": 4100 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.7295018434524536, + "kl": 3.5175278186798096, + "learning_rate": 9.9e-07, + "logits/chosen": 37862601.6, + "logits/rejected": 34241705.6, + "logps/chosen": -180.07008056640626, + "logps/rejected": -165.27158203125, + "loss": 0.41535110473632814, + "rewards/chosen": 0.3609702348709106, + "rewards/margins": 0.8377941370010376, + "rewards/rejected": -0.47682390213012693, + "step": 4110 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.687157154083252, + "kl": 4.200056076049805, + "learning_rate": 9.788888888888889e-07, + "logits/chosen": 30389494.4, + "logits/rejected": 33020348.8, + "logps/chosen": -205.723876953125, + "logps/rejected": -189.8884765625, + "loss": 0.4510765075683594, + "rewards/chosen": -0.2252514600753784, + "rewards/margins": 0.257360577583313, + "rewards/rejected": -0.48261203765869143, + "step": 4120 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.6598585844039917, + "kl": 5.19122838973999, + "learning_rate": 9.677777777777779e-07, + "logits/chosen": 25100822.4, + "logits/rejected": 22434776.0, + "logps/chosen": -139.93997802734376, + "logps/rejected": -148.89000244140624, + "loss": 0.42380781173706056, + "rewards/chosen": 0.25194945335388186, + "rewards/margins": 0.616188907623291, + "rewards/rejected": -0.3642394542694092, + "step": 4130 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.596368134021759, + "kl": 4.2246832847595215, + "learning_rate": 9.566666666666667e-07, + "logits/chosen": 23562168.0, + "logits/rejected": 19095112.0, + "logps/chosen": -167.05655517578126, + "logps/rejected": -163.794482421875, + "loss": 0.42250747680664064, + "rewards/chosen": 0.26157207489013673, + "rewards/margins": 0.9496047973632813, + "rewards/rejected": -0.6880327224731445, + "step": 4140 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.9235984683036804, + "kl": 4.401445388793945, + "learning_rate": 9.455555555555557e-07, + "logits/chosen": 23643918.4, + "logits/rejected": 26274195.2, + "logps/chosen": -147.10347900390624, + "logps/rejected": -122.74593505859374, + "loss": 0.4525291919708252, + "rewards/chosen": 0.2129079818725586, + "rewards/margins": 0.4618348360061646, + "rewards/rejected": -0.24892685413360596, + "step": 4150 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.5369182825088501, + "kl": 4.616759777069092, + "learning_rate": 9.344444444444445e-07, + "logits/chosen": 22211737.6, + "logits/rejected": 20181283.2, + "logps/chosen": -121.6099365234375, + "logps/rejected": -147.121728515625, + "loss": 0.4210421085357666, + "rewards/chosen": 0.3621690273284912, + "rewards/margins": 0.832556676864624, + "rewards/rejected": -0.4703876495361328, + "step": 4160 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 0.7842811942100525, + "kl": 3.4067413806915283, + "learning_rate": 9.233333333333334e-07, + "logits/chosen": 27008988.8, + "logits/rejected": 25419982.4, + "logps/chosen": -170.2637939453125, + "logps/rejected": -197.46827392578126, + "loss": 0.4370081424713135, + "rewards/chosen": -0.2569821834564209, + "rewards/margins": 0.6670010089874268, + "rewards/rejected": -0.9239831924438476, + "step": 4170 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.9903603196144104, + "kl": 2.7231457233428955, + "learning_rate": 9.122222222222222e-07, + "logits/chosen": 24366032.0, + "logits/rejected": 23620707.2, + "logps/chosen": -151.88421630859375, + "logps/rejected": -164.09239501953124, + "loss": 0.42814011573791505, + "rewards/chosen": 0.023269623517990112, + "rewards/margins": 0.6117547690868378, + "rewards/rejected": -0.5884851455688477, + "step": 4180 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 0.8565585613250732, + "kl": 3.8181614875793457, + "learning_rate": 9.011111111111112e-07, + "logits/chosen": 30655971.2, + "logits/rejected": 30687513.6, + "logps/chosen": -158.52818603515624, + "logps/rejected": -156.8181884765625, + "loss": 0.4161073684692383, + "rewards/chosen": 0.05024971961975098, + "rewards/margins": 0.9918828487396241, + "rewards/rejected": -0.9416331291198731, + "step": 4190 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.6923061609268188, + "kl": 3.7691681385040283, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 29314112.0, + "logits/rejected": 29080848.0, + "logps/chosen": -143.7644775390625, + "logps/rejected": -143.195703125, + "loss": 0.42991132736206056, + "rewards/chosen": 0.14495362043380738, + "rewards/margins": 0.5658432364463806, + "rewards/rejected": -0.42088961601257324, + "step": 4200 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 4.478702545166016, + "eval_logits/chosen": 27134529.536, + "eval_logits/rejected": 27462119.424, + "eval_logps/chosen": -154.735171875, + "eval_logps/rejected": -151.346375, + "eval_loss": 0.4791446030139923, + "eval_rewards/chosen": -0.040938720703125, + "eval_rewards/margins": 0.22942645263671874, + "eval_rewards/rejected": -0.2703651733398437, + "eval_runtime": 216.6724, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 4200 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 0.8431246876716614, + "kl": 3.3740649223327637, + "learning_rate": 8.78888888888889e-07, + "logits/chosen": 24580116.8, + "logits/rejected": 23560889.6, + "logps/chosen": -130.23404541015626, + "logps/rejected": -173.274755859375, + "loss": 0.45294036865234377, + "rewards/chosen": -0.21662135124206544, + "rewards/margins": 0.6990167140960692, + "rewards/rejected": -0.9156380653381347, + "step": 4210 + }, + { + "epoch": 1.688, + "grad_norm": 1.0964020490646362, + "kl": 3.4218056201934814, + "learning_rate": 8.677777777777778e-07, + "logits/chosen": 25205835.2, + "logits/rejected": 24163204.8, + "logps/chosen": -144.6944580078125, + "logps/rejected": -165.13668212890624, + "loss": 0.41784143447875977, + "rewards/chosen": 0.14861660003662108, + "rewards/margins": 0.8323590278625488, + "rewards/rejected": -0.6837424278259278, + "step": 4220 + }, + { + "epoch": 1.692, + "grad_norm": 0.8997116088867188, + "kl": 5.5824151039123535, + "learning_rate": 8.566666666666668e-07, + "logits/chosen": 23316640.0, + "logits/rejected": 23185657.6, + "logps/chosen": -150.42138671875, + "logps/rejected": -176.9662109375, + "loss": 0.4408450126647949, + "rewards/chosen": 0.33873915672302246, + "rewards/margins": 0.5633441925048828, + "rewards/rejected": -0.22460503578186036, + "step": 4230 + }, + { + "epoch": 1.696, + "grad_norm": 0.9084079265594482, + "kl": 3.3602194786071777, + "learning_rate": 8.455555555555556e-07, + "logits/chosen": 20662596.8, + "logits/rejected": 21372936.0, + "logps/chosen": -158.28271484375, + "logps/rejected": -157.30823974609376, + "loss": 0.4337437152862549, + "rewards/chosen": -0.27880520820617677, + "rewards/margins": 0.39356427192687987, + "rewards/rejected": -0.6723694801330566, + "step": 4240 + }, + { + "epoch": 1.7, + "grad_norm": 0.822826623916626, + "kl": 6.255753993988037, + "learning_rate": 8.344444444444445e-07, + "logits/chosen": 28279952.0, + "logits/rejected": 30563142.4, + "logps/chosen": -207.4703369140625, + "logps/rejected": -156.87734375, + "loss": 0.401468563079834, + "rewards/chosen": 0.3442718505859375, + "rewards/margins": 0.6474948883056642, + "rewards/rejected": -0.3032230377197266, + "step": 4250 + }, + { + "epoch": 1.704, + "grad_norm": 0.7218330502510071, + "kl": 3.7593655586242676, + "learning_rate": 8.233333333333333e-07, + "logits/chosen": 21211801.6, + "logits/rejected": 25582251.2, + "logps/chosen": -181.3871337890625, + "logps/rejected": -158.4944580078125, + "loss": 0.4855056285858154, + "rewards/chosen": -0.661691427230835, + "rewards/margins": -0.19646124839782714, + "rewards/rejected": -0.4652301788330078, + "step": 4260 + }, + { + "epoch": 1.708, + "grad_norm": 0.5836480259895325, + "kl": 4.7980637550354, + "learning_rate": 8.122222222222223e-07, + "logits/chosen": 25395673.6, + "logits/rejected": 26470857.6, + "logps/chosen": -129.83922119140624, + "logps/rejected": -120.219775390625, + "loss": 0.4517657279968262, + "rewards/chosen": 0.2096014976501465, + "rewards/margins": 0.4321582317352295, + "rewards/rejected": -0.222556734085083, + "step": 4270 + }, + { + "epoch": 1.712, + "grad_norm": 0.6632907390594482, + "kl": 4.515078544616699, + "learning_rate": 8.011111111111111e-07, + "logits/chosen": 28016816.0, + "logits/rejected": 29217836.8, + "logps/chosen": -161.2439697265625, + "logps/rejected": -146.48646240234376, + "loss": 0.4749518871307373, + "rewards/chosen": -0.10194592475891114, + "rewards/margins": 0.19709014892578125, + "rewards/rejected": -0.29903607368469237, + "step": 4280 + }, + { + "epoch": 1.716, + "grad_norm": 0.9374505877494812, + "kl": 4.215886116027832, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 25599212.8, + "logits/rejected": 27952102.4, + "logps/chosen": -146.201220703125, + "logps/rejected": -121.0344482421875, + "loss": 0.4816310882568359, + "rewards/chosen": -0.015983200073242186, + "rewards/margins": 0.10800590515136718, + "rewards/rejected": -0.12398910522460938, + "step": 4290 + }, + { + "epoch": 1.72, + "grad_norm": 0.9015432596206665, + "kl": 3.52048921585083, + "learning_rate": 7.788888888888889e-07, + "logits/chosen": 19586571.2, + "logits/rejected": 17563038.4, + "logps/chosen": -147.8046142578125, + "logps/rejected": -163.8701416015625, + "loss": 0.4111928939819336, + "rewards/chosen": 0.09343934059143066, + "rewards/margins": 1.163926935195923, + "rewards/rejected": -1.0704875946044923, + "step": 4300 + }, + { + "epoch": 1.724, + "grad_norm": 0.6239180564880371, + "kl": 1.9489761590957642, + "learning_rate": 7.677777777777779e-07, + "logits/chosen": 29297952.0, + "logits/rejected": 25391588.8, + "logps/chosen": -142.50740966796874, + "logps/rejected": -180.68870849609374, + "loss": 0.43747854232788086, + "rewards/chosen": -0.7470763683319092, + "rewards/margins": 0.7179863452911378, + "rewards/rejected": -1.465062713623047, + "step": 4310 + }, + { + "epoch": 1.728, + "grad_norm": 0.6577679514884949, + "kl": 4.765759468078613, + "learning_rate": 7.566666666666667e-07, + "logits/chosen": 24857828.8, + "logits/rejected": 23441436.8, + "logps/chosen": -146.1468017578125, + "logps/rejected": -153.498046875, + "loss": 0.44129347801208496, + "rewards/chosen": -0.11145193576812744, + "rewards/margins": 0.5271980524063111, + "rewards/rejected": -0.6386499881744385, + "step": 4320 + }, + { + "epoch": 1.732, + "grad_norm": 0.6183480024337769, + "kl": 4.645040988922119, + "learning_rate": 7.455555555555556e-07, + "logits/chosen": 22485094.4, + "logits/rejected": 24854936.0, + "logps/chosen": -170.60594482421874, + "logps/rejected": -139.26231689453124, + "loss": 0.4434357166290283, + "rewards/chosen": -0.2083209276199341, + "rewards/margins": 0.2575597524642944, + "rewards/rejected": -0.4658806800842285, + "step": 4330 + }, + { + "epoch": 1.736, + "grad_norm": 0.727397084236145, + "kl": 3.468677520751953, + "learning_rate": 7.344444444444445e-07, + "logits/chosen": 19499200.0, + "logits/rejected": 15777052.8, + "logps/chosen": -133.76549072265624, + "logps/rejected": -177.62255859375, + "loss": 0.39086987972259524, + "rewards/chosen": 0.10505068302154541, + "rewards/margins": 1.318554902076721, + "rewards/rejected": -1.2135042190551757, + "step": 4340 + }, + { + "epoch": 1.74, + "grad_norm": 0.5569754242897034, + "kl": 3.695270538330078, + "learning_rate": 7.233333333333334e-07, + "logits/chosen": 21616838.4, + "logits/rejected": 25698086.4, + "logps/chosen": -166.606103515625, + "logps/rejected": -136.80845947265624, + "loss": 0.4519169807434082, + "rewards/chosen": -0.5317587852478027, + "rewards/margins": 0.12961096763610835, + "rewards/rejected": -0.6613697528839111, + "step": 4350 + }, + { + "epoch": 1.744, + "grad_norm": 0.5092763900756836, + "kl": 5.227725028991699, + "learning_rate": 7.122222222222223e-07, + "logits/chosen": 18400382.4, + "logits/rejected": 17152075.2, + "logps/chosen": -152.55262451171876, + "logps/rejected": -161.2753173828125, + "loss": 0.42585110664367676, + "rewards/chosen": 0.08888615369796753, + "rewards/margins": 1.0932955622673035, + "rewards/rejected": -1.004409408569336, + "step": 4360 + }, + { + "epoch": 1.748, + "grad_norm": 0.6089858412742615, + "kl": 6.389164924621582, + "learning_rate": 7.011111111111112e-07, + "logits/chosen": 23212915.2, + "logits/rejected": 23174060.8, + "logps/chosen": -148.427197265625, + "logps/rejected": -138.902783203125, + "loss": 0.43767833709716797, + "rewards/chosen": 0.34542050361633303, + "rewards/margins": 0.6011173248291015, + "rewards/rejected": -0.25569682121276854, + "step": 4370 + }, + { + "epoch": 1.752, + "grad_norm": 0.7319818735122681, + "kl": 3.631608486175537, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 24275633.6, + "logits/rejected": 25451712.0, + "logps/chosen": -130.7281982421875, + "logps/rejected": -169.6556884765625, + "loss": 0.4526735782623291, + "rewards/chosen": -0.1837414264678955, + "rewards/margins": 0.5917365074157716, + "rewards/rejected": -0.775477933883667, + "step": 4380 + }, + { + "epoch": 1.756, + "grad_norm": 0.7777149081230164, + "kl": 3.294254779815674, + "learning_rate": 6.78888888888889e-07, + "logits/chosen": 21825590.4, + "logits/rejected": 21820012.8, + "logps/chosen": -147.787158203125, + "logps/rejected": -144.9162841796875, + "loss": 0.43806142807006837, + "rewards/chosen": -0.11295137405395508, + "rewards/margins": 0.8744370460510255, + "rewards/rejected": -0.9873884201049805, + "step": 4390 + }, + { + "epoch": 1.76, + "grad_norm": 0.6188346147537231, + "kl": 1.1032154560089111, + "learning_rate": 6.677777777777779e-07, + "logits/chosen": 21570892.8, + "logits/rejected": 23144080.0, + "logps/chosen": -135.95819091796875, + "logps/rejected": -149.30087890625, + "loss": 0.44826564788818357, + "rewards/chosen": -0.7288064002990723, + "rewards/margins": 0.5491142272949218, + "rewards/rejected": -1.277920627593994, + "step": 4400 + }, + { + "epoch": 1.76, + "eval_kl": 3.972273826599121, + "eval_logits/chosen": 25631897.6, + "eval_logits/rejected": 26045097.984, + "eval_logps/chosen": -156.330625, + "eval_logps/rejected": -152.941625, + "eval_loss": 0.4793069362640381, + "eval_rewards/chosen": -0.2004847412109375, + "eval_rewards/margins": 0.22940472412109375, + "eval_rewards/rejected": -0.42988946533203126, + "eval_runtime": 216.7456, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 4400 + }, + { + "epoch": 1.764, + "grad_norm": 0.8933963179588318, + "kl": 3.5940029621124268, + "learning_rate": 6.566666666666667e-07, + "logits/chosen": 23419489.6, + "logits/rejected": 19540764.8, + "logps/chosen": -159.04957275390626, + "logps/rejected": -170.299072265625, + "loss": 0.4445340633392334, + "rewards/chosen": -0.11221444606781006, + "rewards/margins": 0.6003352880477906, + "rewards/rejected": -0.7125497341156006, + "step": 4410 + }, + { + "epoch": 1.768, + "grad_norm": 1.043148159980774, + "kl": 4.92350959777832, + "learning_rate": 6.455555555555556e-07, + "logits/chosen": 23423225.6, + "logits/rejected": 20264896.0, + "logps/chosen": -134.481201171875, + "logps/rejected": -165.0137451171875, + "loss": 0.43820796012878416, + "rewards/chosen": 0.19546182155609132, + "rewards/margins": 0.7127113103866578, + "rewards/rejected": -0.5172494888305664, + "step": 4420 + }, + { + "epoch": 1.772, + "grad_norm": 0.6755536198616028, + "kl": 3.846719741821289, + "learning_rate": 6.344444444444445e-07, + "logits/chosen": 30165392.0, + "logits/rejected": 34583219.2, + "logps/chosen": -156.612158203125, + "logps/rejected": -142.9615234375, + "loss": 0.4490334510803223, + "rewards/chosen": -0.037669995427131654, + "rewards/margins": 0.416199442744255, + "rewards/rejected": -0.4538694381713867, + "step": 4430 + }, + { + "epoch": 1.776, + "grad_norm": 0.6009793877601624, + "kl": 3.6022000312805176, + "learning_rate": 6.233333333333333e-07, + "logits/chosen": 16789396.8, + "logits/rejected": 20022726.4, + "logps/chosen": -139.00106201171874, + "logps/rejected": -119.23135986328126, + "loss": 0.48907132148742677, + "rewards/chosen": -0.18124552965164184, + "rewards/margins": 0.10945202112197874, + "rewards/rejected": -0.2906975507736206, + "step": 4440 + }, + { + "epoch": 1.78, + "grad_norm": 0.5468002557754517, + "kl": 4.557890892028809, + "learning_rate": 6.122222222222222e-07, + "logits/chosen": 30898377.6, + "logits/rejected": 29239990.4, + "logps/chosen": -161.43404541015624, + "logps/rejected": -163.96580810546874, + "loss": 0.4502861499786377, + "rewards/chosen": 0.12291504144668579, + "rewards/margins": 0.5061401724815369, + "rewards/rejected": -0.38322513103485106, + "step": 4450 + }, + { + "epoch": 1.784, + "grad_norm": 0.5758063793182373, + "kl": 2.764960527420044, + "learning_rate": 6.011111111111112e-07, + "logits/chosen": 23224883.2, + "logits/rejected": 23461840.0, + "logps/chosen": -147.754541015625, + "logps/rejected": -139.2906494140625, + "loss": 0.44967427253723147, + "rewards/chosen": -0.2614432334899902, + "rewards/margins": 0.513807487487793, + "rewards/rejected": -0.7752507209777832, + "step": 4460 + }, + { + "epoch": 1.788, + "grad_norm": 0.7826879620552063, + "kl": 4.313460350036621, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 28475094.4, + "logits/rejected": 29499904.0, + "logps/chosen": -149.60347900390624, + "logps/rejected": -139.57032470703126, + "loss": 0.4358978748321533, + "rewards/chosen": 0.18888943195343016, + "rewards/margins": 0.7328470468521118, + "rewards/rejected": -0.5439576148986817, + "step": 4470 + }, + { + "epoch": 1.792, + "grad_norm": 0.7026771306991577, + "kl": 1.8280513286590576, + "learning_rate": 5.788888888888889e-07, + "logits/chosen": 15314908.8, + "logits/rejected": 14658353.6, + "logps/chosen": -146.75345458984376, + "logps/rejected": -157.1791259765625, + "loss": 0.4324824810028076, + "rewards/chosen": -0.4457141399383545, + "rewards/margins": 0.7849259853363036, + "rewards/rejected": -1.2306401252746582, + "step": 4480 + }, + { + "epoch": 1.796, + "grad_norm": 0.7853025197982788, + "kl": 5.189270496368408, + "learning_rate": 5.677777777777779e-07, + "logits/chosen": 26974720.0, + "logits/rejected": 25859792.0, + "logps/chosen": -164.4057861328125, + "logps/rejected": -164.3528076171875, + "loss": 0.41428799629211427, + "rewards/chosen": 0.3338757514953613, + "rewards/margins": 0.8068063259124756, + "rewards/rejected": -0.4729305744171143, + "step": 4490 + }, + { + "epoch": 1.8, + "grad_norm": 0.7289919853210449, + "kl": 3.4229626655578613, + "learning_rate": 5.566666666666667e-07, + "logits/chosen": 28305580.8, + "logits/rejected": 27415660.8, + "logps/chosen": -127.7979248046875, + "logps/rejected": -144.92559814453125, + "loss": 0.4419555187225342, + "rewards/chosen": -0.07625447511672974, + "rewards/margins": 0.5168057322502135, + "rewards/rejected": -0.5930602073669433, + "step": 4500 + }, + { + "epoch": 1.804, + "grad_norm": 0.7820873856544495, + "kl": 5.29005241394043, + "learning_rate": 5.455555555555556e-07, + "logits/chosen": 30357193.6, + "logits/rejected": 28641846.4, + "logps/chosen": -164.97662353515625, + "logps/rejected": -146.59793701171876, + "loss": 0.43038201332092285, + "rewards/chosen": 0.35901241302490233, + "rewards/margins": 0.7769660949707031, + "rewards/rejected": -0.4179536819458008, + "step": 4510 + }, + { + "epoch": 1.808, + "grad_norm": 0.8984478116035461, + "kl": 3.696812868118286, + "learning_rate": 5.344444444444445e-07, + "logits/chosen": 21213681.6, + "logits/rejected": 20043033.6, + "logps/chosen": -183.95279541015626, + "logps/rejected": -168.1439208984375, + "loss": 0.4274559020996094, + "rewards/chosen": -0.015572810173034668, + "rewards/margins": 0.6375526189804077, + "rewards/rejected": -0.6531254291534424, + "step": 4520 + }, + { + "epoch": 1.812, + "grad_norm": 0.5371900200843811, + "kl": 2.156186819076538, + "learning_rate": 5.233333333333334e-07, + "logits/chosen": 24134673.6, + "logits/rejected": 21031646.4, + "logps/chosen": -141.1941162109375, + "logps/rejected": -176.01248779296876, + "loss": 0.3860702276229858, + "rewards/chosen": -0.2229753017425537, + "rewards/margins": 1.4064032077789306, + "rewards/rejected": -1.6293785095214843, + "step": 4530 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.8296744227409363, + "kl": 3.7835755348205566, + "learning_rate": 5.122222222222222e-07, + "logits/chosen": 36388054.4, + "logits/rejected": 31688457.6, + "logps/chosen": -160.08714599609374, + "logps/rejected": -207.0276123046875, + "loss": 0.40818300247192385, + "rewards/chosen": 0.12663592100143434, + "rewards/margins": 1.2323408007621766, + "rewards/rejected": -1.1057048797607423, + "step": 4540 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.5752077102661133, + "kl": 3.985583543777466, + "learning_rate": 5.011111111111112e-07, + "logits/chosen": 21616614.4, + "logits/rejected": 22038934.4, + "logps/chosen": -139.400634765625, + "logps/rejected": -120.08944091796874, + "loss": 0.45534987449645997, + "rewards/chosen": -0.1639024496078491, + "rewards/margins": 0.37830965518951415, + "rewards/rejected": -0.5422121047973633, + "step": 4550 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.7708961367607117, + "kl": 3.2169101238250732, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 23925704.0, + "logits/rejected": 19160164.8, + "logps/chosen": -168.2747802734375, + "logps/rejected": -186.7814697265625, + "loss": 0.4789942741394043, + "rewards/chosen": -0.4888188362121582, + "rewards/margins": 0.5942277908325195, + "rewards/rejected": -1.0830466270446777, + "step": 4560 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 0.6243308186531067, + "kl": 4.417626857757568, + "learning_rate": 4.78888888888889e-07, + "logits/chosen": 27919126.4, + "logits/rejected": 26266936.0, + "logps/chosen": -180.739013671875, + "logps/rejected": -187.88717041015624, + "loss": 0.38467090129852294, + "rewards/chosen": -0.027477288246154787, + "rewards/margins": 1.3471161603927613, + "rewards/rejected": -1.374593448638916, + "step": 4570 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6753647327423096, + "kl": 2.572279930114746, + "learning_rate": 4.6777777777777785e-07, + "logits/chosen": 24245668.8, + "logits/rejected": 25570259.2, + "logps/chosen": -183.2251953125, + "logps/rejected": -144.81016845703124, + "loss": 0.47222309112548827, + "rewards/chosen": -0.7298378944396973, + "rewards/margins": -0.06857419013977051, + "rewards/rejected": -0.6612637042999268, + "step": 4580 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 0.6756667494773865, + "kl": 2.9830145835876465, + "learning_rate": 4.566666666666667e-07, + "logits/chosen": 19786300.8, + "logits/rejected": 18077294.4, + "logps/chosen": -142.7079833984375, + "logps/rejected": -167.11500244140626, + "loss": 0.42957119941711425, + "rewards/chosen": -0.10680264234542847, + "rewards/margins": 1.0903936505317688, + "rewards/rejected": -1.1971962928771973, + "step": 4590 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.5496138334274292, + "kl": 2.980377197265625, + "learning_rate": 4.455555555555556e-07, + "logits/chosen": 19038470.4, + "logits/rejected": 22191712.0, + "logps/chosen": -171.56102294921874, + "logps/rejected": -158.53353271484374, + "loss": 0.5080355644226074, + "rewards/chosen": -0.6536062240600586, + "rewards/margins": -0.24833087921142583, + "rewards/rejected": -0.4052753448486328, + "step": 4600 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.5254554748535156, + "eval_logits/chosen": 23809890.304, + "eval_logits/rejected": 24273334.272, + "eval_logps/chosen": -158.050265625, + "eval_logps/rejected": -154.70503125, + "eval_loss": 0.47947752475738525, + "eval_rewards/chosen": -0.37244818115234374, + "eval_rewards/margins": 0.2337820434570313, + "eval_rewards/rejected": -0.606230224609375, + "eval_runtime": 216.9986, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 2.304, + "step": 4600 + } + ], + "logging_steps": 10, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4600/training_args.bin b/v5/KTO/KTO_20k/lora/checkpoint-4600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b3df9314987039f6eb4aae71c1789a27c508f03 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b224910eb4f0913af2c07ef9b4ff545409726d7169b35fc1b136bed8f918d2c +size 5521 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/README.md b/v5/KTO/KTO_20k/lora/checkpoint-4800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_config.json b/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5726eb3a65b963dd94788413b8a63d4accbb95c3 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_model.safetensors b/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..60804bebbbc54787fd6485a21aacee4f492a65c9 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a6e4c09df248761d3b1ded124489e1747953fad011ddd6078f19bdeaa12bf1d +size 180385008 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/chat_template.jinja b/v5/KTO/KTO_20k/lora/checkpoint-4800/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/optimizer.pt b/v5/KTO/KTO_20k/lora/checkpoint-4800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b23f9ed59f226063b66f9826e5641df288490b80 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdd1bf5fb92317b349752f79e05de1db24d5081434bf494ad0fb1a8561762cf3 +size 360902475 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/rng_state.pth b/v5/KTO/KTO_20k/lora/checkpoint-4800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/scaler.pt b/v5/KTO/KTO_20k/lora/checkpoint-4800/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eaf243fc7e76b003e1abfc4171a19fca50b7499 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b429070a564985551cfca2e541b4c4fca20d998c67cc7cb6e2b59f638df425a3 +size 1383 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/scheduler.pt b/v5/KTO/KTO_20k/lora/checkpoint-4800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4df176aca94913189a631ac548bb71d89ec4f545 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e3f78ac339f7701d1852cb3eb842434fc306be01a8f6273fdb6fc865b8b4fbe +size 1465 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer.json b/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer_config.json b/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/trainer_state.json b/v5/KTO/KTO_20k/lora/checkpoint-4800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..dfe67d09c1ccdedc25f2023d6715ba9fdf06ea43 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/trainer_state.json @@ -0,0 +1,7618 @@ +{ + "best_global_step": 4600, + "best_metric": 0.2337820434570313, + "best_model_checkpoint": "output/lora/checkpoint-4600", + "epoch": 1.92, + "eval_steps": 200, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 0.5129354000091553, + "kl": 0.01111381035298109, + "learning_rate": 9e-08, + "logits/chosen": 29682550.4, + "logits/rejected": 31339625.6, + "logps/chosen": -148.96693115234376, + "logps/rejected": -128.8356201171875, + "loss": 0.5001067161560059, + "rewards/chosen": -0.0005946397315710783, + "rewards/margins": -0.0008538532070815563, + "rewards/rejected": 0.000259213475510478, + "step": 10 + }, + { + "epoch": 0.008, + "grad_norm": 0.4323230981826782, + "kl": 0.015593004412949085, + "learning_rate": 1.9e-07, + "logits/chosen": 53384144.0, + "logits/rejected": 52884704.0, + "logps/chosen": -140.024853515625, + "logps/rejected": -151.92880859375, + "loss": 0.49987268447875977, + "rewards/chosen": 0.0006237029097974301, + "rewards/margins": 0.0010180996730923652, + "rewards/rejected": -0.0003943967632949352, + "step": 20 + }, + { + "epoch": 0.012, + "grad_norm": 0.4257548451423645, + "kl": 0.014815926551818848, + "learning_rate": 2.9000000000000003e-07, + "logits/chosen": 34151433.6, + "logits/rejected": 34198240.0, + "logps/chosen": -131.73375244140624, + "logps/rejected": -140.37911376953124, + "loss": 0.4998063087463379, + "rewards/chosen": 0.0004901790525764227, + "rewards/margins": 0.0015492869075387715, + "rewards/rejected": -0.0010591078549623489, + "step": 30 + }, + { + "epoch": 0.016, + "grad_norm": 0.36496493220329285, + "kl": 0.02263352833688259, + "learning_rate": 3.9e-07, + "logits/chosen": 43278188.8, + "logits/rejected": 43919286.4, + "logps/chosen": -144.2862060546875, + "logps/rejected": -146.0272705078125, + "loss": 0.4999645233154297, + "rewards/chosen": 0.0011271238327026367, + "rewards/margins": 0.00028378488495945926, + "rewards/rejected": 0.0008433389477431775, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.3303475081920624, + "kl": 0.018513035029172897, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 43083849.6, + "logits/rejected": 44890182.4, + "logps/chosen": -140.999267578125, + "logps/rejected": -154.3203369140625, + "loss": 0.4999688625335693, + "rewards/chosen": 0.0011019515804946423, + "rewards/margins": 0.0002493190579116345, + "rewards/rejected": 0.0008526325225830078, + "step": 50 + }, + { + "epoch": 0.024, + "grad_norm": 0.2820725739002228, + "kl": 0.01858975924551487, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 36625328.0, + "logits/rejected": 34144819.2, + "logps/chosen": -105.73199462890625, + "logps/rejected": -114.0021728515625, + "loss": 0.5000367164611816, + "rewards/chosen": 0.0006336641497910022, + "rewards/margins": -0.0002944803796708584, + "rewards/rejected": 0.0009281445294618606, + "step": 60 + }, + { + "epoch": 0.028, + "grad_norm": 0.3881119191646576, + "kl": 0.00938491802662611, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 47839347.2, + "logits/rejected": 46951110.4, + "logps/chosen": -165.68013916015624, + "logps/rejected": -175.11986083984374, + "loss": 0.5000545501708984, + "rewards/chosen": -0.004812383651733398, + "rewards/margins": -0.0004361916333436959, + "rewards/rejected": -0.004376192018389702, + "step": 70 + }, + { + "epoch": 0.032, + "grad_norm": 0.4655516743659973, + "kl": 0.011602235026657581, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 40787196.8, + "logits/rejected": 40853612.8, + "logps/chosen": -154.95506591796874, + "logps/rejected": -163.55113525390624, + "loss": 0.4999113082885742, + "rewards/chosen": -0.003601384162902832, + "rewards/margins": 0.0007092095911502838, + "rewards/rejected": -0.004310593754053116, + "step": 80 + }, + { + "epoch": 0.036, + "grad_norm": 0.3819780647754669, + "kl": 0.02207348309457302, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 44163190.4, + "logits/rejected": 44268889.6, + "logps/chosen": -169.8670166015625, + "logps/rejected": -159.19212646484374, + "loss": 0.4996920108795166, + "rewards/chosen": -0.0014049055054783822, + "rewards/margins": 0.002464146353304386, + "rewards/rejected": -0.003869051858782768, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.4587983191013336, + "kl": 0.056853484362363815, + "learning_rate": 9.9e-07, + "logits/chosen": 27709289.6, + "logits/rejected": 27346092.8, + "logps/chosen": -134.2815185546875, + "logps/rejected": -164.53704833984375, + "loss": 0.4997425556182861, + "rewards/chosen": 0.0017462443560361863, + "rewards/margins": 0.0020607755985111, + "rewards/rejected": -0.0003145312424749136, + "step": 100 + }, + { + "epoch": 0.044, + "grad_norm": 0.3832976818084717, + "kl": 0.052884578704833984, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 38150864.0, + "logits/rejected": 37954022.4, + "logps/chosen": -130.54158935546874, + "logps/rejected": -135.6479248046875, + "loss": 0.49963693618774413, + "rewards/chosen": 0.002483482100069523, + "rewards/margins": 0.0029049014206975698, + "rewards/rejected": -0.000421419320628047, + "step": 110 + }, + { + "epoch": 0.048, + "grad_norm": 0.3761675953865051, + "kl": 0.06726250797510147, + "learning_rate": 1.19e-06, + "logits/chosen": 47769347.2, + "logits/rejected": 47376777.6, + "logps/chosen": -162.1564208984375, + "logps/rejected": -133.792041015625, + "loss": 0.5000278949737549, + "rewards/chosen": 0.00019950373098254204, + "rewards/margins": -0.00022385641932487488, + "rewards/rejected": 0.0004233601503074169, + "step": 120 + }, + { + "epoch": 0.052, + "grad_norm": 0.3125726580619812, + "kl": 0.15044990181922913, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 40041744.0, + "logits/rejected": 39132192.0, + "logps/chosen": -162.06031494140626, + "logps/rejected": -140.18397216796876, + "loss": 0.5000635147094726, + "rewards/chosen": 0.011354871094226837, + "rewards/margins": -0.0005075931549072266, + "rewards/rejected": 0.011862464249134064, + "step": 130 + }, + { + "epoch": 0.056, + "grad_norm": 0.35332098603248596, + "kl": 0.2332003116607666, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 40936960.0, + "logits/rejected": 42938476.8, + "logps/chosen": -142.440185546875, + "logps/rejected": -157.8077880859375, + "loss": 0.5003787040710449, + "rewards/chosen": 0.021105077862739564, + "rewards/margins": -0.003029544651508332, + "rewards/rejected": 0.024134622514247896, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.37256330251693726, + "kl": 0.25889211893081665, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 41140886.4, + "logits/rejected": 42385772.8, + "logps/chosen": -130.3114013671875, + "logps/rejected": -117.353369140625, + "loss": 0.49920454025268557, + "rewards/chosen": 0.02739974558353424, + "rewards/margins": 0.006363460421562196, + "rewards/rejected": 0.021036285161972045, + "step": 150 + }, + { + "epoch": 0.064, + "grad_norm": 0.38222262263298035, + "kl": 0.3707125782966614, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 34721360.0, + "logits/rejected": 35081372.8, + "logps/chosen": -143.524462890625, + "logps/rejected": -147.06605224609376, + "loss": 0.49993181228637695, + "rewards/chosen": 0.03622217178344726, + "rewards/margins": 0.0005454152822494465, + "rewards/rejected": 0.035676756501197816, + "step": 160 + }, + { + "epoch": 0.068, + "grad_norm": 0.3260433077812195, + "kl": 0.4656868577003479, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 37774179.2, + "logits/rejected": 39969705.6, + "logps/chosen": -138.2024658203125, + "logps/rejected": -149.75228271484374, + "loss": 0.4999128818511963, + "rewards/chosen": 0.04691722691059112, + "rewards/margins": 0.0006973743438720703, + "rewards/rejected": 0.04621985256671905, + "step": 170 + }, + { + "epoch": 0.072, + "grad_norm": 0.47040465474128723, + "kl": 0.6031174659729004, + "learning_rate": 1.79e-06, + "logits/chosen": 44058707.2, + "logits/rejected": 45027283.2, + "logps/chosen": -145.10814208984374, + "logps/rejected": -170.75323486328125, + "loss": 0.5000656604766845, + "rewards/chosen": 0.060049277544021604, + "rewards/margins": -0.0005249440670013483, + "rewards/rejected": 0.06057422161102295, + "step": 180 + }, + { + "epoch": 0.076, + "grad_norm": 0.3483351767063141, + "kl": 0.5900982022285461, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 41571993.6, + "logits/rejected": 43139596.8, + "logps/chosen": -124.036376953125, + "logps/rejected": -137.070751953125, + "loss": 0.5001267910003662, + "rewards/chosen": 0.05850306153297424, + "rewards/margins": -0.0010134875774383545, + "rewards/rejected": 0.0595165491104126, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.47156763076782227, + "kl": 0.654812753200531, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 46899654.4, + "logits/rejected": 48147660.8, + "logps/chosen": -150.88424072265624, + "logps/rejected": -172.6162109375, + "loss": 0.4997762680053711, + "rewards/chosen": 0.06637628674507141, + "rewards/margins": 0.001789605617523199, + "rewards/rejected": 0.06458668112754821, + "step": 200 + }, + { + "epoch": 0.08, + "eval_kl": 0.4785654842853546, + "eval_logits/chosen": 39006478.336, + "eval_logits/rejected": 38887682.048, + "eval_logps/chosen": -153.8359375, + "eval_logps/rejected": -148.1899375, + "eval_loss": 0.49953681230545044, + "eval_rewards/chosen": 0.04898439407348633, + "eval_rewards/margins": 0.00370624542236328, + "eval_rewards/rejected": 0.04527814865112305, + "eval_runtime": 217.7826, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 200 + }, + { + "epoch": 0.084, + "grad_norm": 0.3955392837524414, + "kl": 0.3641941249370575, + "learning_rate": 2.09e-06, + "logits/chosen": 34072531.2, + "logits/rejected": 34148444.8, + "logps/chosen": -140.24954833984376, + "logps/rejected": -132.308837890625, + "loss": 0.5002529621124268, + "rewards/chosen": 0.02951604127883911, + "rewards/margins": -0.002015212178230287, + "rewards/rejected": 0.0315312534570694, + "step": 210 + }, + { + "epoch": 0.088, + "grad_norm": 0.3798522651195526, + "kl": 0.2939055263996124, + "learning_rate": 2.19e-06, + "logits/chosen": 35659238.4, + "logits/rejected": 36517523.2, + "logps/chosen": -101.61099853515626, + "logps/rejected": -126.22640380859374, + "loss": 0.500004768371582, + "rewards/chosen": 0.023823246359825134, + "rewards/margins": -4.143416881561418e-05, + "rewards/rejected": 0.02386468052864075, + "step": 220 + }, + { + "epoch": 0.092, + "grad_norm": 0.4183703660964966, + "kl": 0.16193707287311554, + "learning_rate": 2.29e-06, + "logits/chosen": 47875980.8, + "logits/rejected": 46433056.0, + "logps/chosen": -185.33223876953124, + "logps/rejected": -163.7679443359375, + "loss": 0.5001229286193848, + "rewards/chosen": 0.002850056067109108, + "rewards/margins": -0.0010004475712776183, + "rewards/rejected": 0.003850503638386726, + "step": 230 + }, + { + "epoch": 0.096, + "grad_norm": 0.41121652722358704, + "kl": 0.2479170858860016, + "learning_rate": 2.39e-06, + "logits/chosen": 48178169.6, + "logits/rejected": 48277104.0, + "logps/chosen": -176.87537841796876, + "logps/rejected": -166.7927978515625, + "loss": 0.5001242637634278, + "rewards/chosen": 0.015346670150756836, + "rewards/margins": -0.0010146483778953556, + "rewards/rejected": 0.01636131852865219, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.49889811873435974, + "kl": 0.2833125591278076, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 37073097.6, + "logits/rejected": 35158246.4, + "logps/chosen": -176.40118408203125, + "logps/rejected": -145.24967041015626, + "loss": 0.49751877784729004, + "rewards/chosen": 0.03347367346286774, + "rewards/margins": 0.01985820829868317, + "rewards/rejected": 0.01361546516418457, + "step": 250 + }, + { + "epoch": 0.104, + "grad_norm": 0.30958813428878784, + "kl": 0.262741357088089, + "learning_rate": 2.59e-06, + "logits/chosen": 30595280.0, + "logits/rejected": 29650486.4, + "logps/chosen": -129.1654296875, + "logps/rejected": -131.6908935546875, + "loss": 0.4989294528961182, + "rewards/chosen": 0.02159818708896637, + "rewards/margins": 0.008565258979797364, + "rewards/rejected": 0.013032928109169006, + "step": 260 + }, + { + "epoch": 0.108, + "grad_norm": 0.5211781859397888, + "kl": 0.45164403319358826, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 39772844.8, + "logits/rejected": 40001942.4, + "logps/chosen": -157.1872314453125, + "logps/rejected": -151.93499755859375, + "loss": 0.498414421081543, + "rewards/chosen": 0.04374273419380188, + "rewards/margins": 0.012685334682464602, + "rewards/rejected": 0.03105739951133728, + "step": 270 + }, + { + "epoch": 0.112, + "grad_norm": 0.4277898073196411, + "kl": 0.4805964529514313, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 33355596.8, + "logits/rejected": 27445552.0, + "logps/chosen": -167.81568603515626, + "logps/rejected": -140.00860595703125, + "loss": 0.5012622356414795, + "rewards/chosen": 0.035515934228897095, + "rewards/margins": -0.010110187530517581, + "rewards/rejected": 0.045626121759414676, + "step": 280 + }, + { + "epoch": 0.116, + "grad_norm": 0.34256765246391296, + "kl": 0.7766927480697632, + "learning_rate": 2.89e-06, + "logits/chosen": 31572070.4, + "logits/rejected": 30722460.8, + "logps/chosen": -145.83292236328126, + "logps/rejected": -147.77294921875, + "loss": 0.4969996452331543, + "rewards/chosen": 0.08674753904342651, + "rewards/margins": 0.024010515213012687, + "rewards/rejected": 0.06273702383041382, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.5291184782981873, + "kl": 0.9063084721565247, + "learning_rate": 2.99e-06, + "logits/chosen": 42822204.8, + "logits/rejected": 43076217.6, + "logps/chosen": -168.21903076171876, + "logps/rejected": -179.7796630859375, + "loss": 0.49880061149597166, + "rewards/chosen": 0.07154507040977479, + "rewards/margins": 0.009801769256591805, + "rewards/rejected": 0.06174330115318298, + "step": 300 + }, + { + "epoch": 0.124, + "grad_norm": 0.37273386120796204, + "kl": 0.7940840125083923, + "learning_rate": 3.09e-06, + "logits/chosen": 34380668.8, + "logits/rejected": 35391296.0, + "logps/chosen": -146.24031982421874, + "logps/rejected": -155.27100830078126, + "loss": 0.501917552947998, + "rewards/chosen": 0.06237313747406006, + "rewards/margins": -0.015355908870697023, + "rewards/rejected": 0.07772904634475708, + "step": 310 + }, + { + "epoch": 0.128, + "grad_norm": 0.606876015663147, + "kl": 0.3188292682170868, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 38069395.2, + "logits/rejected": 35962320.0, + "logps/chosen": -160.331884765625, + "logps/rejected": -134.08292236328126, + "loss": 0.5004048347473145, + "rewards/chosen": -0.0012070264667272568, + "rewards/margins": -0.0032936643809080126, + "rewards/rejected": 0.0020866379141807555, + "step": 320 + }, + { + "epoch": 0.132, + "grad_norm": 0.37304550409317017, + "kl": 0.49732810258865356, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 47658496.0, + "logits/rejected": 44939072.0, + "logps/chosen": -153.3678955078125, + "logps/rejected": -144.11009521484374, + "loss": 0.49539766311645506, + "rewards/chosen": 0.047433477640151975, + "rewards/margins": 0.036879205703735346, + "rewards/rejected": 0.010554271936416625, + "step": 330 + }, + { + "epoch": 0.136, + "grad_norm": 0.39304211735725403, + "kl": 0.6668508052825928, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 40542966.4, + "logits/rejected": 41501622.4, + "logps/chosen": -134.35115966796874, + "logps/rejected": -143.7405029296875, + "loss": 0.500092887878418, + "rewards/chosen": 0.05502796769142151, + "rewards/margins": -0.0008756637573242229, + "rewards/rejected": 0.05590363144874573, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 0.40085893869400024, + "kl": 1.0664705038070679, + "learning_rate": 3.49e-06, + "logits/chosen": 38259638.4, + "logits/rejected": 37112304.0, + "logps/chosen": -134.30029296875, + "logps/rejected": -169.855859375, + "loss": 0.501033353805542, + "rewards/chosen": 0.09948489665985108, + "rewards/margins": -0.008264517784118644, + "rewards/rejected": 0.10774941444396972, + "step": 350 + }, + { + "epoch": 0.144, + "grad_norm": 0.3823564350605011, + "kl": 0.684799075126648, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 41781318.4, + "logits/rejected": 39330675.2, + "logps/chosen": -147.53521728515625, + "logps/rejected": -112.3668701171875, + "loss": 0.4975080966949463, + "rewards/chosen": 0.07126325964927674, + "rewards/margins": 0.019923430681228642, + "rewards/rejected": 0.0513398289680481, + "step": 360 + }, + { + "epoch": 0.148, + "grad_norm": 0.3882247507572174, + "kl": 0.9012428522109985, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 38998243.2, + "logits/rejected": 39052438.4, + "logps/chosen": -146.253564453125, + "logps/rejected": -150.4812744140625, + "loss": 0.5019874572753906, + "rewards/chosen": 0.07476127743721009, + "rewards/margins": -0.01590984463691711, + "rewards/rejected": 0.0906711220741272, + "step": 370 + }, + { + "epoch": 0.152, + "grad_norm": 0.6131926774978638, + "kl": 1.3055822849273682, + "learning_rate": 3.79e-06, + "logits/chosen": 34907878.4, + "logits/rejected": 35887766.4, + "logps/chosen": -155.17113037109374, + "logps/rejected": -166.9649658203125, + "loss": 0.501701831817627, + "rewards/chosen": 0.11749210357666015, + "rewards/margins": -0.013779759407043457, + "rewards/rejected": 0.1312718629837036, + "step": 380 + }, + { + "epoch": 0.156, + "grad_norm": 0.40174201130867004, + "kl": 1.3810280561447144, + "learning_rate": 3.89e-06, + "logits/chosen": 32044432.0, + "logits/rejected": 31293644.8, + "logps/chosen": -174.93670654296875, + "logps/rejected": -149.1033935546875, + "loss": 0.49690823554992675, + "rewards/chosen": 0.14782886505126952, + "rewards/margins": 0.024736273288726796, + "rewards/rejected": 0.12309259176254272, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 0.38557207584381104, + "kl": 1.346494197845459, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 43779216.0, + "logits/rejected": 41962329.6, + "logps/chosen": -148.337939453125, + "logps/rejected": -134.69471435546876, + "loss": 0.49627056121826174, + "rewards/chosen": 0.14792776107788086, + "rewards/margins": 0.029889833927154538, + "rewards/rejected": 0.11803792715072632, + "step": 400 + }, + { + "epoch": 0.16, + "eval_kl": 1.5156359672546387, + "eval_logits/chosen": 39535529.984, + "eval_logits/rejected": 39357890.56, + "eval_logps/chosen": -152.728359375, + "eval_logps/rejected": -147.235078125, + "eval_loss": 0.4976339638233185, + "eval_rewards/chosen": 0.15974119567871095, + "eval_rewards/margins": 0.01897695922851564, + "eval_rewards/rejected": 0.1407642364501953, + "eval_runtime": 217.4122, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 400 + }, + { + "epoch": 0.164, + "grad_norm": 0.3281092047691345, + "kl": 1.7389549016952515, + "learning_rate": 4.09e-06, + "logits/chosen": 44133203.2, + "logits/rejected": 42124723.2, + "logps/chosen": -146.7071044921875, + "logps/rejected": -147.72525634765626, + "loss": 0.4964505672454834, + "rewards/chosen": 0.1881537079811096, + "rewards/margins": 0.028516340255737294, + "rewards/rejected": 0.15963736772537232, + "step": 410 + }, + { + "epoch": 0.168, + "grad_norm": 0.5361565947532654, + "kl": 1.5194333791732788, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 35550390.4, + "logits/rejected": 33523052.8, + "logps/chosen": -161.477978515625, + "logps/rejected": -130.558740234375, + "loss": 0.4951611518859863, + "rewards/chosen": 0.16986674070358276, + "rewards/margins": 0.0387694001197815, + "rewards/rejected": 0.13109734058380126, + "step": 420 + }, + { + "epoch": 0.172, + "grad_norm": 0.3820321559906006, + "kl": 2.423292875289917, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 38340924.8, + "logits/rejected": 34324185.6, + "logps/chosen": -168.2252685546875, + "logps/rejected": -117.912451171875, + "loss": 0.4928645133972168, + "rewards/chosen": 0.2708438873291016, + "rewards/margins": 0.05714957714080812, + "rewards/rejected": 0.21369431018829346, + "step": 430 + }, + { + "epoch": 0.176, + "grad_norm": 0.5985101461410522, + "kl": 3.3684749603271484, + "learning_rate": 4.39e-06, + "logits/chosen": 43800515.2, + "logits/rejected": 45902227.2, + "logps/chosen": -145.99703369140624, + "logps/rejected": -166.97867431640626, + "loss": 0.5005404472351074, + "rewards/chosen": 0.3345966339111328, + "rewards/margins": -0.004501628875732411, + "rewards/rejected": 0.3390982627868652, + "step": 440 + }, + { + "epoch": 0.18, + "grad_norm": 0.38066738843917847, + "kl": 3.7418315410614014, + "learning_rate": 4.49e-06, + "logits/chosen": 35791923.2, + "logits/rejected": 39352179.2, + "logps/chosen": -95.49920654296875, + "logps/rejected": -151.85885009765624, + "loss": 0.5004417419433593, + "rewards/chosen": 0.3723719596862793, + "rewards/margins": -0.00362257957458495, + "rewards/rejected": 0.37599453926086424, + "step": 450 + }, + { + "epoch": 0.184, + "grad_norm": 0.44641122221946716, + "kl": 4.462111473083496, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 47153696.0, + "logits/rejected": 45399382.4, + "logps/chosen": -160.3977294921875, + "logps/rejected": -176.209765625, + "loss": 0.5064189434051514, + "rewards/chosen": 0.420426607131958, + "rewards/margins": -0.05156884193420408, + "rewards/rejected": 0.4719954490661621, + "step": 460 + }, + { + "epoch": 0.188, + "grad_norm": 0.8194193840026855, + "kl": 3.437168836593628, + "learning_rate": 4.69e-06, + "logits/chosen": 56483507.2, + "logits/rejected": 53677011.2, + "logps/chosen": -170.1270751953125, + "logps/rejected": -174.05128173828126, + "loss": 0.4986457824707031, + "rewards/chosen": 0.3491526126861572, + "rewards/margins": 0.010871815681457508, + "rewards/rejected": 0.3382807970046997, + "step": 470 + }, + { + "epoch": 0.192, + "grad_norm": 0.4502066373825073, + "kl": 2.8006443977355957, + "learning_rate": 4.79e-06, + "logits/chosen": 44358038.4, + "logits/rejected": 43814537.6, + "logps/chosen": -149.608935546875, + "logps/rejected": -158.89376220703124, + "loss": 0.49710774421691895, + "rewards/chosen": 0.29175291061401365, + "rewards/margins": 0.02337703704833982, + "rewards/rejected": 0.26837587356567383, + "step": 480 + }, + { + "epoch": 0.196, + "grad_norm": 0.3172740638256073, + "kl": 2.634169816970825, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 30142633.6, + "logits/rejected": 28152640.0, + "logps/chosen": -146.2323974609375, + "logps/rejected": -144.8813720703125, + "loss": 0.49065570831298827, + "rewards/chosen": 0.30094659328460693, + "rewards/margins": 0.07505896091461181, + "rewards/rejected": 0.22588763236999512, + "step": 490 + }, + { + "epoch": 0.2, + "grad_norm": 0.5071095824241638, + "kl": 4.3001179695129395, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 45352835.2, + "logits/rejected": 41344652.8, + "logps/chosen": -177.9995361328125, + "logps/rejected": -128.72022705078126, + "loss": 0.48739986419677733, + "rewards/chosen": 0.48075294494628906, + "rewards/margins": 0.10148224830627439, + "rewards/rejected": 0.37927069664001467, + "step": 500 + }, + { + "epoch": 0.204, + "grad_norm": 0.393530011177063, + "kl": 3.5296618938446045, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 47881750.4, + "logits/rejected": 49526156.8, + "logps/chosen": -143.6640625, + "logps/rejected": -156.88994140625, + "loss": 0.49992995262145995, + "rewards/chosen": 0.3532871723175049, + "rewards/margins": 0.0006417512893676647, + "rewards/rejected": 0.35264542102813723, + "step": 510 + }, + { + "epoch": 0.208, + "grad_norm": 0.3692869544029236, + "kl": 4.48037576675415, + "learning_rate": 4.978888888888889e-06, + "logits/chosen": 46837849.6, + "logits/rejected": 45857177.6, + "logps/chosen": -154.83260498046874, + "logps/rejected": -160.442333984375, + "loss": 0.495820426940918, + "rewards/chosen": 0.464794921875, + "rewards/margins": 0.03351507186889646, + "rewards/rejected": 0.4312798500061035, + "step": 520 + }, + { + "epoch": 0.212, + "grad_norm": 0.44535931944847107, + "kl": 4.121534824371338, + "learning_rate": 4.967777777777778e-06, + "logits/chosen": 42945225.6, + "logits/rejected": 43357875.2, + "logps/chosen": -138.18310546875, + "logps/rejected": -172.6733154296875, + "loss": 0.5080226421356201, + "rewards/chosen": 0.3798489570617676, + "rewards/margins": -0.06460924148559571, + "rewards/rejected": 0.4444581985473633, + "step": 530 + }, + { + "epoch": 0.216, + "grad_norm": 0.5619053840637207, + "kl": 4.352797031402588, + "learning_rate": 4.956666666666667e-06, + "logits/chosen": 34937552.0, + "logits/rejected": 34883318.4, + "logps/chosen": -147.39837646484375, + "logps/rejected": -154.47596435546876, + "loss": 0.49129457473754884, + "rewards/chosen": 0.4700439929962158, + "rewards/margins": 0.0695285320281982, + "rewards/rejected": 0.4005154609680176, + "step": 540 + }, + { + "epoch": 0.22, + "grad_norm": 0.4256366193294525, + "kl": 3.3400237560272217, + "learning_rate": 4.945555555555557e-06, + "logits/chosen": 41670598.4, + "logits/rejected": 43236768.0, + "logps/chosen": -152.20511474609376, + "logps/rejected": -165.210205078125, + "loss": 0.4960598945617676, + "rewards/chosen": 0.3506686449050903, + "rewards/margins": 0.03333282470703125, + "rewards/rejected": 0.31733582019805906, + "step": 550 + }, + { + "epoch": 0.224, + "grad_norm": 0.42866551876068115, + "kl": 3.0413570404052734, + "learning_rate": 4.934444444444445e-06, + "logits/chosen": 36545302.4, + "logits/rejected": 34813177.6, + "logps/chosen": -161.16314697265625, + "logps/rejected": -148.3569091796875, + "loss": 0.4982303619384766, + "rewards/chosen": 0.2904952049255371, + "rewards/margins": 0.014188337326049794, + "rewards/rejected": 0.2763068675994873, + "step": 560 + }, + { + "epoch": 0.228, + "grad_norm": 0.3665854334831238, + "kl": 2.66752290725708, + "learning_rate": 4.923333333333334e-06, + "logits/chosen": 41975648.0, + "logits/rejected": 40743257.6, + "logps/chosen": -147.2247802734375, + "logps/rejected": -131.81417236328124, + "loss": 0.4888582706451416, + "rewards/chosen": 0.3010892391204834, + "rewards/margins": 0.08992741107940674, + "rewards/rejected": 0.21116182804107667, + "step": 570 + }, + { + "epoch": 0.232, + "grad_norm": 0.42764145135879517, + "kl": 2.8396944999694824, + "learning_rate": 4.912222222222223e-06, + "logits/chosen": 47665238.4, + "logits/rejected": 46761827.2, + "logps/chosen": -147.21837158203124, + "logps/rejected": -156.8475830078125, + "loss": 0.4951943874359131, + "rewards/chosen": 0.2681096315383911, + "rewards/margins": 0.03818519115447999, + "rewards/rejected": 0.22992444038391113, + "step": 580 + }, + { + "epoch": 0.236, + "grad_norm": 0.45218735933303833, + "kl": 2.9479668140411377, + "learning_rate": 4.901111111111112e-06, + "logits/chosen": 30179158.4, + "logits/rejected": 30914195.2, + "logps/chosen": -128.025927734375, + "logps/rejected": -133.37138671875, + "loss": 0.4864190101623535, + "rewards/chosen": 0.3341956615447998, + "rewards/margins": 0.1105940818786621, + "rewards/rejected": 0.2236015796661377, + "step": 590 + }, + { + "epoch": 0.24, + "grad_norm": 0.5611497759819031, + "kl": 2.7078356742858887, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 29134601.6, + "logits/rejected": 31641536.0, + "logps/chosen": -147.10302734375, + "logps/rejected": -148.70350341796876, + "loss": 0.5095005035400391, + "rewards/chosen": 0.1935347557067871, + "rewards/margins": -0.07780742645263675, + "rewards/rejected": 0.27134218215942385, + "step": 600 + }, + { + "epoch": 0.24, + "eval_kl": 2.6268301010131836, + "eval_logits/chosen": 38577520.64, + "eval_logits/rejected": 38429237.248, + "eval_logps/chosen": -151.8366875, + "eval_logps/rejected": -146.522453125, + "eval_loss": 0.49543091654777527, + "eval_rewards/chosen": 0.24890777587890625, + "eval_rewards/margins": 0.036881546020507805, + "eval_rewards/rejected": 0.21202622985839845, + "eval_runtime": 216.8269, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 600 + }, + { + "epoch": 0.244, + "grad_norm": 0.43945616483688354, + "kl": 3.3616116046905518, + "learning_rate": 4.878888888888889e-06, + "logits/chosen": 45687324.8, + "logits/rejected": 41788883.2, + "logps/chosen": -193.040185546875, + "logps/rejected": -168.3222900390625, + "loss": 0.48148174285888673, + "rewards/chosen": 0.35201478004455566, + "rewards/margins": 0.17267082929611205, + "rewards/rejected": 0.17934395074844361, + "step": 610 + }, + { + "epoch": 0.248, + "grad_norm": 0.47497129440307617, + "kl": 3.055345058441162, + "learning_rate": 4.867777777777778e-06, + "logits/chosen": 27710918.4, + "logits/rejected": 26318662.4, + "logps/chosen": -138.644775390625, + "logps/rejected": -134.76346435546876, + "loss": 0.4866151809692383, + "rewards/chosen": 0.3442774772644043, + "rewards/margins": 0.10848057270050046, + "rewards/rejected": 0.23579690456390381, + "step": 620 + }, + { + "epoch": 0.252, + "grad_norm": 0.4793793559074402, + "kl": 3.6866455078125, + "learning_rate": 4.856666666666667e-06, + "logits/chosen": 39017129.6, + "logits/rejected": 41635366.4, + "logps/chosen": -139.60113525390625, + "logps/rejected": -171.87177734375, + "loss": 0.5033475399017334, + "rewards/chosen": 0.33596067428588866, + "rewards/margins": -0.02700204849243165, + "rewards/rejected": 0.3629627227783203, + "step": 630 + }, + { + "epoch": 0.256, + "grad_norm": 0.5821816921234131, + "kl": 3.2494399547576904, + "learning_rate": 4.845555555555556e-06, + "logits/chosen": 41812588.8, + "logits/rejected": 40030470.4, + "logps/chosen": -188.748583984375, + "logps/rejected": -149.2949951171875, + "loss": 0.4903052806854248, + "rewards/chosen": 0.3278029918670654, + "rewards/margins": 0.08044664859771727, + "rewards/rejected": 0.24735634326934813, + "step": 640 + }, + { + "epoch": 0.26, + "grad_norm": 0.4971711337566376, + "kl": 3.889043092727661, + "learning_rate": 4.834444444444445e-06, + "logits/chosen": 43703507.2, + "logits/rejected": 42196211.2, + "logps/chosen": -125.8577392578125, + "logps/rejected": -132.8236328125, + "loss": 0.4952712535858154, + "rewards/chosen": 0.3801560878753662, + "rewards/margins": 0.03659126758575437, + "rewards/rejected": 0.3435648202896118, + "step": 650 + }, + { + "epoch": 0.264, + "grad_norm": 0.37012672424316406, + "kl": 2.9793968200683594, + "learning_rate": 4.8233333333333335e-06, + "logits/chosen": 57068806.4, + "logits/rejected": 55593145.6, + "logps/chosen": -168.82774658203124, + "logps/rejected": -132.77369384765626, + "loss": 0.4897792339324951, + "rewards/chosen": 0.3094865083694458, + "rewards/margins": 0.08172969818115233, + "rewards/rejected": 0.22775681018829347, + "step": 660 + }, + { + "epoch": 0.268, + "grad_norm": 0.5025138258934021, + "kl": 3.803910493850708, + "learning_rate": 4.812222222222222e-06, + "logits/chosen": 42714249.6, + "logits/rejected": 43013395.2, + "logps/chosen": -180.80955810546874, + "logps/rejected": -182.71279296875, + "loss": 0.48125367164611815, + "rewards/chosen": 0.4121575832366943, + "rewards/margins": 0.15600955486297607, + "rewards/rejected": 0.25614802837371825, + "step": 670 + }, + { + "epoch": 0.272, + "grad_norm": 0.47364944219589233, + "kl": 2.744506597518921, + "learning_rate": 4.801111111111111e-06, + "logits/chosen": 41662313.6, + "logits/rejected": 40533548.8, + "logps/chosen": -143.998193359375, + "logps/rejected": -125.735693359375, + "loss": 0.493405818939209, + "rewards/chosen": 0.2640446662902832, + "rewards/margins": 0.05262007713317873, + "rewards/rejected": 0.2114245891571045, + "step": 680 + }, + { + "epoch": 0.276, + "grad_norm": 0.378603994846344, + "kl": 4.192839622497559, + "learning_rate": 4.79e-06, + "logits/chosen": 42741641.6, + "logits/rejected": 40729804.8, + "logps/chosen": -156.440087890625, + "logps/rejected": -175.48450927734376, + "loss": 0.4940618991851807, + "rewards/chosen": 0.42174320220947265, + "rewards/margins": 0.04788670539855955, + "rewards/rejected": 0.3738564968109131, + "step": 690 + }, + { + "epoch": 0.28, + "grad_norm": 0.4181530773639679, + "kl": 3.237612247467041, + "learning_rate": 4.778888888888889e-06, + "logits/chosen": 38295142.4, + "logits/rejected": 35257382.4, + "logps/chosen": -155.112744140625, + "logps/rejected": -143.0890869140625, + "loss": 0.49517078399658204, + "rewards/chosen": 0.2904268026351929, + "rewards/margins": 0.041890859603881836, + "rewards/rejected": 0.24853594303131105, + "step": 700 + }, + { + "epoch": 0.284, + "grad_norm": 0.41070079803466797, + "kl": 4.423883438110352, + "learning_rate": 4.767777777777778e-06, + "logits/chosen": 40968300.8, + "logits/rejected": 39222742.4, + "logps/chosen": -172.097021484375, + "logps/rejected": -133.7148193359375, + "loss": 0.4814108371734619, + "rewards/chosen": 0.5093639373779297, + "rewards/margins": 0.1534171581268311, + "rewards/rejected": 0.35594677925109863, + "step": 710 + }, + { + "epoch": 0.288, + "grad_norm": 0.374776691198349, + "kl": 3.082357406616211, + "learning_rate": 4.756666666666667e-06, + "logits/chosen": 34210566.4, + "logits/rejected": 35579948.8, + "logps/chosen": -120.33433837890625, + "logps/rejected": -122.443017578125, + "loss": 0.5063377857208252, + "rewards/chosen": 0.1743820548057556, + "rewards/margins": -0.07105478048324584, + "rewards/rejected": 0.24543683528900145, + "step": 720 + }, + { + "epoch": 0.292, + "grad_norm": 0.3790406882762909, + "kl": 3.781698226928711, + "learning_rate": 4.745555555555556e-06, + "logits/chosen": 37423852.8, + "logits/rejected": 34581126.4, + "logps/chosen": -152.25120849609374, + "logps/rejected": -154.15382080078126, + "loss": 0.493280029296875, + "rewards/chosen": 0.39404921531677245, + "rewards/margins": 0.05465142726898192, + "rewards/rejected": 0.3393977880477905, + "step": 730 + }, + { + "epoch": 0.296, + "grad_norm": 0.47025066614151, + "kl": 4.019077301025391, + "learning_rate": 4.734444444444445e-06, + "logits/chosen": 34749833.6, + "logits/rejected": 33696905.6, + "logps/chosen": -156.05477294921874, + "logps/rejected": -182.29608154296875, + "loss": 0.49593114852905273, + "rewards/chosen": 0.34479031562805174, + "rewards/margins": 0.04153461456298824, + "rewards/rejected": 0.3032557010650635, + "step": 740 + }, + { + "epoch": 0.3, + "grad_norm": 0.2891245484352112, + "kl": 2.7223880290985107, + "learning_rate": 4.7233333333333336e-06, + "logits/chosen": 39233993.6, + "logits/rejected": 38955433.6, + "logps/chosen": -159.49801025390624, + "logps/rejected": -183.92698974609374, + "loss": 0.5023125648498535, + "rewards/chosen": 0.09982055425643921, + "rewards/margins": 0.004025018215179449, + "rewards/rejected": 0.09579553604125976, + "step": 750 + }, + { + "epoch": 0.304, + "grad_norm": 0.4833555817604065, + "kl": 1.6150490045547485, + "learning_rate": 4.712222222222222e-06, + "logits/chosen": 36941795.2, + "logits/rejected": 37837513.6, + "logps/chosen": -115.98502197265626, + "logps/rejected": -125.23421630859374, + "loss": 0.487321662902832, + "rewards/chosen": 0.10154855251312256, + "rewards/margins": 0.10329384654760361, + "rewards/rejected": -0.0017452940344810485, + "step": 760 + }, + { + "epoch": 0.308, + "grad_norm": 0.48121944069862366, + "kl": 1.2522486448287964, + "learning_rate": 4.701111111111111e-06, + "logits/chosen": 44757888.0, + "logits/rejected": 45581593.6, + "logps/chosen": -143.2324951171875, + "logps/rejected": -153.2897216796875, + "loss": 0.4856001377105713, + "rewards/chosen": 0.008315862715244293, + "rewards/margins": 0.1255118027329445, + "rewards/rejected": -0.1171959400177002, + "step": 770 + }, + { + "epoch": 0.312, + "grad_norm": 0.4913221001625061, + "kl": 0.6294690370559692, + "learning_rate": 4.69e-06, + "logits/chosen": 41557488.0, + "logits/rejected": 42490796.8, + "logps/chosen": -163.6885498046875, + "logps/rejected": -145.68963623046875, + "loss": 0.48305044174194334, + "rewards/chosen": -0.08929510116577148, + "rewards/margins": 0.14230823516845703, + "rewards/rejected": -0.23160333633422853, + "step": 780 + }, + { + "epoch": 0.316, + "grad_norm": 0.4478524327278137, + "kl": 1.0363415479660034, + "learning_rate": 4.67888888888889e-06, + "logits/chosen": 31436777.6, + "logits/rejected": 27623001.6, + "logps/chosen": -147.31231689453125, + "logps/rejected": -139.38282470703126, + "loss": 0.478118371963501, + "rewards/chosen": -0.18460922241210936, + "rewards/margins": 0.20353126525878906, + "rewards/rejected": -0.3881404876708984, + "step": 790 + }, + { + "epoch": 0.32, + "grad_norm": 0.5910397171974182, + "kl": 0.9226576089859009, + "learning_rate": 4.6677777777777785e-06, + "logits/chosen": 28333865.6, + "logits/rejected": 28280502.4, + "logps/chosen": -150.2372802734375, + "logps/rejected": -135.2026123046875, + "loss": 0.5036224842071533, + "rewards/chosen": -0.28971683979034424, + "rewards/margins": -0.058632898330688465, + "rewards/rejected": -0.23108394145965577, + "step": 800 + }, + { + "epoch": 0.32, + "eval_kl": 1.187766432762146, + "eval_logits/chosen": 34635034.624, + "eval_logits/rejected": 34656247.808, + "eval_logps/chosen": -156.55328125, + "eval_logps/rejected": -151.3069375, + "eval_loss": 0.4932977855205536, + "eval_rewards/chosen": -0.2227491455078125, + "eval_rewards/margins": 0.04367276000976561, + "eval_rewards/rejected": -0.2664219055175781, + "eval_runtime": 216.6151, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 800 + }, + { + "epoch": 0.324, + "grad_norm": 0.5326458811759949, + "kl": 1.5910537242889404, + "learning_rate": 4.656666666666667e-06, + "logits/chosen": 38717907.2, + "logits/rejected": 39118412.8, + "logps/chosen": -159.7564453125, + "logps/rejected": -157.7111572265625, + "loss": 0.49022369384765624, + "rewards/chosen": 0.008166373521089555, + "rewards/margins": 0.09110548570752143, + "rewards/rejected": -0.08293911218643188, + "step": 810 + }, + { + "epoch": 0.328, + "grad_norm": 0.4700395464897156, + "kl": 1.0638387203216553, + "learning_rate": 4.645555555555556e-06, + "logits/chosen": 24480648.0, + "logits/rejected": 24138777.6, + "logps/chosen": -157.56463623046875, + "logps/rejected": -129.4783935546875, + "loss": 0.5027867794036865, + "rewards/chosen": -0.3186595916748047, + "rewards/margins": -0.04232857227325437, + "rewards/rejected": -0.2763310194015503, + "step": 820 + }, + { + "epoch": 0.332, + "grad_norm": 0.5291322469711304, + "kl": 1.9198650121688843, + "learning_rate": 4.634444444444445e-06, + "logits/chosen": 31482019.2, + "logits/rejected": 30204611.2, + "logps/chosen": -160.241064453125, + "logps/rejected": -127.0677490234375, + "loss": 0.49699864387512205, + "rewards/chosen": -0.03406925797462464, + "rewards/margins": 0.020440274477005, + "rewards/rejected": -0.054509532451629636, + "step": 830 + }, + { + "epoch": 0.336, + "grad_norm": 0.5016227960586548, + "kl": 1.5567735433578491, + "learning_rate": 4.623333333333334e-06, + "logits/chosen": 39855260.8, + "logits/rejected": 39533484.8, + "logps/chosen": -151.4185302734375, + "logps/rejected": -151.45257568359375, + "loss": 0.4913910388946533, + "rewards/chosen": -0.0018289029598236085, + "rewards/margins": 0.05514721870422363, + "rewards/rejected": -0.05697612166404724, + "step": 840 + }, + { + "epoch": 0.34, + "grad_norm": 0.5913689732551575, + "kl": 2.8981661796569824, + "learning_rate": 4.6122222222222225e-06, + "logits/chosen": 39872361.6, + "logits/rejected": 38837766.4, + "logps/chosen": -168.816357421875, + "logps/rejected": -189.29534912109375, + "loss": 0.5085726261138916, + "rewards/chosen": -0.016373127698898315, + "rewards/margins": -0.10572689771652222, + "rewards/rejected": 0.0893537700176239, + "step": 850 + }, + { + "epoch": 0.344, + "grad_norm": 0.4795176088809967, + "kl": 1.9737087488174438, + "learning_rate": 4.601111111111112e-06, + "logits/chosen": 31763212.8, + "logits/rejected": 30045996.8, + "logps/chosen": -165.5231689453125, + "logps/rejected": -144.56939697265625, + "loss": 0.4861030101776123, + "rewards/chosen": -0.004905380308628082, + "rewards/margins": 0.10866030305624008, + "rewards/rejected": -0.11356568336486816, + "step": 860 + }, + { + "epoch": 0.348, + "grad_norm": 0.5342633128166199, + "kl": 0.8907498121261597, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 36357731.2, + "logits/rejected": 34329561.6, + "logps/chosen": -139.342138671875, + "logps/rejected": -147.3021728515625, + "loss": 0.4868171691894531, + "rewards/chosen": -0.3624546766281128, + "rewards/margins": 0.06923034191131588, + "rewards/rejected": -0.4316850185394287, + "step": 870 + }, + { + "epoch": 0.352, + "grad_norm": 0.4725877046585083, + "kl": 0.8254868388175964, + "learning_rate": 4.57888888888889e-06, + "logits/chosen": 31605561.6, + "logits/rejected": 30546553.6, + "logps/chosen": -167.25799560546875, + "logps/rejected": -137.0789794921875, + "loss": 0.4910862922668457, + "rewards/chosen": -0.2911639451980591, + "rewards/margins": 0.12216508388519287, + "rewards/rejected": -0.413329029083252, + "step": 880 + }, + { + "epoch": 0.356, + "grad_norm": 0.40740740299224854, + "kl": 0.5278605222702026, + "learning_rate": 4.5677777777777786e-06, + "logits/chosen": 42018956.8, + "logits/rejected": 41023542.4, + "logps/chosen": -135.9039306640625, + "logps/rejected": -137.1427001953125, + "loss": 0.48065881729125975, + "rewards/chosen": -0.33759872913360595, + "rewards/margins": 0.1944932222366333, + "rewards/rejected": -0.5320919513702392, + "step": 890 + }, + { + "epoch": 0.36, + "grad_norm": 0.506528377532959, + "kl": 0.6414504647254944, + "learning_rate": 4.556666666666667e-06, + "logits/chosen": 37310515.2, + "logits/rejected": 38113939.2, + "logps/chosen": -177.546728515625, + "logps/rejected": -165.55384521484376, + "loss": 0.4943400382995605, + "rewards/chosen": -0.6260869026184082, + "rewards/margins": -0.024873304367065363, + "rewards/rejected": -0.6012135982513428, + "step": 900 + }, + { + "epoch": 0.364, + "grad_norm": 0.4898208975791931, + "kl": 0.6840685606002808, + "learning_rate": 4.545555555555556e-06, + "logits/chosen": 31999705.6, + "logits/rejected": 32116691.2, + "logps/chosen": -168.6631591796875, + "logps/rejected": -174.8232421875, + "loss": 0.49452638626098633, + "rewards/chosen": -0.4872349739074707, + "rewards/margins": 0.17413578033447263, + "rewards/rejected": -0.6613707542419434, + "step": 910 + }, + { + "epoch": 0.368, + "grad_norm": 0.42078927159309387, + "kl": 0.8319946527481079, + "learning_rate": 4.534444444444445e-06, + "logits/chosen": 33578412.8, + "logits/rejected": 30967862.4, + "logps/chosen": -136.42371826171876, + "logps/rejected": -151.80224609375, + "loss": 0.4958657741546631, + "rewards/chosen": -0.4272448539733887, + "rewards/margins": 0.1275300025939941, + "rewards/rejected": -0.5547748565673828, + "step": 920 + }, + { + "epoch": 0.372, + "grad_norm": 0.4784943163394928, + "kl": 1.0423786640167236, + "learning_rate": 4.523333333333334e-06, + "logits/chosen": 32027404.8, + "logits/rejected": 32098508.8, + "logps/chosen": -155.27005615234376, + "logps/rejected": -149.925537109375, + "loss": 0.48459978103637696, + "rewards/chosen": -0.22732582092285156, + "rewards/margins": 0.15194621086120605, + "rewards/rejected": -0.3792720317840576, + "step": 930 + }, + { + "epoch": 0.376, + "grad_norm": 0.3698587119579315, + "kl": 1.5603997707366943, + "learning_rate": 4.512222222222223e-06, + "logits/chosen": 27671084.8, + "logits/rejected": 25976814.4, + "logps/chosen": -136.42069091796876, + "logps/rejected": -152.234033203125, + "loss": 0.4825894355773926, + "rewards/chosen": -0.22695178985595704, + "rewards/margins": 0.20094985961914064, + "rewards/rejected": -0.4279016494750977, + "step": 940 + }, + { + "epoch": 0.38, + "grad_norm": 0.40601083636283875, + "kl": 2.8003079891204834, + "learning_rate": 4.501111111111111e-06, + "logits/chosen": 40676198.4, + "logits/rejected": 43776691.2, + "logps/chosen": -183.81981201171874, + "logps/rejected": -158.6890380859375, + "loss": 0.4809588432312012, + "rewards/chosen": -0.0009134054183959961, + "rewards/margins": 0.13371984958648683, + "rewards/rejected": -0.13463325500488282, + "step": 950 + }, + { + "epoch": 0.384, + "grad_norm": 0.5407139658927917, + "kl": 2.086998224258423, + "learning_rate": 4.49e-06, + "logits/chosen": 37516128.0, + "logits/rejected": 39039443.2, + "logps/chosen": -137.39747314453126, + "logps/rejected": -150.29815673828125, + "loss": 0.5065193176269531, + "rewards/chosen": -0.06897132396697998, + "rewards/margins": -0.031788992881774905, + "rewards/rejected": -0.03718233108520508, + "step": 960 + }, + { + "epoch": 0.388, + "grad_norm": 0.4142342209815979, + "kl": 1.7635319232940674, + "learning_rate": 4.478888888888889e-06, + "logits/chosen": 32190848.0, + "logits/rejected": 30587993.6, + "logps/chosen": -135.15673828125, + "logps/rejected": -118.86663818359375, + "loss": 0.48531031608581543, + "rewards/chosen": -0.007679381966590881, + "rewards/margins": 0.15075800120830538, + "rewards/rejected": -0.15843738317489625, + "step": 970 + }, + { + "epoch": 0.392, + "grad_norm": 0.6221582889556885, + "kl": 3.294914722442627, + "learning_rate": 4.467777777777778e-06, + "logits/chosen": 39407718.4, + "logits/rejected": 39707820.8, + "logps/chosen": -138.0949462890625, + "logps/rejected": -157.43453369140624, + "loss": 0.4826664447784424, + "rewards/chosen": 0.24659197330474852, + "rewards/margins": 0.17039816975593566, + "rewards/rejected": 0.07619380354881286, + "step": 980 + }, + { + "epoch": 0.396, + "grad_norm": 0.5434563755989075, + "kl": 1.082279920578003, + "learning_rate": 4.456666666666667e-06, + "logits/chosen": 31821408.0, + "logits/rejected": 31118764.8, + "logps/chosen": -127.49896240234375, + "logps/rejected": -136.4136962890625, + "loss": 0.48195528984069824, + "rewards/chosen": -0.16623904705047607, + "rewards/margins": 0.15431931018829345, + "rewards/rejected": -0.3205583572387695, + "step": 990 + }, + { + "epoch": 0.4, + "grad_norm": 0.37851211428642273, + "kl": 1.7661035060882568, + "learning_rate": 4.4455555555555554e-06, + "logits/chosen": 31584358.4, + "logits/rejected": 32753641.6, + "logps/chosen": -159.625048828125, + "logps/rejected": -122.3124267578125, + "loss": 0.49321880340576174, + "rewards/chosen": -0.08240060806274414, + "rewards/margins": 0.045932340621948245, + "rewards/rejected": -0.1283329486846924, + "step": 1000 + }, + { + "epoch": 0.4, + "eval_kl": 1.6750891208648682, + "eval_logits/chosen": 33939980.288, + "eval_logits/rejected": 34026332.16, + "eval_logps/chosen": -156.401671875, + "eval_logps/rejected": -151.40996875, + "eval_loss": 0.4904634356498718, + "eval_rewards/chosen": -0.20758909606933593, + "eval_rewards/margins": 0.06913508605957033, + "eval_rewards/rejected": -0.27672418212890626, + "eval_runtime": 216.882, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 1000 + }, + { + "epoch": 0.404, + "grad_norm": 0.6070407032966614, + "kl": 2.5559988021850586, + "learning_rate": 4.434444444444444e-06, + "logits/chosen": 27506348.8, + "logits/rejected": 30075392.0, + "logps/chosen": -138.08692626953126, + "logps/rejected": -187.8861083984375, + "loss": 0.4917243480682373, + "rewards/chosen": -0.13809033632278442, + "rewards/margins": 0.05099650621414184, + "rewards/rejected": -0.18908684253692626, + "step": 1010 + }, + { + "epoch": 0.408, + "grad_norm": 0.5809450149536133, + "kl": 2.151444911956787, + "learning_rate": 4.423333333333334e-06, + "logits/chosen": 28877452.8, + "logits/rejected": 28118339.2, + "logps/chosen": -154.03690185546876, + "logps/rejected": -139.09736328125, + "loss": 0.4888266086578369, + "rewards/chosen": -0.16468460559844972, + "rewards/margins": 0.13210070133209229, + "rewards/rejected": -0.296785306930542, + "step": 1020 + }, + { + "epoch": 0.412, + "grad_norm": 0.4774882197380066, + "kl": 2.590153217315674, + "learning_rate": 4.412222222222223e-06, + "logits/chosen": 33996892.8, + "logits/rejected": 33160086.4, + "logps/chosen": -154.9474609375, + "logps/rejected": -153.31983642578126, + "loss": 0.47945427894592285, + "rewards/chosen": -0.007390469312667847, + "rewards/margins": 0.16412567496299743, + "rewards/rejected": -0.17151614427566528, + "step": 1030 + }, + { + "epoch": 0.416, + "grad_norm": 0.5751529335975647, + "kl": 2.692246913909912, + "learning_rate": 4.4011111111111115e-06, + "logits/chosen": 27060502.4, + "logits/rejected": 26739710.4, + "logps/chosen": -191.5754150390625, + "logps/rejected": -115.40594482421875, + "loss": 0.504734468460083, + "rewards/chosen": -0.14675636291503907, + "rewards/margins": -0.07811862826347352, + "rewards/rejected": -0.06863773465156556, + "step": 1040 + }, + { + "epoch": 0.42, + "grad_norm": 0.5552707314491272, + "kl": 2.183612108230591, + "learning_rate": 4.39e-06, + "logits/chosen": 30119641.6, + "logits/rejected": 27840016.0, + "logps/chosen": -129.14801025390625, + "logps/rejected": -159.921337890625, + "loss": 0.4975595951080322, + "rewards/chosen": -0.11868793964385986, + "rewards/margins": 0.03744263648986816, + "rewards/rejected": -0.15613057613372802, + "step": 1050 + }, + { + "epoch": 0.424, + "grad_norm": 0.4398513436317444, + "kl": 2.019963026046753, + "learning_rate": 4.378888888888889e-06, + "logits/chosen": 39605126.4, + "logits/rejected": 37338668.8, + "logps/chosen": -173.10638427734375, + "logps/rejected": -188.400146484375, + "loss": 0.5177321434020996, + "rewards/chosen": -0.252044153213501, + "rewards/margins": -0.1198780655860901, + "rewards/rejected": -0.1321660876274109, + "step": 1060 + }, + { + "epoch": 0.428, + "grad_norm": 0.6157165765762329, + "kl": 1.4567959308624268, + "learning_rate": 4.367777777777778e-06, + "logits/chosen": 31087238.4, + "logits/rejected": 32085881.6, + "logps/chosen": -145.3509521484375, + "logps/rejected": -170.3815185546875, + "loss": 0.4887071132659912, + "rewards/chosen": -0.3054164171218872, + "rewards/margins": 0.236836838722229, + "rewards/rejected": -0.5422532558441162, + "step": 1070 + }, + { + "epoch": 0.432, + "grad_norm": 0.3502284288406372, + "kl": 0.8787339925765991, + "learning_rate": 4.356666666666667e-06, + "logits/chosen": 34486451.2, + "logits/rejected": 36169574.4, + "logps/chosen": -158.5971923828125, + "logps/rejected": -139.94036865234375, + "loss": 0.5054500579833985, + "rewards/chosen": -0.45778846740722656, + "rewards/margins": -0.09344666004180907, + "rewards/rejected": -0.3643418073654175, + "step": 1080 + }, + { + "epoch": 0.436, + "grad_norm": 0.624359667301178, + "kl": 0.525427520275116, + "learning_rate": 4.3455555555555555e-06, + "logits/chosen": 26498083.2, + "logits/rejected": 25839392.0, + "logps/chosen": -149.42689208984376, + "logps/rejected": -118.26563720703125, + "loss": 0.5076635360717774, + "rewards/chosen": -0.557512617111206, + "rewards/margins": -0.06046972274780271, + "rewards/rejected": -0.4970428943634033, + "step": 1090 + }, + { + "epoch": 0.44, + "grad_norm": 0.549114465713501, + "kl": 0.8173803091049194, + "learning_rate": 4.334444444444445e-06, + "logits/chosen": 34397792.0, + "logits/rejected": 33410729.6, + "logps/chosen": -140.214111328125, + "logps/rejected": -176.107958984375, + "loss": 0.48507490158081057, + "rewards/chosen": -0.4220071792602539, + "rewards/margins": 0.19281878471374514, + "rewards/rejected": -0.614825963973999, + "step": 1100 + }, + { + "epoch": 0.444, + "grad_norm": 0.5036312937736511, + "kl": 0.7975673675537109, + "learning_rate": 4.323333333333334e-06, + "logits/chosen": 36466489.6, + "logits/rejected": 38251465.6, + "logps/chosen": -120.687255859375, + "logps/rejected": -187.36099853515626, + "loss": 0.5016714572906494, + "rewards/chosen": -0.4559361457824707, + "rewards/margins": -0.0054581642150878795, + "rewards/rejected": -0.45047798156738283, + "step": 1110 + }, + { + "epoch": 0.448, + "grad_norm": 0.5358121395111084, + "kl": 1.3897031545639038, + "learning_rate": 4.312222222222223e-06, + "logits/chosen": 46269334.4, + "logits/rejected": 45639856.0, + "logps/chosen": -151.86192626953124, + "logps/rejected": -165.55286865234376, + "loss": 0.4728604793548584, + "rewards/chosen": -0.23922853469848632, + "rewards/margins": 0.28858757019042974, + "rewards/rejected": -0.5278161048889161, + "step": 1120 + }, + { + "epoch": 0.452, + "grad_norm": 0.5269862413406372, + "kl": 1.1441795825958252, + "learning_rate": 4.301111111111112e-06, + "logits/chosen": 35708649.6, + "logits/rejected": 34836294.4, + "logps/chosen": -183.39061279296874, + "logps/rejected": -150.405078125, + "loss": 0.4849276065826416, + "rewards/chosen": -0.40422697067260743, + "rewards/margins": 0.15558710098266598, + "rewards/rejected": -0.5598140716552734, + "step": 1130 + }, + { + "epoch": 0.456, + "grad_norm": 0.3800269067287445, + "kl": 0.8876265287399292, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 22079545.6, + "logits/rejected": 22083444.8, + "logps/chosen": -109.09158935546876, + "logps/rejected": -147.06978759765624, + "loss": 0.4905365467071533, + "rewards/chosen": -0.43719801902770994, + "rewards/margins": 0.08943343162536621, + "rewards/rejected": -0.5266314506530761, + "step": 1140 + }, + { + "epoch": 0.46, + "grad_norm": 0.4693025052547455, + "kl": 0.49091872572898865, + "learning_rate": 4.278888888888889e-06, + "logits/chosen": 42701616.0, + "logits/rejected": 40578803.2, + "logps/chosen": -223.8361328125, + "logps/rejected": -172.5617431640625, + "loss": 0.4969001293182373, + "rewards/chosen": -0.9799749374389648, + "rewards/margins": 0.08639993667602552, + "rewards/rejected": -1.0663748741149903, + "step": 1150 + }, + { + "epoch": 0.464, + "grad_norm": 0.4174056053161621, + "kl": 0.7064284682273865, + "learning_rate": 4.267777777777778e-06, + "logits/chosen": 26652801.6, + "logits/rejected": 24801236.8, + "logps/chosen": -138.50279541015624, + "logps/rejected": -171.82225341796874, + "loss": 0.4671950817108154, + "rewards/chosen": -0.5919324398040772, + "rewards/margins": 0.4853674411773682, + "rewards/rejected": -1.0772998809814454, + "step": 1160 + }, + { + "epoch": 0.468, + "grad_norm": 0.629512369632721, + "kl": 1.179760217666626, + "learning_rate": 4.256666666666668e-06, + "logits/chosen": 28567804.8, + "logits/rejected": 29090739.2, + "logps/chosen": -140.1174560546875, + "logps/rejected": -165.07166748046876, + "loss": 0.49239435195922854, + "rewards/chosen": -0.8533164024353027, + "rewards/margins": -0.06229524612426751, + "rewards/rejected": -0.7910211563110352, + "step": 1170 + }, + { + "epoch": 0.472, + "grad_norm": 0.4868221580982208, + "kl": 0.9739119410514832, + "learning_rate": 4.2455555555555565e-06, + "logits/chosen": 30410720.0, + "logits/rejected": 28420300.8, + "logps/chosen": -140.90198974609376, + "logps/rejected": -170.1091552734375, + "loss": 0.48679437637329104, + "rewards/chosen": -0.551117992401123, + "rewards/margins": 0.2831212997436524, + "rewards/rejected": -0.8342392921447754, + "step": 1180 + }, + { + "epoch": 0.476, + "grad_norm": 0.47686856985092163, + "kl": 0.48265019059181213, + "learning_rate": 4.234444444444445e-06, + "logits/chosen": 29930240.0, + "logits/rejected": 25699152.0, + "logps/chosen": -184.52239990234375, + "logps/rejected": -187.303173828125, + "loss": 0.4651634693145752, + "rewards/chosen": -0.6415619850158691, + "rewards/margins": 0.5583641052246093, + "rewards/rejected": -1.1999260902404785, + "step": 1190 + }, + { + "epoch": 0.48, + "grad_norm": 0.49188584089279175, + "kl": 0.6842840909957886, + "learning_rate": 4.223333333333334e-06, + "logits/chosen": 30387222.4, + "logits/rejected": 28950054.4, + "logps/chosen": -146.3315673828125, + "logps/rejected": -159.7215576171875, + "loss": 0.4879584789276123, + "rewards/chosen": -0.608671236038208, + "rewards/margins": 0.14971170425415037, + "rewards/rejected": -0.7583829402923584, + "step": 1200 + }, + { + "epoch": 0.48, + "eval_kl": 0.6333972215652466, + "eval_logits/chosen": 31495737.344, + "eval_logits/rejected": 31723335.68, + "eval_logps/chosen": -160.925171875, + "eval_logps/rejected": -156.03046875, + "eval_loss": 0.48919567465782166, + "eval_rewards/chosen": -0.659940185546875, + "eval_rewards/margins": 0.07883386230468759, + "eval_rewards/rejected": -0.7387740478515625, + "eval_runtime": 217.7778, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 1200 + }, + { + "epoch": 0.484, + "grad_norm": 0.5108934640884399, + "kl": 0.7764253616333008, + "learning_rate": 4.212222222222223e-06, + "logits/chosen": 22495624.0, + "logits/rejected": 24596646.4, + "logps/chosen": -138.95447998046876, + "logps/rejected": -143.8316162109375, + "loss": 0.5000998020172119, + "rewards/chosen": -0.806338119506836, + "rewards/margins": 0.04378585815429681, + "rewards/rejected": -0.8501239776611328, + "step": 1210 + }, + { + "epoch": 0.488, + "grad_norm": 0.5415228009223938, + "kl": 1.1236222982406616, + "learning_rate": 4.201111111111112e-06, + "logits/chosen": 31318569.6, + "logits/rejected": 28306940.8, + "logps/chosen": -183.03011474609374, + "logps/rejected": -194.442236328125, + "loss": 0.4961515426635742, + "rewards/chosen": -0.552086067199707, + "rewards/margins": 0.12537789344787598, + "rewards/rejected": -0.677463960647583, + "step": 1220 + }, + { + "epoch": 0.492, + "grad_norm": 0.4574231505393982, + "kl": 1.6093800067901611, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 28117484.8, + "logits/rejected": 27773507.2, + "logps/chosen": -148.87581787109374, + "logps/rejected": -157.650537109375, + "loss": 0.48798060417175293, + "rewards/chosen": -0.22223844528198242, + "rewards/margins": 0.0932478666305542, + "rewards/rejected": -0.3154863119125366, + "step": 1230 + }, + { + "epoch": 0.496, + "grad_norm": 0.455790638923645, + "kl": 1.5425220727920532, + "learning_rate": 4.178888888888889e-06, + "logits/chosen": 30485878.4, + "logits/rejected": 30765398.4, + "logps/chosen": -123.1314208984375, + "logps/rejected": -120.400634765625, + "loss": 0.4938685894012451, + "rewards/chosen": -0.15957858562469482, + "rewards/margins": 0.033807253837585455, + "rewards/rejected": -0.19338583946228027, + "step": 1240 + }, + { + "epoch": 0.5, + "grad_norm": 0.5282999873161316, + "kl": 1.4079266786575317, + "learning_rate": 4.167777777777778e-06, + "logits/chosen": 22162673.6, + "logits/rejected": 22915948.8, + "logps/chosen": -113.82423095703125, + "logps/rejected": -131.29176025390626, + "loss": 0.49029102325439455, + "rewards/chosen": -0.14798271656036377, + "rewards/margins": 0.09533922672271727, + "rewards/rejected": -0.24332194328308104, + "step": 1250 + }, + { + "epoch": 0.504, + "grad_norm": 0.5007496476173401, + "kl": 1.7635902166366577, + "learning_rate": 4.156666666666667e-06, + "logits/chosen": 27436982.4, + "logits/rejected": 27643766.4, + "logps/chosen": -147.7771484375, + "logps/rejected": -167.73524169921876, + "loss": 0.46639671325683596, + "rewards/chosen": -0.29362332820892334, + "rewards/margins": 0.3400294542312622, + "rewards/rejected": -0.6336527824401855, + "step": 1260 + }, + { + "epoch": 0.508, + "grad_norm": 0.4727869927883148, + "kl": 1.2690056562423706, + "learning_rate": 4.145555555555556e-06, + "logits/chosen": 29958118.4, + "logits/rejected": 26773496.0, + "logps/chosen": -172.3375732421875, + "logps/rejected": -155.7744140625, + "loss": 0.4935513973236084, + "rewards/chosen": -0.3117243528366089, + "rewards/margins": 0.08564956188201905, + "rewards/rejected": -0.39737391471862793, + "step": 1270 + }, + { + "epoch": 0.512, + "grad_norm": 0.4609099328517914, + "kl": 1.781589150428772, + "learning_rate": 4.1344444444444446e-06, + "logits/chosen": 45966684.8, + "logits/rejected": 46560012.8, + "logps/chosen": -154.45379638671875, + "logps/rejected": -171.34287109375, + "loss": 0.49072775840759275, + "rewards/chosen": -0.026087772846221925, + "rewards/margins": 0.12879917621612547, + "rewards/rejected": -0.1548869490623474, + "step": 1280 + }, + { + "epoch": 0.516, + "grad_norm": 0.5082091093063354, + "kl": 1.657065749168396, + "learning_rate": 4.123333333333333e-06, + "logits/chosen": 27531948.8, + "logits/rejected": 28266195.2, + "logps/chosen": -133.0970947265625, + "logps/rejected": -141.93575439453124, + "loss": 0.4872725486755371, + "rewards/chosen": -0.0539365291595459, + "rewards/margins": 0.13880285024642947, + "rewards/rejected": -0.19273937940597535, + "step": 1290 + }, + { + "epoch": 0.52, + "grad_norm": 0.5041593909263611, + "kl": 2.0696568489074707, + "learning_rate": 4.112222222222222e-06, + "logits/chosen": 37664678.4, + "logits/rejected": 34784227.2, + "logps/chosen": -183.206103515625, + "logps/rejected": -145.39962158203124, + "loss": 0.48463997840881345, + "rewards/chosen": -0.03633859157562256, + "rewards/margins": 0.14437620639801027, + "rewards/rejected": -0.18071479797363282, + "step": 1300 + }, + { + "epoch": 0.524, + "grad_norm": 0.6096036434173584, + "kl": 2.0778517723083496, + "learning_rate": 4.101111111111111e-06, + "logits/chosen": 30281945.6, + "logits/rejected": 30007484.8, + "logps/chosen": -140.6286865234375, + "logps/rejected": -148.30921630859376, + "loss": 0.49010205268859863, + "rewards/chosen": -0.1496596097946167, + "rewards/margins": 0.10425436496734616, + "rewards/rejected": -0.25391397476196287, + "step": 1310 + }, + { + "epoch": 0.528, + "grad_norm": 0.3967672884464264, + "kl": 3.4023184776306152, + "learning_rate": 4.09e-06, + "logits/chosen": 38450940.8, + "logits/rejected": 36835715.2, + "logps/chosen": -149.884423828125, + "logps/rejected": -155.450439453125, + "loss": 0.455477237701416, + "rewards/chosen": 0.22590782642364501, + "rewards/margins": 0.39357452392578124, + "rewards/rejected": -0.16766669750213622, + "step": 1320 + }, + { + "epoch": 0.532, + "grad_norm": 0.39660006761550903, + "kl": 1.7329654693603516, + "learning_rate": 4.0788888888888895e-06, + "logits/chosen": 29744569.6, + "logits/rejected": 30137328.0, + "logps/chosen": -155.14593505859375, + "logps/rejected": -160.4675048828125, + "loss": 0.4830836296081543, + "rewards/chosen": -0.31779026985168457, + "rewards/margins": 0.1599587440490723, + "rewards/rejected": -0.47774901390075686, + "step": 1330 + }, + { + "epoch": 0.536, + "grad_norm": 0.6326448917388916, + "kl": 2.0254123210906982, + "learning_rate": 4.067777777777778e-06, + "logits/chosen": 26790800.0, + "logits/rejected": 28456883.2, + "logps/chosen": -151.97984619140624, + "logps/rejected": -130.97991943359375, + "loss": 0.4777104377746582, + "rewards/chosen": -0.04423903226852417, + "rewards/margins": 0.20697282552719115, + "rewards/rejected": -0.2512118577957153, + "step": 1340 + }, + { + "epoch": 0.54, + "grad_norm": 0.4449482858181, + "kl": 1.457157015800476, + "learning_rate": 4.056666666666667e-06, + "logits/chosen": 29013564.8, + "logits/rejected": 28593001.6, + "logps/chosen": -128.33466796875, + "logps/rejected": -122.49200439453125, + "loss": 0.4760580539703369, + "rewards/chosen": -0.19824122190475463, + "rewards/margins": 0.20185590982437135, + "rewards/rejected": -0.400097131729126, + "step": 1350 + }, + { + "epoch": 0.544, + "grad_norm": 0.45084336400032043, + "kl": 3.831247329711914, + "learning_rate": 4.045555555555556e-06, + "logits/chosen": 31035744.0, + "logits/rejected": 32034198.4, + "logps/chosen": -164.5341064453125, + "logps/rejected": -147.6629638671875, + "loss": 0.48288540840148925, + "rewards/chosen": 0.16254035234451295, + "rewards/margins": 0.1516798198223114, + "rewards/rejected": 0.010860532522201538, + "step": 1360 + }, + { + "epoch": 0.548, + "grad_norm": 0.5451259613037109, + "kl": 3.273149013519287, + "learning_rate": 4.034444444444445e-06, + "logits/chosen": 28394259.2, + "logits/rejected": 25613750.4, + "logps/chosen": -176.49615478515625, + "logps/rejected": -159.41533203125, + "loss": 0.463987922668457, + "rewards/chosen": 0.05027390718460083, + "rewards/margins": 0.3991087079048157, + "rewards/rejected": -0.34883480072021483, + "step": 1370 + }, + { + "epoch": 0.552, + "grad_norm": 0.4214652180671692, + "kl": 2.222465991973877, + "learning_rate": 4.0233333333333335e-06, + "logits/chosen": 34603212.8, + "logits/rejected": 34498118.4, + "logps/chosen": -148.70673828125, + "logps/rejected": -138.68626708984374, + "loss": 0.4929951667785645, + "rewards/chosen": -0.12773821353912354, + "rewards/margins": 0.0252701163291931, + "rewards/rejected": -0.15300832986831664, + "step": 1380 + }, + { + "epoch": 0.556, + "grad_norm": 0.5307957530021667, + "kl": 2.981513500213623, + "learning_rate": 4.012222222222222e-06, + "logits/chosen": 39500022.4, + "logits/rejected": 41076224.0, + "logps/chosen": -156.5267333984375, + "logps/rejected": -168.4097900390625, + "loss": 0.5046597480773926, + "rewards/chosen": 0.03251245319843292, + "rewards/margins": -0.05116569101810456, + "rewards/rejected": 0.08367814421653748, + "step": 1390 + }, + { + "epoch": 0.56, + "grad_norm": 0.5756453275680542, + "kl": 3.648423671722412, + "learning_rate": 4.001111111111111e-06, + "logits/chosen": 36128464.0, + "logits/rejected": 36108208.0, + "logps/chosen": -147.972119140625, + "logps/rejected": -180.87730712890624, + "loss": 0.49908957481384275, + "rewards/chosen": 0.16207314729690553, + "rewards/margins": -0.004686105251312245, + "rewards/rejected": 0.16675925254821777, + "step": 1400 + }, + { + "epoch": 0.56, + "eval_kl": 3.1687636375427246, + "eval_logits/chosen": 33501499.392, + "eval_logits/rejected": 33484677.12, + "eval_logps/chosen": -154.072703125, + "eval_logps/rejected": -149.52703125, + "eval_loss": 0.486517995595932, + "eval_rewards/chosen": 0.025308061599731445, + "eval_rewards/margins": 0.11373865699768065, + "eval_rewards/rejected": -0.08843059539794922, + "eval_runtime": 217.6832, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 1400 + }, + { + "epoch": 0.564, + "grad_norm": 0.5075347423553467, + "kl": 3.5038933753967285, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 28111721.6, + "logits/rejected": 28974422.4, + "logps/chosen": -153.78463134765624, + "logps/rejected": -143.94952392578125, + "loss": 0.4918965816497803, + "rewards/chosen": -0.004850611090660095, + "rewards/margins": 0.020186284184455873, + "rewards/rejected": -0.025036895275115968, + "step": 1410 + }, + { + "epoch": 0.568, + "grad_norm": 0.5109780430793762, + "kl": 2.3000378608703613, + "learning_rate": 3.9788888888888896e-06, + "logits/chosen": 33186614.4, + "logits/rejected": 33699417.6, + "logps/chosen": -141.6215087890625, + "logps/rejected": -178.1355712890625, + "loss": 0.4941215991973877, + "rewards/chosen": -0.18355293273925782, + "rewards/margins": 0.026853704452514643, + "rewards/rejected": -0.21040663719177247, + "step": 1420 + }, + { + "epoch": 0.572, + "grad_norm": 0.6244523525238037, + "kl": 1.984100580215454, + "learning_rate": 3.967777777777778e-06, + "logits/chosen": 39980752.0, + "logits/rejected": 35690995.2, + "logps/chosen": -194.154638671875, + "logps/rejected": -171.23074951171876, + "loss": 0.46099395751953126, + "rewards/chosen": -0.10000758171081543, + "rewards/margins": 0.5053775310516357, + "rewards/rejected": -0.6053851127624512, + "step": 1430 + }, + { + "epoch": 0.576, + "grad_norm": 0.41846802830696106, + "kl": 2.4030237197875977, + "learning_rate": 3.956666666666667e-06, + "logits/chosen": 25522387.2, + "logits/rejected": 26411580.8, + "logps/chosen": -122.002734375, + "logps/rejected": -139.70345458984374, + "loss": 0.4711480617523193, + "rewards/chosen": -0.17828741073608398, + "rewards/margins": 0.30072832107543945, + "rewards/rejected": -0.47901573181152346, + "step": 1440 + }, + { + "epoch": 0.58, + "grad_norm": 0.3383093774318695, + "kl": 2.3522331714630127, + "learning_rate": 3.945555555555556e-06, + "logits/chosen": 34720166.4, + "logits/rejected": 35382691.2, + "logps/chosen": -134.02327880859374, + "logps/rejected": -143.1811767578125, + "loss": 0.48169522285461425, + "rewards/chosen": -0.12290234565734863, + "rewards/margins": 0.22536482810974118, + "rewards/rejected": -0.3482671737670898, + "step": 1450 + }, + { + "epoch": 0.584, + "grad_norm": 0.47618529200553894, + "kl": 1.4843952655792236, + "learning_rate": 3.934444444444445e-06, + "logits/chosen": 29917385.6, + "logits/rejected": 29642912.0, + "logps/chosen": -157.6127197265625, + "logps/rejected": -164.204248046875, + "loss": 0.48633370399475095, + "rewards/chosen": -0.5775248527526855, + "rewards/margins": 0.010549926757812522, + "rewards/rejected": -0.588074779510498, + "step": 1460 + }, + { + "epoch": 0.588, + "grad_norm": 0.4691362977027893, + "kl": 1.8532390594482422, + "learning_rate": 3.923333333333334e-06, + "logits/chosen": 24143035.2, + "logits/rejected": 26696252.8, + "logps/chosen": -145.52325439453125, + "logps/rejected": -114.97313232421875, + "loss": 0.4996927261352539, + "rewards/chosen": -0.3415048837661743, + "rewards/margins": -0.0695812225341797, + "rewards/rejected": -0.27192366123199463, + "step": 1470 + }, + { + "epoch": 0.592, + "grad_norm": 0.49410581588745117, + "kl": 2.910165309906006, + "learning_rate": 3.912222222222222e-06, + "logits/chosen": 29227424.0, + "logits/rejected": 26583780.8, + "logps/chosen": -153.87852783203124, + "logps/rejected": -167.90714111328126, + "loss": 0.4679962158203125, + "rewards/chosen": -0.16606519222259522, + "rewards/margins": 0.28040225505828853, + "rewards/rejected": -0.4464674472808838, + "step": 1480 + }, + { + "epoch": 0.596, + "grad_norm": 0.6437669992446899, + "kl": 4.011757850646973, + "learning_rate": 3.901111111111111e-06, + "logits/chosen": 40104499.2, + "logits/rejected": 35466915.2, + "logps/chosen": -141.6960693359375, + "logps/rejected": -148.79417724609374, + "loss": 0.45351347923278806, + "rewards/chosen": 0.29918632507324217, + "rewards/margins": 0.7735027313232421, + "rewards/rejected": -0.47431640625, + "step": 1490 + }, + { + "epoch": 0.6, + "grad_norm": 0.598638653755188, + "kl": 2.5277042388916016, + "learning_rate": 3.89e-06, + "logits/chosen": 30581568.0, + "logits/rejected": 29237926.4, + "logps/chosen": -170.3593017578125, + "logps/rejected": -161.714111328125, + "loss": 0.5054315567016602, + "rewards/chosen": -0.5106431007385254, + "rewards/margins": -0.14114959239959712, + "rewards/rejected": -0.36949350833892824, + "step": 1500 + }, + { + "epoch": 0.604, + "grad_norm": 0.5450658202171326, + "kl": 3.1822094917297363, + "learning_rate": 3.87888888888889e-06, + "logits/chosen": 30121491.2, + "logits/rejected": 30883408.0, + "logps/chosen": -177.3155029296875, + "logps/rejected": -172.50675048828126, + "loss": 0.4777498722076416, + "rewards/chosen": -0.09470235109329224, + "rewards/margins": 0.16894682645797732, + "rewards/rejected": -0.26364917755126954, + "step": 1510 + }, + { + "epoch": 0.608, + "grad_norm": 0.32850226759910583, + "kl": 3.073251724243164, + "learning_rate": 3.8677777777777785e-06, + "logits/chosen": 32764054.4, + "logits/rejected": 33643142.4, + "logps/chosen": -167.7408447265625, + "logps/rejected": -171.3683349609375, + "loss": 0.4882831573486328, + "rewards/chosen": -0.5916567325592041, + "rewards/margins": 0.06371226310729972, + "rewards/rejected": -0.6553689956665039, + "step": 1520 + }, + { + "epoch": 0.612, + "grad_norm": 0.776578426361084, + "kl": 2.1626362800598145, + "learning_rate": 3.856666666666667e-06, + "logits/chosen": 20513964.8, + "logits/rejected": 19167148.8, + "logps/chosen": -138.76737060546876, + "logps/rejected": -200.4188232421875, + "loss": 0.47345700263977053, + "rewards/chosen": -0.39080009460449217, + "rewards/margins": 0.3349196434020997, + "rewards/rejected": -0.7257197380065918, + "step": 1530 + }, + { + "epoch": 0.616, + "grad_norm": 0.7884080410003662, + "kl": 2.2347629070281982, + "learning_rate": 3.845555555555556e-06, + "logits/chosen": 21506472.0, + "logits/rejected": 20219934.4, + "logps/chosen": -141.54342041015624, + "logps/rejected": -150.6858154296875, + "loss": 0.46153483390808103, + "rewards/chosen": -0.5252087116241455, + "rewards/margins": 0.5382626056671143, + "rewards/rejected": -1.0634713172912598, + "step": 1540 + }, + { + "epoch": 0.62, + "grad_norm": 0.6161748766899109, + "kl": 1.0965118408203125, + "learning_rate": 3.834444444444445e-06, + "logits/chosen": 24290136.0, + "logits/rejected": 24614228.8, + "logps/chosen": -178.11864013671874, + "logps/rejected": -153.21026611328125, + "loss": 0.4584649085998535, + "rewards/chosen": -0.8390473365783692, + "rewards/margins": 0.8001769065856933, + "rewards/rejected": -1.6392242431640625, + "step": 1550 + }, + { + "epoch": 0.624, + "grad_norm": 0.6851525902748108, + "kl": 0.8686630129814148, + "learning_rate": 3.823333333333334e-06, + "logits/chosen": 17298494.4, + "logits/rejected": 14839955.2, + "logps/chosen": -181.07423095703126, + "logps/rejected": -169.425439453125, + "loss": 0.49303278923034666, + "rewards/chosen": -0.948878288269043, + "rewards/margins": 0.510734748840332, + "rewards/rejected": -1.459613037109375, + "step": 1560 + }, + { + "epoch": 0.628, + "grad_norm": 0.6733571290969849, + "kl": 2.8747103214263916, + "learning_rate": 3.8122222222222225e-06, + "logits/chosen": 29427056.0, + "logits/rejected": 24132025.6, + "logps/chosen": -138.527490234375, + "logps/rejected": -174.9707763671875, + "loss": 0.4342005729675293, + "rewards/chosen": 0.09871820211410523, + "rewards/margins": 0.8701262354850768, + "rewards/rejected": -0.7714080333709716, + "step": 1570 + }, + { + "epoch": 0.632, + "grad_norm": 0.5916578769683838, + "kl": 1.7868465185165405, + "learning_rate": 3.8011111111111113e-06, + "logits/chosen": 26572758.4, + "logits/rejected": 23886825.6, + "logps/chosen": -212.365966796875, + "logps/rejected": -208.8218505859375, + "loss": 0.48333086967468264, + "rewards/chosen": -1.0144821166992188, + "rewards/margins": 0.16185150146484362, + "rewards/rejected": -1.1763336181640625, + "step": 1580 + }, + { + "epoch": 0.636, + "grad_norm": 0.5978784561157227, + "kl": 2.861311435699463, + "learning_rate": 3.79e-06, + "logits/chosen": 27835440.0, + "logits/rejected": 27677552.0, + "logps/chosen": -188.40328369140624, + "logps/rejected": -173.6368896484375, + "loss": 0.4893380641937256, + "rewards/chosen": -0.2336580753326416, + "rewards/margins": 0.4210014343261719, + "rewards/rejected": -0.6546595096588135, + "step": 1590 + }, + { + "epoch": 0.64, + "grad_norm": 0.574272632598877, + "kl": 1.9626449346542358, + "learning_rate": 3.7788888888888894e-06, + "logits/chosen": 20977523.2, + "logits/rejected": 17175705.6, + "logps/chosen": -150.7464111328125, + "logps/rejected": -180.0102294921875, + "loss": 0.45204753875732423, + "rewards/chosen": -0.791524600982666, + "rewards/margins": 1.0010954856872558, + "rewards/rejected": -1.7926200866699218, + "step": 1600 + }, + { + "epoch": 0.64, + "eval_kl": 2.5991017818450928, + "eval_logits/chosen": 24918573.056, + "eval_logits/rejected": 25185402.88, + "eval_logps/chosen": -159.12659375, + "eval_logps/rejected": -154.646546875, + "eval_loss": 0.4858725666999817, + "eval_rewards/chosen": -0.480081787109375, + "eval_rewards/margins": 0.12030004882812506, + "eval_rewards/rejected": -0.6003818359375, + "eval_runtime": 217.5803, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1600 + }, + { + "epoch": 0.644, + "grad_norm": 0.6671045422554016, + "kl": 2.1377835273742676, + "learning_rate": 3.767777777777778e-06, + "logits/chosen": 35406131.2, + "logits/rejected": 31205331.2, + "logps/chosen": -184.4550537109375, + "logps/rejected": -165.74014892578126, + "loss": 0.4641073703765869, + "rewards/chosen": -0.07752754688262939, + "rewards/margins": 0.44615256786346436, + "rewards/rejected": -0.5236801147460938, + "step": 1610 + }, + { + "epoch": 0.648, + "grad_norm": 0.6126360297203064, + "kl": 2.7373385429382324, + "learning_rate": 3.756666666666667e-06, + "logits/chosen": 29280086.4, + "logits/rejected": 32377139.2, + "logps/chosen": -212.76513671875, + "logps/rejected": -190.2232666015625, + "loss": 0.49649949073791505, + "rewards/chosen": -0.929378604888916, + "rewards/margins": -0.4132873058319092, + "rewards/rejected": -0.5160912990570068, + "step": 1620 + }, + { + "epoch": 0.652, + "grad_norm": 0.5612730979919434, + "kl": 2.997823476791382, + "learning_rate": 3.7455555555555558e-06, + "logits/chosen": 21078843.2, + "logits/rejected": 19578608.0, + "logps/chosen": -149.2550537109375, + "logps/rejected": -154.4807861328125, + "loss": 0.45549612045288085, + "rewards/chosen": -0.679559326171875, + "rewards/margins": 0.24375944137573247, + "rewards/rejected": -0.9233187675476074, + "step": 1630 + }, + { + "epoch": 0.656, + "grad_norm": 0.44815537333488464, + "kl": 2.8388514518737793, + "learning_rate": 3.734444444444445e-06, + "logits/chosen": 22369004.8, + "logits/rejected": 18537880.0, + "logps/chosen": -171.47335205078124, + "logps/rejected": -165.7509521484375, + "loss": 0.4616579532623291, + "rewards/chosen": -0.18010754585266114, + "rewards/margins": 0.7035699367523194, + "rewards/rejected": -0.8836774826049805, + "step": 1640 + }, + { + "epoch": 0.66, + "grad_norm": 0.8153596520423889, + "kl": 3.4207565784454346, + "learning_rate": 3.723333333333334e-06, + "logits/chosen": 24538766.4, + "logits/rejected": 26516555.2, + "logps/chosen": -150.9359619140625, + "logps/rejected": -122.23038330078126, + "loss": 0.48783044815063475, + "rewards/chosen": -0.1703397035598755, + "rewards/margins": 0.0270324468612671, + "rewards/rejected": -0.1973721504211426, + "step": 1650 + }, + { + "epoch": 0.664, + "grad_norm": 0.45013901591300964, + "kl": 3.9859955310821533, + "learning_rate": 3.7122222222222226e-06, + "logits/chosen": 22145710.4, + "logits/rejected": 20591705.6, + "logps/chosen": -124.9506591796875, + "logps/rejected": -146.586279296875, + "loss": 0.4657421112060547, + "rewards/chosen": 0.028095448017120363, + "rewards/margins": 0.4449395298957825, + "rewards/rejected": -0.4168440818786621, + "step": 1660 + }, + { + "epoch": 0.668, + "grad_norm": 0.5674629211425781, + "kl": 3.48918080329895, + "learning_rate": 3.7011111111111114e-06, + "logits/chosen": 28042899.2, + "logits/rejected": 27730080.0, + "logps/chosen": -170.19053955078124, + "logps/rejected": -192.9843017578125, + "loss": 0.4754744052886963, + "rewards/chosen": -0.249656343460083, + "rewards/margins": 0.22593021392822268, + "rewards/rejected": -0.4755865573883057, + "step": 1670 + }, + { + "epoch": 0.672, + "grad_norm": 0.7562563419342041, + "kl": 3.0691466331481934, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 26120401.6, + "logits/rejected": 25757414.4, + "logps/chosen": -154.65439453125, + "logps/rejected": -162.4265869140625, + "loss": 0.4740549087524414, + "rewards/chosen": -0.2537501811981201, + "rewards/margins": 0.16815314292907718, + "rewards/rejected": -0.4219033241271973, + "step": 1680 + }, + { + "epoch": 0.676, + "grad_norm": 0.6189448237419128, + "kl": 3.324810028076172, + "learning_rate": 3.678888888888889e-06, + "logits/chosen": 30877590.4, + "logits/rejected": 29492992.0, + "logps/chosen": -162.27489013671874, + "logps/rejected": -156.6115234375, + "loss": 0.46096296310424806, + "rewards/chosen": -0.32442150115966795, + "rewards/margins": 0.5965461730957031, + "rewards/rejected": -0.9209676742553711, + "step": 1690 + }, + { + "epoch": 0.68, + "grad_norm": 0.5689833760261536, + "kl": 3.417942762374878, + "learning_rate": 3.667777777777778e-06, + "logits/chosen": 30880060.8, + "logits/rejected": 30390649.6, + "logps/chosen": -146.108203125, + "logps/rejected": -140.8014892578125, + "loss": 0.48299012184143064, + "rewards/chosen": -0.08942080736160278, + "rewards/margins": 0.09204813241958619, + "rewards/rejected": -0.18146893978118897, + "step": 1700 + }, + { + "epoch": 0.684, + "grad_norm": 0.7074683904647827, + "kl": 2.7843141555786133, + "learning_rate": 3.6566666666666667e-06, + "logits/chosen": 23799224.0, + "logits/rejected": 23789908.8, + "logps/chosen": -132.7684326171875, + "logps/rejected": -171.96357421875, + "loss": 0.4795389652252197, + "rewards/chosen": -0.4700624942779541, + "rewards/margins": 0.3144543647766114, + "rewards/rejected": -0.7845168590545655, + "step": 1710 + }, + { + "epoch": 0.688, + "grad_norm": 0.8114802241325378, + "kl": 2.740182876586914, + "learning_rate": 3.645555555555556e-06, + "logits/chosen": 25693836.8, + "logits/rejected": 25391835.2, + "logps/chosen": -147.47672119140626, + "logps/rejected": -162.006640625, + "loss": 0.47942562103271485, + "rewards/chosen": -0.12961168289184571, + "rewards/margins": 0.2411248922348022, + "rewards/rejected": -0.3707365751266479, + "step": 1720 + }, + { + "epoch": 0.692, + "grad_norm": 0.6404406428337097, + "kl": 5.742056369781494, + "learning_rate": 3.6344444444444447e-06, + "logits/chosen": 23561008.0, + "logits/rejected": 24549129.6, + "logps/chosen": -152.6004150390625, + "logps/rejected": -174.73006591796874, + "loss": 0.4857301712036133, + "rewards/chosen": 0.12083638906478882, + "rewards/margins": 0.12182764708995819, + "rewards/rejected": -0.0009912580251693725, + "step": 1730 + }, + { + "epoch": 0.696, + "grad_norm": 0.8152211308479309, + "kl": 2.7150015830993652, + "learning_rate": 3.6233333333333335e-06, + "logits/chosen": 20060864.0, + "logits/rejected": 21277550.4, + "logps/chosen": -161.858642578125, + "logps/rejected": -154.7493408203125, + "loss": 0.49634590148925783, + "rewards/chosen": -0.6363963603973388, + "rewards/margins": -0.21991643905639646, + "rewards/rejected": -0.41647992134094236, + "step": 1740 + }, + { + "epoch": 0.7, + "grad_norm": 0.5856395959854126, + "kl": 3.9709296226501465, + "learning_rate": 3.6122222222222223e-06, + "logits/chosen": 24168908.8, + "logits/rejected": 26363808.0, + "logps/chosen": -216.2795654296875, + "logps/rejected": -159.5930908203125, + "loss": 0.4658236026763916, + "rewards/chosen": -0.5366491794586181, + "rewards/margins": 0.038147354125976585, + "rewards/rejected": -0.5747965335845947, + "step": 1750 + }, + { + "epoch": 0.704, + "grad_norm": 0.6619251370429993, + "kl": 3.0937228202819824, + "learning_rate": 3.601111111111111e-06, + "logits/chosen": 17747806.4, + "logits/rejected": 22547065.6, + "logps/chosen": -187.3780517578125, + "logps/rejected": -159.7064453125, + "loss": 0.5088288307189941, + "rewards/chosen": -1.2607831954956055, + "rewards/margins": -0.6743541717529297, + "rewards/rejected": -0.5864290237426758, + "step": 1760 + }, + { + "epoch": 0.708, + "grad_norm": 0.5218913555145264, + "kl": 2.901822566986084, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 22269254.4, + "logits/rejected": 23662136.0, + "logps/chosen": -136.06712646484374, + "logps/rejected": -121.06827392578126, + "loss": 0.497973108291626, + "rewards/chosen": -0.4131883144378662, + "rewards/margins": -0.10578060150146484, + "rewards/rejected": -0.30740771293640134, + "step": 1770 + }, + { + "epoch": 0.712, + "grad_norm": 0.6656368970870972, + "kl": 3.2692978382110596, + "learning_rate": 3.578888888888889e-06, + "logits/chosen": 25755620.8, + "logits/rejected": 26518835.2, + "logps/chosen": -165.65780029296874, + "logps/rejected": -147.2259033203125, + "loss": 0.506129789352417, + "rewards/chosen": -0.5433285236358643, + "rewards/margins": -0.17034811973571778, + "rewards/rejected": -0.3729804039001465, + "step": 1780 + }, + { + "epoch": 0.716, + "grad_norm": 0.771259069442749, + "kl": 3.0249366760253906, + "learning_rate": 3.5677777777777784e-06, + "logits/chosen": 23546620.8, + "logits/rejected": 25753550.4, + "logps/chosen": -151.70357666015624, + "logps/rejected": -122.8987548828125, + "loss": 0.5249699592590332, + "rewards/chosen": -0.5662184715270996, + "rewards/margins": -0.2557974576950073, + "rewards/rejected": -0.3104210138320923, + "step": 1790 + }, + { + "epoch": 0.72, + "grad_norm": 0.872774064540863, + "kl": 3.2898342609405518, + "learning_rate": 3.556666666666667e-06, + "logits/chosen": 18870168.0, + "logits/rejected": 17117038.4, + "logps/chosen": -150.25985107421874, + "logps/rejected": -161.5357666015625, + "loss": 0.451005744934082, + "rewards/chosen": -0.1520848512649536, + "rewards/margins": 0.6849667310714721, + "rewards/rejected": -0.8370515823364257, + "step": 1800 + }, + { + "epoch": 0.72, + "eval_kl": 3.5156476497650146, + "eval_logits/chosen": 26424913.92, + "eval_logits/rejected": 26601347.072, + "eval_logps/chosen": -156.963453125, + "eval_logps/rejected": -152.583296875, + "eval_loss": 0.48428651690483093, + "eval_rewards/chosen": -0.2637669677734375, + "eval_rewards/margins": 0.13028942871093752, + "eval_rewards/rejected": -0.394056396484375, + "eval_runtime": 217.3905, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 1800 + }, + { + "epoch": 0.724, + "grad_norm": 0.6279663443565369, + "kl": 1.9814598560333252, + "learning_rate": 3.545555555555556e-06, + "logits/chosen": 31417820.8, + "logits/rejected": 29248547.2, + "logps/chosen": -143.02147216796874, + "logps/rejected": -176.2625732421875, + "loss": 0.47081918716430665, + "rewards/chosen": -0.7984821319580078, + "rewards/margins": 0.2239703178405763, + "rewards/rejected": -1.022452449798584, + "step": 1810 + }, + { + "epoch": 0.728, + "grad_norm": 0.6332824230194092, + "kl": 3.8811469078063965, + "learning_rate": 3.534444444444445e-06, + "logits/chosen": 25455878.4, + "logits/rejected": 24060284.8, + "logps/chosen": -148.748779296875, + "logps/rejected": -152.15390625, + "loss": 0.48478074073791505, + "rewards/chosen": -0.37164936065673826, + "rewards/margins": 0.1325855255126953, + "rewards/rejected": -0.5042348861694336, + "step": 1820 + }, + { + "epoch": 0.732, + "grad_norm": 0.570693850517273, + "kl": 3.4417755603790283, + "learning_rate": 3.5233333333333336e-06, + "logits/chosen": 21330112.0, + "logits/rejected": 23791145.6, + "logps/chosen": -175.6027099609375, + "logps/rejected": -139.43577880859374, + "loss": 0.4806610107421875, + "rewards/chosen": -0.7079993724822998, + "rewards/margins": -0.224769401550293, + "rewards/rejected": -0.4832299709320068, + "step": 1830 + }, + { + "epoch": 0.736, + "grad_norm": 0.6215969920158386, + "kl": 2.516907215118408, + "learning_rate": 3.5122222222222224e-06, + "logits/chosen": 19252992.0, + "logits/rejected": 17279195.2, + "logps/chosen": -138.5216552734375, + "logps/rejected": -175.40498046875, + "loss": 0.4477705955505371, + "rewards/chosen": -0.37056674957275393, + "rewards/margins": 0.6211806297302246, + "rewards/rejected": -0.9917473793029785, + "step": 1840 + }, + { + "epoch": 0.74, + "grad_norm": 0.477038711309433, + "kl": 2.8053412437438965, + "learning_rate": 3.5011111111111112e-06, + "logits/chosen": 21869585.6, + "logits/rejected": 25226084.8, + "logps/chosen": -169.3609130859375, + "logps/rejected": -135.9068603515625, + "loss": 0.49389004707336426, + "rewards/chosen": -0.8072388648986817, + "rewards/margins": -0.23602757453918466, + "rewards/rejected": -0.571211290359497, + "step": 1850 + }, + { + "epoch": 0.744, + "grad_norm": 0.4190019369125366, + "kl": 4.664608955383301, + "learning_rate": 3.49e-06, + "logits/chosen": 18998553.6, + "logits/rejected": 18716126.4, + "logps/chosen": -154.63216552734374, + "logps/rejected": -157.9931884765625, + "loss": 0.45591115951538086, + "rewards/chosen": -0.11906745433807372, + "rewards/margins": 0.5571311235427856, + "rewards/rejected": -0.6761985778808594, + "step": 1860 + }, + { + "epoch": 0.748, + "grad_norm": 0.5092635154724121, + "kl": 5.426673412322998, + "learning_rate": 3.4788888888888893e-06, + "logits/chosen": 24268691.2, + "logits/rejected": 23287683.2, + "logps/chosen": -150.13511962890624, + "logps/rejected": -137.98375244140624, + "loss": 0.4658195018768311, + "rewards/chosen": 0.17463077306747438, + "rewards/margins": 0.33842480182647705, + "rewards/rejected": -0.16379402875900267, + "step": 1870 + }, + { + "epoch": 0.752, + "grad_norm": 0.5116318464279175, + "kl": 3.4443411827087402, + "learning_rate": 3.467777777777778e-06, + "logits/chosen": 25825232.0, + "logits/rejected": 28601868.8, + "logps/chosen": -131.28408203125, + "logps/rejected": -165.76693115234374, + "loss": 0.4923543453216553, + "rewards/chosen": -0.23933188915252684, + "rewards/margins": 0.14726905822753905, + "rewards/rejected": -0.3866009473800659, + "step": 1880 + }, + { + "epoch": 0.756, + "grad_norm": 0.6366556286811829, + "kl": 3.051987409591675, + "learning_rate": 3.456666666666667e-06, + "logits/chosen": 24197241.6, + "logits/rejected": 24256118.4, + "logps/chosen": -149.84852294921876, + "logps/rejected": -140.0214599609375, + "loss": 0.4848769664764404, + "rewards/chosen": -0.3190887212753296, + "rewards/margins": 0.17881777286529538, + "rewards/rejected": -0.497906494140625, + "step": 1890 + }, + { + "epoch": 0.76, + "grad_norm": 0.4979274570941925, + "kl": 1.4698994159698486, + "learning_rate": 3.4455555555555557e-06, + "logits/chosen": 23184480.0, + "logits/rejected": 25139280.0, + "logps/chosen": -134.97877197265626, + "logps/rejected": -144.17740478515626, + "loss": 0.4794943809509277, + "rewards/chosen": -0.6308645248413086, + "rewards/margins": 0.13470888137817383, + "rewards/rejected": -0.7655734062194824, + "step": 1900 + }, + { + "epoch": 0.764, + "grad_norm": 0.6274532079696655, + "kl": 3.6468818187713623, + "learning_rate": 3.4344444444444445e-06, + "logits/chosen": 24657672.0, + "logits/rejected": 21508489.6, + "logps/chosen": -160.151904296875, + "logps/rejected": -166.94481201171874, + "loss": 0.4795567512512207, + "rewards/chosen": -0.22244927883148194, + "rewards/margins": 0.15467300415039062, + "rewards/rejected": -0.37712228298187256, + "step": 1910 + }, + { + "epoch": 0.768, + "grad_norm": 0.7713479399681091, + "kl": 4.167417049407959, + "learning_rate": 3.4233333333333333e-06, + "logits/chosen": 23603747.2, + "logits/rejected": 21209184.0, + "logps/chosen": -134.7058837890625, + "logps/rejected": -163.1764404296875, + "loss": 0.4544349193572998, + "rewards/chosen": 0.17299318313598633, + "rewards/margins": 0.5065126180648803, + "rewards/rejected": -0.333519434928894, + "step": 1920 + }, + { + "epoch": 0.772, + "grad_norm": 0.5262131690979004, + "kl": 2.8361663818359375, + "learning_rate": 3.412222222222222e-06, + "logits/chosen": 29174873.6, + "logits/rejected": 33003203.2, + "logps/chosen": -158.76817626953124, + "logps/rejected": -142.29862060546876, + "loss": 0.47826762199401857, + "rewards/chosen": -0.2532700300216675, + "rewards/margins": 0.13431007862091066, + "rewards/rejected": -0.38758010864257814, + "step": 1930 + }, + { + "epoch": 0.776, + "grad_norm": 0.610528826713562, + "kl": 1.9879090785980225, + "learning_rate": 3.4011111111111113e-06, + "logits/chosen": 14738179.2, + "logits/rejected": 17543468.8, + "logps/chosen": -144.6372314453125, + "logps/rejected": -121.0155517578125, + "loss": 0.5197708129882812, + "rewards/chosen": -0.7448621273040772, + "rewards/margins": -0.2757446765899659, + "rewards/rejected": -0.4691174507141113, + "step": 1940 + }, + { + "epoch": 0.78, + "grad_norm": 0.4867253601551056, + "kl": 2.61750864982605, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 29278848.0, + "logits/rejected": 27723072.0, + "logps/chosen": -167.261474609375, + "logps/rejected": -166.44840087890626, + "loss": 0.48351154327392576, + "rewards/chosen": -0.45982890129089354, + "rewards/margins": 0.17165498733520507, + "rewards/rejected": -0.6314838886260986, + "step": 1950 + }, + { + "epoch": 0.784, + "grad_norm": 0.507047176361084, + "kl": 1.4705009460449219, + "learning_rate": 3.3788888888888894e-06, + "logits/chosen": 21861384.0, + "logits/rejected": 22609652.8, + "logps/chosen": -154.51759033203126, + "logps/rejected": -141.37017822265625, + "loss": 0.4911977291107178, + "rewards/chosen": -0.9377481460571289, + "rewards/margins": 0.045456314086914085, + "rewards/rejected": -0.983204460144043, + "step": 1960 + }, + { + "epoch": 0.788, + "grad_norm": 0.5638304352760315, + "kl": 2.8000810146331787, + "learning_rate": 3.367777777777778e-06, + "logits/chosen": 29543376.0, + "logits/rejected": 30959481.6, + "logps/chosen": -154.2559814453125, + "logps/rejected": -140.09403076171876, + "loss": 0.4743481636047363, + "rewards/chosen": -0.27635998725891114, + "rewards/margins": 0.31996994018554686, + "rewards/rejected": -0.596329927444458, + "step": 1970 + }, + { + "epoch": 0.792, + "grad_norm": 0.622689962387085, + "kl": 1.304429292678833, + "learning_rate": 3.356666666666667e-06, + "logits/chosen": 16238214.4, + "logits/rejected": 15864859.2, + "logps/chosen": -148.69432373046874, + "logps/rejected": -155.14200439453126, + "loss": 0.4647815227508545, + "rewards/chosen": -0.6397994041442872, + "rewards/margins": 0.38712730407714835, + "rewards/rejected": -1.0269267082214355, + "step": 1980 + }, + { + "epoch": 0.796, + "grad_norm": 0.5903355479240417, + "kl": 3.8611984252929688, + "learning_rate": 3.345555555555556e-06, + "logits/chosen": 26873817.6, + "logits/rejected": 25962048.0, + "logps/chosen": -168.3064208984375, + "logps/rejected": -165.02401123046874, + "loss": 0.44381189346313477, + "rewards/chosen": -0.056187999248504636, + "rewards/margins": 0.48386293649673456, + "rewards/rejected": -0.5400509357452392, + "step": 1990 + }, + { + "epoch": 0.8, + "grad_norm": 0.6087274551391602, + "kl": 2.4798474311828613, + "learning_rate": 3.3344444444444446e-06, + "logits/chosen": 28899868.8, + "logits/rejected": 28327043.2, + "logps/chosen": -131.373046875, + "logps/rejected": -144.835546875, + "loss": 0.4636848449707031, + "rewards/chosen": -0.4337655544281006, + "rewards/margins": 0.1502884864807129, + "rewards/rejected": -0.5840540409088135, + "step": 2000 + }, + { + "epoch": 0.8, + "eval_kl": 2.3182120323181152, + "eval_logits/chosen": 23415599.104, + "eval_logits/rejected": 23816060.928, + "eval_logps/chosen": -161.6585625, + "eval_logps/rejected": -157.55559375, + "eval_loss": 0.48174571990966797, + "eval_rewards/chosen": -0.7332791137695313, + "eval_rewards/margins": 0.1580091552734375, + "eval_rewards/rejected": -0.8912882690429688, + "eval_runtime": 216.8959, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 2000 + }, + { + "epoch": 0.804, + "grad_norm": 0.6840182542800903, + "kl": 3.759185791015625, + "learning_rate": 3.3233333333333334e-06, + "logits/chosen": 29734800.0, + "logits/rejected": 27820688.0, + "logps/chosen": -171.00633544921874, + "logps/rejected": -149.1771240234375, + "loss": 0.4692417621612549, + "rewards/chosen": -0.2439584493637085, + "rewards/margins": 0.4319137811660767, + "rewards/rejected": -0.6758722305297852, + "step": 2010 + }, + { + "epoch": 0.808, + "grad_norm": 0.4128756523132324, + "kl": 2.642878770828247, + "learning_rate": 3.3122222222222222e-06, + "logits/chosen": 19955732.8, + "logits/rejected": 18758494.4, + "logps/chosen": -189.38092041015625, + "logps/rejected": -170.4149658203125, + "loss": 0.46123080253601073, + "rewards/chosen": -0.5583849906921386, + "rewards/margins": 0.32184505462646484, + "rewards/rejected": -0.8802300453186035, + "step": 2020 + }, + { + "epoch": 0.812, + "grad_norm": 0.5455370545387268, + "kl": 1.1196393966674805, + "learning_rate": 3.3011111111111115e-06, + "logits/chosen": 22045115.2, + "logits/rejected": 18838947.2, + "logps/chosen": -148.83717041015626, + "logps/rejected": -179.09134521484376, + "loss": 0.4277163505554199, + "rewards/chosen": -0.9872810363769531, + "rewards/margins": 0.9499824523925782, + "rewards/rejected": -1.9372634887695312, + "step": 2030 + }, + { + "epoch": 0.816, + "grad_norm": 0.5655795335769653, + "kl": 2.0870370864868164, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 30604160.0, + "logits/rejected": 25881659.2, + "logps/chosen": -170.49140625, + "logps/rejected": -214.0181640625, + "loss": 0.4611818790435791, + "rewards/chosen": -0.9137911796569824, + "rewards/margins": 0.8909661293029785, + "rewards/rejected": -1.804757308959961, + "step": 2040 + }, + { + "epoch": 0.82, + "grad_norm": 0.48172426223754883, + "kl": 2.043773651123047, + "learning_rate": 3.278888888888889e-06, + "logits/chosen": 16779667.2, + "logits/rejected": 17778121.6, + "logps/chosen": -148.56708984375, + "logps/rejected": -125.018115234375, + "loss": 0.49151906967163084, + "rewards/chosen": -1.0805482864379883, + "rewards/margins": -0.045468139648437544, + "rewards/rejected": -1.0350801467895507, + "step": 2050 + }, + { + "epoch": 0.824, + "grad_norm": 0.5591869950294495, + "kl": 1.8221423625946045, + "learning_rate": 3.267777777777778e-06, + "logits/chosen": 18840448.0, + "logits/rejected": 14656315.2, + "logps/chosen": -178.58017578125, + "logps/rejected": -194.3201171875, + "loss": 0.5037118434906006, + "rewards/chosen": -1.5193581581115723, + "rewards/margins": 0.3175524711608886, + "rewards/rejected": -1.8369106292724608, + "step": 2060 + }, + { + "epoch": 0.828, + "grad_norm": 0.6082685589790344, + "kl": 2.7332985401153564, + "learning_rate": 3.2566666666666667e-06, + "logits/chosen": 19653870.4, + "logits/rejected": 18607360.0, + "logps/chosen": -191.222802734375, + "logps/rejected": -195.412109375, + "loss": 0.43700380325317384, + "rewards/chosen": -1.0758570671081542, + "rewards/margins": 1.051231098175049, + "rewards/rejected": -2.127088165283203, + "step": 2070 + }, + { + "epoch": 0.832, + "grad_norm": 0.8018869161605835, + "kl": 1.3849284648895264, + "learning_rate": 3.2455555555555555e-06, + "logits/chosen": 18399478.4, + "logits/rejected": 19887457.6, + "logps/chosen": -193.88709716796876, + "logps/rejected": -149.70872802734374, + "loss": 0.5033087730407715, + "rewards/chosen": -1.796027946472168, + "rewards/margins": -0.644907569885254, + "rewards/rejected": -1.151120376586914, + "step": 2080 + }, + { + "epoch": 0.836, + "grad_norm": 0.6100642681121826, + "kl": 1.6638615131378174, + "learning_rate": 3.2344444444444443e-06, + "logits/chosen": 13364839.2, + "logits/rejected": 12106027.2, + "logps/chosen": -150.7623291015625, + "logps/rejected": -176.14700927734376, + "loss": 0.45767946243286134, + "rewards/chosen": -0.9122394561767578, + "rewards/margins": 1.188156890869141, + "rewards/rejected": -2.1003963470458986, + "step": 2090 + }, + { + "epoch": 0.84, + "grad_norm": 0.4774913191795349, + "kl": 2.0765693187713623, + "learning_rate": 3.223333333333334e-06, + "logits/chosen": 14278204.8, + "logits/rejected": 16952772.8, + "logps/chosen": -180.37510986328124, + "logps/rejected": -164.04486083984375, + "loss": 0.5365061283111572, + "rewards/chosen": -1.5350143432617187, + "rewards/margins": -0.5786049842834472, + "rewards/rejected": -0.9564093589782715, + "step": 2100 + }, + { + "epoch": 0.844, + "grad_norm": 0.424125999212265, + "kl": 1.1270596981048584, + "learning_rate": 3.2122222222222228e-06, + "logits/chosen": 11056914.4, + "logits/rejected": 10464643.2, + "logps/chosen": -159.667138671875, + "logps/rejected": -179.2357177734375, + "loss": 0.4685384750366211, + "rewards/chosen": -0.7500426292419433, + "rewards/margins": 0.7069652557373047, + "rewards/rejected": -1.457007884979248, + "step": 2110 + }, + { + "epoch": 0.848, + "grad_norm": 0.6812456846237183, + "kl": 3.2760558128356934, + "learning_rate": 3.2011111111111116e-06, + "logits/chosen": 12226829.6, + "logits/rejected": 9145164.0, + "logps/chosen": -152.55595703125, + "logps/rejected": -163.9876953125, + "loss": 0.4829984664916992, + "rewards/chosen": -1.0188889503479004, + "rewards/margins": 0.4964068412780762, + "rewards/rejected": -1.5152957916259766, + "step": 2120 + }, + { + "epoch": 0.852, + "grad_norm": 0.521295964717865, + "kl": 1.6184799671173096, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 19296177.6, + "logits/rejected": 18037955.2, + "logps/chosen": -190.2342041015625, + "logps/rejected": -179.3730224609375, + "loss": 0.4534940719604492, + "rewards/chosen": -1.4859835624694824, + "rewards/margins": 0.12996721267700195, + "rewards/rejected": -1.6159507751464843, + "step": 2130 + }, + { + "epoch": 0.856, + "grad_norm": 0.3901250660419464, + "kl": 2.1943907737731934, + "learning_rate": 3.178888888888889e-06, + "logits/chosen": 14292169.6, + "logits/rejected": 16561420.8, + "logps/chosen": -172.7265869140625, + "logps/rejected": -174.55390625, + "loss": 0.46813135147094725, + "rewards/chosen": -0.9191327095031738, + "rewards/margins": 0.9562966346740722, + "rewards/rejected": -1.875429344177246, + "step": 2140 + }, + { + "epoch": 0.86, + "grad_norm": 1.2093825340270996, + "kl": 2.53037691116333, + "learning_rate": 3.167777777777778e-06, + "logits/chosen": 15527200.0, + "logits/rejected": 14247240.0, + "logps/chosen": -175.73638916015625, + "logps/rejected": -177.11685791015626, + "loss": 0.4883676052093506, + "rewards/chosen": -1.1620004653930665, + "rewards/margins": 0.6007183074951172, + "rewards/rejected": -1.7627187728881837, + "step": 2150 + }, + { + "epoch": 0.864, + "grad_norm": 0.7262481451034546, + "kl": 2.6998825073242188, + "learning_rate": 3.156666666666667e-06, + "logits/chosen": 25509097.6, + "logits/rejected": 26202662.4, + "logps/chosen": -159.83707275390626, + "logps/rejected": -170.9850341796875, + "loss": 0.44543633460998533, + "rewards/chosen": -0.19056529998779298, + "rewards/margins": 0.7149291038513184, + "rewards/rejected": -0.9054944038391113, + "step": 2160 + }, + { + "epoch": 0.868, + "grad_norm": 0.4244597554206848, + "kl": 1.9011032581329346, + "learning_rate": 3.1455555555555556e-06, + "logits/chosen": 13862787.2, + "logits/rejected": 13148918.4, + "logps/chosen": -144.288720703125, + "logps/rejected": -159.27891845703124, + "loss": 0.46286282539367674, + "rewards/chosen": -1.0397714614868163, + "rewards/margins": 0.25113573074340834, + "rewards/rejected": -1.2909071922302247, + "step": 2170 + }, + { + "epoch": 0.872, + "grad_norm": 0.7587819695472717, + "kl": 4.540980339050293, + "learning_rate": 3.134444444444445e-06, + "logits/chosen": 29813209.6, + "logits/rejected": 31323004.8, + "logps/chosen": -183.304345703125, + "logps/rejected": -198.9873779296875, + "loss": 0.4856124401092529, + "rewards/chosen": -0.8742061614990234, + "rewards/margins": -0.17829103469848628, + "rewards/rejected": -0.6959151268005371, + "step": 2180 + }, + { + "epoch": 0.876, + "grad_norm": 0.36061376333236694, + "kl": 2.879594326019287, + "learning_rate": 3.1233333333333336e-06, + "logits/chosen": 29873868.8, + "logits/rejected": 30440390.4, + "logps/chosen": -143.857861328125, + "logps/rejected": -136.5346435546875, + "loss": 0.5109179496765137, + "rewards/chosen": -0.8628176689147949, + "rewards/margins": -0.39257164001464845, + "rewards/rejected": -0.4702460289001465, + "step": 2190 + }, + { + "epoch": 0.88, + "grad_norm": 0.39630356431007385, + "kl": 2.539196729660034, + "learning_rate": 3.1122222222222224e-06, + "logits/chosen": 20060600.0, + "logits/rejected": 18553404.8, + "logps/chosen": -159.40810546875, + "logps/rejected": -152.76990966796876, + "loss": 0.47215023040771487, + "rewards/chosen": -0.8129859924316406, + "rewards/margins": 0.28859338760375974, + "rewards/rejected": -1.1015793800354003, + "step": 2200 + }, + { + "epoch": 0.88, + "eval_kl": 2.5812811851501465, + "eval_logits/chosen": 21032757.248, + "eval_logits/rejected": 21261236.224, + "eval_logps/chosen": -162.41909375, + "eval_logps/rejected": -158.462875, + "eval_loss": 0.48150432109832764, + "eval_rewards/chosen": -0.8093319091796874, + "eval_rewards/margins": 0.17268438720703128, + "eval_rewards/rejected": -0.9820162963867187, + "eval_runtime": 216.8473, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 2200 + }, + { + "epoch": 0.884, + "grad_norm": 0.6626996994018555, + "kl": 2.7219512462615967, + "learning_rate": 3.1011111111111113e-06, + "logits/chosen": 20549030.4, + "logits/rejected": 23361507.2, + "logps/chosen": -180.177783203125, + "logps/rejected": -172.0116943359375, + "loss": 0.5009243011474609, + "rewards/chosen": -1.0695799827575683, + "rewards/margins": 0.0988718032836915, + "rewards/rejected": -1.1684517860412598, + "step": 2210 + }, + { + "epoch": 0.888, + "grad_norm": 0.40188467502593994, + "kl": 4.388433933258057, + "learning_rate": 3.09e-06, + "logits/chosen": 31197132.8, + "logits/rejected": 29223571.2, + "logps/chosen": -158.33319091796875, + "logps/rejected": -156.518505859375, + "loss": 0.49659576416015627, + "rewards/chosen": -0.5678246021270752, + "rewards/margins": 0.040923357009887695, + "rewards/rejected": -0.6087479591369629, + "step": 2220 + }, + { + "epoch": 0.892, + "grad_norm": 0.7662191390991211, + "kl": 3.7732715606689453, + "learning_rate": 3.078888888888889e-06, + "logits/chosen": 23728937.6, + "logits/rejected": 24493553.6, + "logps/chosen": -122.2478759765625, + "logps/rejected": -116.500439453125, + "loss": 0.4903532028198242, + "rewards/chosen": -0.12796418666839598, + "rewards/margins": 0.14529306888580323, + "rewards/rejected": -0.2732572555541992, + "step": 2230 + }, + { + "epoch": 0.896, + "grad_norm": 0.5434762835502625, + "kl": 5.346643924713135, + "learning_rate": 3.0677777777777777e-06, + "logits/chosen": 23763382.4, + "logits/rejected": 20131742.4, + "logps/chosen": -148.9446044921875, + "logps/rejected": -145.7754638671875, + "loss": 0.4672962188720703, + "rewards/chosen": 0.033642816543579104, + "rewards/margins": 0.5607096195220947, + "rewards/rejected": -0.5270668029785156, + "step": 2240 + }, + { + "epoch": 0.9, + "grad_norm": 0.5850833058357239, + "kl": 4.739095211029053, + "learning_rate": 3.0566666666666665e-06, + "logits/chosen": 20819936.0, + "logits/rejected": 24134200.0, + "logps/chosen": -142.128466796875, + "logps/rejected": -151.9432861328125, + "loss": 0.4636435031890869, + "rewards/chosen": 0.028873807191848753, + "rewards/margins": 0.2783109962940216, + "rewards/rejected": -0.24943718910217286, + "step": 2250 + }, + { + "epoch": 0.904, + "grad_norm": 0.6144809126853943, + "kl": 4.373375415802002, + "learning_rate": 3.045555555555556e-06, + "logits/chosen": 33130332.8, + "logits/rejected": 34606784.0, + "logps/chosen": -155.8130126953125, + "logps/rejected": -168.7025390625, + "loss": 0.47957863807678225, + "rewards/chosen": -0.34006266593933104, + "rewards/margins": 0.1747920036315918, + "rewards/rejected": -0.5148546695709229, + "step": 2260 + }, + { + "epoch": 0.908, + "grad_norm": 0.48172423243522644, + "kl": 3.806690216064453, + "learning_rate": 3.034444444444445e-06, + "logits/chosen": 29197280.0, + "logits/rejected": 26733576.0, + "logps/chosen": -156.34307861328125, + "logps/rejected": -124.57420654296875, + "loss": 0.486788272857666, + "rewards/chosen": -0.15740108489990234, + "rewards/margins": 0.07049424648284913, + "rewards/rejected": -0.22789533138275148, + "step": 2270 + }, + { + "epoch": 0.912, + "grad_norm": 0.5201888680458069, + "kl": 2.4590580463409424, + "learning_rate": 3.0233333333333338e-06, + "logits/chosen": 13540443.2, + "logits/rejected": 11543592.0, + "logps/chosen": -128.15758056640624, + "logps/rejected": -144.884130859375, + "loss": 0.5047108173370362, + "rewards/chosen": -0.8088220596313477, + "rewards/margins": 0.07349948883056634, + "rewards/rejected": -0.882321548461914, + "step": 2280 + }, + { + "epoch": 0.916, + "grad_norm": 0.5650275945663452, + "kl": 4.004490852355957, + "learning_rate": 3.0122222222222226e-06, + "logits/chosen": 30858310.4, + "logits/rejected": 30752073.6, + "logps/chosen": -173.15472412109375, + "logps/rejected": -179.914208984375, + "loss": 0.45772509574890136, + "rewards/chosen": -0.07655960321426392, + "rewards/margins": 0.580165708065033, + "rewards/rejected": -0.6567253112792969, + "step": 2290 + }, + { + "epoch": 0.92, + "grad_norm": 0.6002667546272278, + "kl": 2.4904167652130127, + "learning_rate": 3.0011111111111114e-06, + "logits/chosen": 27612214.4, + "logits/rejected": 29905420.8, + "logps/chosen": -170.78812255859376, + "logps/rejected": -171.15484619140625, + "loss": 0.48928098678588866, + "rewards/chosen": -0.48044404983520506, + "rewards/margins": 0.08476023674011235, + "rewards/rejected": -0.5652042865753174, + "step": 2300 + }, + { + "epoch": 0.924, + "grad_norm": 0.7137225866317749, + "kl": 2.9995059967041016, + "learning_rate": 2.99e-06, + "logits/chosen": 33246598.4, + "logits/rejected": 31494838.4, + "logps/chosen": -124.85633544921875, + "logps/rejected": -151.764404296875, + "loss": 0.46476993560791013, + "rewards/chosen": -0.3835261344909668, + "rewards/margins": 0.35444231033325196, + "rewards/rejected": -0.7379684448242188, + "step": 2310 + }, + { + "epoch": 0.928, + "grad_norm": 0.48665422201156616, + "kl": 4.963588714599609, + "learning_rate": 2.978888888888889e-06, + "logits/chosen": 26167496.0, + "logits/rejected": 26003188.8, + "logps/chosen": -154.3181640625, + "logps/rejected": -193.70732421875, + "loss": 0.4603987216949463, + "rewards/chosen": -0.019819003343582154, + "rewards/margins": 0.6258892238140107, + "rewards/rejected": -0.6457082271575928, + "step": 2320 + }, + { + "epoch": 0.932, + "grad_norm": 0.6779302954673767, + "kl": 3.996805191040039, + "learning_rate": 2.9677777777777778e-06, + "logits/chosen": 26639760.0, + "logits/rejected": 24185547.2, + "logps/chosen": -145.71864013671876, + "logps/rejected": -165.16456298828126, + "loss": 0.41465444564819337, + "rewards/chosen": 0.16978931427001953, + "rewards/margins": 0.8076234340667725, + "rewards/rejected": -0.637834119796753, + "step": 2330 + }, + { + "epoch": 0.936, + "grad_norm": 0.8533156514167786, + "kl": 3.2005672454833984, + "learning_rate": 2.956666666666667e-06, + "logits/chosen": 17311833.6, + "logits/rejected": 18152035.2, + "logps/chosen": -139.579248046875, + "logps/rejected": -141.84622802734376, + "loss": 0.4770832538604736, + "rewards/chosen": -0.5391797542572021, + "rewards/margins": 0.2623293399810791, + "rewards/rejected": -0.8015090942382812, + "step": 2340 + }, + { + "epoch": 0.94, + "grad_norm": 0.7501420974731445, + "kl": 5.2280778884887695, + "learning_rate": 2.945555555555556e-06, + "logits/chosen": 19459948.8, + "logits/rejected": 19488014.4, + "logps/chosen": -181.2943603515625, + "logps/rejected": -141.72918701171875, + "loss": 0.4741385459899902, + "rewards/chosen": -0.3969358682632446, + "rewards/margins": 0.048303866386413596, + "rewards/rejected": -0.4452397346496582, + "step": 2350 + }, + { + "epoch": 0.944, + "grad_norm": 0.47924181818962097, + "kl": 5.855168342590332, + "learning_rate": 2.9344444444444446e-06, + "logits/chosen": 25643113.6, + "logits/rejected": 21872040.0, + "logps/chosen": -146.56474609375, + "logps/rejected": -149.1546630859375, + "loss": 0.4523441314697266, + "rewards/chosen": 0.34644312858581544, + "rewards/margins": 0.6119464874267578, + "rewards/rejected": -0.2655033588409424, + "step": 2360 + }, + { + "epoch": 0.948, + "grad_norm": 0.6821103692054749, + "kl": 7.001960754394531, + "learning_rate": 2.9233333333333334e-06, + "logits/chosen": 26589932.8, + "logits/rejected": 24771849.6, + "logps/chosen": -132.04959716796876, + "logps/rejected": -197.91544189453126, + "loss": 0.5003488063812256, + "rewards/chosen": -0.008898758888244629, + "rewards/margins": 0.02103534936904907, + "rewards/rejected": -0.0299341082572937, + "step": 2370 + }, + { + "epoch": 0.952, + "grad_norm": 0.6097027063369751, + "kl": 6.377338409423828, + "learning_rate": 2.9122222222222222e-06, + "logits/chosen": 40187350.4, + "logits/rejected": 39877142.4, + "logps/chosen": -171.79857177734374, + "logps/rejected": -151.0936767578125, + "loss": 0.46953182220458983, + "rewards/chosen": 0.39073307514190675, + "rewards/margins": 0.4624105989933014, + "rewards/rejected": -0.07167752385139466, + "step": 2380 + }, + { + "epoch": 0.956, + "grad_norm": 0.8344343900680542, + "kl": 5.0947136878967285, + "learning_rate": 2.901111111111111e-06, + "logits/chosen": 27525568.0, + "logits/rejected": 27525084.8, + "logps/chosen": -174.18642578125, + "logps/rejected": -169.3951171875, + "loss": 0.4775404453277588, + "rewards/chosen": -0.11374995708465577, + "rewards/margins": 0.36258018016815186, + "rewards/rejected": -0.4763301372528076, + "step": 2390 + }, + { + "epoch": 0.96, + "grad_norm": 0.5999415516853333, + "kl": 5.195433616638184, + "learning_rate": 2.89e-06, + "logits/chosen": 32106156.8, + "logits/rejected": 31147836.8, + "logps/chosen": -166.06192626953126, + "logps/rejected": -175.74422607421874, + "loss": 0.4666886329650879, + "rewards/chosen": 0.09231564402580261, + "rewards/margins": 0.445311564207077, + "rewards/rejected": -0.3529959201812744, + "step": 2400 + }, + { + "epoch": 0.96, + "eval_kl": 5.085776329040527, + "eval_logits/chosen": 27241426.944, + "eval_logits/rejected": 27194333.184, + "eval_logps/chosen": -154.09196875, + "eval_logps/rejected": -150.0654375, + "eval_loss": 0.4826502501964569, + "eval_rewards/chosen": 0.023380521774291993, + "eval_rewards/margins": 0.16565044975280763, + "eval_rewards/rejected": -0.14226992797851562, + "eval_runtime": 216.6502, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 2400 + }, + { + "epoch": 0.964, + "grad_norm": 0.6591479182243347, + "kl": 4.856285095214844, + "learning_rate": 2.8788888888888895e-06, + "logits/chosen": 33843148.8, + "logits/rejected": 33023673.6, + "logps/chosen": -162.89716796875, + "logps/rejected": -152.8024169921875, + "loss": 0.4535430908203125, + "rewards/chosen": 0.35766189098358153, + "rewards/margins": 0.5051510214805603, + "rewards/rejected": -0.14748913049697876, + "step": 2410 + }, + { + "epoch": 0.968, + "grad_norm": 0.649363100528717, + "kl": 5.67615270614624, + "learning_rate": 2.8677777777777783e-06, + "logits/chosen": 28120470.4, + "logits/rejected": 28187414.4, + "logps/chosen": -148.36553955078125, + "logps/rejected": -164.37691650390624, + "loss": 0.49935593605041506, + "rewards/chosen": -0.13897392749786378, + "rewards/margins": 0.09212601184844971, + "rewards/rejected": -0.23109993934631348, + "step": 2420 + }, + { + "epoch": 0.972, + "grad_norm": 0.7029784321784973, + "kl": 5.981629848480225, + "learning_rate": 2.856666666666667e-06, + "logits/chosen": 33376736.0, + "logits/rejected": 35385472.0, + "logps/chosen": -160.655419921875, + "logps/rejected": -113.14091796875, + "loss": 0.5014323711395263, + "rewards/chosen": -0.06511507034301758, + "rewards/margins": -0.10962846279144288, + "rewards/rejected": 0.044513392448425296, + "step": 2430 + }, + { + "epoch": 0.976, + "grad_norm": 0.5741814970970154, + "kl": 7.015416145324707, + "learning_rate": 2.845555555555556e-06, + "logits/chosen": 24375812.8, + "logits/rejected": 23925715.2, + "logps/chosen": -142.3306884765625, + "logps/rejected": -145.99935302734374, + "loss": 0.47071352005004885, + "rewards/chosen": 0.3844744205474854, + "rewards/margins": 0.33515343666076663, + "rewards/rejected": 0.04932098388671875, + "step": 2440 + }, + { + "epoch": 0.98, + "grad_norm": 0.708365261554718, + "kl": 7.640904426574707, + "learning_rate": 2.8344444444444447e-06, + "logits/chosen": 36292083.2, + "logits/rejected": 33427609.6, + "logps/chosen": -175.033447265625, + "logps/rejected": -175.34300537109374, + "loss": 0.46329379081726074, + "rewards/chosen": 0.707914161682129, + "rewards/margins": 0.43219349384307865, + "rewards/rejected": 0.2757206678390503, + "step": 2450 + }, + { + "epoch": 0.984, + "grad_norm": 0.8229350447654724, + "kl": 6.793179512023926, + "learning_rate": 2.8233333333333335e-06, + "logits/chosen": 34248473.6, + "logits/rejected": 34939712.0, + "logps/chosen": -144.50880126953126, + "logps/rejected": -149.553759765625, + "loss": 0.49341444969177245, + "rewards/chosen": 0.4542993545532227, + "rewards/margins": 0.14487073421478275, + "rewards/rejected": 0.30942862033843993, + "step": 2460 + }, + { + "epoch": 0.988, + "grad_norm": 0.8729678392410278, + "kl": 6.059536933898926, + "learning_rate": 2.8122222222222224e-06, + "logits/chosen": 39128422.4, + "logits/rejected": 35834524.8, + "logps/chosen": -160.21749267578124, + "logps/rejected": -118.927099609375, + "loss": 0.4856616973876953, + "rewards/chosen": 0.2684544324874878, + "rewards/margins": 0.2460126757621765, + "rewards/rejected": 0.02244175672531128, + "step": 2470 + }, + { + "epoch": 0.992, + "grad_norm": 0.7808408737182617, + "kl": 4.119040489196777, + "learning_rate": 2.801111111111111e-06, + "logits/chosen": 14563339.2, + "logits/rejected": 14386867.2, + "logps/chosen": -131.0562255859375, + "logps/rejected": -109.05662841796875, + "loss": 0.50515718460083, + "rewards/chosen": -0.17694272994995117, + "rewards/margins": -0.02005159854888916, + "rewards/rejected": -0.156891131401062, + "step": 2480 + }, + { + "epoch": 0.996, + "grad_norm": 0.7683461904525757, + "kl": 5.681182861328125, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 34791257.6, + "logits/rejected": 35001116.8, + "logps/chosen": -116.57052001953124, + "logps/rejected": -133.0627197265625, + "loss": 0.46341490745544434, + "rewards/chosen": 0.3942615032196045, + "rewards/margins": 0.4242114990949631, + "rewards/rejected": -0.02994999587535858, + "step": 2490 + }, + { + "epoch": 1.0, + "grad_norm": 0.7146331667900085, + "kl": 7.186850547790527, + "learning_rate": 2.778888888888889e-06, + "logits/chosen": 27759424.0, + "logits/rejected": 28190390.4, + "logps/chosen": -158.3378173828125, + "logps/rejected": -122.17666015625, + "loss": 0.4907883644104004, + "rewards/chosen": 0.33272812366485593, + "rewards/margins": 3.4856796264637335e-05, + "rewards/rejected": 0.3326932668685913, + "step": 2500 + }, + { + "epoch": 1.004, + "grad_norm": 0.7267434597015381, + "kl": 7.022622108459473, + "learning_rate": 2.767777777777778e-06, + "logits/chosen": 23414056.0, + "logits/rejected": 23530460.8, + "logps/chosen": -142.120947265625, + "logps/rejected": -126.23233642578126, + "loss": 0.450551700592041, + "rewards/chosen": 0.6840017318725586, + "rewards/margins": 0.4234133481979371, + "rewards/rejected": 0.26058838367462156, + "step": 2510 + }, + { + "epoch": 1.008, + "grad_norm": 0.613120436668396, + "kl": 7.363889217376709, + "learning_rate": 2.756666666666667e-06, + "logits/chosen": 42853379.2, + "logits/rejected": 42718368.0, + "logps/chosen": -133.13275146484375, + "logps/rejected": -147.75242919921874, + "loss": 0.4773738384246826, + "rewards/chosen": 0.6898352622985839, + "rewards/margins": 0.2725923061370849, + "rewards/rejected": 0.41724295616149903, + "step": 2520 + }, + { + "epoch": 1.012, + "grad_norm": 0.4656667113304138, + "kl": 6.543205261230469, + "learning_rate": 2.7455555555555556e-06, + "logits/chosen": 24894561.6, + "logits/rejected": 23945945.6, + "logps/chosen": -130.03875732421875, + "logps/rejected": -139.270556640625, + "loss": 0.48987507820129395, + "rewards/chosen": 0.16998794078826904, + "rewards/margins": 0.060191738605499256, + "rewards/rejected": 0.10979620218276978, + "step": 2530 + }, + { + "epoch": 1.016, + "grad_norm": 0.6344980597496033, + "kl": 8.745767593383789, + "learning_rate": 2.7344444444444444e-06, + "logits/chosen": 33636630.4, + "logits/rejected": 33898816.0, + "logps/chosen": -137.0499755859375, + "logps/rejected": -142.44915771484375, + "loss": 0.46440706253051756, + "rewards/chosen": 0.7247509479522705, + "rewards/margins": 0.3660990238189697, + "rewards/rejected": 0.35865192413330077, + "step": 2540 + }, + { + "epoch": 1.02, + "grad_norm": 0.5636667013168335, + "kl": 5.15373420715332, + "learning_rate": 2.7233333333333332e-06, + "logits/chosen": 32778352.0, + "logits/rejected": 34006931.2, + "logps/chosen": -138.170361328125, + "logps/rejected": -156.81767578125, + "loss": 0.4536026954650879, + "rewards/chosen": 0.28399336338043213, + "rewards/margins": 0.5328751564025879, + "rewards/rejected": -0.24888179302215577, + "step": 2550 + }, + { + "epoch": 1.024, + "grad_norm": 0.5508406758308411, + "kl": 4.445399284362793, + "learning_rate": 2.712222222222222e-06, + "logits/chosen": 24235310.4, + "logits/rejected": 20467011.2, + "logps/chosen": -102.88858642578126, + "logps/rejected": -117.8940673828125, + "loss": 0.4521032333374023, + "rewards/chosen": 0.2849747180938721, + "rewards/margins": 0.6732351303100585, + "rewards/rejected": -0.3882604122161865, + "step": 2560 + }, + { + "epoch": 1.028, + "grad_norm": 0.6794329881668091, + "kl": 7.5771074295043945, + "learning_rate": 2.7011111111111117e-06, + "logits/chosen": 39230246.4, + "logits/rejected": 36269590.4, + "logps/chosen": -160.77816162109374, + "logps/rejected": -175.435302734375, + "loss": 0.4510225296020508, + "rewards/chosen": 0.48538646697998045, + "rewards/margins": 0.5213055074214935, + "rewards/rejected": -0.03591904044151306, + "step": 2570 + }, + { + "epoch": 1.032, + "grad_norm": 0.873762845993042, + "kl": 7.515707969665527, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 34034691.2, + "logits/rejected": 32782438.4, + "logps/chosen": -146.67379150390624, + "logps/rejected": -160.89415283203124, + "loss": 0.4409791946411133, + "rewards/chosen": 0.8245258331298828, + "rewards/margins": 0.5631396770477295, + "rewards/rejected": 0.2613861560821533, + "step": 2580 + }, + { + "epoch": 1.036, + "grad_norm": 0.8786899447441101, + "kl": 7.477902889251709, + "learning_rate": 2.6788888888888893e-06, + "logits/chosen": 31405507.2, + "logits/rejected": 30546566.4, + "logps/chosen": -162.37747802734376, + "logps/rejected": -158.5367919921875, + "loss": 0.429317569732666, + "rewards/chosen": 0.7475490093231201, + "rewards/margins": 0.6858846783638, + "rewards/rejected": 0.06166433095932007, + "step": 2590 + }, + { + "epoch": 1.04, + "grad_norm": 0.9408835768699646, + "kl": 6.193826198577881, + "learning_rate": 2.667777777777778e-06, + "logits/chosen": 18314294.4, + "logits/rejected": 17261046.4, + "logps/chosen": -133.769482421875, + "logps/rejected": -166.3760498046875, + "loss": 0.4750513553619385, + "rewards/chosen": 0.05294798612594605, + "rewards/margins": 0.23716256618499754, + "rewards/rejected": -0.1842145800590515, + "step": 2600 + }, + { + "epoch": 1.04, + "eval_kl": 6.557363986968994, + "eval_logits/chosen": 28159451.136, + "eval_logits/rejected": 27912509.44, + "eval_logps/chosen": -150.601921875, + "eval_logps/rejected": -146.599171875, + "eval_loss": 0.4829034209251404, + "eval_rewards/chosen": 0.3723853454589844, + "eval_rewards/margins": 0.16802960205078127, + "eval_rewards/rejected": 0.20435574340820312, + "eval_runtime": 217.1791, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 2600 + }, + { + "epoch": 1.044, + "grad_norm": 0.6559078693389893, + "kl": 6.465074062347412, + "learning_rate": 2.656666666666667e-06, + "logits/chosen": 29268515.2, + "logits/rejected": 27382860.8, + "logps/chosen": -124.25496826171874, + "logps/rejected": -132.88599853515626, + "loss": 0.46654496192932127, + "rewards/chosen": 0.6311461448669433, + "rewards/margins": 0.35537390708923333, + "rewards/rejected": 0.27577223777771, + "step": 2610 + }, + { + "epoch": 1.048, + "grad_norm": 0.8091041445732117, + "kl": 10.043633460998535, + "learning_rate": 2.6455555555555557e-06, + "logits/chosen": 36276444.8, + "logits/rejected": 36037961.6, + "logps/chosen": -156.52952880859374, + "logps/rejected": -128.44246826171874, + "loss": 0.49141683578491213, + "rewards/chosen": 0.5628880023956299, + "rewards/margins": 0.02750706672668457, + "rewards/rejected": 0.5353809356689453, + "step": 2620 + }, + { + "epoch": 1.052, + "grad_norm": 0.6571462154388428, + "kl": 9.852654457092285, + "learning_rate": 2.6344444444444445e-06, + "logits/chosen": 31615228.8, + "logits/rejected": 32248979.2, + "logps/chosen": -154.07755126953126, + "logps/rejected": -132.39072265625, + "loss": 0.49523077011108396, + "rewards/chosen": 0.8096317291259766, + "rewards/margins": 0.018443870544433638, + "rewards/rejected": 0.791187858581543, + "step": 2630 + }, + { + "epoch": 1.056, + "grad_norm": 0.7390360832214355, + "kl": 6.219546318054199, + "learning_rate": 2.6233333333333333e-06, + "logits/chosen": 34434326.4, + "logits/rejected": 35229926.4, + "logps/chosen": -137.31668701171876, + "logps/rejected": -155.1895751953125, + "loss": 0.4699239730834961, + "rewards/chosen": 0.5334546089172363, + "rewards/margins": 0.24749846458435054, + "rewards/rejected": 0.28595614433288574, + "step": 2640 + }, + { + "epoch": 1.06, + "grad_norm": 0.4814999997615814, + "kl": 7.876091003417969, + "learning_rate": 2.6122222222222226e-06, + "logits/chosen": 31323043.2, + "logits/rejected": 32345257.6, + "logps/chosen": -123.53333740234375, + "logps/rejected": -115.850341796875, + "loss": 0.43686504364013673, + "rewards/chosen": 0.7052061557769775, + "rewards/margins": 0.533865237236023, + "rewards/rejected": 0.1713409185409546, + "step": 2650 + }, + { + "epoch": 1.064, + "grad_norm": 0.690242350101471, + "kl": 6.12372350692749, + "learning_rate": 2.6011111111111114e-06, + "logits/chosen": 25323872.0, + "logits/rejected": 25615576.0, + "logps/chosen": -141.92086181640624, + "logps/rejected": -147.47308349609375, + "loss": 0.4814923763275146, + "rewards/chosen": 0.1965832829475403, + "rewards/margins": 0.2016111582517624, + "rewards/rejected": -0.005027875304222107, + "step": 2660 + }, + { + "epoch": 1.068, + "grad_norm": 0.5838690400123596, + "kl": 5.5848894119262695, + "learning_rate": 2.59e-06, + "logits/chosen": 26498441.6, + "logits/rejected": 25519251.2, + "logps/chosen": -134.1501220703125, + "logps/rejected": -152.97462158203126, + "loss": 0.44647746086120604, + "rewards/chosen": 0.45215396881103515, + "rewards/margins": 0.7281685590744018, + "rewards/rejected": -0.2760145902633667, + "step": 2670 + }, + { + "epoch": 1.072, + "grad_norm": 0.6714196801185608, + "kl": 6.437767028808594, + "learning_rate": 2.578888888888889e-06, + "logits/chosen": 33809168.0, + "logits/rejected": 33002240.0, + "logps/chosen": -140.62269287109376, + "logps/rejected": -169.9681640625, + "loss": 0.46931910514831543, + "rewards/chosen": 0.5085949420928955, + "rewards/margins": 0.36951395273208615, + "rewards/rejected": 0.13908098936080932, + "step": 2680 + }, + { + "epoch": 1.076, + "grad_norm": 0.6106992959976196, + "kl": 4.450573921203613, + "learning_rate": 2.567777777777778e-06, + "logits/chosen": 30799760.0, + "logits/rejected": 30721590.4, + "logps/chosen": -122.03265380859375, + "logps/rejected": -137.297802734375, + "loss": 0.4664300441741943, + "rewards/chosen": 0.25887534618377683, + "rewards/margins": 0.22206425368785856, + "rewards/rejected": 0.036811092495918275, + "step": 2690 + }, + { + "epoch": 1.08, + "grad_norm": 0.9053173661231995, + "kl": 6.650594234466553, + "learning_rate": 2.5566666666666666e-06, + "logits/chosen": 38100038.4, + "logits/rejected": 34380816.0, + "logps/chosen": -145.046533203125, + "logps/rejected": -173.3657958984375, + "loss": 0.43851666450500487, + "rewards/chosen": 0.6501460552215577, + "rewards/margins": 0.6605178594589234, + "rewards/rejected": -0.010371804237365723, + "step": 2700 + }, + { + "epoch": 1.084, + "grad_norm": 0.676274836063385, + "kl": 3.7379002571105957, + "learning_rate": 2.5455555555555554e-06, + "logits/chosen": 19580768.0, + "logits/rejected": 20510683.2, + "logps/chosen": -145.21326904296876, + "logps/rejected": -136.80472412109376, + "loss": 0.5046684741973877, + "rewards/chosen": -0.46685400009155276, + "rewards/margins": -0.04879570007324219, + "rewards/rejected": -0.41805830001831057, + "step": 2710 + }, + { + "epoch": 1.088, + "grad_norm": 0.7222055792808533, + "kl": 5.816348075866699, + "learning_rate": 2.534444444444445e-06, + "logits/chosen": 22407427.2, + "logits/rejected": 21474601.6, + "logps/chosen": -100.11134643554688, + "logps/rejected": -127.4869384765625, + "loss": 0.4877651214599609, + "rewards/chosen": 0.17378766536712648, + "rewards/margins": 0.2759766340255737, + "rewards/rejected": -0.10218896865844726, + "step": 2720 + }, + { + "epoch": 1.092, + "grad_norm": 0.6274256110191345, + "kl": 5.171383857727051, + "learning_rate": 2.523333333333334e-06, + "logits/chosen": 35835520.0, + "logits/rejected": 35165875.2, + "logps/chosen": -186.44775390625, + "logps/rejected": -167.0265869140625, + "loss": 0.466900634765625, + "rewards/chosen": -0.10870237350463867, + "rewards/margins": 0.21330931186676022, + "rewards/rejected": -0.3220116853713989, + "step": 2730 + }, + { + "epoch": 1.096, + "grad_norm": 0.6451675295829773, + "kl": 5.268320083618164, + "learning_rate": 2.5122222222222227e-06, + "logits/chosen": 32315388.8, + "logits/rejected": 34424963.2, + "logps/chosen": -177.06943359375, + "logps/rejected": -168.0710205078125, + "loss": 0.47601852416992185, + "rewards/chosen": -0.004057984054088593, + "rewards/margins": 0.10740263015031815, + "rewards/rejected": -0.11146061420440674, + "step": 2740 + }, + { + "epoch": 1.1, + "grad_norm": 0.9512624740600586, + "kl": 5.623786926269531, + "learning_rate": 2.5011111111111115e-06, + "logits/chosen": 24552736.0, + "logits/rejected": 23781004.8, + "logps/chosen": -174.98658447265626, + "logps/rejected": -152.33095703125, + "loss": 0.42731657028198244, + "rewards/chosen": 0.17493221759796143, + "rewards/margins": 0.8694432020187378, + "rewards/rejected": -0.6945109844207764, + "step": 2750 + }, + { + "epoch": 1.104, + "grad_norm": 0.8605503439903259, + "kl": 3.258263111114502, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 17540195.2, + "logits/rejected": 14481008.0, + "logps/chosen": -130.9520263671875, + "logps/rejected": -140.87447509765624, + "loss": 0.4474031925201416, + "rewards/chosen": -0.1570604920387268, + "rewards/margins": 0.7482632040977477, + "rewards/rejected": -0.9053236961364746, + "step": 2760 + }, + { + "epoch": 1.108, + "grad_norm": 0.8902762532234192, + "kl": 6.445823669433594, + "learning_rate": 2.478888888888889e-06, + "logits/chosen": 25843875.2, + "logits/rejected": 25419281.6, + "logps/chosen": -155.84178466796874, + "logps/rejected": -156.02589111328126, + "loss": 0.464084529876709, + "rewards/chosen": 0.1782880425453186, + "rewards/margins": 0.5563196301460266, + "rewards/rejected": -0.378031587600708, + "step": 2770 + }, + { + "epoch": 1.112, + "grad_norm": 0.7916592359542847, + "kl": 5.980124473571777, + "learning_rate": 2.467777777777778e-06, + "logits/chosen": 23357057.6, + "logits/rejected": 19639329.6, + "logps/chosen": -165.2984619140625, + "logps/rejected": -144.48494873046874, + "loss": 0.44487595558166504, + "rewards/chosen": 0.28723764419555664, + "rewards/margins": 0.6892455577850342, + "rewards/rejected": -0.4020079135894775, + "step": 2780 + }, + { + "epoch": 1.116, + "grad_norm": 0.5942727327346802, + "kl": 7.625303745269775, + "learning_rate": 2.4566666666666667e-06, + "logits/chosen": 22922739.2, + "logits/rejected": 22430774.4, + "logps/chosen": -139.4244873046875, + "logps/rejected": -147.5355224609375, + "loss": 0.43770174980163573, + "rewards/chosen": 0.7275904655456543, + "rewards/margins": 0.641109848022461, + "rewards/rejected": 0.08648061752319336, + "step": 2790 + }, + { + "epoch": 1.12, + "grad_norm": 0.6265588402748108, + "kl": 5.79810094833374, + "learning_rate": 2.4455555555555555e-06, + "logits/chosen": 33326956.8, + "logits/rejected": 33234153.6, + "logps/chosen": -165.5775634765625, + "logps/rejected": -181.19012451171875, + "loss": 0.46999220848083495, + "rewards/chosen": 0.3356909275054932, + "rewards/margins": 0.41499342918396, + "rewards/rejected": -0.0793025016784668, + "step": 2800 + }, + { + "epoch": 1.12, + "eval_kl": 6.51247501373291, + "eval_logits/chosen": 31125557.248, + "eval_logits/rejected": 30991392.768, + "eval_logps/chosen": -150.6563125, + "eval_logps/rejected": -146.877875, + "eval_loss": 0.480685293674469, + "eval_rewards/chosen": 0.36694525146484375, + "eval_rewards/margins": 0.19045977783203125, + "eval_rewards/rejected": 0.1764854736328125, + "eval_runtime": 217.0415, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.304, + "step": 2800 + }, + { + "epoch": 1.124, + "grad_norm": 0.6718530654907227, + "kl": 7.274069309234619, + "learning_rate": 2.4344444444444448e-06, + "logits/chosen": 28781395.2, + "logits/rejected": 28910336.0, + "logps/chosen": -142.17664794921876, + "logps/rejected": -153.1429443359375, + "loss": 0.48170881271362304, + "rewards/chosen": 0.4687415599822998, + "rewards/margins": 0.17820630073547367, + "rewards/rejected": 0.29053525924682616, + "step": 2810 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.8993642330169678, + "kl": 5.114150047302246, + "learning_rate": 2.4233333333333336e-06, + "logits/chosen": 31094944.0, + "logits/rejected": 29708396.8, + "logps/chosen": -158.43695068359375, + "logps/rejected": -134.23763427734374, + "loss": 0.4788343906402588, + "rewards/chosen": 0.18828521966934203, + "rewards/margins": 0.20167077183723447, + "rewards/rejected": -0.013385552167892455, + "step": 2820 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 0.5459766387939453, + "kl": 6.137971878051758, + "learning_rate": 2.4122222222222224e-06, + "logits/chosen": 41590592.0, + "logits/rejected": 39072947.2, + "logps/chosen": -145.41527099609374, + "logps/rejected": -144.8617919921875, + "loss": 0.40869617462158203, + "rewards/chosen": 0.8426953315734863, + "rewards/margins": 0.9073116958141326, + "rewards/rejected": -0.06461636424064636, + "step": 2830 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.6354557275772095, + "kl": 5.928516387939453, + "learning_rate": 2.401111111111111e-06, + "logits/chosen": 34861424.0, + "logits/rejected": 34759747.2, + "logps/chosen": -129.4930908203125, + "logps/rejected": -143.89154052734375, + "loss": 0.44280567169189455, + "rewards/chosen": 0.5408330440521241, + "rewards/margins": 0.5000333577394486, + "rewards/rejected": 0.04079968631267548, + "step": 2840 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.5932331085205078, + "kl": 8.782042503356934, + "learning_rate": 2.39e-06, + "logits/chosen": 32643923.2, + "logits/rejected": 30833843.2, + "logps/chosen": -127.0966064453125, + "logps/rejected": -164.51356201171876, + "loss": 0.4796291351318359, + "rewards/chosen": 0.8198535919189454, + "rewards/margins": 0.17787570953369147, + "rewards/rejected": 0.6419778823852539, + "step": 2850 + }, + { + "epoch": 1.144, + "grad_norm": 0.4119536578655243, + "kl": 6.023087024688721, + "learning_rate": 2.3788888888888892e-06, + "logits/chosen": 34157424.0, + "logits/rejected": 33897881.6, + "logps/chosen": -142.58350830078126, + "logps/rejected": -112.63641357421875, + "loss": 0.44989490509033203, + "rewards/chosen": 0.5664341926574707, + "rewards/margins": 0.5420472577214241, + "rewards/rejected": 0.0243869349360466, + "step": 2860 + }, + { + "epoch": 1.148, + "grad_norm": 0.6733080744743347, + "kl": 7.073210716247559, + "learning_rate": 2.367777777777778e-06, + "logits/chosen": 32286940.8, + "logits/rejected": 32065385.6, + "logps/chosen": -144.9827392578125, + "logps/rejected": -150.515478515625, + "loss": 0.4916172981262207, + "rewards/chosen": 0.20184409618377686, + "rewards/margins": 0.11459586024284363, + "rewards/rejected": 0.08724823594093323, + "step": 2870 + }, + { + "epoch": 1.152, + "grad_norm": 1.03435480594635, + "kl": 6.8236589431762695, + "learning_rate": 2.356666666666667e-06, + "logits/chosen": 29720467.2, + "logits/rejected": 29326092.8, + "logps/chosen": -151.82506103515624, + "logps/rejected": -165.98399658203124, + "loss": 0.45775256156921384, + "rewards/chosen": 0.4520999908447266, + "rewards/margins": 0.2227289915084839, + "rewards/rejected": 0.22937099933624266, + "step": 2880 + }, + { + "epoch": 1.156, + "grad_norm": 0.5883368849754333, + "kl": 6.491732120513916, + "learning_rate": 2.3455555555555556e-06, + "logits/chosen": 23835811.2, + "logits/rejected": 23859849.6, + "logps/chosen": -170.26668701171874, + "logps/rejected": -150.8224609375, + "loss": 0.431504487991333, + "rewards/chosen": 0.614830207824707, + "rewards/margins": 0.6636435002088547, + "rewards/rejected": -0.048813292384147645, + "step": 2890 + }, + { + "epoch": 1.16, + "grad_norm": 0.7225193977355957, + "kl": 5.883708477020264, + "learning_rate": 2.334444444444445e-06, + "logits/chosen": 36251644.8, + "logits/rejected": 33482300.8, + "logps/chosen": -145.1569091796875, + "logps/rejected": -140.39755859375, + "loss": 0.4224736213684082, + "rewards/chosen": 0.4660323619842529, + "rewards/margins": 0.918277359008789, + "rewards/rejected": -0.45224499702453613, + "step": 2900 + }, + { + "epoch": 1.164, + "grad_norm": 0.550423800945282, + "kl": 6.711850643157959, + "learning_rate": 2.3233333333333337e-06, + "logits/chosen": 37449692.8, + "logits/rejected": 35041868.8, + "logps/chosen": -142.85999755859376, + "logps/rejected": -148.87371826171875, + "loss": 0.45527114868164065, + "rewards/chosen": 0.5728636741638183, + "rewards/margins": 0.5280719608068466, + "rewards/rejected": 0.04479171335697174, + "step": 2910 + }, + { + "epoch": 1.168, + "grad_norm": 0.8112925887107849, + "kl": 3.537787675857544, + "learning_rate": 2.3122222222222225e-06, + "logits/chosen": 24047856.0, + "logits/rejected": 23857336.0, + "logps/chosen": -164.10267333984376, + "logps/rejected": -137.96185302734375, + "loss": 0.44495596885681155, + "rewards/chosen": -0.09260135293006896, + "rewards/margins": 0.5166131436824798, + "rewards/rejected": -0.6092144966125488, + "step": 2920 + }, + { + "epoch": 1.172, + "grad_norm": 0.5182350277900696, + "kl": 5.1398210525512695, + "learning_rate": 2.3011111111111113e-06, + "logits/chosen": 24040510.4, + "logits/rejected": 25510265.6, + "logps/chosen": -170.7597412109375, + "logps/rejected": -124.09306640625, + "loss": 0.45585017204284667, + "rewards/chosen": 0.017396342754364014, + "rewards/margins": 0.4217635035514832, + "rewards/rejected": -0.40436716079711915, + "step": 2930 + }, + { + "epoch": 1.176, + "grad_norm": 0.7129570841789246, + "kl": 5.55086612701416, + "learning_rate": 2.29e-06, + "logits/chosen": 36301065.6, + "logits/rejected": 36730444.8, + "logps/chosen": -144.58111572265625, + "logps/rejected": -170.131298828125, + "loss": 0.45364060401916506, + "rewards/chosen": 0.47618856430053713, + "rewards/margins": 0.4523512840270996, + "rewards/rejected": 0.0238372802734375, + "step": 2940 + }, + { + "epoch": 1.18, + "grad_norm": 0.689380943775177, + "kl": 5.013358116149902, + "learning_rate": 2.278888888888889e-06, + "logits/chosen": 26987398.4, + "logits/rejected": 27705705.6, + "logps/chosen": -97.45775146484375, + "logps/rejected": -156.65115966796876, + "loss": 0.4797633171081543, + "rewards/chosen": 0.1765173554420471, + "rewards/margins": 0.27975412607192995, + "rewards/rejected": -0.10323677062988282, + "step": 2950 + }, + { + "epoch": 1.184, + "grad_norm": 0.5471240282058716, + "kl": 8.275094985961914, + "learning_rate": 2.2677777777777777e-06, + "logits/chosen": 40253052.8, + "logits/rejected": 37156057.6, + "logps/chosen": -157.5787109375, + "logps/rejected": -178.797021484375, + "loss": 0.46055126190185547, + "rewards/chosen": 0.7023271083831787, + "rewards/margins": 0.4890592336654663, + "rewards/rejected": 0.2132678747177124, + "step": 2960 + }, + { + "epoch": 1.188, + "grad_norm": 0.5812104344367981, + "kl": 6.715832710266113, + "learning_rate": 2.2566666666666665e-06, + "logits/chosen": 43975625.6, + "logits/rejected": 41445318.4, + "logps/chosen": -171.347216796875, + "logps/rejected": -180.57864990234376, + "loss": 0.4628589630126953, + "rewards/chosen": 0.22713685035705566, + "rewards/margins": 0.5415925025939942, + "rewards/rejected": -0.3144556522369385, + "step": 2970 + }, + { + "epoch": 1.192, + "grad_norm": 0.8072389364242554, + "kl": 5.419320583343506, + "learning_rate": 2.2455555555555557e-06, + "logits/chosen": 37647660.8, + "logits/rejected": 36517382.4, + "logps/chosen": -149.52344970703126, + "logps/rejected": -164.54786376953126, + "loss": 0.44607295989990237, + "rewards/chosen": 0.300301718711853, + "rewards/margins": 0.5973361253738403, + "rewards/rejected": -0.2970344066619873, + "step": 2980 + }, + { + "epoch": 1.196, + "grad_norm": 0.44364601373672485, + "kl": 6.054505825042725, + "learning_rate": 2.2344444444444446e-06, + "logits/chosen": 22235811.2, + "logits/rejected": 19985859.2, + "logps/chosen": -143.41585693359374, + "logps/rejected": -150.84788818359374, + "loss": 0.41753711700439455, + "rewards/chosen": 0.5826003074645996, + "rewards/margins": 0.9533658504486083, + "rewards/rejected": -0.37076554298400877, + "step": 2990 + }, + { + "epoch": 1.2, + "grad_norm": 0.541634202003479, + "kl": 8.647984504699707, + "learning_rate": 2.2233333333333334e-06, + "logits/chosen": 38845638.4, + "logits/rejected": 38343161.6, + "logps/chosen": -171.0084228515625, + "logps/rejected": -129.40447998046875, + "loss": 0.4108599662780762, + "rewards/chosen": 1.1798659324645997, + "rewards/margins": 0.8690209865570069, + "rewards/rejected": 0.31084494590759276, + "step": 3000 + }, + { + "epoch": 1.2, + "eval_kl": 5.96298360824585, + "eval_logits/chosen": 32690919.424, + "eval_logits/rejected": 32526981.12, + "eval_logps/chosen": -150.91603125, + "eval_logps/rejected": -147.333453125, + "eval_loss": 0.4783514738082886, + "eval_rewards/chosen": 0.3409757690429687, + "eval_rewards/margins": 0.21004846191406248, + "eval_rewards/rejected": 0.13092730712890624, + "eval_runtime": 216.7869, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3000 + }, + { + "epoch": 1.204, + "grad_norm": 0.735106348991394, + "kl": 4.565227508544922, + "learning_rate": 2.212222222222222e-06, + "logits/chosen": 42755068.8, + "logits/rejected": 41808000.0, + "logps/chosen": -144.0309814453125, + "logps/rejected": -162.88955078125, + "loss": 0.4483033180236816, + "rewards/chosen": 0.3165964841842651, + "rewards/margins": 0.5639133214950561, + "rewards/rejected": -0.24731683731079102, + "step": 3010 + }, + { + "epoch": 1.208, + "grad_norm": 0.4859018623828888, + "kl": 6.566149711608887, + "learning_rate": 2.2011111111111114e-06, + "logits/chosen": 40423686.4, + "logits/rejected": 38088403.2, + "logps/chosen": -152.13575439453126, + "logps/rejected": -164.60054931640624, + "loss": 0.43638858795166013, + "rewards/chosen": 0.7344797134399415, + "rewards/margins": 0.7190215766429902, + "rewards/rejected": 0.015458136796951294, + "step": 3020 + }, + { + "epoch": 1.212, + "grad_norm": 0.5802381634712219, + "kl": 5.501416206359863, + "learning_rate": 2.19e-06, + "logits/chosen": 36018793.6, + "logits/rejected": 34128659.2, + "logps/chosen": -139.50609130859374, + "logps/rejected": -176.41171875, + "loss": 0.48515634536743163, + "rewards/chosen": 0.2475515365600586, + "rewards/margins": 0.1769363284111023, + "rewards/rejected": 0.0706152081489563, + "step": 3030 + }, + { + "epoch": 1.216, + "grad_norm": 0.9714307188987732, + "kl": 7.294144630432129, + "learning_rate": 2.178888888888889e-06, + "logits/chosen": 29871635.2, + "logits/rejected": 28148294.4, + "logps/chosen": -144.24107666015624, + "logps/rejected": -157.43570556640626, + "loss": 0.4299461364746094, + "rewards/chosen": 0.7857762336730957, + "rewards/margins": 0.6812341213226318, + "rewards/rejected": 0.10454211235046387, + "step": 3040 + }, + { + "epoch": 1.22, + "grad_norm": 0.6073058247566223, + "kl": 5.890301704406738, + "learning_rate": 2.1677777777777782e-06, + "logits/chosen": 36303308.8, + "logits/rejected": 36356560.0, + "logps/chosen": -151.838623046875, + "logps/rejected": -171.02061767578124, + "loss": 0.4609498977661133, + "rewards/chosen": 0.38731932640075684, + "rewards/margins": 0.6510258436203002, + "rewards/rejected": -0.26370651721954347, + "step": 3050 + }, + { + "epoch": 1.224, + "grad_norm": 0.5871905088424683, + "kl": 5.802463531494141, + "learning_rate": 2.156666666666667e-06, + "logits/chosen": 28059456.0, + "logits/rejected": 26678612.8, + "logps/chosen": -159.48240966796874, + "logps/rejected": -153.2945068359375, + "loss": 0.43854827880859376, + "rewards/chosen": 0.4585693359375, + "rewards/margins": 0.676022219657898, + "rewards/rejected": -0.21745288372039795, + "step": 3060 + }, + { + "epoch": 1.228, + "grad_norm": 0.4357960522174835, + "kl": 4.12323522567749, + "learning_rate": 2.145555555555556e-06, + "logits/chosen": 34136323.2, + "logits/rejected": 33183904.0, + "logps/chosen": -146.4657958984375, + "logps/rejected": -139.709375, + "loss": 0.43050317764282225, + "rewards/chosen": 0.3769852876663208, + "rewards/margins": 0.9553431749343871, + "rewards/rejected": -0.5783578872680664, + "step": 3070 + }, + { + "epoch": 1.232, + "grad_norm": 0.47064968943595886, + "kl": 7.40407657623291, + "learning_rate": 2.1344444444444447e-06, + "logits/chosen": 42982582.4, + "logits/rejected": 40440524.8, + "logps/chosen": -142.09869384765625, + "logps/rejected": -158.719091796875, + "loss": 0.42528462409973145, + "rewards/chosen": 0.7800788879394531, + "rewards/margins": 0.7373038113117218, + "rewards/rejected": 0.04277507662773132, + "step": 3080 + }, + { + "epoch": 1.236, + "grad_norm": 0.7308046221733093, + "kl": 7.210939884185791, + "learning_rate": 2.1233333333333335e-06, + "logits/chosen": 28548304.0, + "logits/rejected": 28032595.2, + "logps/chosen": -122.3899658203125, + "logps/rejected": -133.5687744140625, + "loss": 0.4277194976806641, + "rewards/chosen": 0.8977908134460449, + "rewards/margins": 0.6939298629760742, + "rewards/rejected": 0.20386095046997071, + "step": 3090 + }, + { + "epoch": 1.24, + "grad_norm": 0.8042078614234924, + "kl": 6.6927170753479, + "learning_rate": 2.1122222222222223e-06, + "logits/chosen": 27459798.4, + "logits/rejected": 29134211.2, + "logps/chosen": -143.5735107421875, + "logps/rejected": -149.86500244140626, + "loss": 0.46234517097473143, + "rewards/chosen": 0.5464875221252441, + "rewards/margins": 0.39129424095153803, + "rewards/rejected": 0.15519328117370607, + "step": 3100 + }, + { + "epoch": 1.244, + "grad_norm": 0.5080249309539795, + "kl": 7.83388614654541, + "learning_rate": 2.101111111111111e-06, + "logits/chosen": 44153635.2, + "logits/rejected": 41055212.8, + "logps/chosen": -187.93551025390624, + "logps/rejected": -171.871630859375, + "loss": 0.4076192378997803, + "rewards/chosen": 0.8624818801879883, + "rewards/margins": 1.0380724906921388, + "rewards/rejected": -0.1755906105041504, + "step": 3110 + }, + { + "epoch": 1.248, + "grad_norm": 0.7141006588935852, + "kl": 6.444394111633301, + "learning_rate": 2.09e-06, + "logits/chosen": 22851744.0, + "logits/rejected": 21574438.4, + "logps/chosen": -135.523486328125, + "logps/rejected": -137.11583251953124, + "loss": 0.4397727966308594, + "rewards/chosen": 0.6564054965972901, + "rewards/margins": 0.65584604293108, + "rewards/rejected": 0.0005594536662101746, + "step": 3120 + }, + { + "epoch": 1.252, + "grad_norm": 0.6408936977386475, + "kl": 8.526594161987305, + "learning_rate": 2.078888888888889e-06, + "logits/chosen": 37600892.8, + "logits/rejected": 38453331.2, + "logps/chosen": -134.79913330078125, + "logps/rejected": -169.9215087890625, + "loss": 0.47039794921875, + "rewards/chosen": 0.816160011291504, + "rewards/margins": 0.25816926956176767, + "rewards/rejected": 0.5579907417297363, + "step": 3130 + }, + { + "epoch": 1.256, + "grad_norm": 0.7661492228507996, + "kl": 7.337412357330322, + "learning_rate": 2.067777777777778e-06, + "logits/chosen": 37269350.4, + "logits/rejected": 36887142.4, + "logps/chosen": -185.44752197265626, + "logps/rejected": -150.3639404296875, + "loss": 0.4498098850250244, + "rewards/chosen": 0.6579087257385254, + "rewards/margins": 0.5174487590789796, + "rewards/rejected": 0.1404599666595459, + "step": 3140 + }, + { + "epoch": 1.26, + "grad_norm": 0.5819596648216248, + "kl": 9.288617134094238, + "learning_rate": 2.0566666666666667e-06, + "logits/chosen": 41466233.6, + "logits/rejected": 39139404.8, + "logps/chosen": -120.82750244140625, + "logps/rejected": -131.20218505859376, + "loss": 0.46291580200195315, + "rewards/chosen": 0.883179759979248, + "rewards/margins": 0.3774709701538086, + "rewards/rejected": 0.5057087898254394, + "step": 3150 + }, + { + "epoch": 1.264, + "grad_norm": 0.46087002754211426, + "kl": 8.114151000976562, + "learning_rate": 2.0455555555555555e-06, + "logits/chosen": 54930982.4, + "logits/rejected": 55207168.0, + "logps/chosen": -161.81624755859374, + "logps/rejected": -131.8754638671875, + "loss": 0.4237666130065918, + "rewards/chosen": 1.0106356620788575, + "rewards/margins": 0.6930557727813721, + "rewards/rejected": 0.3175798892974854, + "step": 3160 + }, + { + "epoch": 1.268, + "grad_norm": 0.6536068320274353, + "kl": 9.215258598327637, + "learning_rate": 2.0344444444444448e-06, + "logits/chosen": 40884854.4, + "logits/rejected": 41148396.8, + "logps/chosen": -174.63795166015626, + "logps/rejected": -183.98502197265626, + "loss": 0.410884428024292, + "rewards/chosen": 1.0293194770812988, + "rewards/margins": 0.9003942966461181, + "rewards/rejected": 0.12892518043518067, + "step": 3170 + }, + { + "epoch": 1.272, + "grad_norm": 0.6242780089378357, + "kl": 5.226318359375, + "learning_rate": 2.0233333333333336e-06, + "logits/chosen": 39783075.2, + "logits/rejected": 38742624.0, + "logps/chosen": -140.7747314453125, + "logps/rejected": -128.35888671875, + "loss": 0.4299330234527588, + "rewards/chosen": 0.5863895893096924, + "rewards/margins": 0.637285441160202, + "rewards/rejected": -0.05089585185050964, + "step": 3180 + }, + { + "epoch": 1.276, + "grad_norm": 0.46700039505958557, + "kl": 7.267230033874512, + "learning_rate": 2.0122222222222224e-06, + "logits/chosen": 41375708.8, + "logits/rejected": 38081993.6, + "logps/chosen": -153.80274658203126, + "logps/rejected": -176.299755859375, + "loss": 0.460453462600708, + "rewards/chosen": 0.6854756832122803, + "rewards/margins": 0.3931422710418701, + "rewards/rejected": 0.29233341217041015, + "step": 3190 + }, + { + "epoch": 1.28, + "grad_norm": 0.6310598850250244, + "kl": 5.743724346160889, + "learning_rate": 2.001111111111111e-06, + "logits/chosen": 34063660.8, + "logits/rejected": 31735603.2, + "logps/chosen": -153.7698974609375, + "logps/rejected": -144.71922607421874, + "loss": 0.46727771759033204, + "rewards/chosen": 0.4247100830078125, + "rewards/margins": 0.3391889691352844, + "rewards/rejected": 0.08552111387252807, + "step": 3200 + }, + { + "epoch": 1.28, + "eval_kl": 6.358611106872559, + "eval_logits/chosen": 35785789.44, + "eval_logits/rejected": 35595558.912, + "eval_logps/chosen": -149.448609375, + "eval_logps/rejected": -146.011734375, + "eval_loss": 0.47684431076049805, + "eval_rewards/chosen": 0.48771685791015623, + "eval_rewards/margins": 0.22461712646484372, + "eval_rewards/rejected": 0.2630997314453125, + "eval_runtime": 216.7977, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3200 + }, + { + "epoch": 1.284, + "grad_norm": 0.5444361567497253, + "kl": 8.359007835388184, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 38480896.0, + "logits/rejected": 37109968.0, + "logps/chosen": -166.1022705078125, + "logps/rejected": -135.0826416015625, + "loss": 0.4136789321899414, + "rewards/chosen": 1.1088358879089355, + "rewards/margins": 0.8896709442138672, + "rewards/rejected": 0.21916494369506836, + "step": 3210 + }, + { + "epoch": 1.288, + "grad_norm": 0.47785794734954834, + "kl": 5.674698829650879, + "learning_rate": 1.9788888888888892e-06, + "logits/chosen": 30067932.8, + "logits/rejected": 30675318.4, + "logps/chosen": -119.010009765625, + "logps/rejected": -125.2324462890625, + "loss": 0.4624322891235352, + "rewards/chosen": 0.3068151712417603, + "rewards/margins": 0.3403205394744873, + "rewards/rejected": -0.033505368232727054, + "step": 3220 + }, + { + "epoch": 1.292, + "grad_norm": 0.5500114560127258, + "kl": 6.791600704193115, + "learning_rate": 1.967777777777778e-06, + "logits/chosen": 34258720.0, + "logits/rejected": 31316112.0, + "logps/chosen": -148.19810791015624, + "logps/rejected": -154.41190185546876, + "loss": 0.449018669128418, + "rewards/chosen": 0.7993580818176269, + "rewards/margins": 0.4857671737670898, + "rewards/rejected": 0.31359090805053713, + "step": 3230 + }, + { + "epoch": 1.296, + "grad_norm": 0.5430236458778381, + "kl": 8.808231353759766, + "learning_rate": 1.956666666666667e-06, + "logits/chosen": 34977206.4, + "logits/rejected": 31934518.4, + "logps/chosen": -150.738330078125, + "logps/rejected": -182.1927490234375, + "loss": 0.45286874771118163, + "rewards/chosen": 0.8764358520507812, + "rewards/margins": 0.5628475189208983, + "rewards/rejected": 0.3135883331298828, + "step": 3240 + }, + { + "epoch": 1.3, + "grad_norm": 0.350079745054245, + "kl": 8.156396865844727, + "learning_rate": 1.9455555555555557e-06, + "logits/chosen": 38458761.6, + "logits/rejected": 36443888.0, + "logps/chosen": -153.3631103515625, + "logps/rejected": -182.10977783203126, + "loss": 0.4610450267791748, + "rewards/chosen": 0.7133102416992188, + "rewards/margins": 0.43579509258270266, + "rewards/rejected": 0.2775151491165161, + "step": 3250 + }, + { + "epoch": 1.304, + "grad_norm": 0.5987509489059448, + "kl": 4.856646537780762, + "learning_rate": 1.9344444444444445e-06, + "logits/chosen": 34302396.8, + "logits/rejected": 35170553.6, + "logps/chosen": -113.03687744140625, + "logps/rejected": -124.1705810546875, + "loss": 0.4647495269775391, + "rewards/chosen": 0.39636247158050536, + "rewards/margins": 0.2917425155639648, + "rewards/rejected": 0.10461995601654053, + "step": 3260 + }, + { + "epoch": 1.308, + "grad_norm": 0.6457140445709229, + "kl": 4.818373680114746, + "learning_rate": 1.9233333333333333e-06, + "logits/chosen": 43591721.6, + "logits/rejected": 44343180.8, + "logps/chosen": -137.19747314453124, + "logps/rejected": -153.1467529296875, + "loss": 0.42650656700134276, + "rewards/chosen": 0.6118185997009278, + "rewards/margins": 0.7147171497344971, + "rewards/rejected": -0.10289855003356933, + "step": 3270 + }, + { + "epoch": 1.312, + "grad_norm": 0.6024192571640015, + "kl": 4.331322193145752, + "learning_rate": 1.912222222222222e-06, + "logits/chosen": 40203308.8, + "logits/rejected": 41383084.8, + "logps/chosen": -156.63048095703124, + "logps/rejected": -144.2884033203125, + "loss": 0.4273221969604492, + "rewards/chosen": 0.6165127277374267, + "rewards/margins": 0.7079921424388885, + "rewards/rejected": -0.09147941470146179, + "step": 3280 + }, + { + "epoch": 1.316, + "grad_norm": 0.6954644918441772, + "kl": 4.1213908195495605, + "learning_rate": 1.9011111111111113e-06, + "logits/chosen": 28768950.4, + "logits/rejected": 24494809.6, + "logps/chosen": -143.09805908203126, + "logps/rejected": -139.8798095703125, + "loss": 0.4376859664916992, + "rewards/chosen": 0.23681788444519042, + "rewards/margins": 0.6746566295623779, + "rewards/rejected": -0.4378387451171875, + "step": 3290 + }, + { + "epoch": 1.32, + "grad_norm": 0.6745891571044922, + "kl": 4.750722408294678, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 26675907.2, + "logits/rejected": 26277996.8, + "logps/chosen": -143.6562255859375, + "logps/rejected": -134.79100341796874, + "loss": 0.4433170795440674, + "rewards/chosen": 0.3683876276016235, + "rewards/margins": 0.5583112239837646, + "rewards/rejected": -0.18992359638214112, + "step": 3300 + }, + { + "epoch": 1.324, + "grad_norm": 0.5548813939094543, + "kl": 6.803833961486816, + "learning_rate": 1.878888888888889e-06, + "logits/chosen": 36996547.2, + "logits/rejected": 37710758.4, + "logps/chosen": -151.74556884765624, + "logps/rejected": -154.92069091796876, + "loss": 0.43219637870788574, + "rewards/chosen": 0.8092526435852051, + "rewards/margins": 0.6131466150283813, + "rewards/rejected": 0.19610602855682374, + "step": 3310 + }, + { + "epoch": 1.328, + "grad_norm": 0.5542740821838379, + "kl": 3.4922118186950684, + "learning_rate": 1.8677777777777777e-06, + "logits/chosen": 21493171.2, + "logits/rejected": 22701468.8, + "logps/chosen": -153.733984375, + "logps/rejected": -130.3265380859375, + "loss": 0.45499577522277834, + "rewards/chosen": 0.06440688967704773, + "rewards/margins": 0.4255514085292816, + "rewards/rejected": -0.36114451885223386, + "step": 3320 + }, + { + "epoch": 1.332, + "grad_norm": 0.5660212635993958, + "kl": 5.5110368728637695, + "learning_rate": 1.856666666666667e-06, + "logits/chosen": 25962280.0, + "logits/rejected": 25115568.0, + "logps/chosen": -157.3265625, + "logps/rejected": -127.7706298828125, + "loss": 0.45712642669677733, + "rewards/chosen": 0.25737948417663575, + "rewards/margins": 0.38217872381210327, + "rewards/rejected": -0.12479923963546753, + "step": 3330 + }, + { + "epoch": 1.336, + "grad_norm": 0.7863187193870544, + "kl": 4.013291835784912, + "learning_rate": 1.8455555555555558e-06, + "logits/chosen": 35347702.4, + "logits/rejected": 34501942.4, + "logps/chosen": -148.804345703125, + "logps/rejected": -153.26226806640625, + "loss": 0.4377838134765625, + "rewards/chosen": 0.25958924293518065, + "rewards/margins": 0.4975349426269531, + "rewards/rejected": -0.23794569969177246, + "step": 3340 + }, + { + "epoch": 1.34, + "grad_norm": 0.8637145757675171, + "kl": 6.854001522064209, + "learning_rate": 1.8344444444444446e-06, + "logits/chosen": 37165539.2, + "logits/rejected": 35276435.2, + "logps/chosen": -165.85059814453126, + "logps/rejected": -189.76441650390626, + "loss": 0.46552677154541017, + "rewards/chosen": 0.28020381927490234, + "rewards/margins": 0.23775566816329957, + "rewards/rejected": 0.042448151111602786, + "step": 3350 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.7173047661781311, + "kl": 6.264387130737305, + "learning_rate": 1.8233333333333334e-06, + "logits/chosen": 29046736.0, + "logits/rejected": 27349148.8, + "logps/chosen": -160.81956787109374, + "logps/rejected": -144.21968994140624, + "loss": 0.4449786186218262, + "rewards/chosen": 0.4654555320739746, + "rewards/margins": 0.5440494477748871, + "rewards/rejected": -0.07859391570091248, + "step": 3360 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.6456140279769897, + "kl": 3.3577468395233154, + "learning_rate": 1.8122222222222224e-06, + "logits/chosen": 32191424.0, + "logits/rejected": 29473795.2, + "logps/chosen": -138.01751708984375, + "logps/rejected": -148.85108642578126, + "loss": 0.4563105583190918, + "rewards/chosen": -0.22999329566955568, + "rewards/margins": 0.3565816402435303, + "rewards/rejected": -0.586574935913086, + "step": 3370 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.5196599960327148, + "kl": 4.913334369659424, + "learning_rate": 1.8011111111111112e-06, + "logits/chosen": 29135721.6, + "logits/rejected": 27177129.6, + "logps/chosen": -161.2333984375, + "logps/rejected": -136.623095703125, + "loss": 0.42957291603088377, + "rewards/chosen": 0.3112953186035156, + "rewards/margins": 0.6790343999862671, + "rewards/rejected": -0.36773908138275146, + "step": 3380 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.5491528511047363, + "kl": 2.574375867843628, + "learning_rate": 1.79e-06, + "logits/chosen": 39787881.6, + "logits/rejected": 37482313.6, + "logps/chosen": -132.85360107421874, + "logps/rejected": -138.88128662109375, + "loss": 0.4316267490386963, + "rewards/chosen": -0.03256496787071228, + "rewards/margins": 0.6733869135379791, + "rewards/rejected": -0.7059518814086914, + "step": 3390 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.5999844670295715, + "kl": 5.321146011352539, + "learning_rate": 1.7788888888888892e-06, + "logits/chosen": 34423936.0, + "logits/rejected": 35799561.6, + "logps/chosen": -169.89874267578125, + "logps/rejected": -162.8022705078125, + "loss": 0.4550165176391602, + "rewards/chosen": 0.13871285915374756, + "rewards/margins": 0.4647700548171997, + "rewards/rejected": -0.32605719566345215, + "step": 3400 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.4790494441986084, + "eval_logits/chosen": 29217603.584, + "eval_logits/rejected": 29389656.064, + "eval_logps/chosen": -156.61384375, + "eval_logps/rejected": -153.2220625, + "eval_loss": 0.47793951630592346, + "eval_rewards/chosen": -0.22880656433105467, + "eval_rewards/margins": 0.2291276092529297, + "eval_rewards/rejected": -0.45793417358398436, + "eval_runtime": 216.674, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 3400 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 0.5353158712387085, + "kl": 3.89123272895813, + "learning_rate": 1.767777777777778e-06, + "logits/chosen": 30302416.0, + "logits/rejected": 30243564.8, + "logps/chosen": -163.517431640625, + "logps/rejected": -174.63206787109374, + "loss": 0.4478933334350586, + "rewards/chosen": 0.02733871340751648, + "rewards/margins": 0.6695918262004852, + "rewards/rejected": -0.6422531127929687, + "step": 3410 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5308664441108704, + "kl": 3.7901718616485596, + "learning_rate": 1.7566666666666669e-06, + "logits/chosen": 30754256.0, + "logits/rejected": 27248163.2, + "logps/chosen": -130.69642333984376, + "logps/rejected": -149.67838134765626, + "loss": 0.454122257232666, + "rewards/chosen": 0.1454862356185913, + "rewards/margins": 0.4878753900527954, + "rewards/rejected": -0.3423891544342041, + "step": 3420 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 0.7273574471473694, + "kl": 4.090758323669434, + "learning_rate": 1.7455555555555557e-06, + "logits/chosen": 28009366.4, + "logits/rejected": 28281456.0, + "logps/chosen": -150.5, + "logps/rejected": -151.46649169921875, + "loss": 0.43383193016052246, + "rewards/chosen": 0.24967949390411376, + "rewards/margins": 0.7830450773239135, + "rewards/rejected": -0.5333655834197998, + "step": 3430 + }, + { + "epoch": 1.376, + "grad_norm": 0.3887825906276703, + "kl": 5.064081192016602, + "learning_rate": 1.7344444444444447e-06, + "logits/chosen": 27018854.4, + "logits/rejected": 24356123.2, + "logps/chosen": -130.98511962890626, + "logps/rejected": -151.60482177734374, + "loss": 0.43924894332885744, + "rewards/chosen": 0.3166049957275391, + "rewards/margins": 0.6815865993499757, + "rewards/rejected": -0.36498160362243653, + "step": 3440 + }, + { + "epoch": 1.38, + "grad_norm": 0.4775777757167816, + "kl": 7.2533087730407715, + "learning_rate": 1.7233333333333335e-06, + "logits/chosen": 38613673.6, + "logits/rejected": 41955670.4, + "logps/chosen": -176.4742919921875, + "logps/rejected": -157.8961181640625, + "loss": 0.4113172054290771, + "rewards/chosen": 0.7336381912231446, + "rewards/margins": 0.7889788269996644, + "rewards/rejected": -0.05534063577651978, + "step": 3450 + }, + { + "epoch": 1.384, + "grad_norm": 0.7163631916046143, + "kl": 5.157084941864014, + "learning_rate": 1.7122222222222223e-06, + "logits/chosen": 35435958.4, + "logits/rejected": 36708380.8, + "logps/chosen": -135.27044677734375, + "logps/rejected": -149.26287841796875, + "loss": 0.49365973472595215, + "rewards/chosen": 0.14373122453689574, + "rewards/margins": 0.07738589048385619, + "rewards/rejected": 0.06634533405303955, + "step": 3460 + }, + { + "epoch": 1.388, + "grad_norm": 0.6470320224761963, + "kl": 4.387451648712158, + "learning_rate": 1.7011111111111111e-06, + "logits/chosen": 29047792.0, + "logits/rejected": 28239174.4, + "logps/chosen": -131.78316650390624, + "logps/rejected": -120.0147216796875, + "loss": 0.4465984344482422, + "rewards/chosen": 0.3296776294708252, + "rewards/margins": 0.6029248237609863, + "rewards/rejected": -0.27324719429016114, + "step": 3470 + }, + { + "epoch": 1.392, + "grad_norm": 0.6309516429901123, + "kl": 7.3288164138793945, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 36852099.2, + "logits/rejected": 35572294.4, + "logps/chosen": -133.08834228515624, + "logps/rejected": -156.20654296875, + "loss": 0.44939703941345216, + "rewards/chosen": 0.7472519397735595, + "rewards/margins": 0.5482589960098266, + "rewards/rejected": 0.19899294376373292, + "step": 3480 + }, + { + "epoch": 1.396, + "grad_norm": 0.7260765433311462, + "kl": 3.237916946411133, + "learning_rate": 1.6788888888888891e-06, + "logits/chosen": 30554035.2, + "logits/rejected": 28196780.8, + "logps/chosen": -123.18148193359374, + "logps/rejected": -137.39403076171874, + "loss": 0.43561625480651855, + "rewards/chosen": 0.26550750732421874, + "rewards/margins": 0.6841002464294433, + "rewards/rejected": -0.4185927391052246, + "step": 3490 + }, + { + "epoch": 1.4, + "grad_norm": 0.4278745949268341, + "kl": 5.290976047515869, + "learning_rate": 1.667777777777778e-06, + "logits/chosen": 29699056.0, + "logits/rejected": 30936454.4, + "logps/chosen": -153.85557861328124, + "logps/rejected": -122.64144287109374, + "loss": 0.4286449909210205, + "rewards/chosen": 0.49454541206359864, + "rewards/margins": 0.6557791233062744, + "rewards/rejected": -0.16123371124267577, + "step": 3500 + }, + { + "epoch": 1.404, + "grad_norm": 0.7629099488258362, + "kl": 6.994016170501709, + "learning_rate": 1.6566666666666668e-06, + "logits/chosen": 26804969.6, + "logits/rejected": 27539884.8, + "logps/chosen": -131.692041015625, + "logps/rejected": -185.0650146484375, + "loss": 0.4522398948669434, + "rewards/chosen": 0.5013983726501465, + "rewards/margins": 0.40837590694427495, + "rewards/rejected": 0.09302246570587158, + "step": 3510 + }, + { + "epoch": 1.408, + "grad_norm": 0.67551189661026, + "kl": 4.912568092346191, + "learning_rate": 1.6455555555555558e-06, + "logits/chosen": 27143923.2, + "logits/rejected": 26963395.2, + "logps/chosen": -149.304931640625, + "logps/rejected": -139.75704345703124, + "loss": 0.44557414054870603, + "rewards/chosen": 0.3085124731063843, + "rewards/margins": 0.6712681531906128, + "rewards/rejected": -0.36275568008422854, + "step": 3520 + }, + { + "epoch": 1.412, + "grad_norm": 0.7234175801277161, + "kl": 5.856083869934082, + "learning_rate": 1.6344444444444446e-06, + "logits/chosen": 33198009.6, + "logits/rejected": 32032864.0, + "logps/chosen": -150.87943115234376, + "logps/rejected": -153.37586669921876, + "loss": 0.4352092742919922, + "rewards/chosen": 0.3994121074676514, + "rewards/margins": 0.5765307188034058, + "rewards/rejected": -0.1771186113357544, + "step": 3530 + }, + { + "epoch": 1.416, + "grad_norm": 0.7202039361000061, + "kl": 5.610236167907715, + "learning_rate": 1.6233333333333334e-06, + "logits/chosen": 25017616.0, + "logits/rejected": 26410630.4, + "logps/chosen": -187.0184814453125, + "logps/rejected": -115.09561767578126, + "loss": 0.4591636657714844, + "rewards/chosen": 0.3089368104934692, + "rewards/margins": 0.34654129743576045, + "rewards/rejected": -0.03760448694229126, + "step": 3540 + }, + { + "epoch": 1.42, + "grad_norm": 0.7653972506523132, + "kl": 4.185455322265625, + "learning_rate": 1.6122222222222222e-06, + "logits/chosen": 27766281.6, + "logits/rejected": 24358944.0, + "logps/chosen": -125.4722412109375, + "logps/rejected": -162.280615234375, + "loss": 0.4453754901885986, + "rewards/chosen": 0.24888882637023926, + "rewards/margins": 0.640946888923645, + "rewards/rejected": -0.39205806255340575, + "step": 3550 + }, + { + "epoch": 1.424, + "grad_norm": 0.5244606137275696, + "kl": 5.084301948547363, + "learning_rate": 1.6011111111111114e-06, + "logits/chosen": 38255152.0, + "logits/rejected": 35147708.8, + "logps/chosen": -169.7523681640625, + "logps/rejected": -189.486328125, + "loss": 0.4723203659057617, + "rewards/chosen": 0.08335857987403869, + "rewards/margins": 0.32414146065711974, + "rewards/rejected": -0.24078288078308105, + "step": 3560 + }, + { + "epoch": 1.428, + "grad_norm": 0.7249192595481873, + "kl": 6.4514336585998535, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 31364140.8, + "logits/rejected": 32550483.2, + "logps/chosen": -136.05145263671875, + "logps/rejected": -165.39798583984376, + "loss": 0.43005828857421874, + "rewards/chosen": 0.6245347499847412, + "rewards/margins": 0.6684352219104767, + "rewards/rejected": -0.04390047192573547, + "step": 3570 + }, + { + "epoch": 1.432, + "grad_norm": 0.5767175555229187, + "kl": 3.358916759490967, + "learning_rate": 1.578888888888889e-06, + "logits/chosen": 34861779.2, + "logits/rejected": 36117113.6, + "logps/chosen": -153.4178466796875, + "logps/rejected": -138.995556640625, + "loss": 0.46177167892456056, + "rewards/chosen": 0.06014393568038941, + "rewards/margins": 0.33000377416610716, + "rewards/rejected": -0.26985983848571776, + "step": 3580 + }, + { + "epoch": 1.436, + "grad_norm": 0.8270474076271057, + "kl": 3.5596280097961426, + "learning_rate": 1.5677777777777778e-06, + "logits/chosen": 24936995.2, + "logits/rejected": 25041536.0, + "logps/chosen": -142.94605712890626, + "logps/rejected": -115.97581787109375, + "loss": 0.46255645751953123, + "rewards/chosen": 0.09056978225708008, + "rewards/margins": 0.3586315393447876, + "rewards/rejected": -0.2680617570877075, + "step": 3590 + }, + { + "epoch": 1.44, + "grad_norm": 0.712232768535614, + "kl": 3.8768982887268066, + "learning_rate": 1.5566666666666669e-06, + "logits/chosen": 31608688.0, + "logits/rejected": 29185264.0, + "logps/chosen": -135.2075439453125, + "logps/rejected": -174.82825927734376, + "loss": 0.45167975425720214, + "rewards/chosen": 0.0786507248878479, + "rewards/margins": 0.565507709980011, + "rewards/rejected": -0.4868569850921631, + "step": 3600 + }, + { + "epoch": 1.44, + "eval_kl": 4.0966901779174805, + "eval_logits/chosen": 30547773.44, + "eval_logits/rejected": 30678024.192, + "eval_logps/chosen": -154.22709375, + "eval_logps/rejected": -150.8435625, + "eval_loss": 0.47759392857551575, + "eval_rewards/chosen": 0.009867694854736328, + "eval_rewards/margins": 0.22995056533813477, + "eval_rewards/rejected": -0.22008287048339845, + "eval_runtime": 216.6033, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 3600 + }, + { + "epoch": 1.444, + "grad_norm": 0.6500243544578552, + "kl": 4.5812273025512695, + "learning_rate": 1.5455555555555557e-06, + "logits/chosen": 35394022.4, + "logits/rejected": 35018995.2, + "logps/chosen": -116.204833984375, + "logps/rejected": -184.157275390625, + "loss": 0.4943058490753174, + "rewards/chosen": -0.007693278789520264, + "rewards/margins": 0.12241411209106444, + "rewards/rejected": -0.1301073908805847, + "step": 3610 + }, + { + "epoch": 1.448, + "grad_norm": 0.6954056024551392, + "kl": 5.1914801597595215, + "learning_rate": 1.5344444444444445e-06, + "logits/chosen": 44866396.8, + "logits/rejected": 43301910.4, + "logps/chosen": -144.904541015625, + "logps/rejected": -164.3775390625, + "loss": 0.41394357681274413, + "rewards/chosen": 0.45650997161865237, + "rewards/margins": 0.8667933464050293, + "rewards/rejected": -0.41028337478637694, + "step": 3620 + }, + { + "epoch": 1.452, + "grad_norm": 0.6982813477516174, + "kl": 4.627970218658447, + "learning_rate": 1.5233333333333333e-06, + "logits/chosen": 33511337.6, + "logits/rejected": 33426937.6, + "logps/chosen": -176.5933837890625, + "logps/rejected": -148.85958251953124, + "loss": 0.43149843215942385, + "rewards/chosen": 0.27549741268157957, + "rewards/margins": 0.6807630777359008, + "rewards/rejected": -0.40526566505432127, + "step": 3630 + }, + { + "epoch": 1.456, + "grad_norm": 0.472672700881958, + "kl": 4.886686325073242, + "learning_rate": 1.5122222222222225e-06, + "logits/chosen": 21423132.8, + "logits/rejected": 20745464.0, + "logps/chosen": -100.18471069335938, + "logps/rejected": -142.314697265625, + "loss": 0.45476489067077636, + "rewards/chosen": 0.453489875793457, + "rewards/margins": 0.5046129763126374, + "rewards/rejected": -0.0511231005191803, + "step": 3640 + }, + { + "epoch": 1.46, + "grad_norm": 0.6913832426071167, + "kl": 4.06036901473999, + "learning_rate": 1.5011111111111113e-06, + "logits/chosen": 41536867.2, + "logits/rejected": 40642899.2, + "logps/chosen": -215.0681396484375, + "logps/rejected": -168.06806640625, + "loss": 0.4454173564910889, + "rewards/chosen": -0.10317556858062744, + "rewards/margins": 0.5138320684432983, + "rewards/rejected": -0.6170076370239258, + "step": 3650 + }, + { + "epoch": 1.464, + "grad_norm": 0.5362917184829712, + "kl": 3.2193520069122314, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 25776776.0, + "logits/rejected": 22911784.0, + "logps/chosen": -130.6083984375, + "logps/rejected": -168.973876953125, + "loss": 0.41051359176635743, + "rewards/chosen": 0.19750649929046632, + "rewards/margins": 0.9899675607681275, + "rewards/rejected": -0.7924610614776612, + "step": 3660 + }, + { + "epoch": 1.468, + "grad_norm": 0.9869509935379028, + "kl": 4.485353946685791, + "learning_rate": 1.478888888888889e-06, + "logits/chosen": 27324668.8, + "logits/rejected": 27301977.6, + "logps/chosen": -132.47244873046876, + "logps/rejected": -162.38021240234374, + "loss": 0.45110092163085935, + "rewards/chosen": -0.08881351947784424, + "rewards/margins": 0.4330620527267456, + "rewards/rejected": -0.5218755722045898, + "step": 3670 + }, + { + "epoch": 1.472, + "grad_norm": 0.8374236822128296, + "kl": 5.5675249099731445, + "learning_rate": 1.467777777777778e-06, + "logits/chosen": 30133260.8, + "logits/rejected": 27502342.4, + "logps/chosen": -132.419970703125, + "logps/rejected": -164.09249267578124, + "loss": 0.4621445655822754, + "rewards/chosen": 0.29708335399627683, + "rewards/margins": 0.5296570777893066, + "rewards/rejected": -0.2325737237930298, + "step": 3680 + }, + { + "epoch": 1.476, + "grad_norm": 0.5651530623435974, + "kl": 3.6750998497009277, + "learning_rate": 1.4566666666666668e-06, + "logits/chosen": 28068777.6, + "logits/rejected": 24100668.8, + "logps/chosen": -176.51123046875, + "logps/rejected": -184.35858154296875, + "loss": 0.4131460666656494, + "rewards/chosen": 0.15955194234848022, + "rewards/margins": 1.0650185942649841, + "rewards/rejected": -0.9054666519165039, + "step": 3690 + }, + { + "epoch": 1.48, + "grad_norm": 0.6101991534233093, + "kl": 3.6094698905944824, + "learning_rate": 1.4455555555555556e-06, + "logits/chosen": 28100012.8, + "logits/rejected": 26111475.2, + "logps/chosen": -138.737109375, + "logps/rejected": -156.04112548828124, + "loss": 0.4490304470062256, + "rewards/chosen": 0.15077462196350097, + "rewards/margins": 0.5411154270172119, + "rewards/rejected": -0.39034080505371094, + "step": 3700 + }, + { + "epoch": 1.484, + "grad_norm": 0.8218708038330078, + "kl": 3.022378444671631, + "learning_rate": 1.4344444444444446e-06, + "logits/chosen": 18033281.6, + "logits/rejected": 19812489.6, + "logps/chosen": -134.19481201171874, + "logps/rejected": -143.6683837890625, + "loss": 0.4570739269256592, + "rewards/chosen": -0.3303727626800537, + "rewards/margins": 0.5034278392791749, + "rewards/rejected": -0.8338006019592286, + "step": 3710 + }, + { + "epoch": 1.488, + "grad_norm": 0.6318350434303284, + "kl": 3.771150588989258, + "learning_rate": 1.4233333333333336e-06, + "logits/chosen": 26958435.2, + "logits/rejected": 23352366.4, + "logps/chosen": -178.9782470703125, + "logps/rejected": -194.87041015625, + "loss": 0.4509871482849121, + "rewards/chosen": -0.14689927101135253, + "rewards/margins": 0.5733826160430908, + "rewards/rejected": -0.7202818870544434, + "step": 3720 + }, + { + "epoch": 1.492, + "grad_norm": 0.6234843730926514, + "kl": 5.08270263671875, + "learning_rate": 1.4122222222222224e-06, + "logits/chosen": 24577870.4, + "logits/rejected": 24378513.6, + "logps/chosen": -144.27496337890625, + "logps/rejected": -157.0753662109375, + "loss": 0.439809513092041, + "rewards/chosen": 0.2378466844558716, + "rewards/margins": 0.495815110206604, + "rewards/rejected": -0.2579684257507324, + "step": 3730 + }, + { + "epoch": 1.496, + "grad_norm": 0.6093852519989014, + "kl": 3.4414265155792236, + "learning_rate": 1.4011111111111112e-06, + "logits/chosen": 25166454.4, + "logits/rejected": 25530366.4, + "logps/chosen": -121.027001953125, + "logps/rejected": -122.510009765625, + "loss": 0.45766735076904297, + "rewards/chosen": 0.05086352825164795, + "rewards/margins": 0.45518562793731693, + "rewards/rejected": -0.40432209968566896, + "step": 3740 + }, + { + "epoch": 1.5, + "grad_norm": 0.6537352204322815, + "kl": 3.7078990936279297, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 19157790.4, + "logits/rejected": 19428764.8, + "logps/chosen": -108.5373046875, + "logps/rejected": -131.371728515625, + "loss": 0.4350168228149414, + "rewards/chosen": 0.38070919513702395, + "rewards/margins": 0.6320278406143189, + "rewards/rejected": -0.25131864547729493, + "step": 3750 + }, + { + "epoch": 1.504, + "grad_norm": 0.7157226204872131, + "kl": 3.980473041534424, + "learning_rate": 1.378888888888889e-06, + "logits/chosen": 24798140.8, + "logits/rejected": 23861692.8, + "logps/chosen": -141.9797607421875, + "logps/rejected": -169.5621337890625, + "loss": 0.39097282886505125, + "rewards/chosen": 0.2861147403717041, + "rewards/margins": 1.1024574756622314, + "rewards/rejected": -0.8163427352905274, + "step": 3760 + }, + { + "epoch": 1.508, + "grad_norm": 0.5401111245155334, + "kl": 3.915759563446045, + "learning_rate": 1.3677777777777779e-06, + "logits/chosen": 27341155.2, + "logits/rejected": 23659843.2, + "logps/chosen": -167.150537109375, + "logps/rejected": -156.52796630859376, + "loss": 0.4424854278564453, + "rewards/chosen": 0.20697882175445556, + "rewards/margins": 0.6797100305557251, + "rewards/rejected": -0.4727312088012695, + "step": 3770 + }, + { + "epoch": 1.512, + "grad_norm": 0.5865006446838379, + "kl": 3.9945666790008545, + "learning_rate": 1.3566666666666667e-06, + "logits/chosen": 43287993.6, + "logits/rejected": 42994304.0, + "logps/chosen": -151.379150390625, + "logps/rejected": -170.74891357421876, + "loss": 0.46252665519714353, + "rewards/chosen": 0.28137707710266113, + "rewards/margins": 0.3768645763397217, + "rewards/rejected": -0.09548749923706054, + "step": 3780 + }, + { + "epoch": 1.516, + "grad_norm": 0.7108325362205505, + "kl": 5.5239386558532715, + "learning_rate": 1.3455555555555557e-06, + "logits/chosen": 25728556.8, + "logits/rejected": 25374808.0, + "logps/chosen": -126.28800048828126, + "logps/rejected": -141.21763916015624, + "loss": 0.4297455310821533, + "rewards/chosen": 0.6269711494445801, + "rewards/margins": 0.7478980660438538, + "rewards/rejected": -0.12092691659927368, + "step": 3790 + }, + { + "epoch": 1.52, + "grad_norm": 0.5811319947242737, + "kl": 5.248955726623535, + "learning_rate": 1.3344444444444447e-06, + "logits/chosen": 33503753.6, + "logits/rejected": 31553878.4, + "logps/chosen": -178.1581298828125, + "logps/rejected": -143.417431640625, + "loss": 0.45870108604431153, + "rewards/chosen": 0.4684587001800537, + "rewards/margins": 0.4509533554315567, + "rewards/rejected": 0.01750534474849701, + "step": 3800 + }, + { + "epoch": 1.52, + "eval_kl": 4.183420181274414, + "eval_logits/chosen": 30411423.744, + "eval_logits/rejected": 30603616.256, + "eval_logps/chosen": -154.112203125, + "eval_logps/rejected": -150.76103125, + "eval_loss": 0.4779178202152252, + "eval_rewards/chosen": 0.021359254837036133, + "eval_rewards/margins": 0.23318927192687988, + "eval_rewards/rejected": -0.21183001708984375, + "eval_runtime": 217.1598, + "eval_samples_per_second": 4.605, + "eval_steps_per_second": 2.302, + "step": 3800 + }, + { + "epoch": 1.524, + "grad_norm": 0.7062050700187683, + "kl": 4.107216835021973, + "learning_rate": 1.3233333333333335e-06, + "logits/chosen": 26509099.2, + "logits/rejected": 24838300.8, + "logps/chosen": -137.1124267578125, + "logps/rejected": -149.33499755859376, + "loss": 0.44512219429016114, + "rewards/chosen": 0.20196728706359862, + "rewards/margins": 0.5584580659866333, + "rewards/rejected": -0.35649077892303466, + "step": 3810 + }, + { + "epoch": 1.528, + "grad_norm": 0.43405744433403015, + "kl": 6.5067572593688965, + "learning_rate": 1.3122222222222223e-06, + "logits/chosen": 34846220.8, + "logits/rejected": 33152172.8, + "logps/chosen": -144.03416748046874, + "logps/rejected": -156.00250244140625, + "loss": 0.3955928564071655, + "rewards/chosen": 0.8109316825866699, + "rewards/margins": 1.0338047742843628, + "rewards/rejected": -0.22287309169769287, + "step": 3820 + }, + { + "epoch": 1.532, + "grad_norm": 0.48609739542007446, + "kl": 4.6954779624938965, + "learning_rate": 1.3011111111111113e-06, + "logits/chosen": 24615228.8, + "logits/rejected": 25253913.6, + "logps/chosen": -152.94261474609374, + "logps/rejected": -162.1392578125, + "loss": 0.44533653259277345, + "rewards/chosen": -0.09745782017707824, + "rewards/margins": 0.5474663436412811, + "rewards/rejected": -0.6449241638183594, + "step": 3830 + }, + { + "epoch": 1.536, + "grad_norm": 0.8033897280693054, + "kl": 4.541080951690674, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 23824740.8, + "logits/rejected": 25749264.0, + "logps/chosen": -148.12535400390624, + "logps/rejected": -132.90029296875, + "loss": 0.4263105869293213, + "rewards/chosen": 0.34121017456054686, + "rewards/margins": 0.7844597339630126, + "rewards/rejected": -0.4432495594024658, + "step": 3840 + }, + { + "epoch": 1.54, + "grad_norm": 0.5979334115982056, + "kl": 3.1920642852783203, + "learning_rate": 1.278888888888889e-06, + "logits/chosen": 25940036.8, + "logits/rejected": 25332123.2, + "logps/chosen": -125.19688720703125, + "logps/rejected": -124.583349609375, + "loss": 0.42458858489990237, + "rewards/chosen": 0.11553690433502198, + "rewards/margins": 0.7247669935226441, + "rewards/rejected": -0.6092300891876221, + "step": 3850 + }, + { + "epoch": 1.544, + "grad_norm": 0.6010407209396362, + "kl": 7.0015411376953125, + "learning_rate": 1.2677777777777778e-06, + "logits/chosen": 28349667.2, + "logits/rejected": 29532544.0, + "logps/chosen": -160.0486083984375, + "logps/rejected": -148.159619140625, + "loss": 0.433948278427124, + "rewards/chosen": 0.6110920429229736, + "rewards/margins": 0.6498981416225433, + "rewards/rejected": -0.038806098699569705, + "step": 3860 + }, + { + "epoch": 1.548, + "grad_norm": 0.5501318573951721, + "kl": 5.648660659790039, + "learning_rate": 1.2566666666666668e-06, + "logits/chosen": 26864864.0, + "logits/rejected": 25404660.8, + "logps/chosen": -173.2337890625, + "logps/rejected": -160.03525390625, + "loss": 0.43099102973937986, + "rewards/chosen": 0.3765087604522705, + "rewards/margins": 0.7873351097106933, + "rewards/rejected": -0.4108263492584229, + "step": 3870 + }, + { + "epoch": 1.552, + "grad_norm": 0.5478479862213135, + "kl": 3.376481294631958, + "learning_rate": 1.2455555555555556e-06, + "logits/chosen": 32903283.2, + "logits/rejected": 32423433.6, + "logps/chosen": -146.17325439453126, + "logps/rejected": -139.81845703125, + "loss": 0.4483139991760254, + "rewards/chosen": 0.12561094760894775, + "rewards/margins": 0.3918390512466431, + "rewards/rejected": -0.26622810363769533, + "step": 3880 + }, + { + "epoch": 1.556, + "grad_norm": 0.6676125526428223, + "kl": 4.547513484954834, + "learning_rate": 1.2344444444444446e-06, + "logits/chosen": 37385532.8, + "logits/rejected": 38763891.2, + "logps/chosen": -155.23748779296875, + "logps/rejected": -168.7784912109375, + "loss": 0.4901449203491211, + "rewards/chosen": 0.16143620014190674, + "rewards/margins": 0.11462950706481934, + "rewards/rejected": 0.0468066930770874, + "step": 3890 + }, + { + "epoch": 1.56, + "grad_norm": 0.7040978670120239, + "kl": 5.753296852111816, + "learning_rate": 1.2233333333333334e-06, + "logits/chosen": 34341609.6, + "logits/rejected": 33446153.6, + "logps/chosen": -145.878369140625, + "logps/rejected": -180.95491943359374, + "loss": 0.4818913459777832, + "rewards/chosen": 0.37144837379455564, + "rewards/margins": 0.2124497532844543, + "rewards/rejected": 0.15899862051010133, + "step": 3900 + }, + { + "epoch": 1.564, + "grad_norm": 0.48866015672683716, + "kl": 5.357041358947754, + "learning_rate": 1.2122222222222222e-06, + "logits/chosen": 27682144.0, + "logits/rejected": 28275977.6, + "logps/chosen": -150.2932861328125, + "logps/rejected": -144.590478515625, + "loss": 0.4552725315093994, + "rewards/chosen": 0.34428427219390867, + "rewards/margins": 0.4334153890609741, + "rewards/rejected": -0.08913111686706543, + "step": 3910 + }, + { + "epoch": 1.568, + "grad_norm": 0.536604642868042, + "kl": 4.181241512298584, + "learning_rate": 1.2011111111111112e-06, + "logits/chosen": 33525574.4, + "logits/rejected": 32405542.4, + "logps/chosen": -137.73907470703125, + "logps/rejected": -178.59801025390624, + "loss": 0.4515504837036133, + "rewards/chosen": 0.20468955039978026, + "rewards/margins": 0.46134023666381835, + "rewards/rejected": -0.2566506862640381, + "step": 3920 + }, + { + "epoch": 1.572, + "grad_norm": 0.7674170136451721, + "kl": 4.251042366027832, + "learning_rate": 1.19e-06, + "logits/chosen": 40275475.2, + "logits/rejected": 36563088.0, + "logps/chosen": -187.68133544921875, + "logps/rejected": -170.8183349609375, + "loss": 0.406461238861084, + "rewards/chosen": 0.5473237037658691, + "rewards/margins": 1.11146821975708, + "rewards/rejected": -0.564144515991211, + "step": 3930 + }, + { + "epoch": 1.576, + "grad_norm": 0.483766108751297, + "kl": 4.224446773529053, + "learning_rate": 1.178888888888889e-06, + "logits/chosen": 25433030.4, + "logits/rejected": 25341385.6, + "logps/chosen": -117.7307861328125, + "logps/rejected": -139.6504150390625, + "loss": 0.42197356224060056, + "rewards/chosen": 0.24890828132629395, + "rewards/margins": 0.7226192951202393, + "rewards/rejected": -0.4737110137939453, + "step": 3940 + }, + { + "epoch": 1.58, + "grad_norm": 0.5180490016937256, + "kl": 3.8868117332458496, + "learning_rate": 1.1677777777777779e-06, + "logits/chosen": 33638163.2, + "logits/rejected": 34016854.4, + "logps/chosen": -129.69632568359376, + "logps/rejected": -143.5869384765625, + "loss": 0.43038105964660645, + "rewards/chosen": 0.3097927808761597, + "rewards/margins": 0.6986382722854614, + "rewards/rejected": -0.38884549140930175, + "step": 3950 + }, + { + "epoch": 1.584, + "grad_norm": 0.5631889700889587, + "kl": 3.5186545848846436, + "learning_rate": 1.1566666666666667e-06, + "logits/chosen": 30367190.4, + "logits/rejected": 29603392.0, + "logps/chosen": -153.10296630859375, + "logps/rejected": -163.72001953125, + "loss": 0.4525346279144287, + "rewards/chosen": -0.12654991149902345, + "rewards/margins": 0.4131012439727783, + "rewards/rejected": -0.5396511554718018, + "step": 3960 + }, + { + "epoch": 1.588, + "grad_norm": 0.5241718292236328, + "kl": 4.312124729156494, + "learning_rate": 1.1455555555555557e-06, + "logits/chosen": 24180620.8, + "logits/rejected": 26728572.8, + "logps/chosen": -140.080517578125, + "logps/rejected": -115.592919921875, + "loss": 0.4370166301727295, + "rewards/chosen": 0.2027698278427124, + "rewards/margins": 0.536671781539917, + "rewards/rejected": -0.3339019536972046, + "step": 3970 + }, + { + "epoch": 1.592, + "grad_norm": 0.5848884582519531, + "kl": 4.956355571746826, + "learning_rate": 1.1344444444444445e-06, + "logits/chosen": 29427676.8, + "logits/rejected": 25774260.8, + "logps/chosen": -147.395361328125, + "logps/rejected": -165.61865234375, + "loss": 0.4145470142364502, + "rewards/chosen": 0.4822521686553955, + "rewards/margins": 0.6998722791671753, + "rewards/rejected": -0.2176201105117798, + "step": 3980 + }, + { + "epoch": 1.596, + "grad_norm": 0.7140029668807983, + "kl": 5.854944705963135, + "learning_rate": 1.1233333333333333e-06, + "logits/chosen": 39306499.2, + "logits/rejected": 34811254.4, + "logps/chosen": -137.74361572265624, + "logps/rejected": -148.427978515625, + "loss": 0.41861691474914553, + "rewards/chosen": 0.6944310188293457, + "rewards/margins": 1.1321285247802733, + "rewards/rejected": -0.4376975059509277, + "step": 3990 + }, + { + "epoch": 1.6, + "grad_norm": 0.7309412360191345, + "kl": 3.7196297645568848, + "learning_rate": 1.1122222222222223e-06, + "logits/chosen": 29282588.8, + "logits/rejected": 28362857.6, + "logps/chosen": -168.0974365234375, + "logps/rejected": -160.11337890625, + "loss": 0.4827260971069336, + "rewards/chosen": -0.2844557285308838, + "rewards/margins": -0.075036096572876, + "rewards/rejected": -0.2094196319580078, + "step": 4000 + }, + { + "epoch": 1.6, + "eval_kl": 4.738241195678711, + "eval_logits/chosen": 29865963.52, + "eval_logits/rejected": 30057035.776, + "eval_logps/chosen": -153.191, + "eval_logps/rejected": -149.831890625, + "eval_loss": 0.4781652092933655, + "eval_rewards/chosen": 0.1134777603149414, + "eval_rewards/margins": 0.23239292907714842, + "eval_rewards/rejected": -0.11891516876220704, + "eval_runtime": 216.5956, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 4000 + }, + { + "epoch": 1.604, + "grad_norm": 0.6143787503242493, + "kl": 5.21004581451416, + "learning_rate": 1.1011111111111113e-06, + "logits/chosen": 31273155.2, + "logits/rejected": 31765638.4, + "logps/chosen": -172.1530029296875, + "logps/rejected": -172.15975341796874, + "loss": 0.4306319713592529, + "rewards/chosen": 0.42154908180236816, + "rewards/margins": 0.6504992485046387, + "rewards/rejected": -0.2289501667022705, + "step": 4010 + }, + { + "epoch": 1.608, + "grad_norm": 0.3771494925022125, + "kl": 4.662692070007324, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 33774937.6, + "logits/rejected": 34804582.4, + "logps/chosen": -162.08702392578124, + "logps/rejected": -170.07498779296876, + "loss": 0.4483354568481445, + "rewards/chosen": -0.026274442672729492, + "rewards/margins": 0.4997582912445069, + "rewards/rejected": -0.5260327339172364, + "step": 4020 + }, + { + "epoch": 1.612, + "grad_norm": 0.8712416291236877, + "kl": 4.398558616638184, + "learning_rate": 1.078888888888889e-06, + "logits/chosen": 23578403.2, + "logits/rejected": 21598651.2, + "logps/chosen": -131.9225830078125, + "logps/rejected": -196.6409423828125, + "loss": 0.441709041595459, + "rewards/chosen": 0.29367847442626954, + "rewards/margins": 0.6416111469268799, + "rewards/rejected": -0.34793267250061033, + "step": 4030 + }, + { + "epoch": 1.616, + "grad_norm": 0.7502478957176208, + "kl": 4.180516242980957, + "learning_rate": 1.0677777777777778e-06, + "logits/chosen": 24851664.0, + "logits/rejected": 24331059.2, + "logps/chosen": -134.76131591796874, + "logps/rejected": -146.1375, + "loss": 0.42934479713439944, + "rewards/chosen": 0.15300320386886596, + "rewards/margins": 0.7616443037986755, + "rewards/rejected": -0.6086410999298095, + "step": 4040 + }, + { + "epoch": 1.62, + "grad_norm": 0.6729795932769775, + "kl": 3.5344510078430176, + "learning_rate": 1.0566666666666668e-06, + "logits/chosen": 30835174.4, + "logits/rejected": 30916979.2, + "logps/chosen": -167.9162841796875, + "logps/rejected": -148.01121826171874, + "loss": 0.40194120407104494, + "rewards/chosen": 0.18118813037872314, + "rewards/margins": 1.300507092475891, + "rewards/rejected": -1.119318962097168, + "step": 4050 + }, + { + "epoch": 1.624, + "grad_norm": 0.8483315110206604, + "kl": 4.144981384277344, + "learning_rate": 1.0455555555555556e-06, + "logits/chosen": 23790352.0, + "logits/rejected": 21096166.4, + "logps/chosen": -167.572314453125, + "logps/rejected": -160.46065673828124, + "loss": 0.4239004135131836, + "rewards/chosen": 0.40131430625915526, + "rewards/margins": 0.9644507408142089, + "rewards/rejected": -0.5631364345550537, + "step": 4060 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 0.7283557653427124, + "kl": 5.704648971557617, + "learning_rate": 1.0344444444444446e-06, + "logits/chosen": 34577296.0, + "logits/rejected": 28464720.0, + "logps/chosen": -131.04425048828125, + "logps/rejected": -170.54326171875, + "loss": 0.3883501052856445, + "rewards/chosen": 0.8470425605773926, + "rewards/margins": 1.1756995677948, + "rewards/rejected": -0.3286570072174072, + "step": 4070 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.7542695999145508, + "kl": 5.014037132263184, + "learning_rate": 1.0233333333333334e-06, + "logits/chosen": 33581536.0, + "logits/rejected": 31188915.2, + "logps/chosen": -200.83006591796874, + "logps/rejected": -201.557470703125, + "loss": 0.4472477912902832, + "rewards/chosen": 0.1391082763671875, + "rewards/margins": 0.5890022277832031, + "rewards/rejected": -0.4498939514160156, + "step": 4080 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 0.699129045009613, + "kl": 6.782160758972168, + "learning_rate": 1.0122222222222224e-06, + "logits/chosen": 33043475.2, + "logits/rejected": 32764982.4, + "logps/chosen": -178.3090087890625, + "logps/rejected": -168.98370361328125, + "loss": 0.42490806579589846, + "rewards/chosen": 0.7757717132568359, + "rewards/margins": 0.9651144385337829, + "rewards/rejected": -0.189342725276947, + "step": 4090 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.7144243717193604, + "kl": 3.5951950550079346, + "learning_rate": 1.0011111111111112e-06, + "logits/chosen": 24177262.4, + "logits/rejected": 20001184.0, + "logps/chosen": -142.5383056640625, + "logps/rejected": -174.16627197265626, + "loss": 0.4106290817260742, + "rewards/chosen": 0.029287612438201903, + "rewards/margins": 1.2375101923942566, + "rewards/rejected": -1.2082225799560546, + "step": 4100 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.7295018434524536, + "kl": 3.5175278186798096, + "learning_rate": 9.9e-07, + "logits/chosen": 37862601.6, + "logits/rejected": 34241705.6, + "logps/chosen": -180.07008056640626, + "logps/rejected": -165.27158203125, + "loss": 0.41535110473632814, + "rewards/chosen": 0.3609702348709106, + "rewards/margins": 0.8377941370010376, + "rewards/rejected": -0.47682390213012693, + "step": 4110 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.687157154083252, + "kl": 4.200056076049805, + "learning_rate": 9.788888888888889e-07, + "logits/chosen": 30389494.4, + "logits/rejected": 33020348.8, + "logps/chosen": -205.723876953125, + "logps/rejected": -189.8884765625, + "loss": 0.4510765075683594, + "rewards/chosen": -0.2252514600753784, + "rewards/margins": 0.257360577583313, + "rewards/rejected": -0.48261203765869143, + "step": 4120 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.6598585844039917, + "kl": 5.19122838973999, + "learning_rate": 9.677777777777779e-07, + "logits/chosen": 25100822.4, + "logits/rejected": 22434776.0, + "logps/chosen": -139.93997802734376, + "logps/rejected": -148.89000244140624, + "loss": 0.42380781173706056, + "rewards/chosen": 0.25194945335388186, + "rewards/margins": 0.616188907623291, + "rewards/rejected": -0.3642394542694092, + "step": 4130 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.596368134021759, + "kl": 4.2246832847595215, + "learning_rate": 9.566666666666667e-07, + "logits/chosen": 23562168.0, + "logits/rejected": 19095112.0, + "logps/chosen": -167.05655517578126, + "logps/rejected": -163.794482421875, + "loss": 0.42250747680664064, + "rewards/chosen": 0.26157207489013673, + "rewards/margins": 0.9496047973632813, + "rewards/rejected": -0.6880327224731445, + "step": 4140 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.9235984683036804, + "kl": 4.401445388793945, + "learning_rate": 9.455555555555557e-07, + "logits/chosen": 23643918.4, + "logits/rejected": 26274195.2, + "logps/chosen": -147.10347900390624, + "logps/rejected": -122.74593505859374, + "loss": 0.4525291919708252, + "rewards/chosen": 0.2129079818725586, + "rewards/margins": 0.4618348360061646, + "rewards/rejected": -0.24892685413360596, + "step": 4150 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.5369182825088501, + "kl": 4.616759777069092, + "learning_rate": 9.344444444444445e-07, + "logits/chosen": 22211737.6, + "logits/rejected": 20181283.2, + "logps/chosen": -121.6099365234375, + "logps/rejected": -147.121728515625, + "loss": 0.4210421085357666, + "rewards/chosen": 0.3621690273284912, + "rewards/margins": 0.832556676864624, + "rewards/rejected": -0.4703876495361328, + "step": 4160 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 0.7842811942100525, + "kl": 3.4067413806915283, + "learning_rate": 9.233333333333334e-07, + "logits/chosen": 27008988.8, + "logits/rejected": 25419982.4, + "logps/chosen": -170.2637939453125, + "logps/rejected": -197.46827392578126, + "loss": 0.4370081424713135, + "rewards/chosen": -0.2569821834564209, + "rewards/margins": 0.6670010089874268, + "rewards/rejected": -0.9239831924438476, + "step": 4170 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.9903603196144104, + "kl": 2.7231457233428955, + "learning_rate": 9.122222222222222e-07, + "logits/chosen": 24366032.0, + "logits/rejected": 23620707.2, + "logps/chosen": -151.88421630859375, + "logps/rejected": -164.09239501953124, + "loss": 0.42814011573791505, + "rewards/chosen": 0.023269623517990112, + "rewards/margins": 0.6117547690868378, + "rewards/rejected": -0.5884851455688477, + "step": 4180 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 0.8565585613250732, + "kl": 3.8181614875793457, + "learning_rate": 9.011111111111112e-07, + "logits/chosen": 30655971.2, + "logits/rejected": 30687513.6, + "logps/chosen": -158.52818603515624, + "logps/rejected": -156.8181884765625, + "loss": 0.4161073684692383, + "rewards/chosen": 0.05024971961975098, + "rewards/margins": 0.9918828487396241, + "rewards/rejected": -0.9416331291198731, + "step": 4190 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.6923061609268188, + "kl": 3.7691681385040283, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 29314112.0, + "logits/rejected": 29080848.0, + "logps/chosen": -143.7644775390625, + "logps/rejected": -143.195703125, + "loss": 0.42991132736206056, + "rewards/chosen": 0.14495362043380738, + "rewards/margins": 0.5658432364463806, + "rewards/rejected": -0.42088961601257324, + "step": 4200 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 4.478702545166016, + "eval_logits/chosen": 27134529.536, + "eval_logits/rejected": 27462119.424, + "eval_logps/chosen": -154.735171875, + "eval_logps/rejected": -151.346375, + "eval_loss": 0.4791446030139923, + "eval_rewards/chosen": -0.040938720703125, + "eval_rewards/margins": 0.22942645263671874, + "eval_rewards/rejected": -0.2703651733398437, + "eval_runtime": 216.6724, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 4200 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 0.8431246876716614, + "kl": 3.3740649223327637, + "learning_rate": 8.78888888888889e-07, + "logits/chosen": 24580116.8, + "logits/rejected": 23560889.6, + "logps/chosen": -130.23404541015626, + "logps/rejected": -173.274755859375, + "loss": 0.45294036865234377, + "rewards/chosen": -0.21662135124206544, + "rewards/margins": 0.6990167140960692, + "rewards/rejected": -0.9156380653381347, + "step": 4210 + }, + { + "epoch": 1.688, + "grad_norm": 1.0964020490646362, + "kl": 3.4218056201934814, + "learning_rate": 8.677777777777778e-07, + "logits/chosen": 25205835.2, + "logits/rejected": 24163204.8, + "logps/chosen": -144.6944580078125, + "logps/rejected": -165.13668212890624, + "loss": 0.41784143447875977, + "rewards/chosen": 0.14861660003662108, + "rewards/margins": 0.8323590278625488, + "rewards/rejected": -0.6837424278259278, + "step": 4220 + }, + { + "epoch": 1.692, + "grad_norm": 0.8997116088867188, + "kl": 5.5824151039123535, + "learning_rate": 8.566666666666668e-07, + "logits/chosen": 23316640.0, + "logits/rejected": 23185657.6, + "logps/chosen": -150.42138671875, + "logps/rejected": -176.9662109375, + "loss": 0.4408450126647949, + "rewards/chosen": 0.33873915672302246, + "rewards/margins": 0.5633441925048828, + "rewards/rejected": -0.22460503578186036, + "step": 4230 + }, + { + "epoch": 1.696, + "grad_norm": 0.9084079265594482, + "kl": 3.3602194786071777, + "learning_rate": 8.455555555555556e-07, + "logits/chosen": 20662596.8, + "logits/rejected": 21372936.0, + "logps/chosen": -158.28271484375, + "logps/rejected": -157.30823974609376, + "loss": 0.4337437152862549, + "rewards/chosen": -0.27880520820617677, + "rewards/margins": 0.39356427192687987, + "rewards/rejected": -0.6723694801330566, + "step": 4240 + }, + { + "epoch": 1.7, + "grad_norm": 0.822826623916626, + "kl": 6.255753993988037, + "learning_rate": 8.344444444444445e-07, + "logits/chosen": 28279952.0, + "logits/rejected": 30563142.4, + "logps/chosen": -207.4703369140625, + "logps/rejected": -156.87734375, + "loss": 0.401468563079834, + "rewards/chosen": 0.3442718505859375, + "rewards/margins": 0.6474948883056642, + "rewards/rejected": -0.3032230377197266, + "step": 4250 + }, + { + "epoch": 1.704, + "grad_norm": 0.7218330502510071, + "kl": 3.7593655586242676, + "learning_rate": 8.233333333333333e-07, + "logits/chosen": 21211801.6, + "logits/rejected": 25582251.2, + "logps/chosen": -181.3871337890625, + "logps/rejected": -158.4944580078125, + "loss": 0.4855056285858154, + "rewards/chosen": -0.661691427230835, + "rewards/margins": -0.19646124839782714, + "rewards/rejected": -0.4652301788330078, + "step": 4260 + }, + { + "epoch": 1.708, + "grad_norm": 0.5836480259895325, + "kl": 4.7980637550354, + "learning_rate": 8.122222222222223e-07, + "logits/chosen": 25395673.6, + "logits/rejected": 26470857.6, + "logps/chosen": -129.83922119140624, + "logps/rejected": -120.219775390625, + "loss": 0.4517657279968262, + "rewards/chosen": 0.2096014976501465, + "rewards/margins": 0.4321582317352295, + "rewards/rejected": -0.222556734085083, + "step": 4270 + }, + { + "epoch": 1.712, + "grad_norm": 0.6632907390594482, + "kl": 4.515078544616699, + "learning_rate": 8.011111111111111e-07, + "logits/chosen": 28016816.0, + "logits/rejected": 29217836.8, + "logps/chosen": -161.2439697265625, + "logps/rejected": -146.48646240234376, + "loss": 0.4749518871307373, + "rewards/chosen": -0.10194592475891114, + "rewards/margins": 0.19709014892578125, + "rewards/rejected": -0.29903607368469237, + "step": 4280 + }, + { + "epoch": 1.716, + "grad_norm": 0.9374505877494812, + "kl": 4.215886116027832, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 25599212.8, + "logits/rejected": 27952102.4, + "logps/chosen": -146.201220703125, + "logps/rejected": -121.0344482421875, + "loss": 0.4816310882568359, + "rewards/chosen": -0.015983200073242186, + "rewards/margins": 0.10800590515136718, + "rewards/rejected": -0.12398910522460938, + "step": 4290 + }, + { + "epoch": 1.72, + "grad_norm": 0.9015432596206665, + "kl": 3.52048921585083, + "learning_rate": 7.788888888888889e-07, + "logits/chosen": 19586571.2, + "logits/rejected": 17563038.4, + "logps/chosen": -147.8046142578125, + "logps/rejected": -163.8701416015625, + "loss": 0.4111928939819336, + "rewards/chosen": 0.09343934059143066, + "rewards/margins": 1.163926935195923, + "rewards/rejected": -1.0704875946044923, + "step": 4300 + }, + { + "epoch": 1.724, + "grad_norm": 0.6239180564880371, + "kl": 1.9489761590957642, + "learning_rate": 7.677777777777779e-07, + "logits/chosen": 29297952.0, + "logits/rejected": 25391588.8, + "logps/chosen": -142.50740966796874, + "logps/rejected": -180.68870849609374, + "loss": 0.43747854232788086, + "rewards/chosen": -0.7470763683319092, + "rewards/margins": 0.7179863452911378, + "rewards/rejected": -1.465062713623047, + "step": 4310 + }, + { + "epoch": 1.728, + "grad_norm": 0.6577679514884949, + "kl": 4.765759468078613, + "learning_rate": 7.566666666666667e-07, + "logits/chosen": 24857828.8, + "logits/rejected": 23441436.8, + "logps/chosen": -146.1468017578125, + "logps/rejected": -153.498046875, + "loss": 0.44129347801208496, + "rewards/chosen": -0.11145193576812744, + "rewards/margins": 0.5271980524063111, + "rewards/rejected": -0.6386499881744385, + "step": 4320 + }, + { + "epoch": 1.732, + "grad_norm": 0.6183480024337769, + "kl": 4.645040988922119, + "learning_rate": 7.455555555555556e-07, + "logits/chosen": 22485094.4, + "logits/rejected": 24854936.0, + "logps/chosen": -170.60594482421874, + "logps/rejected": -139.26231689453124, + "loss": 0.4434357166290283, + "rewards/chosen": -0.2083209276199341, + "rewards/margins": 0.2575597524642944, + "rewards/rejected": -0.4658806800842285, + "step": 4330 + }, + { + "epoch": 1.736, + "grad_norm": 0.727397084236145, + "kl": 3.468677520751953, + "learning_rate": 7.344444444444445e-07, + "logits/chosen": 19499200.0, + "logits/rejected": 15777052.8, + "logps/chosen": -133.76549072265624, + "logps/rejected": -177.62255859375, + "loss": 0.39086987972259524, + "rewards/chosen": 0.10505068302154541, + "rewards/margins": 1.318554902076721, + "rewards/rejected": -1.2135042190551757, + "step": 4340 + }, + { + "epoch": 1.74, + "grad_norm": 0.5569754242897034, + "kl": 3.695270538330078, + "learning_rate": 7.233333333333334e-07, + "logits/chosen": 21616838.4, + "logits/rejected": 25698086.4, + "logps/chosen": -166.606103515625, + "logps/rejected": -136.80845947265624, + "loss": 0.4519169807434082, + "rewards/chosen": -0.5317587852478027, + "rewards/margins": 0.12961096763610835, + "rewards/rejected": -0.6613697528839111, + "step": 4350 + }, + { + "epoch": 1.744, + "grad_norm": 0.5092763900756836, + "kl": 5.227725028991699, + "learning_rate": 7.122222222222223e-07, + "logits/chosen": 18400382.4, + "logits/rejected": 17152075.2, + "logps/chosen": -152.55262451171876, + "logps/rejected": -161.2753173828125, + "loss": 0.42585110664367676, + "rewards/chosen": 0.08888615369796753, + "rewards/margins": 1.0932955622673035, + "rewards/rejected": -1.004409408569336, + "step": 4360 + }, + { + "epoch": 1.748, + "grad_norm": 0.6089858412742615, + "kl": 6.389164924621582, + "learning_rate": 7.011111111111112e-07, + "logits/chosen": 23212915.2, + "logits/rejected": 23174060.8, + "logps/chosen": -148.427197265625, + "logps/rejected": -138.902783203125, + "loss": 0.43767833709716797, + "rewards/chosen": 0.34542050361633303, + "rewards/margins": 0.6011173248291015, + "rewards/rejected": -0.25569682121276854, + "step": 4370 + }, + { + "epoch": 1.752, + "grad_norm": 0.7319818735122681, + "kl": 3.631608486175537, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 24275633.6, + "logits/rejected": 25451712.0, + "logps/chosen": -130.7281982421875, + "logps/rejected": -169.6556884765625, + "loss": 0.4526735782623291, + "rewards/chosen": -0.1837414264678955, + "rewards/margins": 0.5917365074157716, + "rewards/rejected": -0.775477933883667, + "step": 4380 + }, + { + "epoch": 1.756, + "grad_norm": 0.7777149081230164, + "kl": 3.294254779815674, + "learning_rate": 6.78888888888889e-07, + "logits/chosen": 21825590.4, + "logits/rejected": 21820012.8, + "logps/chosen": -147.787158203125, + "logps/rejected": -144.9162841796875, + "loss": 0.43806142807006837, + "rewards/chosen": -0.11295137405395508, + "rewards/margins": 0.8744370460510255, + "rewards/rejected": -0.9873884201049805, + "step": 4390 + }, + { + "epoch": 1.76, + "grad_norm": 0.6188346147537231, + "kl": 1.1032154560089111, + "learning_rate": 6.677777777777779e-07, + "logits/chosen": 21570892.8, + "logits/rejected": 23144080.0, + "logps/chosen": -135.95819091796875, + "logps/rejected": -149.30087890625, + "loss": 0.44826564788818357, + "rewards/chosen": -0.7288064002990723, + "rewards/margins": 0.5491142272949218, + "rewards/rejected": -1.277920627593994, + "step": 4400 + }, + { + "epoch": 1.76, + "eval_kl": 3.972273826599121, + "eval_logits/chosen": 25631897.6, + "eval_logits/rejected": 26045097.984, + "eval_logps/chosen": -156.330625, + "eval_logps/rejected": -152.941625, + "eval_loss": 0.4793069362640381, + "eval_rewards/chosen": -0.2004847412109375, + "eval_rewards/margins": 0.22940472412109375, + "eval_rewards/rejected": -0.42988946533203126, + "eval_runtime": 216.7456, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 4400 + }, + { + "epoch": 1.764, + "grad_norm": 0.8933963179588318, + "kl": 3.5940029621124268, + "learning_rate": 6.566666666666667e-07, + "logits/chosen": 23419489.6, + "logits/rejected": 19540764.8, + "logps/chosen": -159.04957275390626, + "logps/rejected": -170.299072265625, + "loss": 0.4445340633392334, + "rewards/chosen": -0.11221444606781006, + "rewards/margins": 0.6003352880477906, + "rewards/rejected": -0.7125497341156006, + "step": 4410 + }, + { + "epoch": 1.768, + "grad_norm": 1.043148159980774, + "kl": 4.92350959777832, + "learning_rate": 6.455555555555556e-07, + "logits/chosen": 23423225.6, + "logits/rejected": 20264896.0, + "logps/chosen": -134.481201171875, + "logps/rejected": -165.0137451171875, + "loss": 0.43820796012878416, + "rewards/chosen": 0.19546182155609132, + "rewards/margins": 0.7127113103866578, + "rewards/rejected": -0.5172494888305664, + "step": 4420 + }, + { + "epoch": 1.772, + "grad_norm": 0.6755536198616028, + "kl": 3.846719741821289, + "learning_rate": 6.344444444444445e-07, + "logits/chosen": 30165392.0, + "logits/rejected": 34583219.2, + "logps/chosen": -156.612158203125, + "logps/rejected": -142.9615234375, + "loss": 0.4490334510803223, + "rewards/chosen": -0.037669995427131654, + "rewards/margins": 0.416199442744255, + "rewards/rejected": -0.4538694381713867, + "step": 4430 + }, + { + "epoch": 1.776, + "grad_norm": 0.6009793877601624, + "kl": 3.6022000312805176, + "learning_rate": 6.233333333333333e-07, + "logits/chosen": 16789396.8, + "logits/rejected": 20022726.4, + "logps/chosen": -139.00106201171874, + "logps/rejected": -119.23135986328126, + "loss": 0.48907132148742677, + "rewards/chosen": -0.18124552965164184, + "rewards/margins": 0.10945202112197874, + "rewards/rejected": -0.2906975507736206, + "step": 4440 + }, + { + "epoch": 1.78, + "grad_norm": 0.5468002557754517, + "kl": 4.557890892028809, + "learning_rate": 6.122222222222222e-07, + "logits/chosen": 30898377.6, + "logits/rejected": 29239990.4, + "logps/chosen": -161.43404541015624, + "logps/rejected": -163.96580810546874, + "loss": 0.4502861499786377, + "rewards/chosen": 0.12291504144668579, + "rewards/margins": 0.5061401724815369, + "rewards/rejected": -0.38322513103485106, + "step": 4450 + }, + { + "epoch": 1.784, + "grad_norm": 0.5758063793182373, + "kl": 2.764960527420044, + "learning_rate": 6.011111111111112e-07, + "logits/chosen": 23224883.2, + "logits/rejected": 23461840.0, + "logps/chosen": -147.754541015625, + "logps/rejected": -139.2906494140625, + "loss": 0.44967427253723147, + "rewards/chosen": -0.2614432334899902, + "rewards/margins": 0.513807487487793, + "rewards/rejected": -0.7752507209777832, + "step": 4460 + }, + { + "epoch": 1.788, + "grad_norm": 0.7826879620552063, + "kl": 4.313460350036621, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 28475094.4, + "logits/rejected": 29499904.0, + "logps/chosen": -149.60347900390624, + "logps/rejected": -139.57032470703126, + "loss": 0.4358978748321533, + "rewards/chosen": 0.18888943195343016, + "rewards/margins": 0.7328470468521118, + "rewards/rejected": -0.5439576148986817, + "step": 4470 + }, + { + "epoch": 1.792, + "grad_norm": 0.7026771306991577, + "kl": 1.8280513286590576, + "learning_rate": 5.788888888888889e-07, + "logits/chosen": 15314908.8, + "logits/rejected": 14658353.6, + "logps/chosen": -146.75345458984376, + "logps/rejected": -157.1791259765625, + "loss": 0.4324824810028076, + "rewards/chosen": -0.4457141399383545, + "rewards/margins": 0.7849259853363036, + "rewards/rejected": -1.2306401252746582, + "step": 4480 + }, + { + "epoch": 1.796, + "grad_norm": 0.7853025197982788, + "kl": 5.189270496368408, + "learning_rate": 5.677777777777779e-07, + "logits/chosen": 26974720.0, + "logits/rejected": 25859792.0, + "logps/chosen": -164.4057861328125, + "logps/rejected": -164.3528076171875, + "loss": 0.41428799629211427, + "rewards/chosen": 0.3338757514953613, + "rewards/margins": 0.8068063259124756, + "rewards/rejected": -0.4729305744171143, + "step": 4490 + }, + { + "epoch": 1.8, + "grad_norm": 0.7289919853210449, + "kl": 3.4229626655578613, + "learning_rate": 5.566666666666667e-07, + "logits/chosen": 28305580.8, + "logits/rejected": 27415660.8, + "logps/chosen": -127.7979248046875, + "logps/rejected": -144.92559814453125, + "loss": 0.4419555187225342, + "rewards/chosen": -0.07625447511672974, + "rewards/margins": 0.5168057322502135, + "rewards/rejected": -0.5930602073669433, + "step": 4500 + }, + { + "epoch": 1.804, + "grad_norm": 0.7820873856544495, + "kl": 5.29005241394043, + "learning_rate": 5.455555555555556e-07, + "logits/chosen": 30357193.6, + "logits/rejected": 28641846.4, + "logps/chosen": -164.97662353515625, + "logps/rejected": -146.59793701171876, + "loss": 0.43038201332092285, + "rewards/chosen": 0.35901241302490233, + "rewards/margins": 0.7769660949707031, + "rewards/rejected": -0.4179536819458008, + "step": 4510 + }, + { + "epoch": 1.808, + "grad_norm": 0.8984478116035461, + "kl": 3.696812868118286, + "learning_rate": 5.344444444444445e-07, + "logits/chosen": 21213681.6, + "logits/rejected": 20043033.6, + "logps/chosen": -183.95279541015626, + "logps/rejected": -168.1439208984375, + "loss": 0.4274559020996094, + "rewards/chosen": -0.015572810173034668, + "rewards/margins": 0.6375526189804077, + "rewards/rejected": -0.6531254291534424, + "step": 4520 + }, + { + "epoch": 1.812, + "grad_norm": 0.5371900200843811, + "kl": 2.156186819076538, + "learning_rate": 5.233333333333334e-07, + "logits/chosen": 24134673.6, + "logits/rejected": 21031646.4, + "logps/chosen": -141.1941162109375, + "logps/rejected": -176.01248779296876, + "loss": 0.3860702276229858, + "rewards/chosen": -0.2229753017425537, + "rewards/margins": 1.4064032077789306, + "rewards/rejected": -1.6293785095214843, + "step": 4530 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.8296744227409363, + "kl": 3.7835755348205566, + "learning_rate": 5.122222222222222e-07, + "logits/chosen": 36388054.4, + "logits/rejected": 31688457.6, + "logps/chosen": -160.08714599609374, + "logps/rejected": -207.0276123046875, + "loss": 0.40818300247192385, + "rewards/chosen": 0.12663592100143434, + "rewards/margins": 1.2323408007621766, + "rewards/rejected": -1.1057048797607423, + "step": 4540 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.5752077102661133, + "kl": 3.985583543777466, + "learning_rate": 5.011111111111112e-07, + "logits/chosen": 21616614.4, + "logits/rejected": 22038934.4, + "logps/chosen": -139.400634765625, + "logps/rejected": -120.08944091796874, + "loss": 0.45534987449645997, + "rewards/chosen": -0.1639024496078491, + "rewards/margins": 0.37830965518951415, + "rewards/rejected": -0.5422121047973633, + "step": 4550 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.7708961367607117, + "kl": 3.2169101238250732, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 23925704.0, + "logits/rejected": 19160164.8, + "logps/chosen": -168.2747802734375, + "logps/rejected": -186.7814697265625, + "loss": 0.4789942741394043, + "rewards/chosen": -0.4888188362121582, + "rewards/margins": 0.5942277908325195, + "rewards/rejected": -1.0830466270446777, + "step": 4560 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 0.6243308186531067, + "kl": 4.417626857757568, + "learning_rate": 4.78888888888889e-07, + "logits/chosen": 27919126.4, + "logits/rejected": 26266936.0, + "logps/chosen": -180.739013671875, + "logps/rejected": -187.88717041015624, + "loss": 0.38467090129852294, + "rewards/chosen": -0.027477288246154787, + "rewards/margins": 1.3471161603927613, + "rewards/rejected": -1.374593448638916, + "step": 4570 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6753647327423096, + "kl": 2.572279930114746, + "learning_rate": 4.6777777777777785e-07, + "logits/chosen": 24245668.8, + "logits/rejected": 25570259.2, + "logps/chosen": -183.2251953125, + "logps/rejected": -144.81016845703124, + "loss": 0.47222309112548827, + "rewards/chosen": -0.7298378944396973, + "rewards/margins": -0.06857419013977051, + "rewards/rejected": -0.6612637042999268, + "step": 4580 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 0.6756667494773865, + "kl": 2.9830145835876465, + "learning_rate": 4.566666666666667e-07, + "logits/chosen": 19786300.8, + "logits/rejected": 18077294.4, + "logps/chosen": -142.7079833984375, + "logps/rejected": -167.11500244140626, + "loss": 0.42957119941711425, + "rewards/chosen": -0.10680264234542847, + "rewards/margins": 1.0903936505317688, + "rewards/rejected": -1.1971962928771973, + "step": 4590 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.5496138334274292, + "kl": 2.980377197265625, + "learning_rate": 4.455555555555556e-07, + "logits/chosen": 19038470.4, + "logits/rejected": 22191712.0, + "logps/chosen": -171.56102294921874, + "logps/rejected": -158.53353271484374, + "loss": 0.5080355644226074, + "rewards/chosen": -0.6536062240600586, + "rewards/margins": -0.24833087921142583, + "rewards/rejected": -0.4052753448486328, + "step": 4600 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.5254554748535156, + "eval_logits/chosen": 23809890.304, + "eval_logits/rejected": 24273334.272, + "eval_logps/chosen": -158.050265625, + "eval_logps/rejected": -154.70503125, + "eval_loss": 0.47947752475738525, + "eval_rewards/chosen": -0.37244818115234374, + "eval_rewards/margins": 0.2337820434570313, + "eval_rewards/rejected": -0.606230224609375, + "eval_runtime": 216.9986, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 2.304, + "step": 4600 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 0.53779536485672, + "kl": 2.9465878009796143, + "learning_rate": 4.344444444444445e-07, + "logits/chosen": 18342864.0, + "logits/rejected": 17376532.8, + "logps/chosen": -151.92774658203126, + "logps/rejected": -173.01671142578124, + "loss": 0.4371053218841553, + "rewards/chosen": 0.02389627695083618, + "rewards/margins": 0.8590038895606995, + "rewards/rejected": -0.8351076126098633, + "step": 4610 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.8398244380950928, + "kl": 4.876760482788086, + "learning_rate": 4.233333333333334e-07, + "logits/chosen": 18385000.0, + "logits/rejected": 16077065.6, + "logps/chosen": -145.19398193359376, + "logps/rejected": -156.1609619140625, + "loss": 0.4620822906494141, + "rewards/chosen": -0.28269295692443847, + "rewards/margins": 0.44993181228637696, + "rewards/rejected": -0.7326247692108154, + "step": 4620 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 0.5775973796844482, + "kl": 3.4174346923828125, + "learning_rate": 4.1222222222222225e-07, + "logits/chosen": 25656241.6, + "logits/rejected": 24305292.8, + "logps/chosen": -178.959912109375, + "logps/rejected": -173.44217529296876, + "loss": 0.42700676918029784, + "rewards/chosen": -0.3585548162460327, + "rewards/margins": 0.6643104791641236, + "rewards/rejected": -1.0228652954101562, + "step": 4630 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.5026371479034424, + "kl": 5.277353763580322, + "learning_rate": 4.0111111111111116e-07, + "logits/chosen": 23320708.8, + "logits/rejected": 26072505.6, + "logps/chosen": -162.64700927734376, + "logps/rejected": -163.052490234375, + "loss": 0.4349231719970703, + "rewards/chosen": 0.08882616758346558, + "rewards/margins": 0.8141135096549988, + "rewards/rejected": -0.7252873420715332, + "step": 4640 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.9511138796806335, + "kl": 3.9891674518585205, + "learning_rate": 3.9e-07, + "logits/chosen": 22138430.4, + "logits/rejected": 20103414.4, + "logps/chosen": -165.88394775390626, + "logps/rejected": -168.73675537109375, + "loss": 0.45732574462890624, + "rewards/chosen": -0.1767573595046997, + "rewards/margins": 0.7479500532150268, + "rewards/rejected": -0.9247074127197266, + "step": 4650 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.9161826968193054, + "kl": 4.220607280731201, + "learning_rate": 3.7888888888888894e-07, + "logits/chosen": 28969993.6, + "logits/rejected": 29838678.4, + "logps/chosen": -155.28150634765626, + "logps/rejected": -168.24874267578124, + "loss": 0.4191298961639404, + "rewards/chosen": 0.2649924993515015, + "rewards/margins": 0.8968584775924683, + "rewards/rejected": -0.6318659782409668, + "step": 4660 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 0.5795943140983582, + "kl": 2.4628450870513916, + "learning_rate": 3.677777777777778e-07, + "logits/chosen": 15824608.0, + "logits/rejected": 14315985.6, + "logps/chosen": -139.00059814453124, + "logps/rejected": -158.5366943359375, + "loss": 0.41784844398498533, + "rewards/chosen": -0.5109588146209717, + "rewards/margins": 0.7057264804840088, + "rewards/rejected": -1.2166852951049805, + "step": 4670 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 1.1482797861099243, + "kl": 5.052585601806641, + "learning_rate": 3.566666666666667e-07, + "logits/chosen": 31239507.2, + "logits/rejected": 31432432.0, + "logps/chosen": -179.59786376953124, + "logps/rejected": -199.677734375, + "loss": 0.4447749137878418, + "rewards/chosen": -0.503559160232544, + "rewards/margins": 0.2613923072814941, + "rewards/rejected": -0.7649514675140381, + "step": 4680 + }, + { + "epoch": 1.876, + "grad_norm": 0.4730696678161621, + "kl": 3.02178955078125, + "learning_rate": 3.4555555555555557e-07, + "logits/chosen": 31747740.8, + "logits/rejected": 31681862.4, + "logps/chosen": -140.8739990234375, + "logps/rejected": -135.987109375, + "loss": 0.48804163932800293, + "rewards/chosen": -0.5644313812255859, + "rewards/margins": -0.14893860816955562, + "rewards/rejected": -0.4154927730560303, + "step": 4690 + }, + { + "epoch": 1.88, + "grad_norm": 0.3857530653476715, + "kl": 4.020687103271484, + "learning_rate": 3.344444444444445e-07, + "logits/chosen": 22076350.4, + "logits/rejected": 21447452.8, + "logps/chosen": -153.19390869140625, + "logps/rejected": -149.92744140625, + "loss": 0.43955206871032715, + "rewards/chosen": -0.1915654182434082, + "rewards/margins": 0.6257681846618652, + "rewards/rejected": -0.8173336029052735, + "step": 4700 + }, + { + "epoch": 1.884, + "grad_norm": 0.6033211350440979, + "kl": 3.1054883003234863, + "learning_rate": 3.2333333333333334e-07, + "logits/chosen": 21611427.2, + "logits/rejected": 24669811.2, + "logps/chosen": -174.2831787109375, + "logps/rejected": -169.7625, + "loss": 0.46239595413208007, + "rewards/chosen": -0.480119514465332, + "rewards/margins": 0.46341266632080075, + "rewards/rejected": -0.9435321807861328, + "step": 4710 + }, + { + "epoch": 1.888, + "grad_norm": 0.4621000289916992, + "kl": 4.2236480712890625, + "learning_rate": 3.1222222222222226e-07, + "logits/chosen": 31168502.4, + "logits/rejected": 29779161.6, + "logps/chosen": -155.3637939453125, + "logps/rejected": -156.241455078125, + "loss": 0.47081618309020995, + "rewards/chosen": -0.27088658809661864, + "rewards/margins": 0.3101552248001099, + "rewards/rejected": -0.5810418128967285, + "step": 4720 + }, + { + "epoch": 1.892, + "grad_norm": 0.9124680161476135, + "kl": 3.5288634300231934, + "learning_rate": 3.0111111111111117e-07, + "logits/chosen": 21657052.8, + "logits/rejected": 22909753.6, + "logps/chosen": -122.133251953125, + "logps/rejected": -120.75472412109374, + "loss": 0.4612110137939453, + "rewards/chosen": -0.11650089025497437, + "rewards/margins": 0.5821841835975647, + "rewards/rejected": -0.6986850738525391, + "step": 4730 + }, + { + "epoch": 1.896, + "grad_norm": 0.6643645763397217, + "kl": 4.355770587921143, + "learning_rate": 2.9000000000000003e-07, + "logits/chosen": 18843272.0, + "logits/rejected": 15865353.6, + "logps/chosen": -149.32027587890624, + "logps/rejected": -151.32247314453124, + "loss": 0.43994879722595215, + "rewards/chosen": -0.003923875093460083, + "rewards/margins": 1.0778412997722626, + "rewards/rejected": -1.0817651748657227, + "step": 4740 + }, + { + "epoch": 1.9, + "grad_norm": 0.7815224528312683, + "kl": 3.7565338611602783, + "learning_rate": 2.7888888888888894e-07, + "logits/chosen": 17081561.6, + "logits/rejected": 20796334.4, + "logps/chosen": -143.27027587890626, + "logps/rejected": -158.528515625, + "loss": 0.4135895252227783, + "rewards/chosen": -0.08530845642089843, + "rewards/margins": 0.8226515769958496, + "rewards/rejected": -0.907960033416748, + "step": 4750 + }, + { + "epoch": 1.904, + "grad_norm": 0.6945417523384094, + "kl": 4.180878639221191, + "learning_rate": 2.677777777777778e-07, + "logits/chosen": 26961856.0, + "logits/rejected": 28458582.4, + "logps/chosen": -157.9197265625, + "logps/rejected": -175.12359619140625, + "loss": 0.4510028839111328, + "rewards/chosen": -0.5507327556610108, + "rewards/margins": 0.6062280178070069, + "rewards/rejected": -1.1569607734680176, + "step": 4760 + }, + { + "epoch": 1.908, + "grad_norm": 0.6717256903648376, + "kl": 2.790264844894409, + "learning_rate": 2.566666666666667e-07, + "logits/chosen": 26413414.4, + "logits/rejected": 26038793.6, + "logps/chosen": -155.70628662109374, + "logps/rejected": -127.781591796875, + "loss": 0.45128421783447265, + "rewards/chosen": -0.09372057914733886, + "rewards/margins": 0.4549129009246826, + "rewards/rejected": -0.5486334800720215, + "step": 4770 + }, + { + "epoch": 1.912, + "grad_norm": 0.7105498313903809, + "kl": 2.446235179901123, + "learning_rate": 2.455555555555556e-07, + "logits/chosen": 13721004.8, + "logits/rejected": 10354278.4, + "logps/chosen": -126.288916015625, + "logps/rejected": -146.79375, + "loss": 0.48598880767822267, + "rewards/chosen": -0.6219570159912109, + "rewards/margins": 0.45132789611816415, + "rewards/rejected": -1.073284912109375, + "step": 4780 + }, + { + "epoch": 1.916, + "grad_norm": 0.6213298439979553, + "kl": 3.837221622467041, + "learning_rate": 2.3444444444444446e-07, + "logits/chosen": 28986860.8, + "logits/rejected": 28953318.4, + "logps/chosen": -171.77513427734374, + "logps/rejected": -183.67852783203125, + "loss": 0.4227924346923828, + "rewards/chosen": 0.06139696836471557, + "rewards/margins": 1.0945536494255066, + "rewards/rejected": -1.033156681060791, + "step": 4790 + }, + { + "epoch": 1.92, + "grad_norm": 0.8169627785682678, + "kl": 2.6108901500701904, + "learning_rate": 2.2333333333333335e-07, + "logits/chosen": 25155881.6, + "logits/rejected": 27653331.2, + "logps/chosen": -169.9299072265625, + "logps/rejected": -174.208056640625, + "loss": 0.43941802978515626, + "rewards/chosen": -0.3946220397949219, + "rewards/margins": 0.4759023666381836, + "rewards/rejected": -0.8705244064331055, + "step": 4800 + }, + { + "epoch": 1.92, + "eval_kl": 3.7999300956726074, + "eval_logits/chosen": 24666335.232, + "eval_logits/rejected": 25062387.712, + "eval_logps/chosen": -157.002125, + "eval_logps/rejected": -153.64446875, + "eval_loss": 0.47916504740715027, + "eval_rewards/chosen": -0.2676349487304687, + "eval_rewards/margins": 0.23253808593749997, + "eval_rewards/rejected": -0.5001730346679687, + "eval_runtime": 221.7048, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 2.255, + "step": 4800 + } + ], + "logging_steps": 10, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-4800/training_args.bin b/v5/KTO/KTO_20k/lora/checkpoint-4800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b3df9314987039f6eb4aae71c1789a27c508f03 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-4800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b224910eb4f0913af2c07ef9b4ff545409726d7169b35fc1b136bed8f918d2c +size 5521 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/README.md b/v5/KTO/KTO_20k/lora/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_config.json b/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5726eb3a65b963dd94788413b8a63d4accbb95c3 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "down_proj", + "up_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_model.safetensors b/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2d145714f141da8b8c325d40001999d52ab4dc29 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1845719370035a0d7d677dd5a6aa6b40410a8081b1032d304e630360d6c1b035 +size 180385008 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/chat_template.jinja b/v5/KTO/KTO_20k/lora/checkpoint-5000/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/optimizer.pt b/v5/KTO/KTO_20k/lora/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d30adace4e518bedc13cccfed8d261a4acddff6 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac1ccf0ca87c3b2c52f5b95a597a1f1f09cb27f5331627c573517543c3743ce4 +size 360902475 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/rng_state.pth b/v5/KTO/KTO_20k/lora/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/scaler.pt b/v5/KTO/KTO_20k/lora/checkpoint-5000/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..056c848f825c33132d06ef857f33ab84e8af1fd6 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ad8990572ad11a824b7db276c8af49c179ca7e7724b4e6906cd0ae480a80a8 +size 1383 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/scheduler.pt b/v5/KTO/KTO_20k/lora/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e220b1855e20ae1515d2ddecfab514913ce9e4a --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c360197721457bfdbbfbd6f0bf796c7ca2011bfb10be6c12ef13d7dcf6cb098 +size 1465 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer.json b/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer_config.json b/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/trainer_state.json b/v5/KTO/KTO_20k/lora/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7066fa6dc1442ff37ae071da85ad2dc3c9f2c9a7 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/trainer_state.json @@ -0,0 +1,7934 @@ +{ + "best_global_step": 4600, + "best_metric": 0.2337820434570313, + "best_model_checkpoint": "output/lora/checkpoint-4600", + "epoch": 2.0, + "eval_steps": 200, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004, + "grad_norm": 0.5129354000091553, + "kl": 0.01111381035298109, + "learning_rate": 9e-08, + "logits/chosen": 29682550.4, + "logits/rejected": 31339625.6, + "logps/chosen": -148.96693115234376, + "logps/rejected": -128.8356201171875, + "loss": 0.5001067161560059, + "rewards/chosen": -0.0005946397315710783, + "rewards/margins": -0.0008538532070815563, + "rewards/rejected": 0.000259213475510478, + "step": 10 + }, + { + "epoch": 0.008, + "grad_norm": 0.4323230981826782, + "kl": 0.015593004412949085, + "learning_rate": 1.9e-07, + "logits/chosen": 53384144.0, + "logits/rejected": 52884704.0, + "logps/chosen": -140.024853515625, + "logps/rejected": -151.92880859375, + "loss": 0.49987268447875977, + "rewards/chosen": 0.0006237029097974301, + "rewards/margins": 0.0010180996730923652, + "rewards/rejected": -0.0003943967632949352, + "step": 20 + }, + { + "epoch": 0.012, + "grad_norm": 0.4257548451423645, + "kl": 0.014815926551818848, + "learning_rate": 2.9000000000000003e-07, + "logits/chosen": 34151433.6, + "logits/rejected": 34198240.0, + "logps/chosen": -131.73375244140624, + "logps/rejected": -140.37911376953124, + "loss": 0.4998063087463379, + "rewards/chosen": 0.0004901790525764227, + "rewards/margins": 0.0015492869075387715, + "rewards/rejected": -0.0010591078549623489, + "step": 30 + }, + { + "epoch": 0.016, + "grad_norm": 0.36496493220329285, + "kl": 0.02263352833688259, + "learning_rate": 3.9e-07, + "logits/chosen": 43278188.8, + "logits/rejected": 43919286.4, + "logps/chosen": -144.2862060546875, + "logps/rejected": -146.0272705078125, + "loss": 0.4999645233154297, + "rewards/chosen": 0.0011271238327026367, + "rewards/margins": 0.00028378488495945926, + "rewards/rejected": 0.0008433389477431775, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.3303475081920624, + "kl": 0.018513035029172897, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 43083849.6, + "logits/rejected": 44890182.4, + "logps/chosen": -140.999267578125, + "logps/rejected": -154.3203369140625, + "loss": 0.4999688625335693, + "rewards/chosen": 0.0011019515804946423, + "rewards/margins": 0.0002493190579116345, + "rewards/rejected": 0.0008526325225830078, + "step": 50 + }, + { + "epoch": 0.024, + "grad_norm": 0.2820725739002228, + "kl": 0.01858975924551487, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 36625328.0, + "logits/rejected": 34144819.2, + "logps/chosen": -105.73199462890625, + "logps/rejected": -114.0021728515625, + "loss": 0.5000367164611816, + "rewards/chosen": 0.0006336641497910022, + "rewards/margins": -0.0002944803796708584, + "rewards/rejected": 0.0009281445294618606, + "step": 60 + }, + { + "epoch": 0.028, + "grad_norm": 0.3881119191646576, + "kl": 0.00938491802662611, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 47839347.2, + "logits/rejected": 46951110.4, + "logps/chosen": -165.68013916015624, + "logps/rejected": -175.11986083984374, + "loss": 0.5000545501708984, + "rewards/chosen": -0.004812383651733398, + "rewards/margins": -0.0004361916333436959, + "rewards/rejected": -0.004376192018389702, + "step": 70 + }, + { + "epoch": 0.032, + "grad_norm": 0.4655516743659973, + "kl": 0.011602235026657581, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 40787196.8, + "logits/rejected": 40853612.8, + "logps/chosen": -154.95506591796874, + "logps/rejected": -163.55113525390624, + "loss": 0.4999113082885742, + "rewards/chosen": -0.003601384162902832, + "rewards/margins": 0.0007092095911502838, + "rewards/rejected": -0.004310593754053116, + "step": 80 + }, + { + "epoch": 0.036, + "grad_norm": 0.3819780647754669, + "kl": 0.02207348309457302, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 44163190.4, + "logits/rejected": 44268889.6, + "logps/chosen": -169.8670166015625, + "logps/rejected": -159.19212646484374, + "loss": 0.4996920108795166, + "rewards/chosen": -0.0014049055054783822, + "rewards/margins": 0.002464146353304386, + "rewards/rejected": -0.003869051858782768, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.4587983191013336, + "kl": 0.056853484362363815, + "learning_rate": 9.9e-07, + "logits/chosen": 27709289.6, + "logits/rejected": 27346092.8, + "logps/chosen": -134.2815185546875, + "logps/rejected": -164.53704833984375, + "loss": 0.4997425556182861, + "rewards/chosen": 0.0017462443560361863, + "rewards/margins": 0.0020607755985111, + "rewards/rejected": -0.0003145312424749136, + "step": 100 + }, + { + "epoch": 0.044, + "grad_norm": 0.3832976818084717, + "kl": 0.052884578704833984, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 38150864.0, + "logits/rejected": 37954022.4, + "logps/chosen": -130.54158935546874, + "logps/rejected": -135.6479248046875, + "loss": 0.49963693618774413, + "rewards/chosen": 0.002483482100069523, + "rewards/margins": 0.0029049014206975698, + "rewards/rejected": -0.000421419320628047, + "step": 110 + }, + { + "epoch": 0.048, + "grad_norm": 0.3761675953865051, + "kl": 0.06726250797510147, + "learning_rate": 1.19e-06, + "logits/chosen": 47769347.2, + "logits/rejected": 47376777.6, + "logps/chosen": -162.1564208984375, + "logps/rejected": -133.792041015625, + "loss": 0.5000278949737549, + "rewards/chosen": 0.00019950373098254204, + "rewards/margins": -0.00022385641932487488, + "rewards/rejected": 0.0004233601503074169, + "step": 120 + }, + { + "epoch": 0.052, + "grad_norm": 0.3125726580619812, + "kl": 0.15044990181922913, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 40041744.0, + "logits/rejected": 39132192.0, + "logps/chosen": -162.06031494140626, + "logps/rejected": -140.18397216796876, + "loss": 0.5000635147094726, + "rewards/chosen": 0.011354871094226837, + "rewards/margins": -0.0005075931549072266, + "rewards/rejected": 0.011862464249134064, + "step": 130 + }, + { + "epoch": 0.056, + "grad_norm": 0.35332098603248596, + "kl": 0.2332003116607666, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 40936960.0, + "logits/rejected": 42938476.8, + "logps/chosen": -142.440185546875, + "logps/rejected": -157.8077880859375, + "loss": 0.5003787040710449, + "rewards/chosen": 0.021105077862739564, + "rewards/margins": -0.003029544651508332, + "rewards/rejected": 0.024134622514247896, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.37256330251693726, + "kl": 0.25889211893081665, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 41140886.4, + "logits/rejected": 42385772.8, + "logps/chosen": -130.3114013671875, + "logps/rejected": -117.353369140625, + "loss": 0.49920454025268557, + "rewards/chosen": 0.02739974558353424, + "rewards/margins": 0.006363460421562196, + "rewards/rejected": 0.021036285161972045, + "step": 150 + }, + { + "epoch": 0.064, + "grad_norm": 0.38222262263298035, + "kl": 0.3707125782966614, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 34721360.0, + "logits/rejected": 35081372.8, + "logps/chosen": -143.524462890625, + "logps/rejected": -147.06605224609376, + "loss": 0.49993181228637695, + "rewards/chosen": 0.03622217178344726, + "rewards/margins": 0.0005454152822494465, + "rewards/rejected": 0.035676756501197816, + "step": 160 + }, + { + "epoch": 0.068, + "grad_norm": 0.3260433077812195, + "kl": 0.4656868577003479, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 37774179.2, + "logits/rejected": 39969705.6, + "logps/chosen": -138.2024658203125, + "logps/rejected": -149.75228271484374, + "loss": 0.4999128818511963, + "rewards/chosen": 0.04691722691059112, + "rewards/margins": 0.0006973743438720703, + "rewards/rejected": 0.04621985256671905, + "step": 170 + }, + { + "epoch": 0.072, + "grad_norm": 0.47040465474128723, + "kl": 0.6031174659729004, + "learning_rate": 1.79e-06, + "logits/chosen": 44058707.2, + "logits/rejected": 45027283.2, + "logps/chosen": -145.10814208984374, + "logps/rejected": -170.75323486328125, + "loss": 0.5000656604766845, + "rewards/chosen": 0.060049277544021604, + "rewards/margins": -0.0005249440670013483, + "rewards/rejected": 0.06057422161102295, + "step": 180 + }, + { + "epoch": 0.076, + "grad_norm": 0.3483351767063141, + "kl": 0.5900982022285461, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 41571993.6, + "logits/rejected": 43139596.8, + "logps/chosen": -124.036376953125, + "logps/rejected": -137.070751953125, + "loss": 0.5001267910003662, + "rewards/chosen": 0.05850306153297424, + "rewards/margins": -0.0010134875774383545, + "rewards/rejected": 0.0595165491104126, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.47156763076782227, + "kl": 0.654812753200531, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 46899654.4, + "logits/rejected": 48147660.8, + "logps/chosen": -150.88424072265624, + "logps/rejected": -172.6162109375, + "loss": 0.4997762680053711, + "rewards/chosen": 0.06637628674507141, + "rewards/margins": 0.001789605617523199, + "rewards/rejected": 0.06458668112754821, + "step": 200 + }, + { + "epoch": 0.08, + "eval_kl": 0.4785654842853546, + "eval_logits/chosen": 39006478.336, + "eval_logits/rejected": 38887682.048, + "eval_logps/chosen": -153.8359375, + "eval_logps/rejected": -148.1899375, + "eval_loss": 0.49953681230545044, + "eval_rewards/chosen": 0.04898439407348633, + "eval_rewards/margins": 0.00370624542236328, + "eval_rewards/rejected": 0.04527814865112305, + "eval_runtime": 217.7826, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 200 + }, + { + "epoch": 0.084, + "grad_norm": 0.3955392837524414, + "kl": 0.3641941249370575, + "learning_rate": 2.09e-06, + "logits/chosen": 34072531.2, + "logits/rejected": 34148444.8, + "logps/chosen": -140.24954833984376, + "logps/rejected": -132.308837890625, + "loss": 0.5002529621124268, + "rewards/chosen": 0.02951604127883911, + "rewards/margins": -0.002015212178230287, + "rewards/rejected": 0.0315312534570694, + "step": 210 + }, + { + "epoch": 0.088, + "grad_norm": 0.3798522651195526, + "kl": 0.2939055263996124, + "learning_rate": 2.19e-06, + "logits/chosen": 35659238.4, + "logits/rejected": 36517523.2, + "logps/chosen": -101.61099853515626, + "logps/rejected": -126.22640380859374, + "loss": 0.500004768371582, + "rewards/chosen": 0.023823246359825134, + "rewards/margins": -4.143416881561418e-05, + "rewards/rejected": 0.02386468052864075, + "step": 220 + }, + { + "epoch": 0.092, + "grad_norm": 0.4183703660964966, + "kl": 0.16193707287311554, + "learning_rate": 2.29e-06, + "logits/chosen": 47875980.8, + "logits/rejected": 46433056.0, + "logps/chosen": -185.33223876953124, + "logps/rejected": -163.7679443359375, + "loss": 0.5001229286193848, + "rewards/chosen": 0.002850056067109108, + "rewards/margins": -0.0010004475712776183, + "rewards/rejected": 0.003850503638386726, + "step": 230 + }, + { + "epoch": 0.096, + "grad_norm": 0.41121652722358704, + "kl": 0.2479170858860016, + "learning_rate": 2.39e-06, + "logits/chosen": 48178169.6, + "logits/rejected": 48277104.0, + "logps/chosen": -176.87537841796876, + "logps/rejected": -166.7927978515625, + "loss": 0.5001242637634278, + "rewards/chosen": 0.015346670150756836, + "rewards/margins": -0.0010146483778953556, + "rewards/rejected": 0.01636131852865219, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.49889811873435974, + "kl": 0.2833125591278076, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 37073097.6, + "logits/rejected": 35158246.4, + "logps/chosen": -176.40118408203125, + "logps/rejected": -145.24967041015626, + "loss": 0.49751877784729004, + "rewards/chosen": 0.03347367346286774, + "rewards/margins": 0.01985820829868317, + "rewards/rejected": 0.01361546516418457, + "step": 250 + }, + { + "epoch": 0.104, + "grad_norm": 0.30958813428878784, + "kl": 0.262741357088089, + "learning_rate": 2.59e-06, + "logits/chosen": 30595280.0, + "logits/rejected": 29650486.4, + "logps/chosen": -129.1654296875, + "logps/rejected": -131.6908935546875, + "loss": 0.4989294528961182, + "rewards/chosen": 0.02159818708896637, + "rewards/margins": 0.008565258979797364, + "rewards/rejected": 0.013032928109169006, + "step": 260 + }, + { + "epoch": 0.108, + "grad_norm": 0.5211781859397888, + "kl": 0.45164403319358826, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 39772844.8, + "logits/rejected": 40001942.4, + "logps/chosen": -157.1872314453125, + "logps/rejected": -151.93499755859375, + "loss": 0.498414421081543, + "rewards/chosen": 0.04374273419380188, + "rewards/margins": 0.012685334682464602, + "rewards/rejected": 0.03105739951133728, + "step": 270 + }, + { + "epoch": 0.112, + "grad_norm": 0.4277898073196411, + "kl": 0.4805964529514313, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 33355596.8, + "logits/rejected": 27445552.0, + "logps/chosen": -167.81568603515626, + "logps/rejected": -140.00860595703125, + "loss": 0.5012622356414795, + "rewards/chosen": 0.035515934228897095, + "rewards/margins": -0.010110187530517581, + "rewards/rejected": 0.045626121759414676, + "step": 280 + }, + { + "epoch": 0.116, + "grad_norm": 0.34256765246391296, + "kl": 0.7766927480697632, + "learning_rate": 2.89e-06, + "logits/chosen": 31572070.4, + "logits/rejected": 30722460.8, + "logps/chosen": -145.83292236328126, + "logps/rejected": -147.77294921875, + "loss": 0.4969996452331543, + "rewards/chosen": 0.08674753904342651, + "rewards/margins": 0.024010515213012687, + "rewards/rejected": 0.06273702383041382, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.5291184782981873, + "kl": 0.9063084721565247, + "learning_rate": 2.99e-06, + "logits/chosen": 42822204.8, + "logits/rejected": 43076217.6, + "logps/chosen": -168.21903076171876, + "logps/rejected": -179.7796630859375, + "loss": 0.49880061149597166, + "rewards/chosen": 0.07154507040977479, + "rewards/margins": 0.009801769256591805, + "rewards/rejected": 0.06174330115318298, + "step": 300 + }, + { + "epoch": 0.124, + "grad_norm": 0.37273386120796204, + "kl": 0.7940840125083923, + "learning_rate": 3.09e-06, + "logits/chosen": 34380668.8, + "logits/rejected": 35391296.0, + "logps/chosen": -146.24031982421874, + "logps/rejected": -155.27100830078126, + "loss": 0.501917552947998, + "rewards/chosen": 0.06237313747406006, + "rewards/margins": -0.015355908870697023, + "rewards/rejected": 0.07772904634475708, + "step": 310 + }, + { + "epoch": 0.128, + "grad_norm": 0.606876015663147, + "kl": 0.3188292682170868, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 38069395.2, + "logits/rejected": 35962320.0, + "logps/chosen": -160.331884765625, + "logps/rejected": -134.08292236328126, + "loss": 0.5004048347473145, + "rewards/chosen": -0.0012070264667272568, + "rewards/margins": -0.0032936643809080126, + "rewards/rejected": 0.0020866379141807555, + "step": 320 + }, + { + "epoch": 0.132, + "grad_norm": 0.37304550409317017, + "kl": 0.49732810258865356, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 47658496.0, + "logits/rejected": 44939072.0, + "logps/chosen": -153.3678955078125, + "logps/rejected": -144.11009521484374, + "loss": 0.49539766311645506, + "rewards/chosen": 0.047433477640151975, + "rewards/margins": 0.036879205703735346, + "rewards/rejected": 0.010554271936416625, + "step": 330 + }, + { + "epoch": 0.136, + "grad_norm": 0.39304211735725403, + "kl": 0.6668508052825928, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 40542966.4, + "logits/rejected": 41501622.4, + "logps/chosen": -134.35115966796874, + "logps/rejected": -143.7405029296875, + "loss": 0.500092887878418, + "rewards/chosen": 0.05502796769142151, + "rewards/margins": -0.0008756637573242229, + "rewards/rejected": 0.05590363144874573, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 0.40085893869400024, + "kl": 1.0664705038070679, + "learning_rate": 3.49e-06, + "logits/chosen": 38259638.4, + "logits/rejected": 37112304.0, + "logps/chosen": -134.30029296875, + "logps/rejected": -169.855859375, + "loss": 0.501033353805542, + "rewards/chosen": 0.09948489665985108, + "rewards/margins": -0.008264517784118644, + "rewards/rejected": 0.10774941444396972, + "step": 350 + }, + { + "epoch": 0.144, + "grad_norm": 0.3823564350605011, + "kl": 0.684799075126648, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 41781318.4, + "logits/rejected": 39330675.2, + "logps/chosen": -147.53521728515625, + "logps/rejected": -112.3668701171875, + "loss": 0.4975080966949463, + "rewards/chosen": 0.07126325964927674, + "rewards/margins": 0.019923430681228642, + "rewards/rejected": 0.0513398289680481, + "step": 360 + }, + { + "epoch": 0.148, + "grad_norm": 0.3882247507572174, + "kl": 0.9012428522109985, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 38998243.2, + "logits/rejected": 39052438.4, + "logps/chosen": -146.253564453125, + "logps/rejected": -150.4812744140625, + "loss": 0.5019874572753906, + "rewards/chosen": 0.07476127743721009, + "rewards/margins": -0.01590984463691711, + "rewards/rejected": 0.0906711220741272, + "step": 370 + }, + { + "epoch": 0.152, + "grad_norm": 0.6131926774978638, + "kl": 1.3055822849273682, + "learning_rate": 3.79e-06, + "logits/chosen": 34907878.4, + "logits/rejected": 35887766.4, + "logps/chosen": -155.17113037109374, + "logps/rejected": -166.9649658203125, + "loss": 0.501701831817627, + "rewards/chosen": 0.11749210357666015, + "rewards/margins": -0.013779759407043457, + "rewards/rejected": 0.1312718629837036, + "step": 380 + }, + { + "epoch": 0.156, + "grad_norm": 0.40174201130867004, + "kl": 1.3810280561447144, + "learning_rate": 3.89e-06, + "logits/chosen": 32044432.0, + "logits/rejected": 31293644.8, + "logps/chosen": -174.93670654296875, + "logps/rejected": -149.1033935546875, + "loss": 0.49690823554992675, + "rewards/chosen": 0.14782886505126952, + "rewards/margins": 0.024736273288726796, + "rewards/rejected": 0.12309259176254272, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 0.38557207584381104, + "kl": 1.346494197845459, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 43779216.0, + "logits/rejected": 41962329.6, + "logps/chosen": -148.337939453125, + "logps/rejected": -134.69471435546876, + "loss": 0.49627056121826174, + "rewards/chosen": 0.14792776107788086, + "rewards/margins": 0.029889833927154538, + "rewards/rejected": 0.11803792715072632, + "step": 400 + }, + { + "epoch": 0.16, + "eval_kl": 1.5156359672546387, + "eval_logits/chosen": 39535529.984, + "eval_logits/rejected": 39357890.56, + "eval_logps/chosen": -152.728359375, + "eval_logps/rejected": -147.235078125, + "eval_loss": 0.4976339638233185, + "eval_rewards/chosen": 0.15974119567871095, + "eval_rewards/margins": 0.01897695922851564, + "eval_rewards/rejected": 0.1407642364501953, + "eval_runtime": 217.4122, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 400 + }, + { + "epoch": 0.164, + "grad_norm": 0.3281092047691345, + "kl": 1.7389549016952515, + "learning_rate": 4.09e-06, + "logits/chosen": 44133203.2, + "logits/rejected": 42124723.2, + "logps/chosen": -146.7071044921875, + "logps/rejected": -147.72525634765626, + "loss": 0.4964505672454834, + "rewards/chosen": 0.1881537079811096, + "rewards/margins": 0.028516340255737294, + "rewards/rejected": 0.15963736772537232, + "step": 410 + }, + { + "epoch": 0.168, + "grad_norm": 0.5361565947532654, + "kl": 1.5194333791732788, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 35550390.4, + "logits/rejected": 33523052.8, + "logps/chosen": -161.477978515625, + "logps/rejected": -130.558740234375, + "loss": 0.4951611518859863, + "rewards/chosen": 0.16986674070358276, + "rewards/margins": 0.0387694001197815, + "rewards/rejected": 0.13109734058380126, + "step": 420 + }, + { + "epoch": 0.172, + "grad_norm": 0.3820321559906006, + "kl": 2.423292875289917, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 38340924.8, + "logits/rejected": 34324185.6, + "logps/chosen": -168.2252685546875, + "logps/rejected": -117.912451171875, + "loss": 0.4928645133972168, + "rewards/chosen": 0.2708438873291016, + "rewards/margins": 0.05714957714080812, + "rewards/rejected": 0.21369431018829346, + "step": 430 + }, + { + "epoch": 0.176, + "grad_norm": 0.5985101461410522, + "kl": 3.3684749603271484, + "learning_rate": 4.39e-06, + "logits/chosen": 43800515.2, + "logits/rejected": 45902227.2, + "logps/chosen": -145.99703369140624, + "logps/rejected": -166.97867431640626, + "loss": 0.5005404472351074, + "rewards/chosen": 0.3345966339111328, + "rewards/margins": -0.004501628875732411, + "rewards/rejected": 0.3390982627868652, + "step": 440 + }, + { + "epoch": 0.18, + "grad_norm": 0.38066738843917847, + "kl": 3.7418315410614014, + "learning_rate": 4.49e-06, + "logits/chosen": 35791923.2, + "logits/rejected": 39352179.2, + "logps/chosen": -95.49920654296875, + "logps/rejected": -151.85885009765624, + "loss": 0.5004417419433593, + "rewards/chosen": 0.3723719596862793, + "rewards/margins": -0.00362257957458495, + "rewards/rejected": 0.37599453926086424, + "step": 450 + }, + { + "epoch": 0.184, + "grad_norm": 0.44641122221946716, + "kl": 4.462111473083496, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 47153696.0, + "logits/rejected": 45399382.4, + "logps/chosen": -160.3977294921875, + "logps/rejected": -176.209765625, + "loss": 0.5064189434051514, + "rewards/chosen": 0.420426607131958, + "rewards/margins": -0.05156884193420408, + "rewards/rejected": 0.4719954490661621, + "step": 460 + }, + { + "epoch": 0.188, + "grad_norm": 0.8194193840026855, + "kl": 3.437168836593628, + "learning_rate": 4.69e-06, + "logits/chosen": 56483507.2, + "logits/rejected": 53677011.2, + "logps/chosen": -170.1270751953125, + "logps/rejected": -174.05128173828126, + "loss": 0.4986457824707031, + "rewards/chosen": 0.3491526126861572, + "rewards/margins": 0.010871815681457508, + "rewards/rejected": 0.3382807970046997, + "step": 470 + }, + { + "epoch": 0.192, + "grad_norm": 0.4502066373825073, + "kl": 2.8006443977355957, + "learning_rate": 4.79e-06, + "logits/chosen": 44358038.4, + "logits/rejected": 43814537.6, + "logps/chosen": -149.608935546875, + "logps/rejected": -158.89376220703124, + "loss": 0.49710774421691895, + "rewards/chosen": 0.29175291061401365, + "rewards/margins": 0.02337703704833982, + "rewards/rejected": 0.26837587356567383, + "step": 480 + }, + { + "epoch": 0.196, + "grad_norm": 0.3172740638256073, + "kl": 2.634169816970825, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 30142633.6, + "logits/rejected": 28152640.0, + "logps/chosen": -146.2323974609375, + "logps/rejected": -144.8813720703125, + "loss": 0.49065570831298827, + "rewards/chosen": 0.30094659328460693, + "rewards/margins": 0.07505896091461181, + "rewards/rejected": 0.22588763236999512, + "step": 490 + }, + { + "epoch": 0.2, + "grad_norm": 0.5071095824241638, + "kl": 4.3001179695129395, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 45352835.2, + "logits/rejected": 41344652.8, + "logps/chosen": -177.9995361328125, + "logps/rejected": -128.72022705078126, + "loss": 0.48739986419677733, + "rewards/chosen": 0.48075294494628906, + "rewards/margins": 0.10148224830627439, + "rewards/rejected": 0.37927069664001467, + "step": 500 + }, + { + "epoch": 0.204, + "grad_norm": 0.393530011177063, + "kl": 3.5296618938446045, + "learning_rate": 4.9900000000000005e-06, + "logits/chosen": 47881750.4, + "logits/rejected": 49526156.8, + "logps/chosen": -143.6640625, + "logps/rejected": -156.88994140625, + "loss": 0.49992995262145995, + "rewards/chosen": 0.3532871723175049, + "rewards/margins": 0.0006417512893676647, + "rewards/rejected": 0.35264542102813723, + "step": 510 + }, + { + "epoch": 0.208, + "grad_norm": 0.3692869544029236, + "kl": 4.48037576675415, + "learning_rate": 4.978888888888889e-06, + "logits/chosen": 46837849.6, + "logits/rejected": 45857177.6, + "logps/chosen": -154.83260498046874, + "logps/rejected": -160.442333984375, + "loss": 0.495820426940918, + "rewards/chosen": 0.464794921875, + "rewards/margins": 0.03351507186889646, + "rewards/rejected": 0.4312798500061035, + "step": 520 + }, + { + "epoch": 0.212, + "grad_norm": 0.44535931944847107, + "kl": 4.121534824371338, + "learning_rate": 4.967777777777778e-06, + "logits/chosen": 42945225.6, + "logits/rejected": 43357875.2, + "logps/chosen": -138.18310546875, + "logps/rejected": -172.6733154296875, + "loss": 0.5080226421356201, + "rewards/chosen": 0.3798489570617676, + "rewards/margins": -0.06460924148559571, + "rewards/rejected": 0.4444581985473633, + "step": 530 + }, + { + "epoch": 0.216, + "grad_norm": 0.5619053840637207, + "kl": 4.352797031402588, + "learning_rate": 4.956666666666667e-06, + "logits/chosen": 34937552.0, + "logits/rejected": 34883318.4, + "logps/chosen": -147.39837646484375, + "logps/rejected": -154.47596435546876, + "loss": 0.49129457473754884, + "rewards/chosen": 0.4700439929962158, + "rewards/margins": 0.0695285320281982, + "rewards/rejected": 0.4005154609680176, + "step": 540 + }, + { + "epoch": 0.22, + "grad_norm": 0.4256366193294525, + "kl": 3.3400237560272217, + "learning_rate": 4.945555555555557e-06, + "logits/chosen": 41670598.4, + "logits/rejected": 43236768.0, + "logps/chosen": -152.20511474609376, + "logps/rejected": -165.210205078125, + "loss": 0.4960598945617676, + "rewards/chosen": 0.3506686449050903, + "rewards/margins": 0.03333282470703125, + "rewards/rejected": 0.31733582019805906, + "step": 550 + }, + { + "epoch": 0.224, + "grad_norm": 0.42866551876068115, + "kl": 3.0413570404052734, + "learning_rate": 4.934444444444445e-06, + "logits/chosen": 36545302.4, + "logits/rejected": 34813177.6, + "logps/chosen": -161.16314697265625, + "logps/rejected": -148.3569091796875, + "loss": 0.4982303619384766, + "rewards/chosen": 0.2904952049255371, + "rewards/margins": 0.014188337326049794, + "rewards/rejected": 0.2763068675994873, + "step": 560 + }, + { + "epoch": 0.228, + "grad_norm": 0.3665854334831238, + "kl": 2.66752290725708, + "learning_rate": 4.923333333333334e-06, + "logits/chosen": 41975648.0, + "logits/rejected": 40743257.6, + "logps/chosen": -147.2247802734375, + "logps/rejected": -131.81417236328124, + "loss": 0.4888582706451416, + "rewards/chosen": 0.3010892391204834, + "rewards/margins": 0.08992741107940674, + "rewards/rejected": 0.21116182804107667, + "step": 570 + }, + { + "epoch": 0.232, + "grad_norm": 0.42764145135879517, + "kl": 2.8396944999694824, + "learning_rate": 4.912222222222223e-06, + "logits/chosen": 47665238.4, + "logits/rejected": 46761827.2, + "logps/chosen": -147.21837158203124, + "logps/rejected": -156.8475830078125, + "loss": 0.4951943874359131, + "rewards/chosen": 0.2681096315383911, + "rewards/margins": 0.03818519115447999, + "rewards/rejected": 0.22992444038391113, + "step": 580 + }, + { + "epoch": 0.236, + "grad_norm": 0.45218735933303833, + "kl": 2.9479668140411377, + "learning_rate": 4.901111111111112e-06, + "logits/chosen": 30179158.4, + "logits/rejected": 30914195.2, + "logps/chosen": -128.025927734375, + "logps/rejected": -133.37138671875, + "loss": 0.4864190101623535, + "rewards/chosen": 0.3341956615447998, + "rewards/margins": 0.1105940818786621, + "rewards/rejected": 0.2236015796661377, + "step": 590 + }, + { + "epoch": 0.24, + "grad_norm": 0.5611497759819031, + "kl": 2.7078356742858887, + "learning_rate": 4.890000000000001e-06, + "logits/chosen": 29134601.6, + "logits/rejected": 31641536.0, + "logps/chosen": -147.10302734375, + "logps/rejected": -148.70350341796876, + "loss": 0.5095005035400391, + "rewards/chosen": 0.1935347557067871, + "rewards/margins": -0.07780742645263675, + "rewards/rejected": 0.27134218215942385, + "step": 600 + }, + { + "epoch": 0.24, + "eval_kl": 2.6268301010131836, + "eval_logits/chosen": 38577520.64, + "eval_logits/rejected": 38429237.248, + "eval_logps/chosen": -151.8366875, + "eval_logps/rejected": -146.522453125, + "eval_loss": 0.49543091654777527, + "eval_rewards/chosen": 0.24890777587890625, + "eval_rewards/margins": 0.036881546020507805, + "eval_rewards/rejected": 0.21202622985839845, + "eval_runtime": 216.8269, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 600 + }, + { + "epoch": 0.244, + "grad_norm": 0.43945616483688354, + "kl": 3.3616116046905518, + "learning_rate": 4.878888888888889e-06, + "logits/chosen": 45687324.8, + "logits/rejected": 41788883.2, + "logps/chosen": -193.040185546875, + "logps/rejected": -168.3222900390625, + "loss": 0.48148174285888673, + "rewards/chosen": 0.35201478004455566, + "rewards/margins": 0.17267082929611205, + "rewards/rejected": 0.17934395074844361, + "step": 610 + }, + { + "epoch": 0.248, + "grad_norm": 0.47497129440307617, + "kl": 3.055345058441162, + "learning_rate": 4.867777777777778e-06, + "logits/chosen": 27710918.4, + "logits/rejected": 26318662.4, + "logps/chosen": -138.644775390625, + "logps/rejected": -134.76346435546876, + "loss": 0.4866151809692383, + "rewards/chosen": 0.3442774772644043, + "rewards/margins": 0.10848057270050046, + "rewards/rejected": 0.23579690456390381, + "step": 620 + }, + { + "epoch": 0.252, + "grad_norm": 0.4793793559074402, + "kl": 3.6866455078125, + "learning_rate": 4.856666666666667e-06, + "logits/chosen": 39017129.6, + "logits/rejected": 41635366.4, + "logps/chosen": -139.60113525390625, + "logps/rejected": -171.87177734375, + "loss": 0.5033475399017334, + "rewards/chosen": 0.33596067428588866, + "rewards/margins": -0.02700204849243165, + "rewards/rejected": 0.3629627227783203, + "step": 630 + }, + { + "epoch": 0.256, + "grad_norm": 0.5821816921234131, + "kl": 3.2494399547576904, + "learning_rate": 4.845555555555556e-06, + "logits/chosen": 41812588.8, + "logits/rejected": 40030470.4, + "logps/chosen": -188.748583984375, + "logps/rejected": -149.2949951171875, + "loss": 0.4903052806854248, + "rewards/chosen": 0.3278029918670654, + "rewards/margins": 0.08044664859771727, + "rewards/rejected": 0.24735634326934813, + "step": 640 + }, + { + "epoch": 0.26, + "grad_norm": 0.4971711337566376, + "kl": 3.889043092727661, + "learning_rate": 4.834444444444445e-06, + "logits/chosen": 43703507.2, + "logits/rejected": 42196211.2, + "logps/chosen": -125.8577392578125, + "logps/rejected": -132.8236328125, + "loss": 0.4952712535858154, + "rewards/chosen": 0.3801560878753662, + "rewards/margins": 0.03659126758575437, + "rewards/rejected": 0.3435648202896118, + "step": 650 + }, + { + "epoch": 0.264, + "grad_norm": 0.37012672424316406, + "kl": 2.9793968200683594, + "learning_rate": 4.8233333333333335e-06, + "logits/chosen": 57068806.4, + "logits/rejected": 55593145.6, + "logps/chosen": -168.82774658203124, + "logps/rejected": -132.77369384765626, + "loss": 0.4897792339324951, + "rewards/chosen": 0.3094865083694458, + "rewards/margins": 0.08172969818115233, + "rewards/rejected": 0.22775681018829347, + "step": 660 + }, + { + "epoch": 0.268, + "grad_norm": 0.5025138258934021, + "kl": 3.803910493850708, + "learning_rate": 4.812222222222222e-06, + "logits/chosen": 42714249.6, + "logits/rejected": 43013395.2, + "logps/chosen": -180.80955810546874, + "logps/rejected": -182.71279296875, + "loss": 0.48125367164611815, + "rewards/chosen": 0.4121575832366943, + "rewards/margins": 0.15600955486297607, + "rewards/rejected": 0.25614802837371825, + "step": 670 + }, + { + "epoch": 0.272, + "grad_norm": 0.47364944219589233, + "kl": 2.744506597518921, + "learning_rate": 4.801111111111111e-06, + "logits/chosen": 41662313.6, + "logits/rejected": 40533548.8, + "logps/chosen": -143.998193359375, + "logps/rejected": -125.735693359375, + "loss": 0.493405818939209, + "rewards/chosen": 0.2640446662902832, + "rewards/margins": 0.05262007713317873, + "rewards/rejected": 0.2114245891571045, + "step": 680 + }, + { + "epoch": 0.276, + "grad_norm": 0.378603994846344, + "kl": 4.192839622497559, + "learning_rate": 4.79e-06, + "logits/chosen": 42741641.6, + "logits/rejected": 40729804.8, + "logps/chosen": -156.440087890625, + "logps/rejected": -175.48450927734376, + "loss": 0.4940618991851807, + "rewards/chosen": 0.42174320220947265, + "rewards/margins": 0.04788670539855955, + "rewards/rejected": 0.3738564968109131, + "step": 690 + }, + { + "epoch": 0.28, + "grad_norm": 0.4181530773639679, + "kl": 3.237612247467041, + "learning_rate": 4.778888888888889e-06, + "logits/chosen": 38295142.4, + "logits/rejected": 35257382.4, + "logps/chosen": -155.112744140625, + "logps/rejected": -143.0890869140625, + "loss": 0.49517078399658204, + "rewards/chosen": 0.2904268026351929, + "rewards/margins": 0.041890859603881836, + "rewards/rejected": 0.24853594303131105, + "step": 700 + }, + { + "epoch": 0.284, + "grad_norm": 0.41070079803466797, + "kl": 4.423883438110352, + "learning_rate": 4.767777777777778e-06, + "logits/chosen": 40968300.8, + "logits/rejected": 39222742.4, + "logps/chosen": -172.097021484375, + "logps/rejected": -133.7148193359375, + "loss": 0.4814108371734619, + "rewards/chosen": 0.5093639373779297, + "rewards/margins": 0.1534171581268311, + "rewards/rejected": 0.35594677925109863, + "step": 710 + }, + { + "epoch": 0.288, + "grad_norm": 0.374776691198349, + "kl": 3.082357406616211, + "learning_rate": 4.756666666666667e-06, + "logits/chosen": 34210566.4, + "logits/rejected": 35579948.8, + "logps/chosen": -120.33433837890625, + "logps/rejected": -122.443017578125, + "loss": 0.5063377857208252, + "rewards/chosen": 0.1743820548057556, + "rewards/margins": -0.07105478048324584, + "rewards/rejected": 0.24543683528900145, + "step": 720 + }, + { + "epoch": 0.292, + "grad_norm": 0.3790406882762909, + "kl": 3.781698226928711, + "learning_rate": 4.745555555555556e-06, + "logits/chosen": 37423852.8, + "logits/rejected": 34581126.4, + "logps/chosen": -152.25120849609374, + "logps/rejected": -154.15382080078126, + "loss": 0.493280029296875, + "rewards/chosen": 0.39404921531677245, + "rewards/margins": 0.05465142726898192, + "rewards/rejected": 0.3393977880477905, + "step": 730 + }, + { + "epoch": 0.296, + "grad_norm": 0.47025066614151, + "kl": 4.019077301025391, + "learning_rate": 4.734444444444445e-06, + "logits/chosen": 34749833.6, + "logits/rejected": 33696905.6, + "logps/chosen": -156.05477294921874, + "logps/rejected": -182.29608154296875, + "loss": 0.49593114852905273, + "rewards/chosen": 0.34479031562805174, + "rewards/margins": 0.04153461456298824, + "rewards/rejected": 0.3032557010650635, + "step": 740 + }, + { + "epoch": 0.3, + "grad_norm": 0.2891245484352112, + "kl": 2.7223880290985107, + "learning_rate": 4.7233333333333336e-06, + "logits/chosen": 39233993.6, + "logits/rejected": 38955433.6, + "logps/chosen": -159.49801025390624, + "logps/rejected": -183.92698974609374, + "loss": 0.5023125648498535, + "rewards/chosen": 0.09982055425643921, + "rewards/margins": 0.004025018215179449, + "rewards/rejected": 0.09579553604125976, + "step": 750 + }, + { + "epoch": 0.304, + "grad_norm": 0.4833555817604065, + "kl": 1.6150490045547485, + "learning_rate": 4.712222222222222e-06, + "logits/chosen": 36941795.2, + "logits/rejected": 37837513.6, + "logps/chosen": -115.98502197265626, + "logps/rejected": -125.23421630859374, + "loss": 0.487321662902832, + "rewards/chosen": 0.10154855251312256, + "rewards/margins": 0.10329384654760361, + "rewards/rejected": -0.0017452940344810485, + "step": 760 + }, + { + "epoch": 0.308, + "grad_norm": 0.48121944069862366, + "kl": 1.2522486448287964, + "learning_rate": 4.701111111111111e-06, + "logits/chosen": 44757888.0, + "logits/rejected": 45581593.6, + "logps/chosen": -143.2324951171875, + "logps/rejected": -153.2897216796875, + "loss": 0.4856001377105713, + "rewards/chosen": 0.008315862715244293, + "rewards/margins": 0.1255118027329445, + "rewards/rejected": -0.1171959400177002, + "step": 770 + }, + { + "epoch": 0.312, + "grad_norm": 0.4913221001625061, + "kl": 0.6294690370559692, + "learning_rate": 4.69e-06, + "logits/chosen": 41557488.0, + "logits/rejected": 42490796.8, + "logps/chosen": -163.6885498046875, + "logps/rejected": -145.68963623046875, + "loss": 0.48305044174194334, + "rewards/chosen": -0.08929510116577148, + "rewards/margins": 0.14230823516845703, + "rewards/rejected": -0.23160333633422853, + "step": 780 + }, + { + "epoch": 0.316, + "grad_norm": 0.4478524327278137, + "kl": 1.0363415479660034, + "learning_rate": 4.67888888888889e-06, + "logits/chosen": 31436777.6, + "logits/rejected": 27623001.6, + "logps/chosen": -147.31231689453125, + "logps/rejected": -139.38282470703126, + "loss": 0.478118371963501, + "rewards/chosen": -0.18460922241210936, + "rewards/margins": 0.20353126525878906, + "rewards/rejected": -0.3881404876708984, + "step": 790 + }, + { + "epoch": 0.32, + "grad_norm": 0.5910397171974182, + "kl": 0.9226576089859009, + "learning_rate": 4.6677777777777785e-06, + "logits/chosen": 28333865.6, + "logits/rejected": 28280502.4, + "logps/chosen": -150.2372802734375, + "logps/rejected": -135.2026123046875, + "loss": 0.5036224842071533, + "rewards/chosen": -0.28971683979034424, + "rewards/margins": -0.058632898330688465, + "rewards/rejected": -0.23108394145965577, + "step": 800 + }, + { + "epoch": 0.32, + "eval_kl": 1.187766432762146, + "eval_logits/chosen": 34635034.624, + "eval_logits/rejected": 34656247.808, + "eval_logps/chosen": -156.55328125, + "eval_logps/rejected": -151.3069375, + "eval_loss": 0.4932977855205536, + "eval_rewards/chosen": -0.2227491455078125, + "eval_rewards/margins": 0.04367276000976561, + "eval_rewards/rejected": -0.2664219055175781, + "eval_runtime": 216.6151, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 800 + }, + { + "epoch": 0.324, + "grad_norm": 0.5326458811759949, + "kl": 1.5910537242889404, + "learning_rate": 4.656666666666667e-06, + "logits/chosen": 38717907.2, + "logits/rejected": 39118412.8, + "logps/chosen": -159.7564453125, + "logps/rejected": -157.7111572265625, + "loss": 0.49022369384765624, + "rewards/chosen": 0.008166373521089555, + "rewards/margins": 0.09110548570752143, + "rewards/rejected": -0.08293911218643188, + "step": 810 + }, + { + "epoch": 0.328, + "grad_norm": 0.4700395464897156, + "kl": 1.0638387203216553, + "learning_rate": 4.645555555555556e-06, + "logits/chosen": 24480648.0, + "logits/rejected": 24138777.6, + "logps/chosen": -157.56463623046875, + "logps/rejected": -129.4783935546875, + "loss": 0.5027867794036865, + "rewards/chosen": -0.3186595916748047, + "rewards/margins": -0.04232857227325437, + "rewards/rejected": -0.2763310194015503, + "step": 820 + }, + { + "epoch": 0.332, + "grad_norm": 0.5291322469711304, + "kl": 1.9198650121688843, + "learning_rate": 4.634444444444445e-06, + "logits/chosen": 31482019.2, + "logits/rejected": 30204611.2, + "logps/chosen": -160.241064453125, + "logps/rejected": -127.0677490234375, + "loss": 0.49699864387512205, + "rewards/chosen": -0.03406925797462464, + "rewards/margins": 0.020440274477005, + "rewards/rejected": -0.054509532451629636, + "step": 830 + }, + { + "epoch": 0.336, + "grad_norm": 0.5016227960586548, + "kl": 1.5567735433578491, + "learning_rate": 4.623333333333334e-06, + "logits/chosen": 39855260.8, + "logits/rejected": 39533484.8, + "logps/chosen": -151.4185302734375, + "logps/rejected": -151.45257568359375, + "loss": 0.4913910388946533, + "rewards/chosen": -0.0018289029598236085, + "rewards/margins": 0.05514721870422363, + "rewards/rejected": -0.05697612166404724, + "step": 840 + }, + { + "epoch": 0.34, + "grad_norm": 0.5913689732551575, + "kl": 2.8981661796569824, + "learning_rate": 4.6122222222222225e-06, + "logits/chosen": 39872361.6, + "logits/rejected": 38837766.4, + "logps/chosen": -168.816357421875, + "logps/rejected": -189.29534912109375, + "loss": 0.5085726261138916, + "rewards/chosen": -0.016373127698898315, + "rewards/margins": -0.10572689771652222, + "rewards/rejected": 0.0893537700176239, + "step": 850 + }, + { + "epoch": 0.344, + "grad_norm": 0.4795176088809967, + "kl": 1.9737087488174438, + "learning_rate": 4.601111111111112e-06, + "logits/chosen": 31763212.8, + "logits/rejected": 30045996.8, + "logps/chosen": -165.5231689453125, + "logps/rejected": -144.56939697265625, + "loss": 0.4861030101776123, + "rewards/chosen": -0.004905380308628082, + "rewards/margins": 0.10866030305624008, + "rewards/rejected": -0.11356568336486816, + "step": 860 + }, + { + "epoch": 0.348, + "grad_norm": 0.5342633128166199, + "kl": 0.8907498121261597, + "learning_rate": 4.590000000000001e-06, + "logits/chosen": 36357731.2, + "logits/rejected": 34329561.6, + "logps/chosen": -139.342138671875, + "logps/rejected": -147.3021728515625, + "loss": 0.4868171691894531, + "rewards/chosen": -0.3624546766281128, + "rewards/margins": 0.06923034191131588, + "rewards/rejected": -0.4316850185394287, + "step": 870 + }, + { + "epoch": 0.352, + "grad_norm": 0.4725877046585083, + "kl": 0.8254868388175964, + "learning_rate": 4.57888888888889e-06, + "logits/chosen": 31605561.6, + "logits/rejected": 30546553.6, + "logps/chosen": -167.25799560546875, + "logps/rejected": -137.0789794921875, + "loss": 0.4910862922668457, + "rewards/chosen": -0.2911639451980591, + "rewards/margins": 0.12216508388519287, + "rewards/rejected": -0.413329029083252, + "step": 880 + }, + { + "epoch": 0.356, + "grad_norm": 0.40740740299224854, + "kl": 0.5278605222702026, + "learning_rate": 4.5677777777777786e-06, + "logits/chosen": 42018956.8, + "logits/rejected": 41023542.4, + "logps/chosen": -135.9039306640625, + "logps/rejected": -137.1427001953125, + "loss": 0.48065881729125975, + "rewards/chosen": -0.33759872913360595, + "rewards/margins": 0.1944932222366333, + "rewards/rejected": -0.5320919513702392, + "step": 890 + }, + { + "epoch": 0.36, + "grad_norm": 0.506528377532959, + "kl": 0.6414504647254944, + "learning_rate": 4.556666666666667e-06, + "logits/chosen": 37310515.2, + "logits/rejected": 38113939.2, + "logps/chosen": -177.546728515625, + "logps/rejected": -165.55384521484376, + "loss": 0.4943400382995605, + "rewards/chosen": -0.6260869026184082, + "rewards/margins": -0.024873304367065363, + "rewards/rejected": -0.6012135982513428, + "step": 900 + }, + { + "epoch": 0.364, + "grad_norm": 0.4898208975791931, + "kl": 0.6840685606002808, + "learning_rate": 4.545555555555556e-06, + "logits/chosen": 31999705.6, + "logits/rejected": 32116691.2, + "logps/chosen": -168.6631591796875, + "logps/rejected": -174.8232421875, + "loss": 0.49452638626098633, + "rewards/chosen": -0.4872349739074707, + "rewards/margins": 0.17413578033447263, + "rewards/rejected": -0.6613707542419434, + "step": 910 + }, + { + "epoch": 0.368, + "grad_norm": 0.42078927159309387, + "kl": 0.8319946527481079, + "learning_rate": 4.534444444444445e-06, + "logits/chosen": 33578412.8, + "logits/rejected": 30967862.4, + "logps/chosen": -136.42371826171876, + "logps/rejected": -151.80224609375, + "loss": 0.4958657741546631, + "rewards/chosen": -0.4272448539733887, + "rewards/margins": 0.1275300025939941, + "rewards/rejected": -0.5547748565673828, + "step": 920 + }, + { + "epoch": 0.372, + "grad_norm": 0.4784943163394928, + "kl": 1.0423786640167236, + "learning_rate": 4.523333333333334e-06, + "logits/chosen": 32027404.8, + "logits/rejected": 32098508.8, + "logps/chosen": -155.27005615234376, + "logps/rejected": -149.925537109375, + "loss": 0.48459978103637696, + "rewards/chosen": -0.22732582092285156, + "rewards/margins": 0.15194621086120605, + "rewards/rejected": -0.3792720317840576, + "step": 930 + }, + { + "epoch": 0.376, + "grad_norm": 0.3698587119579315, + "kl": 1.5603997707366943, + "learning_rate": 4.512222222222223e-06, + "logits/chosen": 27671084.8, + "logits/rejected": 25976814.4, + "logps/chosen": -136.42069091796876, + "logps/rejected": -152.234033203125, + "loss": 0.4825894355773926, + "rewards/chosen": -0.22695178985595704, + "rewards/margins": 0.20094985961914064, + "rewards/rejected": -0.4279016494750977, + "step": 940 + }, + { + "epoch": 0.38, + "grad_norm": 0.40601083636283875, + "kl": 2.8003079891204834, + "learning_rate": 4.501111111111111e-06, + "logits/chosen": 40676198.4, + "logits/rejected": 43776691.2, + "logps/chosen": -183.81981201171874, + "logps/rejected": -158.6890380859375, + "loss": 0.4809588432312012, + "rewards/chosen": -0.0009134054183959961, + "rewards/margins": 0.13371984958648683, + "rewards/rejected": -0.13463325500488282, + "step": 950 + }, + { + "epoch": 0.384, + "grad_norm": 0.5407139658927917, + "kl": 2.086998224258423, + "learning_rate": 4.49e-06, + "logits/chosen": 37516128.0, + "logits/rejected": 39039443.2, + "logps/chosen": -137.39747314453126, + "logps/rejected": -150.29815673828125, + "loss": 0.5065193176269531, + "rewards/chosen": -0.06897132396697998, + "rewards/margins": -0.031788992881774905, + "rewards/rejected": -0.03718233108520508, + "step": 960 + }, + { + "epoch": 0.388, + "grad_norm": 0.4142342209815979, + "kl": 1.7635319232940674, + "learning_rate": 4.478888888888889e-06, + "logits/chosen": 32190848.0, + "logits/rejected": 30587993.6, + "logps/chosen": -135.15673828125, + "logps/rejected": -118.86663818359375, + "loss": 0.48531031608581543, + "rewards/chosen": -0.007679381966590881, + "rewards/margins": 0.15075800120830538, + "rewards/rejected": -0.15843738317489625, + "step": 970 + }, + { + "epoch": 0.392, + "grad_norm": 0.6221582889556885, + "kl": 3.294914722442627, + "learning_rate": 4.467777777777778e-06, + "logits/chosen": 39407718.4, + "logits/rejected": 39707820.8, + "logps/chosen": -138.0949462890625, + "logps/rejected": -157.43453369140624, + "loss": 0.4826664447784424, + "rewards/chosen": 0.24659197330474852, + "rewards/margins": 0.17039816975593566, + "rewards/rejected": 0.07619380354881286, + "step": 980 + }, + { + "epoch": 0.396, + "grad_norm": 0.5434563755989075, + "kl": 1.082279920578003, + "learning_rate": 4.456666666666667e-06, + "logits/chosen": 31821408.0, + "logits/rejected": 31118764.8, + "logps/chosen": -127.49896240234375, + "logps/rejected": -136.4136962890625, + "loss": 0.48195528984069824, + "rewards/chosen": -0.16623904705047607, + "rewards/margins": 0.15431931018829345, + "rewards/rejected": -0.3205583572387695, + "step": 990 + }, + { + "epoch": 0.4, + "grad_norm": 0.37851211428642273, + "kl": 1.7661035060882568, + "learning_rate": 4.4455555555555554e-06, + "logits/chosen": 31584358.4, + "logits/rejected": 32753641.6, + "logps/chosen": -159.625048828125, + "logps/rejected": -122.3124267578125, + "loss": 0.49321880340576174, + "rewards/chosen": -0.08240060806274414, + "rewards/margins": 0.045932340621948245, + "rewards/rejected": -0.1283329486846924, + "step": 1000 + }, + { + "epoch": 0.4, + "eval_kl": 1.6750891208648682, + "eval_logits/chosen": 33939980.288, + "eval_logits/rejected": 34026332.16, + "eval_logps/chosen": -156.401671875, + "eval_logps/rejected": -151.40996875, + "eval_loss": 0.4904634356498718, + "eval_rewards/chosen": -0.20758909606933593, + "eval_rewards/margins": 0.06913508605957033, + "eval_rewards/rejected": -0.27672418212890626, + "eval_runtime": 216.882, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 1000 + }, + { + "epoch": 0.404, + "grad_norm": 0.6070407032966614, + "kl": 2.5559988021850586, + "learning_rate": 4.434444444444444e-06, + "logits/chosen": 27506348.8, + "logits/rejected": 30075392.0, + "logps/chosen": -138.08692626953126, + "logps/rejected": -187.8861083984375, + "loss": 0.4917243480682373, + "rewards/chosen": -0.13809033632278442, + "rewards/margins": 0.05099650621414184, + "rewards/rejected": -0.18908684253692626, + "step": 1010 + }, + { + "epoch": 0.408, + "grad_norm": 0.5809450149536133, + "kl": 2.151444911956787, + "learning_rate": 4.423333333333334e-06, + "logits/chosen": 28877452.8, + "logits/rejected": 28118339.2, + "logps/chosen": -154.03690185546876, + "logps/rejected": -139.09736328125, + "loss": 0.4888266086578369, + "rewards/chosen": -0.16468460559844972, + "rewards/margins": 0.13210070133209229, + "rewards/rejected": -0.296785306930542, + "step": 1020 + }, + { + "epoch": 0.412, + "grad_norm": 0.4774882197380066, + "kl": 2.590153217315674, + "learning_rate": 4.412222222222223e-06, + "logits/chosen": 33996892.8, + "logits/rejected": 33160086.4, + "logps/chosen": -154.9474609375, + "logps/rejected": -153.31983642578126, + "loss": 0.47945427894592285, + "rewards/chosen": -0.007390469312667847, + "rewards/margins": 0.16412567496299743, + "rewards/rejected": -0.17151614427566528, + "step": 1030 + }, + { + "epoch": 0.416, + "grad_norm": 0.5751529335975647, + "kl": 2.692246913909912, + "learning_rate": 4.4011111111111115e-06, + "logits/chosen": 27060502.4, + "logits/rejected": 26739710.4, + "logps/chosen": -191.5754150390625, + "logps/rejected": -115.40594482421875, + "loss": 0.504734468460083, + "rewards/chosen": -0.14675636291503907, + "rewards/margins": -0.07811862826347352, + "rewards/rejected": -0.06863773465156556, + "step": 1040 + }, + { + "epoch": 0.42, + "grad_norm": 0.5552707314491272, + "kl": 2.183612108230591, + "learning_rate": 4.39e-06, + "logits/chosen": 30119641.6, + "logits/rejected": 27840016.0, + "logps/chosen": -129.14801025390625, + "logps/rejected": -159.921337890625, + "loss": 0.4975595951080322, + "rewards/chosen": -0.11868793964385986, + "rewards/margins": 0.03744263648986816, + "rewards/rejected": -0.15613057613372802, + "step": 1050 + }, + { + "epoch": 0.424, + "grad_norm": 0.4398513436317444, + "kl": 2.019963026046753, + "learning_rate": 4.378888888888889e-06, + "logits/chosen": 39605126.4, + "logits/rejected": 37338668.8, + "logps/chosen": -173.10638427734375, + "logps/rejected": -188.400146484375, + "loss": 0.5177321434020996, + "rewards/chosen": -0.252044153213501, + "rewards/margins": -0.1198780655860901, + "rewards/rejected": -0.1321660876274109, + "step": 1060 + }, + { + "epoch": 0.428, + "grad_norm": 0.6157165765762329, + "kl": 1.4567959308624268, + "learning_rate": 4.367777777777778e-06, + "logits/chosen": 31087238.4, + "logits/rejected": 32085881.6, + "logps/chosen": -145.3509521484375, + "logps/rejected": -170.3815185546875, + "loss": 0.4887071132659912, + "rewards/chosen": -0.3054164171218872, + "rewards/margins": 0.236836838722229, + "rewards/rejected": -0.5422532558441162, + "step": 1070 + }, + { + "epoch": 0.432, + "grad_norm": 0.3502284288406372, + "kl": 0.8787339925765991, + "learning_rate": 4.356666666666667e-06, + "logits/chosen": 34486451.2, + "logits/rejected": 36169574.4, + "logps/chosen": -158.5971923828125, + "logps/rejected": -139.94036865234375, + "loss": 0.5054500579833985, + "rewards/chosen": -0.45778846740722656, + "rewards/margins": -0.09344666004180907, + "rewards/rejected": -0.3643418073654175, + "step": 1080 + }, + { + "epoch": 0.436, + "grad_norm": 0.624359667301178, + "kl": 0.525427520275116, + "learning_rate": 4.3455555555555555e-06, + "logits/chosen": 26498083.2, + "logits/rejected": 25839392.0, + "logps/chosen": -149.42689208984376, + "logps/rejected": -118.26563720703125, + "loss": 0.5076635360717774, + "rewards/chosen": -0.557512617111206, + "rewards/margins": -0.06046972274780271, + "rewards/rejected": -0.4970428943634033, + "step": 1090 + }, + { + "epoch": 0.44, + "grad_norm": 0.549114465713501, + "kl": 0.8173803091049194, + "learning_rate": 4.334444444444445e-06, + "logits/chosen": 34397792.0, + "logits/rejected": 33410729.6, + "logps/chosen": -140.214111328125, + "logps/rejected": -176.107958984375, + "loss": 0.48507490158081057, + "rewards/chosen": -0.4220071792602539, + "rewards/margins": 0.19281878471374514, + "rewards/rejected": -0.614825963973999, + "step": 1100 + }, + { + "epoch": 0.444, + "grad_norm": 0.5036312937736511, + "kl": 0.7975673675537109, + "learning_rate": 4.323333333333334e-06, + "logits/chosen": 36466489.6, + "logits/rejected": 38251465.6, + "logps/chosen": -120.687255859375, + "logps/rejected": -187.36099853515626, + "loss": 0.5016714572906494, + "rewards/chosen": -0.4559361457824707, + "rewards/margins": -0.0054581642150878795, + "rewards/rejected": -0.45047798156738283, + "step": 1110 + }, + { + "epoch": 0.448, + "grad_norm": 0.5358121395111084, + "kl": 1.3897031545639038, + "learning_rate": 4.312222222222223e-06, + "logits/chosen": 46269334.4, + "logits/rejected": 45639856.0, + "logps/chosen": -151.86192626953124, + "logps/rejected": -165.55286865234376, + "loss": 0.4728604793548584, + "rewards/chosen": -0.23922853469848632, + "rewards/margins": 0.28858757019042974, + "rewards/rejected": -0.5278161048889161, + "step": 1120 + }, + { + "epoch": 0.452, + "grad_norm": 0.5269862413406372, + "kl": 1.1441795825958252, + "learning_rate": 4.301111111111112e-06, + "logits/chosen": 35708649.6, + "logits/rejected": 34836294.4, + "logps/chosen": -183.39061279296874, + "logps/rejected": -150.405078125, + "loss": 0.4849276065826416, + "rewards/chosen": -0.40422697067260743, + "rewards/margins": 0.15558710098266598, + "rewards/rejected": -0.5598140716552734, + "step": 1130 + }, + { + "epoch": 0.456, + "grad_norm": 0.3800269067287445, + "kl": 0.8876265287399292, + "learning_rate": 4.2900000000000004e-06, + "logits/chosen": 22079545.6, + "logits/rejected": 22083444.8, + "logps/chosen": -109.09158935546876, + "logps/rejected": -147.06978759765624, + "loss": 0.4905365467071533, + "rewards/chosen": -0.43719801902770994, + "rewards/margins": 0.08943343162536621, + "rewards/rejected": -0.5266314506530761, + "step": 1140 + }, + { + "epoch": 0.46, + "grad_norm": 0.4693025052547455, + "kl": 0.49091872572898865, + "learning_rate": 4.278888888888889e-06, + "logits/chosen": 42701616.0, + "logits/rejected": 40578803.2, + "logps/chosen": -223.8361328125, + "logps/rejected": -172.5617431640625, + "loss": 0.4969001293182373, + "rewards/chosen": -0.9799749374389648, + "rewards/margins": 0.08639993667602552, + "rewards/rejected": -1.0663748741149903, + "step": 1150 + }, + { + "epoch": 0.464, + "grad_norm": 0.4174056053161621, + "kl": 0.7064284682273865, + "learning_rate": 4.267777777777778e-06, + "logits/chosen": 26652801.6, + "logits/rejected": 24801236.8, + "logps/chosen": -138.50279541015624, + "logps/rejected": -171.82225341796874, + "loss": 0.4671950817108154, + "rewards/chosen": -0.5919324398040772, + "rewards/margins": 0.4853674411773682, + "rewards/rejected": -1.0772998809814454, + "step": 1160 + }, + { + "epoch": 0.468, + "grad_norm": 0.629512369632721, + "kl": 1.179760217666626, + "learning_rate": 4.256666666666668e-06, + "logits/chosen": 28567804.8, + "logits/rejected": 29090739.2, + "logps/chosen": -140.1174560546875, + "logps/rejected": -165.07166748046876, + "loss": 0.49239435195922854, + "rewards/chosen": -0.8533164024353027, + "rewards/margins": -0.06229524612426751, + "rewards/rejected": -0.7910211563110352, + "step": 1170 + }, + { + "epoch": 0.472, + "grad_norm": 0.4868221580982208, + "kl": 0.9739119410514832, + "learning_rate": 4.2455555555555565e-06, + "logits/chosen": 30410720.0, + "logits/rejected": 28420300.8, + "logps/chosen": -140.90198974609376, + "logps/rejected": -170.1091552734375, + "loss": 0.48679437637329104, + "rewards/chosen": -0.551117992401123, + "rewards/margins": 0.2831212997436524, + "rewards/rejected": -0.8342392921447754, + "step": 1180 + }, + { + "epoch": 0.476, + "grad_norm": 0.47686856985092163, + "kl": 0.48265019059181213, + "learning_rate": 4.234444444444445e-06, + "logits/chosen": 29930240.0, + "logits/rejected": 25699152.0, + "logps/chosen": -184.52239990234375, + "logps/rejected": -187.303173828125, + "loss": 0.4651634693145752, + "rewards/chosen": -0.6415619850158691, + "rewards/margins": 0.5583641052246093, + "rewards/rejected": -1.1999260902404785, + "step": 1190 + }, + { + "epoch": 0.48, + "grad_norm": 0.49188584089279175, + "kl": 0.6842840909957886, + "learning_rate": 4.223333333333334e-06, + "logits/chosen": 30387222.4, + "logits/rejected": 28950054.4, + "logps/chosen": -146.3315673828125, + "logps/rejected": -159.7215576171875, + "loss": 0.4879584789276123, + "rewards/chosen": -0.608671236038208, + "rewards/margins": 0.14971170425415037, + "rewards/rejected": -0.7583829402923584, + "step": 1200 + }, + { + "epoch": 0.48, + "eval_kl": 0.6333972215652466, + "eval_logits/chosen": 31495737.344, + "eval_logits/rejected": 31723335.68, + "eval_logps/chosen": -160.925171875, + "eval_logps/rejected": -156.03046875, + "eval_loss": 0.48919567465782166, + "eval_rewards/chosen": -0.659940185546875, + "eval_rewards/margins": 0.07883386230468759, + "eval_rewards/rejected": -0.7387740478515625, + "eval_runtime": 217.7778, + "eval_samples_per_second": 4.592, + "eval_steps_per_second": 2.296, + "step": 1200 + }, + { + "epoch": 0.484, + "grad_norm": 0.5108934640884399, + "kl": 0.7764253616333008, + "learning_rate": 4.212222222222223e-06, + "logits/chosen": 22495624.0, + "logits/rejected": 24596646.4, + "logps/chosen": -138.95447998046876, + "logps/rejected": -143.8316162109375, + "loss": 0.5000998020172119, + "rewards/chosen": -0.806338119506836, + "rewards/margins": 0.04378585815429681, + "rewards/rejected": -0.8501239776611328, + "step": 1210 + }, + { + "epoch": 0.488, + "grad_norm": 0.5415228009223938, + "kl": 1.1236222982406616, + "learning_rate": 4.201111111111112e-06, + "logits/chosen": 31318569.6, + "logits/rejected": 28306940.8, + "logps/chosen": -183.03011474609374, + "logps/rejected": -194.442236328125, + "loss": 0.4961515426635742, + "rewards/chosen": -0.552086067199707, + "rewards/margins": 0.12537789344787598, + "rewards/rejected": -0.677463960647583, + "step": 1220 + }, + { + "epoch": 0.492, + "grad_norm": 0.4574231505393982, + "kl": 1.6093800067901611, + "learning_rate": 4.1900000000000005e-06, + "logits/chosen": 28117484.8, + "logits/rejected": 27773507.2, + "logps/chosen": -148.87581787109374, + "logps/rejected": -157.650537109375, + "loss": 0.48798060417175293, + "rewards/chosen": -0.22223844528198242, + "rewards/margins": 0.0932478666305542, + "rewards/rejected": -0.3154863119125366, + "step": 1230 + }, + { + "epoch": 0.496, + "grad_norm": 0.455790638923645, + "kl": 1.5425220727920532, + "learning_rate": 4.178888888888889e-06, + "logits/chosen": 30485878.4, + "logits/rejected": 30765398.4, + "logps/chosen": -123.1314208984375, + "logps/rejected": -120.400634765625, + "loss": 0.4938685894012451, + "rewards/chosen": -0.15957858562469482, + "rewards/margins": 0.033807253837585455, + "rewards/rejected": -0.19338583946228027, + "step": 1240 + }, + { + "epoch": 0.5, + "grad_norm": 0.5282999873161316, + "kl": 1.4079266786575317, + "learning_rate": 4.167777777777778e-06, + "logits/chosen": 22162673.6, + "logits/rejected": 22915948.8, + "logps/chosen": -113.82423095703125, + "logps/rejected": -131.29176025390626, + "loss": 0.49029102325439455, + "rewards/chosen": -0.14798271656036377, + "rewards/margins": 0.09533922672271727, + "rewards/rejected": -0.24332194328308104, + "step": 1250 + }, + { + "epoch": 0.504, + "grad_norm": 0.5007496476173401, + "kl": 1.7635902166366577, + "learning_rate": 4.156666666666667e-06, + "logits/chosen": 27436982.4, + "logits/rejected": 27643766.4, + "logps/chosen": -147.7771484375, + "logps/rejected": -167.73524169921876, + "loss": 0.46639671325683596, + "rewards/chosen": -0.29362332820892334, + "rewards/margins": 0.3400294542312622, + "rewards/rejected": -0.6336527824401855, + "step": 1260 + }, + { + "epoch": 0.508, + "grad_norm": 0.4727869927883148, + "kl": 1.2690056562423706, + "learning_rate": 4.145555555555556e-06, + "logits/chosen": 29958118.4, + "logits/rejected": 26773496.0, + "logps/chosen": -172.3375732421875, + "logps/rejected": -155.7744140625, + "loss": 0.4935513973236084, + "rewards/chosen": -0.3117243528366089, + "rewards/margins": 0.08564956188201905, + "rewards/rejected": -0.39737391471862793, + "step": 1270 + }, + { + "epoch": 0.512, + "grad_norm": 0.4609099328517914, + "kl": 1.781589150428772, + "learning_rate": 4.1344444444444446e-06, + "logits/chosen": 45966684.8, + "logits/rejected": 46560012.8, + "logps/chosen": -154.45379638671875, + "logps/rejected": -171.34287109375, + "loss": 0.49072775840759275, + "rewards/chosen": -0.026087772846221925, + "rewards/margins": 0.12879917621612547, + "rewards/rejected": -0.1548869490623474, + "step": 1280 + }, + { + "epoch": 0.516, + "grad_norm": 0.5082091093063354, + "kl": 1.657065749168396, + "learning_rate": 4.123333333333333e-06, + "logits/chosen": 27531948.8, + "logits/rejected": 28266195.2, + "logps/chosen": -133.0970947265625, + "logps/rejected": -141.93575439453124, + "loss": 0.4872725486755371, + "rewards/chosen": -0.0539365291595459, + "rewards/margins": 0.13880285024642947, + "rewards/rejected": -0.19273937940597535, + "step": 1290 + }, + { + "epoch": 0.52, + "grad_norm": 0.5041593909263611, + "kl": 2.0696568489074707, + "learning_rate": 4.112222222222222e-06, + "logits/chosen": 37664678.4, + "logits/rejected": 34784227.2, + "logps/chosen": -183.206103515625, + "logps/rejected": -145.39962158203124, + "loss": 0.48463997840881345, + "rewards/chosen": -0.03633859157562256, + "rewards/margins": 0.14437620639801027, + "rewards/rejected": -0.18071479797363282, + "step": 1300 + }, + { + "epoch": 0.524, + "grad_norm": 0.6096036434173584, + "kl": 2.0778517723083496, + "learning_rate": 4.101111111111111e-06, + "logits/chosen": 30281945.6, + "logits/rejected": 30007484.8, + "logps/chosen": -140.6286865234375, + "logps/rejected": -148.30921630859376, + "loss": 0.49010205268859863, + "rewards/chosen": -0.1496596097946167, + "rewards/margins": 0.10425436496734616, + "rewards/rejected": -0.25391397476196287, + "step": 1310 + }, + { + "epoch": 0.528, + "grad_norm": 0.3967672884464264, + "kl": 3.4023184776306152, + "learning_rate": 4.09e-06, + "logits/chosen": 38450940.8, + "logits/rejected": 36835715.2, + "logps/chosen": -149.884423828125, + "logps/rejected": -155.450439453125, + "loss": 0.455477237701416, + "rewards/chosen": 0.22590782642364501, + "rewards/margins": 0.39357452392578124, + "rewards/rejected": -0.16766669750213622, + "step": 1320 + }, + { + "epoch": 0.532, + "grad_norm": 0.39660006761550903, + "kl": 1.7329654693603516, + "learning_rate": 4.0788888888888895e-06, + "logits/chosen": 29744569.6, + "logits/rejected": 30137328.0, + "logps/chosen": -155.14593505859375, + "logps/rejected": -160.4675048828125, + "loss": 0.4830836296081543, + "rewards/chosen": -0.31779026985168457, + "rewards/margins": 0.1599587440490723, + "rewards/rejected": -0.47774901390075686, + "step": 1330 + }, + { + "epoch": 0.536, + "grad_norm": 0.6326448917388916, + "kl": 2.0254123210906982, + "learning_rate": 4.067777777777778e-06, + "logits/chosen": 26790800.0, + "logits/rejected": 28456883.2, + "logps/chosen": -151.97984619140624, + "logps/rejected": -130.97991943359375, + "loss": 0.4777104377746582, + "rewards/chosen": -0.04423903226852417, + "rewards/margins": 0.20697282552719115, + "rewards/rejected": -0.2512118577957153, + "step": 1340 + }, + { + "epoch": 0.54, + "grad_norm": 0.4449482858181, + "kl": 1.457157015800476, + "learning_rate": 4.056666666666667e-06, + "logits/chosen": 29013564.8, + "logits/rejected": 28593001.6, + "logps/chosen": -128.33466796875, + "logps/rejected": -122.49200439453125, + "loss": 0.4760580539703369, + "rewards/chosen": -0.19824122190475463, + "rewards/margins": 0.20185590982437135, + "rewards/rejected": -0.400097131729126, + "step": 1350 + }, + { + "epoch": 0.544, + "grad_norm": 0.45084336400032043, + "kl": 3.831247329711914, + "learning_rate": 4.045555555555556e-06, + "logits/chosen": 31035744.0, + "logits/rejected": 32034198.4, + "logps/chosen": -164.5341064453125, + "logps/rejected": -147.6629638671875, + "loss": 0.48288540840148925, + "rewards/chosen": 0.16254035234451295, + "rewards/margins": 0.1516798198223114, + "rewards/rejected": 0.010860532522201538, + "step": 1360 + }, + { + "epoch": 0.548, + "grad_norm": 0.5451259613037109, + "kl": 3.273149013519287, + "learning_rate": 4.034444444444445e-06, + "logits/chosen": 28394259.2, + "logits/rejected": 25613750.4, + "logps/chosen": -176.49615478515625, + "logps/rejected": -159.41533203125, + "loss": 0.463987922668457, + "rewards/chosen": 0.05027390718460083, + "rewards/margins": 0.3991087079048157, + "rewards/rejected": -0.34883480072021483, + "step": 1370 + }, + { + "epoch": 0.552, + "grad_norm": 0.4214652180671692, + "kl": 2.222465991973877, + "learning_rate": 4.0233333333333335e-06, + "logits/chosen": 34603212.8, + "logits/rejected": 34498118.4, + "logps/chosen": -148.70673828125, + "logps/rejected": -138.68626708984374, + "loss": 0.4929951667785645, + "rewards/chosen": -0.12773821353912354, + "rewards/margins": 0.0252701163291931, + "rewards/rejected": -0.15300832986831664, + "step": 1380 + }, + { + "epoch": 0.556, + "grad_norm": 0.5307957530021667, + "kl": 2.981513500213623, + "learning_rate": 4.012222222222222e-06, + "logits/chosen": 39500022.4, + "logits/rejected": 41076224.0, + "logps/chosen": -156.5267333984375, + "logps/rejected": -168.4097900390625, + "loss": 0.5046597480773926, + "rewards/chosen": 0.03251245319843292, + "rewards/margins": -0.05116569101810456, + "rewards/rejected": 0.08367814421653748, + "step": 1390 + }, + { + "epoch": 0.56, + "grad_norm": 0.5756453275680542, + "kl": 3.648423671722412, + "learning_rate": 4.001111111111111e-06, + "logits/chosen": 36128464.0, + "logits/rejected": 36108208.0, + "logps/chosen": -147.972119140625, + "logps/rejected": -180.87730712890624, + "loss": 0.49908957481384275, + "rewards/chosen": 0.16207314729690553, + "rewards/margins": -0.004686105251312245, + "rewards/rejected": 0.16675925254821777, + "step": 1400 + }, + { + "epoch": 0.56, + "eval_kl": 3.1687636375427246, + "eval_logits/chosen": 33501499.392, + "eval_logits/rejected": 33484677.12, + "eval_logps/chosen": -154.072703125, + "eval_logps/rejected": -149.52703125, + "eval_loss": 0.486517995595932, + "eval_rewards/chosen": 0.025308061599731445, + "eval_rewards/margins": 0.11373865699768065, + "eval_rewards/rejected": -0.08843059539794922, + "eval_runtime": 217.6832, + "eval_samples_per_second": 4.594, + "eval_steps_per_second": 2.297, + "step": 1400 + }, + { + "epoch": 0.564, + "grad_norm": 0.5075347423553467, + "kl": 3.5038933753967285, + "learning_rate": 3.990000000000001e-06, + "logits/chosen": 28111721.6, + "logits/rejected": 28974422.4, + "logps/chosen": -153.78463134765624, + "logps/rejected": -143.94952392578125, + "loss": 0.4918965816497803, + "rewards/chosen": -0.004850611090660095, + "rewards/margins": 0.020186284184455873, + "rewards/rejected": -0.025036895275115968, + "step": 1410 + }, + { + "epoch": 0.568, + "grad_norm": 0.5109780430793762, + "kl": 2.3000378608703613, + "learning_rate": 3.9788888888888896e-06, + "logits/chosen": 33186614.4, + "logits/rejected": 33699417.6, + "logps/chosen": -141.6215087890625, + "logps/rejected": -178.1355712890625, + "loss": 0.4941215991973877, + "rewards/chosen": -0.18355293273925782, + "rewards/margins": 0.026853704452514643, + "rewards/rejected": -0.21040663719177247, + "step": 1420 + }, + { + "epoch": 0.572, + "grad_norm": 0.6244523525238037, + "kl": 1.984100580215454, + "learning_rate": 3.967777777777778e-06, + "logits/chosen": 39980752.0, + "logits/rejected": 35690995.2, + "logps/chosen": -194.154638671875, + "logps/rejected": -171.23074951171876, + "loss": 0.46099395751953126, + "rewards/chosen": -0.10000758171081543, + "rewards/margins": 0.5053775310516357, + "rewards/rejected": -0.6053851127624512, + "step": 1430 + }, + { + "epoch": 0.576, + "grad_norm": 0.41846802830696106, + "kl": 2.4030237197875977, + "learning_rate": 3.956666666666667e-06, + "logits/chosen": 25522387.2, + "logits/rejected": 26411580.8, + "logps/chosen": -122.002734375, + "logps/rejected": -139.70345458984374, + "loss": 0.4711480617523193, + "rewards/chosen": -0.17828741073608398, + "rewards/margins": 0.30072832107543945, + "rewards/rejected": -0.47901573181152346, + "step": 1440 + }, + { + "epoch": 0.58, + "grad_norm": 0.3383093774318695, + "kl": 2.3522331714630127, + "learning_rate": 3.945555555555556e-06, + "logits/chosen": 34720166.4, + "logits/rejected": 35382691.2, + "logps/chosen": -134.02327880859374, + "logps/rejected": -143.1811767578125, + "loss": 0.48169522285461425, + "rewards/chosen": -0.12290234565734863, + "rewards/margins": 0.22536482810974118, + "rewards/rejected": -0.3482671737670898, + "step": 1450 + }, + { + "epoch": 0.584, + "grad_norm": 0.47618529200553894, + "kl": 1.4843952655792236, + "learning_rate": 3.934444444444445e-06, + "logits/chosen": 29917385.6, + "logits/rejected": 29642912.0, + "logps/chosen": -157.6127197265625, + "logps/rejected": -164.204248046875, + "loss": 0.48633370399475095, + "rewards/chosen": -0.5775248527526855, + "rewards/margins": 0.010549926757812522, + "rewards/rejected": -0.588074779510498, + "step": 1460 + }, + { + "epoch": 0.588, + "grad_norm": 0.4691362977027893, + "kl": 1.8532390594482422, + "learning_rate": 3.923333333333334e-06, + "logits/chosen": 24143035.2, + "logits/rejected": 26696252.8, + "logps/chosen": -145.52325439453125, + "logps/rejected": -114.97313232421875, + "loss": 0.4996927261352539, + "rewards/chosen": -0.3415048837661743, + "rewards/margins": -0.0695812225341797, + "rewards/rejected": -0.27192366123199463, + "step": 1470 + }, + { + "epoch": 0.592, + "grad_norm": 0.49410581588745117, + "kl": 2.910165309906006, + "learning_rate": 3.912222222222222e-06, + "logits/chosen": 29227424.0, + "logits/rejected": 26583780.8, + "logps/chosen": -153.87852783203124, + "logps/rejected": -167.90714111328126, + "loss": 0.4679962158203125, + "rewards/chosen": -0.16606519222259522, + "rewards/margins": 0.28040225505828853, + "rewards/rejected": -0.4464674472808838, + "step": 1480 + }, + { + "epoch": 0.596, + "grad_norm": 0.6437669992446899, + "kl": 4.011757850646973, + "learning_rate": 3.901111111111111e-06, + "logits/chosen": 40104499.2, + "logits/rejected": 35466915.2, + "logps/chosen": -141.6960693359375, + "logps/rejected": -148.79417724609374, + "loss": 0.45351347923278806, + "rewards/chosen": 0.29918632507324217, + "rewards/margins": 0.7735027313232421, + "rewards/rejected": -0.47431640625, + "step": 1490 + }, + { + "epoch": 0.6, + "grad_norm": 0.598638653755188, + "kl": 2.5277042388916016, + "learning_rate": 3.89e-06, + "logits/chosen": 30581568.0, + "logits/rejected": 29237926.4, + "logps/chosen": -170.3593017578125, + "logps/rejected": -161.714111328125, + "loss": 0.5054315567016602, + "rewards/chosen": -0.5106431007385254, + "rewards/margins": -0.14114959239959712, + "rewards/rejected": -0.36949350833892824, + "step": 1500 + }, + { + "epoch": 0.604, + "grad_norm": 0.5450658202171326, + "kl": 3.1822094917297363, + "learning_rate": 3.87888888888889e-06, + "logits/chosen": 30121491.2, + "logits/rejected": 30883408.0, + "logps/chosen": -177.3155029296875, + "logps/rejected": -172.50675048828126, + "loss": 0.4777498722076416, + "rewards/chosen": -0.09470235109329224, + "rewards/margins": 0.16894682645797732, + "rewards/rejected": -0.26364917755126954, + "step": 1510 + }, + { + "epoch": 0.608, + "grad_norm": 0.32850226759910583, + "kl": 3.073251724243164, + "learning_rate": 3.8677777777777785e-06, + "logits/chosen": 32764054.4, + "logits/rejected": 33643142.4, + "logps/chosen": -167.7408447265625, + "logps/rejected": -171.3683349609375, + "loss": 0.4882831573486328, + "rewards/chosen": -0.5916567325592041, + "rewards/margins": 0.06371226310729972, + "rewards/rejected": -0.6553689956665039, + "step": 1520 + }, + { + "epoch": 0.612, + "grad_norm": 0.776578426361084, + "kl": 2.1626362800598145, + "learning_rate": 3.856666666666667e-06, + "logits/chosen": 20513964.8, + "logits/rejected": 19167148.8, + "logps/chosen": -138.76737060546876, + "logps/rejected": -200.4188232421875, + "loss": 0.47345700263977053, + "rewards/chosen": -0.39080009460449217, + "rewards/margins": 0.3349196434020997, + "rewards/rejected": -0.7257197380065918, + "step": 1530 + }, + { + "epoch": 0.616, + "grad_norm": 0.7884080410003662, + "kl": 2.2347629070281982, + "learning_rate": 3.845555555555556e-06, + "logits/chosen": 21506472.0, + "logits/rejected": 20219934.4, + "logps/chosen": -141.54342041015624, + "logps/rejected": -150.6858154296875, + "loss": 0.46153483390808103, + "rewards/chosen": -0.5252087116241455, + "rewards/margins": 0.5382626056671143, + "rewards/rejected": -1.0634713172912598, + "step": 1540 + }, + { + "epoch": 0.62, + "grad_norm": 0.6161748766899109, + "kl": 1.0965118408203125, + "learning_rate": 3.834444444444445e-06, + "logits/chosen": 24290136.0, + "logits/rejected": 24614228.8, + "logps/chosen": -178.11864013671874, + "logps/rejected": -153.21026611328125, + "loss": 0.4584649085998535, + "rewards/chosen": -0.8390473365783692, + "rewards/margins": 0.8001769065856933, + "rewards/rejected": -1.6392242431640625, + "step": 1550 + }, + { + "epoch": 0.624, + "grad_norm": 0.6851525902748108, + "kl": 0.8686630129814148, + "learning_rate": 3.823333333333334e-06, + "logits/chosen": 17298494.4, + "logits/rejected": 14839955.2, + "logps/chosen": -181.07423095703126, + "logps/rejected": -169.425439453125, + "loss": 0.49303278923034666, + "rewards/chosen": -0.948878288269043, + "rewards/margins": 0.510734748840332, + "rewards/rejected": -1.459613037109375, + "step": 1560 + }, + { + "epoch": 0.628, + "grad_norm": 0.6733571290969849, + "kl": 2.8747103214263916, + "learning_rate": 3.8122222222222225e-06, + "logits/chosen": 29427056.0, + "logits/rejected": 24132025.6, + "logps/chosen": -138.527490234375, + "logps/rejected": -174.9707763671875, + "loss": 0.4342005729675293, + "rewards/chosen": 0.09871820211410523, + "rewards/margins": 0.8701262354850768, + "rewards/rejected": -0.7714080333709716, + "step": 1570 + }, + { + "epoch": 0.632, + "grad_norm": 0.5916578769683838, + "kl": 1.7868465185165405, + "learning_rate": 3.8011111111111113e-06, + "logits/chosen": 26572758.4, + "logits/rejected": 23886825.6, + "logps/chosen": -212.365966796875, + "logps/rejected": -208.8218505859375, + "loss": 0.48333086967468264, + "rewards/chosen": -1.0144821166992188, + "rewards/margins": 0.16185150146484362, + "rewards/rejected": -1.1763336181640625, + "step": 1580 + }, + { + "epoch": 0.636, + "grad_norm": 0.5978784561157227, + "kl": 2.861311435699463, + "learning_rate": 3.79e-06, + "logits/chosen": 27835440.0, + "logits/rejected": 27677552.0, + "logps/chosen": -188.40328369140624, + "logps/rejected": -173.6368896484375, + "loss": 0.4893380641937256, + "rewards/chosen": -0.2336580753326416, + "rewards/margins": 0.4210014343261719, + "rewards/rejected": -0.6546595096588135, + "step": 1590 + }, + { + "epoch": 0.64, + "grad_norm": 0.574272632598877, + "kl": 1.9626449346542358, + "learning_rate": 3.7788888888888894e-06, + "logits/chosen": 20977523.2, + "logits/rejected": 17175705.6, + "logps/chosen": -150.7464111328125, + "logps/rejected": -180.0102294921875, + "loss": 0.45204753875732423, + "rewards/chosen": -0.791524600982666, + "rewards/margins": 1.0010954856872558, + "rewards/rejected": -1.7926200866699218, + "step": 1600 + }, + { + "epoch": 0.64, + "eval_kl": 2.5991017818450928, + "eval_logits/chosen": 24918573.056, + "eval_logits/rejected": 25185402.88, + "eval_logps/chosen": -159.12659375, + "eval_logps/rejected": -154.646546875, + "eval_loss": 0.4858725666999817, + "eval_rewards/chosen": -0.480081787109375, + "eval_rewards/margins": 0.12030004882812506, + "eval_rewards/rejected": -0.6003818359375, + "eval_runtime": 217.5803, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 1600 + }, + { + "epoch": 0.644, + "grad_norm": 0.6671045422554016, + "kl": 2.1377835273742676, + "learning_rate": 3.767777777777778e-06, + "logits/chosen": 35406131.2, + "logits/rejected": 31205331.2, + "logps/chosen": -184.4550537109375, + "logps/rejected": -165.74014892578126, + "loss": 0.4641073703765869, + "rewards/chosen": -0.07752754688262939, + "rewards/margins": 0.44615256786346436, + "rewards/rejected": -0.5236801147460938, + "step": 1610 + }, + { + "epoch": 0.648, + "grad_norm": 0.6126360297203064, + "kl": 2.7373385429382324, + "learning_rate": 3.756666666666667e-06, + "logits/chosen": 29280086.4, + "logits/rejected": 32377139.2, + "logps/chosen": -212.76513671875, + "logps/rejected": -190.2232666015625, + "loss": 0.49649949073791505, + "rewards/chosen": -0.929378604888916, + "rewards/margins": -0.4132873058319092, + "rewards/rejected": -0.5160912990570068, + "step": 1620 + }, + { + "epoch": 0.652, + "grad_norm": 0.5612730979919434, + "kl": 2.997823476791382, + "learning_rate": 3.7455555555555558e-06, + "logits/chosen": 21078843.2, + "logits/rejected": 19578608.0, + "logps/chosen": -149.2550537109375, + "logps/rejected": -154.4807861328125, + "loss": 0.45549612045288085, + "rewards/chosen": -0.679559326171875, + "rewards/margins": 0.24375944137573247, + "rewards/rejected": -0.9233187675476074, + "step": 1630 + }, + { + "epoch": 0.656, + "grad_norm": 0.44815537333488464, + "kl": 2.8388514518737793, + "learning_rate": 3.734444444444445e-06, + "logits/chosen": 22369004.8, + "logits/rejected": 18537880.0, + "logps/chosen": -171.47335205078124, + "logps/rejected": -165.7509521484375, + "loss": 0.4616579532623291, + "rewards/chosen": -0.18010754585266114, + "rewards/margins": 0.7035699367523194, + "rewards/rejected": -0.8836774826049805, + "step": 1640 + }, + { + "epoch": 0.66, + "grad_norm": 0.8153596520423889, + "kl": 3.4207565784454346, + "learning_rate": 3.723333333333334e-06, + "logits/chosen": 24538766.4, + "logits/rejected": 26516555.2, + "logps/chosen": -150.9359619140625, + "logps/rejected": -122.23038330078126, + "loss": 0.48783044815063475, + "rewards/chosen": -0.1703397035598755, + "rewards/margins": 0.0270324468612671, + "rewards/rejected": -0.1973721504211426, + "step": 1650 + }, + { + "epoch": 0.664, + "grad_norm": 0.45013901591300964, + "kl": 3.9859955310821533, + "learning_rate": 3.7122222222222226e-06, + "logits/chosen": 22145710.4, + "logits/rejected": 20591705.6, + "logps/chosen": -124.9506591796875, + "logps/rejected": -146.586279296875, + "loss": 0.4657421112060547, + "rewards/chosen": 0.028095448017120363, + "rewards/margins": 0.4449395298957825, + "rewards/rejected": -0.4168440818786621, + "step": 1660 + }, + { + "epoch": 0.668, + "grad_norm": 0.5674629211425781, + "kl": 3.48918080329895, + "learning_rate": 3.7011111111111114e-06, + "logits/chosen": 28042899.2, + "logits/rejected": 27730080.0, + "logps/chosen": -170.19053955078124, + "logps/rejected": -192.9843017578125, + "loss": 0.4754744052886963, + "rewards/chosen": -0.249656343460083, + "rewards/margins": 0.22593021392822268, + "rewards/rejected": -0.4755865573883057, + "step": 1670 + }, + { + "epoch": 0.672, + "grad_norm": 0.7562563419342041, + "kl": 3.0691466331481934, + "learning_rate": 3.6900000000000002e-06, + "logits/chosen": 26120401.6, + "logits/rejected": 25757414.4, + "logps/chosen": -154.65439453125, + "logps/rejected": -162.4265869140625, + "loss": 0.4740549087524414, + "rewards/chosen": -0.2537501811981201, + "rewards/margins": 0.16815314292907718, + "rewards/rejected": -0.4219033241271973, + "step": 1680 + }, + { + "epoch": 0.676, + "grad_norm": 0.6189448237419128, + "kl": 3.324810028076172, + "learning_rate": 3.678888888888889e-06, + "logits/chosen": 30877590.4, + "logits/rejected": 29492992.0, + "logps/chosen": -162.27489013671874, + "logps/rejected": -156.6115234375, + "loss": 0.46096296310424806, + "rewards/chosen": -0.32442150115966795, + "rewards/margins": 0.5965461730957031, + "rewards/rejected": -0.9209676742553711, + "step": 1690 + }, + { + "epoch": 0.68, + "grad_norm": 0.5689833760261536, + "kl": 3.417942762374878, + "learning_rate": 3.667777777777778e-06, + "logits/chosen": 30880060.8, + "logits/rejected": 30390649.6, + "logps/chosen": -146.108203125, + "logps/rejected": -140.8014892578125, + "loss": 0.48299012184143064, + "rewards/chosen": -0.08942080736160278, + "rewards/margins": 0.09204813241958619, + "rewards/rejected": -0.18146893978118897, + "step": 1700 + }, + { + "epoch": 0.684, + "grad_norm": 0.7074683904647827, + "kl": 2.7843141555786133, + "learning_rate": 3.6566666666666667e-06, + "logits/chosen": 23799224.0, + "logits/rejected": 23789908.8, + "logps/chosen": -132.7684326171875, + "logps/rejected": -171.96357421875, + "loss": 0.4795389652252197, + "rewards/chosen": -0.4700624942779541, + "rewards/margins": 0.3144543647766114, + "rewards/rejected": -0.7845168590545655, + "step": 1710 + }, + { + "epoch": 0.688, + "grad_norm": 0.8114802241325378, + "kl": 2.740182876586914, + "learning_rate": 3.645555555555556e-06, + "logits/chosen": 25693836.8, + "logits/rejected": 25391835.2, + "logps/chosen": -147.47672119140626, + "logps/rejected": -162.006640625, + "loss": 0.47942562103271485, + "rewards/chosen": -0.12961168289184571, + "rewards/margins": 0.2411248922348022, + "rewards/rejected": -0.3707365751266479, + "step": 1720 + }, + { + "epoch": 0.692, + "grad_norm": 0.6404406428337097, + "kl": 5.742056369781494, + "learning_rate": 3.6344444444444447e-06, + "logits/chosen": 23561008.0, + "logits/rejected": 24549129.6, + "logps/chosen": -152.6004150390625, + "logps/rejected": -174.73006591796874, + "loss": 0.4857301712036133, + "rewards/chosen": 0.12083638906478882, + "rewards/margins": 0.12182764708995819, + "rewards/rejected": -0.0009912580251693725, + "step": 1730 + }, + { + "epoch": 0.696, + "grad_norm": 0.8152211308479309, + "kl": 2.7150015830993652, + "learning_rate": 3.6233333333333335e-06, + "logits/chosen": 20060864.0, + "logits/rejected": 21277550.4, + "logps/chosen": -161.858642578125, + "logps/rejected": -154.7493408203125, + "loss": 0.49634590148925783, + "rewards/chosen": -0.6363963603973388, + "rewards/margins": -0.21991643905639646, + "rewards/rejected": -0.41647992134094236, + "step": 1740 + }, + { + "epoch": 0.7, + "grad_norm": 0.5856395959854126, + "kl": 3.9709296226501465, + "learning_rate": 3.6122222222222223e-06, + "logits/chosen": 24168908.8, + "logits/rejected": 26363808.0, + "logps/chosen": -216.2795654296875, + "logps/rejected": -159.5930908203125, + "loss": 0.4658236026763916, + "rewards/chosen": -0.5366491794586181, + "rewards/margins": 0.038147354125976585, + "rewards/rejected": -0.5747965335845947, + "step": 1750 + }, + { + "epoch": 0.704, + "grad_norm": 0.6619251370429993, + "kl": 3.0937228202819824, + "learning_rate": 3.601111111111111e-06, + "logits/chosen": 17747806.4, + "logits/rejected": 22547065.6, + "logps/chosen": -187.3780517578125, + "logps/rejected": -159.7064453125, + "loss": 0.5088288307189941, + "rewards/chosen": -1.2607831954956055, + "rewards/margins": -0.6743541717529297, + "rewards/rejected": -0.5864290237426758, + "step": 1760 + }, + { + "epoch": 0.708, + "grad_norm": 0.5218913555145264, + "kl": 2.901822566986084, + "learning_rate": 3.5900000000000004e-06, + "logits/chosen": 22269254.4, + "logits/rejected": 23662136.0, + "logps/chosen": -136.06712646484374, + "logps/rejected": -121.06827392578126, + "loss": 0.497973108291626, + "rewards/chosen": -0.4131883144378662, + "rewards/margins": -0.10578060150146484, + "rewards/rejected": -0.30740771293640134, + "step": 1770 + }, + { + "epoch": 0.712, + "grad_norm": 0.6656368970870972, + "kl": 3.2692978382110596, + "learning_rate": 3.578888888888889e-06, + "logits/chosen": 25755620.8, + "logits/rejected": 26518835.2, + "logps/chosen": -165.65780029296874, + "logps/rejected": -147.2259033203125, + "loss": 0.506129789352417, + "rewards/chosen": -0.5433285236358643, + "rewards/margins": -0.17034811973571778, + "rewards/rejected": -0.3729804039001465, + "step": 1780 + }, + { + "epoch": 0.716, + "grad_norm": 0.771259069442749, + "kl": 3.0249366760253906, + "learning_rate": 3.5677777777777784e-06, + "logits/chosen": 23546620.8, + "logits/rejected": 25753550.4, + "logps/chosen": -151.70357666015624, + "logps/rejected": -122.8987548828125, + "loss": 0.5249699592590332, + "rewards/chosen": -0.5662184715270996, + "rewards/margins": -0.2557974576950073, + "rewards/rejected": -0.3104210138320923, + "step": 1790 + }, + { + "epoch": 0.72, + "grad_norm": 0.872774064540863, + "kl": 3.2898342609405518, + "learning_rate": 3.556666666666667e-06, + "logits/chosen": 18870168.0, + "logits/rejected": 17117038.4, + "logps/chosen": -150.25985107421874, + "logps/rejected": -161.5357666015625, + "loss": 0.451005744934082, + "rewards/chosen": -0.1520848512649536, + "rewards/margins": 0.6849667310714721, + "rewards/rejected": -0.8370515823364257, + "step": 1800 + }, + { + "epoch": 0.72, + "eval_kl": 3.5156476497650146, + "eval_logits/chosen": 26424913.92, + "eval_logits/rejected": 26601347.072, + "eval_logps/chosen": -156.963453125, + "eval_logps/rejected": -152.583296875, + "eval_loss": 0.48428651690483093, + "eval_rewards/chosen": -0.2637669677734375, + "eval_rewards/margins": 0.13028942871093752, + "eval_rewards/rejected": -0.394056396484375, + "eval_runtime": 217.3905, + "eval_samples_per_second": 4.6, + "eval_steps_per_second": 2.3, + "step": 1800 + }, + { + "epoch": 0.724, + "grad_norm": 0.6279663443565369, + "kl": 1.9814598560333252, + "learning_rate": 3.545555555555556e-06, + "logits/chosen": 31417820.8, + "logits/rejected": 29248547.2, + "logps/chosen": -143.02147216796874, + "logps/rejected": -176.2625732421875, + "loss": 0.47081918716430665, + "rewards/chosen": -0.7984821319580078, + "rewards/margins": 0.2239703178405763, + "rewards/rejected": -1.022452449798584, + "step": 1810 + }, + { + "epoch": 0.728, + "grad_norm": 0.6332824230194092, + "kl": 3.8811469078063965, + "learning_rate": 3.534444444444445e-06, + "logits/chosen": 25455878.4, + "logits/rejected": 24060284.8, + "logps/chosen": -148.748779296875, + "logps/rejected": -152.15390625, + "loss": 0.48478074073791505, + "rewards/chosen": -0.37164936065673826, + "rewards/margins": 0.1325855255126953, + "rewards/rejected": -0.5042348861694336, + "step": 1820 + }, + { + "epoch": 0.732, + "grad_norm": 0.570693850517273, + "kl": 3.4417755603790283, + "learning_rate": 3.5233333333333336e-06, + "logits/chosen": 21330112.0, + "logits/rejected": 23791145.6, + "logps/chosen": -175.6027099609375, + "logps/rejected": -139.43577880859374, + "loss": 0.4806610107421875, + "rewards/chosen": -0.7079993724822998, + "rewards/margins": -0.224769401550293, + "rewards/rejected": -0.4832299709320068, + "step": 1830 + }, + { + "epoch": 0.736, + "grad_norm": 0.6215969920158386, + "kl": 2.516907215118408, + "learning_rate": 3.5122222222222224e-06, + "logits/chosen": 19252992.0, + "logits/rejected": 17279195.2, + "logps/chosen": -138.5216552734375, + "logps/rejected": -175.40498046875, + "loss": 0.4477705955505371, + "rewards/chosen": -0.37056674957275393, + "rewards/margins": 0.6211806297302246, + "rewards/rejected": -0.9917473793029785, + "step": 1840 + }, + { + "epoch": 0.74, + "grad_norm": 0.477038711309433, + "kl": 2.8053412437438965, + "learning_rate": 3.5011111111111112e-06, + "logits/chosen": 21869585.6, + "logits/rejected": 25226084.8, + "logps/chosen": -169.3609130859375, + "logps/rejected": -135.9068603515625, + "loss": 0.49389004707336426, + "rewards/chosen": -0.8072388648986817, + "rewards/margins": -0.23602757453918466, + "rewards/rejected": -0.571211290359497, + "step": 1850 + }, + { + "epoch": 0.744, + "grad_norm": 0.4190019369125366, + "kl": 4.664608955383301, + "learning_rate": 3.49e-06, + "logits/chosen": 18998553.6, + "logits/rejected": 18716126.4, + "logps/chosen": -154.63216552734374, + "logps/rejected": -157.9931884765625, + "loss": 0.45591115951538086, + "rewards/chosen": -0.11906745433807372, + "rewards/margins": 0.5571311235427856, + "rewards/rejected": -0.6761985778808594, + "step": 1860 + }, + { + "epoch": 0.748, + "grad_norm": 0.5092635154724121, + "kl": 5.426673412322998, + "learning_rate": 3.4788888888888893e-06, + "logits/chosen": 24268691.2, + "logits/rejected": 23287683.2, + "logps/chosen": -150.13511962890624, + "logps/rejected": -137.98375244140624, + "loss": 0.4658195018768311, + "rewards/chosen": 0.17463077306747438, + "rewards/margins": 0.33842480182647705, + "rewards/rejected": -0.16379402875900267, + "step": 1870 + }, + { + "epoch": 0.752, + "grad_norm": 0.5116318464279175, + "kl": 3.4443411827087402, + "learning_rate": 3.467777777777778e-06, + "logits/chosen": 25825232.0, + "logits/rejected": 28601868.8, + "logps/chosen": -131.28408203125, + "logps/rejected": -165.76693115234374, + "loss": 0.4923543453216553, + "rewards/chosen": -0.23933188915252684, + "rewards/margins": 0.14726905822753905, + "rewards/rejected": -0.3866009473800659, + "step": 1880 + }, + { + "epoch": 0.756, + "grad_norm": 0.6366556286811829, + "kl": 3.051987409591675, + "learning_rate": 3.456666666666667e-06, + "logits/chosen": 24197241.6, + "logits/rejected": 24256118.4, + "logps/chosen": -149.84852294921876, + "logps/rejected": -140.0214599609375, + "loss": 0.4848769664764404, + "rewards/chosen": -0.3190887212753296, + "rewards/margins": 0.17881777286529538, + "rewards/rejected": -0.497906494140625, + "step": 1890 + }, + { + "epoch": 0.76, + "grad_norm": 0.4979274570941925, + "kl": 1.4698994159698486, + "learning_rate": 3.4455555555555557e-06, + "logits/chosen": 23184480.0, + "logits/rejected": 25139280.0, + "logps/chosen": -134.97877197265626, + "logps/rejected": -144.17740478515626, + "loss": 0.4794943809509277, + "rewards/chosen": -0.6308645248413086, + "rewards/margins": 0.13470888137817383, + "rewards/rejected": -0.7655734062194824, + "step": 1900 + }, + { + "epoch": 0.764, + "grad_norm": 0.6274532079696655, + "kl": 3.6468818187713623, + "learning_rate": 3.4344444444444445e-06, + "logits/chosen": 24657672.0, + "logits/rejected": 21508489.6, + "logps/chosen": -160.151904296875, + "logps/rejected": -166.94481201171874, + "loss": 0.4795567512512207, + "rewards/chosen": -0.22244927883148194, + "rewards/margins": 0.15467300415039062, + "rewards/rejected": -0.37712228298187256, + "step": 1910 + }, + { + "epoch": 0.768, + "grad_norm": 0.7713479399681091, + "kl": 4.167417049407959, + "learning_rate": 3.4233333333333333e-06, + "logits/chosen": 23603747.2, + "logits/rejected": 21209184.0, + "logps/chosen": -134.7058837890625, + "logps/rejected": -163.1764404296875, + "loss": 0.4544349193572998, + "rewards/chosen": 0.17299318313598633, + "rewards/margins": 0.5065126180648803, + "rewards/rejected": -0.333519434928894, + "step": 1920 + }, + { + "epoch": 0.772, + "grad_norm": 0.5262131690979004, + "kl": 2.8361663818359375, + "learning_rate": 3.412222222222222e-06, + "logits/chosen": 29174873.6, + "logits/rejected": 33003203.2, + "logps/chosen": -158.76817626953124, + "logps/rejected": -142.29862060546876, + "loss": 0.47826762199401857, + "rewards/chosen": -0.2532700300216675, + "rewards/margins": 0.13431007862091066, + "rewards/rejected": -0.38758010864257814, + "step": 1930 + }, + { + "epoch": 0.776, + "grad_norm": 0.610528826713562, + "kl": 1.9879090785980225, + "learning_rate": 3.4011111111111113e-06, + "logits/chosen": 14738179.2, + "logits/rejected": 17543468.8, + "logps/chosen": -144.6372314453125, + "logps/rejected": -121.0155517578125, + "loss": 0.5197708129882812, + "rewards/chosen": -0.7448621273040772, + "rewards/margins": -0.2757446765899659, + "rewards/rejected": -0.4691174507141113, + "step": 1940 + }, + { + "epoch": 0.78, + "grad_norm": 0.4867253601551056, + "kl": 2.61750864982605, + "learning_rate": 3.3900000000000006e-06, + "logits/chosen": 29278848.0, + "logits/rejected": 27723072.0, + "logps/chosen": -167.261474609375, + "logps/rejected": -166.44840087890626, + "loss": 0.48351154327392576, + "rewards/chosen": -0.45982890129089354, + "rewards/margins": 0.17165498733520507, + "rewards/rejected": -0.6314838886260986, + "step": 1950 + }, + { + "epoch": 0.784, + "grad_norm": 0.507047176361084, + "kl": 1.4705009460449219, + "learning_rate": 3.3788888888888894e-06, + "logits/chosen": 21861384.0, + "logits/rejected": 22609652.8, + "logps/chosen": -154.51759033203126, + "logps/rejected": -141.37017822265625, + "loss": 0.4911977291107178, + "rewards/chosen": -0.9377481460571289, + "rewards/margins": 0.045456314086914085, + "rewards/rejected": -0.983204460144043, + "step": 1960 + }, + { + "epoch": 0.788, + "grad_norm": 0.5638304352760315, + "kl": 2.8000810146331787, + "learning_rate": 3.367777777777778e-06, + "logits/chosen": 29543376.0, + "logits/rejected": 30959481.6, + "logps/chosen": -154.2559814453125, + "logps/rejected": -140.09403076171876, + "loss": 0.4743481636047363, + "rewards/chosen": -0.27635998725891114, + "rewards/margins": 0.31996994018554686, + "rewards/rejected": -0.596329927444458, + "step": 1970 + }, + { + "epoch": 0.792, + "grad_norm": 0.622689962387085, + "kl": 1.304429292678833, + "learning_rate": 3.356666666666667e-06, + "logits/chosen": 16238214.4, + "logits/rejected": 15864859.2, + "logps/chosen": -148.69432373046874, + "logps/rejected": -155.14200439453126, + "loss": 0.4647815227508545, + "rewards/chosen": -0.6397994041442872, + "rewards/margins": 0.38712730407714835, + "rewards/rejected": -1.0269267082214355, + "step": 1980 + }, + { + "epoch": 0.796, + "grad_norm": 0.5903355479240417, + "kl": 3.8611984252929688, + "learning_rate": 3.345555555555556e-06, + "logits/chosen": 26873817.6, + "logits/rejected": 25962048.0, + "logps/chosen": -168.3064208984375, + "logps/rejected": -165.02401123046874, + "loss": 0.44381189346313477, + "rewards/chosen": -0.056187999248504636, + "rewards/margins": 0.48386293649673456, + "rewards/rejected": -0.5400509357452392, + "step": 1990 + }, + { + "epoch": 0.8, + "grad_norm": 0.6087274551391602, + "kl": 2.4798474311828613, + "learning_rate": 3.3344444444444446e-06, + "logits/chosen": 28899868.8, + "logits/rejected": 28327043.2, + "logps/chosen": -131.373046875, + "logps/rejected": -144.835546875, + "loss": 0.4636848449707031, + "rewards/chosen": -0.4337655544281006, + "rewards/margins": 0.1502884864807129, + "rewards/rejected": -0.5840540409088135, + "step": 2000 + }, + { + "epoch": 0.8, + "eval_kl": 2.3182120323181152, + "eval_logits/chosen": 23415599.104, + "eval_logits/rejected": 23816060.928, + "eval_logps/chosen": -161.6585625, + "eval_logps/rejected": -157.55559375, + "eval_loss": 0.48174571990966797, + "eval_rewards/chosen": -0.7332791137695313, + "eval_rewards/margins": 0.1580091552734375, + "eval_rewards/rejected": -0.8912882690429688, + "eval_runtime": 216.8959, + "eval_samples_per_second": 4.611, + "eval_steps_per_second": 2.305, + "step": 2000 + }, + { + "epoch": 0.804, + "grad_norm": 0.6840182542800903, + "kl": 3.759185791015625, + "learning_rate": 3.3233333333333334e-06, + "logits/chosen": 29734800.0, + "logits/rejected": 27820688.0, + "logps/chosen": -171.00633544921874, + "logps/rejected": -149.1771240234375, + "loss": 0.4692417621612549, + "rewards/chosen": -0.2439584493637085, + "rewards/margins": 0.4319137811660767, + "rewards/rejected": -0.6758722305297852, + "step": 2010 + }, + { + "epoch": 0.808, + "grad_norm": 0.4128756523132324, + "kl": 2.642878770828247, + "learning_rate": 3.3122222222222222e-06, + "logits/chosen": 19955732.8, + "logits/rejected": 18758494.4, + "logps/chosen": -189.38092041015625, + "logps/rejected": -170.4149658203125, + "loss": 0.46123080253601073, + "rewards/chosen": -0.5583849906921386, + "rewards/margins": 0.32184505462646484, + "rewards/rejected": -0.8802300453186035, + "step": 2020 + }, + { + "epoch": 0.812, + "grad_norm": 0.5455370545387268, + "kl": 1.1196393966674805, + "learning_rate": 3.3011111111111115e-06, + "logits/chosen": 22045115.2, + "logits/rejected": 18838947.2, + "logps/chosen": -148.83717041015626, + "logps/rejected": -179.09134521484376, + "loss": 0.4277163505554199, + "rewards/chosen": -0.9872810363769531, + "rewards/margins": 0.9499824523925782, + "rewards/rejected": -1.9372634887695312, + "step": 2030 + }, + { + "epoch": 0.816, + "grad_norm": 0.5655795335769653, + "kl": 2.0870370864868164, + "learning_rate": 3.2900000000000003e-06, + "logits/chosen": 30604160.0, + "logits/rejected": 25881659.2, + "logps/chosen": -170.49140625, + "logps/rejected": -214.0181640625, + "loss": 0.4611818790435791, + "rewards/chosen": -0.9137911796569824, + "rewards/margins": 0.8909661293029785, + "rewards/rejected": -1.804757308959961, + "step": 2040 + }, + { + "epoch": 0.82, + "grad_norm": 0.48172426223754883, + "kl": 2.043773651123047, + "learning_rate": 3.278888888888889e-06, + "logits/chosen": 16779667.2, + "logits/rejected": 17778121.6, + "logps/chosen": -148.56708984375, + "logps/rejected": -125.018115234375, + "loss": 0.49151906967163084, + "rewards/chosen": -1.0805482864379883, + "rewards/margins": -0.045468139648437544, + "rewards/rejected": -1.0350801467895507, + "step": 2050 + }, + { + "epoch": 0.824, + "grad_norm": 0.5591869950294495, + "kl": 1.8221423625946045, + "learning_rate": 3.267777777777778e-06, + "logits/chosen": 18840448.0, + "logits/rejected": 14656315.2, + "logps/chosen": -178.58017578125, + "logps/rejected": -194.3201171875, + "loss": 0.5037118434906006, + "rewards/chosen": -1.5193581581115723, + "rewards/margins": 0.3175524711608886, + "rewards/rejected": -1.8369106292724608, + "step": 2060 + }, + { + "epoch": 0.828, + "grad_norm": 0.6082685589790344, + "kl": 2.7332985401153564, + "learning_rate": 3.2566666666666667e-06, + "logits/chosen": 19653870.4, + "logits/rejected": 18607360.0, + "logps/chosen": -191.222802734375, + "logps/rejected": -195.412109375, + "loss": 0.43700380325317384, + "rewards/chosen": -1.0758570671081542, + "rewards/margins": 1.051231098175049, + "rewards/rejected": -2.127088165283203, + "step": 2070 + }, + { + "epoch": 0.832, + "grad_norm": 0.8018869161605835, + "kl": 1.3849284648895264, + "learning_rate": 3.2455555555555555e-06, + "logits/chosen": 18399478.4, + "logits/rejected": 19887457.6, + "logps/chosen": -193.88709716796876, + "logps/rejected": -149.70872802734374, + "loss": 0.5033087730407715, + "rewards/chosen": -1.796027946472168, + "rewards/margins": -0.644907569885254, + "rewards/rejected": -1.151120376586914, + "step": 2080 + }, + { + "epoch": 0.836, + "grad_norm": 0.6100642681121826, + "kl": 1.6638615131378174, + "learning_rate": 3.2344444444444443e-06, + "logits/chosen": 13364839.2, + "logits/rejected": 12106027.2, + "logps/chosen": -150.7623291015625, + "logps/rejected": -176.14700927734376, + "loss": 0.45767946243286134, + "rewards/chosen": -0.9122394561767578, + "rewards/margins": 1.188156890869141, + "rewards/rejected": -2.1003963470458986, + "step": 2090 + }, + { + "epoch": 0.84, + "grad_norm": 0.4774913191795349, + "kl": 2.0765693187713623, + "learning_rate": 3.223333333333334e-06, + "logits/chosen": 14278204.8, + "logits/rejected": 16952772.8, + "logps/chosen": -180.37510986328124, + "logps/rejected": -164.04486083984375, + "loss": 0.5365061283111572, + "rewards/chosen": -1.5350143432617187, + "rewards/margins": -0.5786049842834472, + "rewards/rejected": -0.9564093589782715, + "step": 2100 + }, + { + "epoch": 0.844, + "grad_norm": 0.424125999212265, + "kl": 1.1270596981048584, + "learning_rate": 3.2122222222222228e-06, + "logits/chosen": 11056914.4, + "logits/rejected": 10464643.2, + "logps/chosen": -159.667138671875, + "logps/rejected": -179.2357177734375, + "loss": 0.4685384750366211, + "rewards/chosen": -0.7500426292419433, + "rewards/margins": 0.7069652557373047, + "rewards/rejected": -1.457007884979248, + "step": 2110 + }, + { + "epoch": 0.848, + "grad_norm": 0.6812456846237183, + "kl": 3.2760558128356934, + "learning_rate": 3.2011111111111116e-06, + "logits/chosen": 12226829.6, + "logits/rejected": 9145164.0, + "logps/chosen": -152.55595703125, + "logps/rejected": -163.9876953125, + "loss": 0.4829984664916992, + "rewards/chosen": -1.0188889503479004, + "rewards/margins": 0.4964068412780762, + "rewards/rejected": -1.5152957916259766, + "step": 2120 + }, + { + "epoch": 0.852, + "grad_norm": 0.521295964717865, + "kl": 1.6184799671173096, + "learning_rate": 3.1900000000000004e-06, + "logits/chosen": 19296177.6, + "logits/rejected": 18037955.2, + "logps/chosen": -190.2342041015625, + "logps/rejected": -179.3730224609375, + "loss": 0.4534940719604492, + "rewards/chosen": -1.4859835624694824, + "rewards/margins": 0.12996721267700195, + "rewards/rejected": -1.6159507751464843, + "step": 2130 + }, + { + "epoch": 0.856, + "grad_norm": 0.3901250660419464, + "kl": 2.1943907737731934, + "learning_rate": 3.178888888888889e-06, + "logits/chosen": 14292169.6, + "logits/rejected": 16561420.8, + "logps/chosen": -172.7265869140625, + "logps/rejected": -174.55390625, + "loss": 0.46813135147094725, + "rewards/chosen": -0.9191327095031738, + "rewards/margins": 0.9562966346740722, + "rewards/rejected": -1.875429344177246, + "step": 2140 + }, + { + "epoch": 0.86, + "grad_norm": 1.2093825340270996, + "kl": 2.53037691116333, + "learning_rate": 3.167777777777778e-06, + "logits/chosen": 15527200.0, + "logits/rejected": 14247240.0, + "logps/chosen": -175.73638916015625, + "logps/rejected": -177.11685791015626, + "loss": 0.4883676052093506, + "rewards/chosen": -1.1620004653930665, + "rewards/margins": 0.6007183074951172, + "rewards/rejected": -1.7627187728881837, + "step": 2150 + }, + { + "epoch": 0.864, + "grad_norm": 0.7262481451034546, + "kl": 2.6998825073242188, + "learning_rate": 3.156666666666667e-06, + "logits/chosen": 25509097.6, + "logits/rejected": 26202662.4, + "logps/chosen": -159.83707275390626, + "logps/rejected": -170.9850341796875, + "loss": 0.44543633460998533, + "rewards/chosen": -0.19056529998779298, + "rewards/margins": 0.7149291038513184, + "rewards/rejected": -0.9054944038391113, + "step": 2160 + }, + { + "epoch": 0.868, + "grad_norm": 0.4244597554206848, + "kl": 1.9011032581329346, + "learning_rate": 3.1455555555555556e-06, + "logits/chosen": 13862787.2, + "logits/rejected": 13148918.4, + "logps/chosen": -144.288720703125, + "logps/rejected": -159.27891845703124, + "loss": 0.46286282539367674, + "rewards/chosen": -1.0397714614868163, + "rewards/margins": 0.25113573074340834, + "rewards/rejected": -1.2909071922302247, + "step": 2170 + }, + { + "epoch": 0.872, + "grad_norm": 0.7587819695472717, + "kl": 4.540980339050293, + "learning_rate": 3.134444444444445e-06, + "logits/chosen": 29813209.6, + "logits/rejected": 31323004.8, + "logps/chosen": -183.304345703125, + "logps/rejected": -198.9873779296875, + "loss": 0.4856124401092529, + "rewards/chosen": -0.8742061614990234, + "rewards/margins": -0.17829103469848628, + "rewards/rejected": -0.6959151268005371, + "step": 2180 + }, + { + "epoch": 0.876, + "grad_norm": 0.36061376333236694, + "kl": 2.879594326019287, + "learning_rate": 3.1233333333333336e-06, + "logits/chosen": 29873868.8, + "logits/rejected": 30440390.4, + "logps/chosen": -143.857861328125, + "logps/rejected": -136.5346435546875, + "loss": 0.5109179496765137, + "rewards/chosen": -0.8628176689147949, + "rewards/margins": -0.39257164001464845, + "rewards/rejected": -0.4702460289001465, + "step": 2190 + }, + { + "epoch": 0.88, + "grad_norm": 0.39630356431007385, + "kl": 2.539196729660034, + "learning_rate": 3.1122222222222224e-06, + "logits/chosen": 20060600.0, + "logits/rejected": 18553404.8, + "logps/chosen": -159.40810546875, + "logps/rejected": -152.76990966796876, + "loss": 0.47215023040771487, + "rewards/chosen": -0.8129859924316406, + "rewards/margins": 0.28859338760375974, + "rewards/rejected": -1.1015793800354003, + "step": 2200 + }, + { + "epoch": 0.88, + "eval_kl": 2.5812811851501465, + "eval_logits/chosen": 21032757.248, + "eval_logits/rejected": 21261236.224, + "eval_logps/chosen": -162.41909375, + "eval_logps/rejected": -158.462875, + "eval_loss": 0.48150432109832764, + "eval_rewards/chosen": -0.8093319091796874, + "eval_rewards/margins": 0.17268438720703128, + "eval_rewards/rejected": -0.9820162963867187, + "eval_runtime": 216.8473, + "eval_samples_per_second": 4.612, + "eval_steps_per_second": 2.306, + "step": 2200 + }, + { + "epoch": 0.884, + "grad_norm": 0.6626996994018555, + "kl": 2.7219512462615967, + "learning_rate": 3.1011111111111113e-06, + "logits/chosen": 20549030.4, + "logits/rejected": 23361507.2, + "logps/chosen": -180.177783203125, + "logps/rejected": -172.0116943359375, + "loss": 0.5009243011474609, + "rewards/chosen": -1.0695799827575683, + "rewards/margins": 0.0988718032836915, + "rewards/rejected": -1.1684517860412598, + "step": 2210 + }, + { + "epoch": 0.888, + "grad_norm": 0.40188467502593994, + "kl": 4.388433933258057, + "learning_rate": 3.09e-06, + "logits/chosen": 31197132.8, + "logits/rejected": 29223571.2, + "logps/chosen": -158.33319091796875, + "logps/rejected": -156.518505859375, + "loss": 0.49659576416015627, + "rewards/chosen": -0.5678246021270752, + "rewards/margins": 0.040923357009887695, + "rewards/rejected": -0.6087479591369629, + "step": 2220 + }, + { + "epoch": 0.892, + "grad_norm": 0.7662191390991211, + "kl": 3.7732715606689453, + "learning_rate": 3.078888888888889e-06, + "logits/chosen": 23728937.6, + "logits/rejected": 24493553.6, + "logps/chosen": -122.2478759765625, + "logps/rejected": -116.500439453125, + "loss": 0.4903532028198242, + "rewards/chosen": -0.12796418666839598, + "rewards/margins": 0.14529306888580323, + "rewards/rejected": -0.2732572555541992, + "step": 2230 + }, + { + "epoch": 0.896, + "grad_norm": 0.5434762835502625, + "kl": 5.346643924713135, + "learning_rate": 3.0677777777777777e-06, + "logits/chosen": 23763382.4, + "logits/rejected": 20131742.4, + "logps/chosen": -148.9446044921875, + "logps/rejected": -145.7754638671875, + "loss": 0.4672962188720703, + "rewards/chosen": 0.033642816543579104, + "rewards/margins": 0.5607096195220947, + "rewards/rejected": -0.5270668029785156, + "step": 2240 + }, + { + "epoch": 0.9, + "grad_norm": 0.5850833058357239, + "kl": 4.739095211029053, + "learning_rate": 3.0566666666666665e-06, + "logits/chosen": 20819936.0, + "logits/rejected": 24134200.0, + "logps/chosen": -142.128466796875, + "logps/rejected": -151.9432861328125, + "loss": 0.4636435031890869, + "rewards/chosen": 0.028873807191848753, + "rewards/margins": 0.2783109962940216, + "rewards/rejected": -0.24943718910217286, + "step": 2250 + }, + { + "epoch": 0.904, + "grad_norm": 0.6144809126853943, + "kl": 4.373375415802002, + "learning_rate": 3.045555555555556e-06, + "logits/chosen": 33130332.8, + "logits/rejected": 34606784.0, + "logps/chosen": -155.8130126953125, + "logps/rejected": -168.7025390625, + "loss": 0.47957863807678225, + "rewards/chosen": -0.34006266593933104, + "rewards/margins": 0.1747920036315918, + "rewards/rejected": -0.5148546695709229, + "step": 2260 + }, + { + "epoch": 0.908, + "grad_norm": 0.48172423243522644, + "kl": 3.806690216064453, + "learning_rate": 3.034444444444445e-06, + "logits/chosen": 29197280.0, + "logits/rejected": 26733576.0, + "logps/chosen": -156.34307861328125, + "logps/rejected": -124.57420654296875, + "loss": 0.486788272857666, + "rewards/chosen": -0.15740108489990234, + "rewards/margins": 0.07049424648284913, + "rewards/rejected": -0.22789533138275148, + "step": 2270 + }, + { + "epoch": 0.912, + "grad_norm": 0.5201888680458069, + "kl": 2.4590580463409424, + "learning_rate": 3.0233333333333338e-06, + "logits/chosen": 13540443.2, + "logits/rejected": 11543592.0, + "logps/chosen": -128.15758056640624, + "logps/rejected": -144.884130859375, + "loss": 0.5047108173370362, + "rewards/chosen": -0.8088220596313477, + "rewards/margins": 0.07349948883056634, + "rewards/rejected": -0.882321548461914, + "step": 2280 + }, + { + "epoch": 0.916, + "grad_norm": 0.5650275945663452, + "kl": 4.004490852355957, + "learning_rate": 3.0122222222222226e-06, + "logits/chosen": 30858310.4, + "logits/rejected": 30752073.6, + "logps/chosen": -173.15472412109375, + "logps/rejected": -179.914208984375, + "loss": 0.45772509574890136, + "rewards/chosen": -0.07655960321426392, + "rewards/margins": 0.580165708065033, + "rewards/rejected": -0.6567253112792969, + "step": 2290 + }, + { + "epoch": 0.92, + "grad_norm": 0.6002667546272278, + "kl": 2.4904167652130127, + "learning_rate": 3.0011111111111114e-06, + "logits/chosen": 27612214.4, + "logits/rejected": 29905420.8, + "logps/chosen": -170.78812255859376, + "logps/rejected": -171.15484619140625, + "loss": 0.48928098678588866, + "rewards/chosen": -0.48044404983520506, + "rewards/margins": 0.08476023674011235, + "rewards/rejected": -0.5652042865753174, + "step": 2300 + }, + { + "epoch": 0.924, + "grad_norm": 0.7137225866317749, + "kl": 2.9995059967041016, + "learning_rate": 2.99e-06, + "logits/chosen": 33246598.4, + "logits/rejected": 31494838.4, + "logps/chosen": -124.85633544921875, + "logps/rejected": -151.764404296875, + "loss": 0.46476993560791013, + "rewards/chosen": -0.3835261344909668, + "rewards/margins": 0.35444231033325196, + "rewards/rejected": -0.7379684448242188, + "step": 2310 + }, + { + "epoch": 0.928, + "grad_norm": 0.48665422201156616, + "kl": 4.963588714599609, + "learning_rate": 2.978888888888889e-06, + "logits/chosen": 26167496.0, + "logits/rejected": 26003188.8, + "logps/chosen": -154.3181640625, + "logps/rejected": -193.70732421875, + "loss": 0.4603987216949463, + "rewards/chosen": -0.019819003343582154, + "rewards/margins": 0.6258892238140107, + "rewards/rejected": -0.6457082271575928, + "step": 2320 + }, + { + "epoch": 0.932, + "grad_norm": 0.6779302954673767, + "kl": 3.996805191040039, + "learning_rate": 2.9677777777777778e-06, + "logits/chosen": 26639760.0, + "logits/rejected": 24185547.2, + "logps/chosen": -145.71864013671876, + "logps/rejected": -165.16456298828126, + "loss": 0.41465444564819337, + "rewards/chosen": 0.16978931427001953, + "rewards/margins": 0.8076234340667725, + "rewards/rejected": -0.637834119796753, + "step": 2330 + }, + { + "epoch": 0.936, + "grad_norm": 0.8533156514167786, + "kl": 3.2005672454833984, + "learning_rate": 2.956666666666667e-06, + "logits/chosen": 17311833.6, + "logits/rejected": 18152035.2, + "logps/chosen": -139.579248046875, + "logps/rejected": -141.84622802734376, + "loss": 0.4770832538604736, + "rewards/chosen": -0.5391797542572021, + "rewards/margins": 0.2623293399810791, + "rewards/rejected": -0.8015090942382812, + "step": 2340 + }, + { + "epoch": 0.94, + "grad_norm": 0.7501420974731445, + "kl": 5.2280778884887695, + "learning_rate": 2.945555555555556e-06, + "logits/chosen": 19459948.8, + "logits/rejected": 19488014.4, + "logps/chosen": -181.2943603515625, + "logps/rejected": -141.72918701171875, + "loss": 0.4741385459899902, + "rewards/chosen": -0.3969358682632446, + "rewards/margins": 0.048303866386413596, + "rewards/rejected": -0.4452397346496582, + "step": 2350 + }, + { + "epoch": 0.944, + "grad_norm": 0.47924181818962097, + "kl": 5.855168342590332, + "learning_rate": 2.9344444444444446e-06, + "logits/chosen": 25643113.6, + "logits/rejected": 21872040.0, + "logps/chosen": -146.56474609375, + "logps/rejected": -149.1546630859375, + "loss": 0.4523441314697266, + "rewards/chosen": 0.34644312858581544, + "rewards/margins": 0.6119464874267578, + "rewards/rejected": -0.2655033588409424, + "step": 2360 + }, + { + "epoch": 0.948, + "grad_norm": 0.6821103692054749, + "kl": 7.001960754394531, + "learning_rate": 2.9233333333333334e-06, + "logits/chosen": 26589932.8, + "logits/rejected": 24771849.6, + "logps/chosen": -132.04959716796876, + "logps/rejected": -197.91544189453126, + "loss": 0.5003488063812256, + "rewards/chosen": -0.008898758888244629, + "rewards/margins": 0.02103534936904907, + "rewards/rejected": -0.0299341082572937, + "step": 2370 + }, + { + "epoch": 0.952, + "grad_norm": 0.6097027063369751, + "kl": 6.377338409423828, + "learning_rate": 2.9122222222222222e-06, + "logits/chosen": 40187350.4, + "logits/rejected": 39877142.4, + "logps/chosen": -171.79857177734374, + "logps/rejected": -151.0936767578125, + "loss": 0.46953182220458983, + "rewards/chosen": 0.39073307514190675, + "rewards/margins": 0.4624105989933014, + "rewards/rejected": -0.07167752385139466, + "step": 2380 + }, + { + "epoch": 0.956, + "grad_norm": 0.8344343900680542, + "kl": 5.0947136878967285, + "learning_rate": 2.901111111111111e-06, + "logits/chosen": 27525568.0, + "logits/rejected": 27525084.8, + "logps/chosen": -174.18642578125, + "logps/rejected": -169.3951171875, + "loss": 0.4775404453277588, + "rewards/chosen": -0.11374995708465577, + "rewards/margins": 0.36258018016815186, + "rewards/rejected": -0.4763301372528076, + "step": 2390 + }, + { + "epoch": 0.96, + "grad_norm": 0.5999415516853333, + "kl": 5.195433616638184, + "learning_rate": 2.89e-06, + "logits/chosen": 32106156.8, + "logits/rejected": 31147836.8, + "logps/chosen": -166.06192626953126, + "logps/rejected": -175.74422607421874, + "loss": 0.4666886329650879, + "rewards/chosen": 0.09231564402580261, + "rewards/margins": 0.445311564207077, + "rewards/rejected": -0.3529959201812744, + "step": 2400 + }, + { + "epoch": 0.96, + "eval_kl": 5.085776329040527, + "eval_logits/chosen": 27241426.944, + "eval_logits/rejected": 27194333.184, + "eval_logps/chosen": -154.09196875, + "eval_logps/rejected": -150.0654375, + "eval_loss": 0.4826502501964569, + "eval_rewards/chosen": 0.023380521774291993, + "eval_rewards/margins": 0.16565044975280763, + "eval_rewards/rejected": -0.14226992797851562, + "eval_runtime": 216.6502, + "eval_samples_per_second": 4.616, + "eval_steps_per_second": 2.308, + "step": 2400 + }, + { + "epoch": 0.964, + "grad_norm": 0.6591479182243347, + "kl": 4.856285095214844, + "learning_rate": 2.8788888888888895e-06, + "logits/chosen": 33843148.8, + "logits/rejected": 33023673.6, + "logps/chosen": -162.89716796875, + "logps/rejected": -152.8024169921875, + "loss": 0.4535430908203125, + "rewards/chosen": 0.35766189098358153, + "rewards/margins": 0.5051510214805603, + "rewards/rejected": -0.14748913049697876, + "step": 2410 + }, + { + "epoch": 0.968, + "grad_norm": 0.649363100528717, + "kl": 5.67615270614624, + "learning_rate": 2.8677777777777783e-06, + "logits/chosen": 28120470.4, + "logits/rejected": 28187414.4, + "logps/chosen": -148.36553955078125, + "logps/rejected": -164.37691650390624, + "loss": 0.49935593605041506, + "rewards/chosen": -0.13897392749786378, + "rewards/margins": 0.09212601184844971, + "rewards/rejected": -0.23109993934631348, + "step": 2420 + }, + { + "epoch": 0.972, + "grad_norm": 0.7029784321784973, + "kl": 5.981629848480225, + "learning_rate": 2.856666666666667e-06, + "logits/chosen": 33376736.0, + "logits/rejected": 35385472.0, + "logps/chosen": -160.655419921875, + "logps/rejected": -113.14091796875, + "loss": 0.5014323711395263, + "rewards/chosen": -0.06511507034301758, + "rewards/margins": -0.10962846279144288, + "rewards/rejected": 0.044513392448425296, + "step": 2430 + }, + { + "epoch": 0.976, + "grad_norm": 0.5741814970970154, + "kl": 7.015416145324707, + "learning_rate": 2.845555555555556e-06, + "logits/chosen": 24375812.8, + "logits/rejected": 23925715.2, + "logps/chosen": -142.3306884765625, + "logps/rejected": -145.99935302734374, + "loss": 0.47071352005004885, + "rewards/chosen": 0.3844744205474854, + "rewards/margins": 0.33515343666076663, + "rewards/rejected": 0.04932098388671875, + "step": 2440 + }, + { + "epoch": 0.98, + "grad_norm": 0.708365261554718, + "kl": 7.640904426574707, + "learning_rate": 2.8344444444444447e-06, + "logits/chosen": 36292083.2, + "logits/rejected": 33427609.6, + "logps/chosen": -175.033447265625, + "logps/rejected": -175.34300537109374, + "loss": 0.46329379081726074, + "rewards/chosen": 0.707914161682129, + "rewards/margins": 0.43219349384307865, + "rewards/rejected": 0.2757206678390503, + "step": 2450 + }, + { + "epoch": 0.984, + "grad_norm": 0.8229350447654724, + "kl": 6.793179512023926, + "learning_rate": 2.8233333333333335e-06, + "logits/chosen": 34248473.6, + "logits/rejected": 34939712.0, + "logps/chosen": -144.50880126953126, + "logps/rejected": -149.553759765625, + "loss": 0.49341444969177245, + "rewards/chosen": 0.4542993545532227, + "rewards/margins": 0.14487073421478275, + "rewards/rejected": 0.30942862033843993, + "step": 2460 + }, + { + "epoch": 0.988, + "grad_norm": 0.8729678392410278, + "kl": 6.059536933898926, + "learning_rate": 2.8122222222222224e-06, + "logits/chosen": 39128422.4, + "logits/rejected": 35834524.8, + "logps/chosen": -160.21749267578124, + "logps/rejected": -118.927099609375, + "loss": 0.4856616973876953, + "rewards/chosen": 0.2684544324874878, + "rewards/margins": 0.2460126757621765, + "rewards/rejected": 0.02244175672531128, + "step": 2470 + }, + { + "epoch": 0.992, + "grad_norm": 0.7808408737182617, + "kl": 4.119040489196777, + "learning_rate": 2.801111111111111e-06, + "logits/chosen": 14563339.2, + "logits/rejected": 14386867.2, + "logps/chosen": -131.0562255859375, + "logps/rejected": -109.05662841796875, + "loss": 0.50515718460083, + "rewards/chosen": -0.17694272994995117, + "rewards/margins": -0.02005159854888916, + "rewards/rejected": -0.156891131401062, + "step": 2480 + }, + { + "epoch": 0.996, + "grad_norm": 0.7683461904525757, + "kl": 5.681182861328125, + "learning_rate": 2.7900000000000004e-06, + "logits/chosen": 34791257.6, + "logits/rejected": 35001116.8, + "logps/chosen": -116.57052001953124, + "logps/rejected": -133.0627197265625, + "loss": 0.46341490745544434, + "rewards/chosen": 0.3942615032196045, + "rewards/margins": 0.4242114990949631, + "rewards/rejected": -0.02994999587535858, + "step": 2490 + }, + { + "epoch": 1.0, + "grad_norm": 0.7146331667900085, + "kl": 7.186850547790527, + "learning_rate": 2.778888888888889e-06, + "logits/chosen": 27759424.0, + "logits/rejected": 28190390.4, + "logps/chosen": -158.3378173828125, + "logps/rejected": -122.17666015625, + "loss": 0.4907883644104004, + "rewards/chosen": 0.33272812366485593, + "rewards/margins": 3.4856796264637335e-05, + "rewards/rejected": 0.3326932668685913, + "step": 2500 + }, + { + "epoch": 1.004, + "grad_norm": 0.7267434597015381, + "kl": 7.022622108459473, + "learning_rate": 2.767777777777778e-06, + "logits/chosen": 23414056.0, + "logits/rejected": 23530460.8, + "logps/chosen": -142.120947265625, + "logps/rejected": -126.23233642578126, + "loss": 0.450551700592041, + "rewards/chosen": 0.6840017318725586, + "rewards/margins": 0.4234133481979371, + "rewards/rejected": 0.26058838367462156, + "step": 2510 + }, + { + "epoch": 1.008, + "grad_norm": 0.613120436668396, + "kl": 7.363889217376709, + "learning_rate": 2.756666666666667e-06, + "logits/chosen": 42853379.2, + "logits/rejected": 42718368.0, + "logps/chosen": -133.13275146484375, + "logps/rejected": -147.75242919921874, + "loss": 0.4773738384246826, + "rewards/chosen": 0.6898352622985839, + "rewards/margins": 0.2725923061370849, + "rewards/rejected": 0.41724295616149903, + "step": 2520 + }, + { + "epoch": 1.012, + "grad_norm": 0.4656667113304138, + "kl": 6.543205261230469, + "learning_rate": 2.7455555555555556e-06, + "logits/chosen": 24894561.6, + "logits/rejected": 23945945.6, + "logps/chosen": -130.03875732421875, + "logps/rejected": -139.270556640625, + "loss": 0.48987507820129395, + "rewards/chosen": 0.16998794078826904, + "rewards/margins": 0.060191738605499256, + "rewards/rejected": 0.10979620218276978, + "step": 2530 + }, + { + "epoch": 1.016, + "grad_norm": 0.6344980597496033, + "kl": 8.745767593383789, + "learning_rate": 2.7344444444444444e-06, + "logits/chosen": 33636630.4, + "logits/rejected": 33898816.0, + "logps/chosen": -137.0499755859375, + "logps/rejected": -142.44915771484375, + "loss": 0.46440706253051756, + "rewards/chosen": 0.7247509479522705, + "rewards/margins": 0.3660990238189697, + "rewards/rejected": 0.35865192413330077, + "step": 2540 + }, + { + "epoch": 1.02, + "grad_norm": 0.5636667013168335, + "kl": 5.15373420715332, + "learning_rate": 2.7233333333333332e-06, + "logits/chosen": 32778352.0, + "logits/rejected": 34006931.2, + "logps/chosen": -138.170361328125, + "logps/rejected": -156.81767578125, + "loss": 0.4536026954650879, + "rewards/chosen": 0.28399336338043213, + "rewards/margins": 0.5328751564025879, + "rewards/rejected": -0.24888179302215577, + "step": 2550 + }, + { + "epoch": 1.024, + "grad_norm": 0.5508406758308411, + "kl": 4.445399284362793, + "learning_rate": 2.712222222222222e-06, + "logits/chosen": 24235310.4, + "logits/rejected": 20467011.2, + "logps/chosen": -102.88858642578126, + "logps/rejected": -117.8940673828125, + "loss": 0.4521032333374023, + "rewards/chosen": 0.2849747180938721, + "rewards/margins": 0.6732351303100585, + "rewards/rejected": -0.3882604122161865, + "step": 2560 + }, + { + "epoch": 1.028, + "grad_norm": 0.6794329881668091, + "kl": 7.5771074295043945, + "learning_rate": 2.7011111111111117e-06, + "logits/chosen": 39230246.4, + "logits/rejected": 36269590.4, + "logps/chosen": -160.77816162109374, + "logps/rejected": -175.435302734375, + "loss": 0.4510225296020508, + "rewards/chosen": 0.48538646697998045, + "rewards/margins": 0.5213055074214935, + "rewards/rejected": -0.03591904044151306, + "step": 2570 + }, + { + "epoch": 1.032, + "grad_norm": 0.873762845993042, + "kl": 7.515707969665527, + "learning_rate": 2.6900000000000005e-06, + "logits/chosen": 34034691.2, + "logits/rejected": 32782438.4, + "logps/chosen": -146.67379150390624, + "logps/rejected": -160.89415283203124, + "loss": 0.4409791946411133, + "rewards/chosen": 0.8245258331298828, + "rewards/margins": 0.5631396770477295, + "rewards/rejected": 0.2613861560821533, + "step": 2580 + }, + { + "epoch": 1.036, + "grad_norm": 0.8786899447441101, + "kl": 7.477902889251709, + "learning_rate": 2.6788888888888893e-06, + "logits/chosen": 31405507.2, + "logits/rejected": 30546566.4, + "logps/chosen": -162.37747802734376, + "logps/rejected": -158.5367919921875, + "loss": 0.429317569732666, + "rewards/chosen": 0.7475490093231201, + "rewards/margins": 0.6858846783638, + "rewards/rejected": 0.06166433095932007, + "step": 2590 + }, + { + "epoch": 1.04, + "grad_norm": 0.9408835768699646, + "kl": 6.193826198577881, + "learning_rate": 2.667777777777778e-06, + "logits/chosen": 18314294.4, + "logits/rejected": 17261046.4, + "logps/chosen": -133.769482421875, + "logps/rejected": -166.3760498046875, + "loss": 0.4750513553619385, + "rewards/chosen": 0.05294798612594605, + "rewards/margins": 0.23716256618499754, + "rewards/rejected": -0.1842145800590515, + "step": 2600 + }, + { + "epoch": 1.04, + "eval_kl": 6.557363986968994, + "eval_logits/chosen": 28159451.136, + "eval_logits/rejected": 27912509.44, + "eval_logps/chosen": -150.601921875, + "eval_logps/rejected": -146.599171875, + "eval_loss": 0.4829034209251404, + "eval_rewards/chosen": 0.3723853454589844, + "eval_rewards/margins": 0.16802960205078127, + "eval_rewards/rejected": 0.20435574340820312, + "eval_runtime": 217.1791, + "eval_samples_per_second": 4.604, + "eval_steps_per_second": 2.302, + "step": 2600 + }, + { + "epoch": 1.044, + "grad_norm": 0.6559078693389893, + "kl": 6.465074062347412, + "learning_rate": 2.656666666666667e-06, + "logits/chosen": 29268515.2, + "logits/rejected": 27382860.8, + "logps/chosen": -124.25496826171874, + "logps/rejected": -132.88599853515626, + "loss": 0.46654496192932127, + "rewards/chosen": 0.6311461448669433, + "rewards/margins": 0.35537390708923333, + "rewards/rejected": 0.27577223777771, + "step": 2610 + }, + { + "epoch": 1.048, + "grad_norm": 0.8091041445732117, + "kl": 10.043633460998535, + "learning_rate": 2.6455555555555557e-06, + "logits/chosen": 36276444.8, + "logits/rejected": 36037961.6, + "logps/chosen": -156.52952880859374, + "logps/rejected": -128.44246826171874, + "loss": 0.49141683578491213, + "rewards/chosen": 0.5628880023956299, + "rewards/margins": 0.02750706672668457, + "rewards/rejected": 0.5353809356689453, + "step": 2620 + }, + { + "epoch": 1.052, + "grad_norm": 0.6571462154388428, + "kl": 9.852654457092285, + "learning_rate": 2.6344444444444445e-06, + "logits/chosen": 31615228.8, + "logits/rejected": 32248979.2, + "logps/chosen": -154.07755126953126, + "logps/rejected": -132.39072265625, + "loss": 0.49523077011108396, + "rewards/chosen": 0.8096317291259766, + "rewards/margins": 0.018443870544433638, + "rewards/rejected": 0.791187858581543, + "step": 2630 + }, + { + "epoch": 1.056, + "grad_norm": 0.7390360832214355, + "kl": 6.219546318054199, + "learning_rate": 2.6233333333333333e-06, + "logits/chosen": 34434326.4, + "logits/rejected": 35229926.4, + "logps/chosen": -137.31668701171876, + "logps/rejected": -155.1895751953125, + "loss": 0.4699239730834961, + "rewards/chosen": 0.5334546089172363, + "rewards/margins": 0.24749846458435054, + "rewards/rejected": 0.28595614433288574, + "step": 2640 + }, + { + "epoch": 1.06, + "grad_norm": 0.4814999997615814, + "kl": 7.876091003417969, + "learning_rate": 2.6122222222222226e-06, + "logits/chosen": 31323043.2, + "logits/rejected": 32345257.6, + "logps/chosen": -123.53333740234375, + "logps/rejected": -115.850341796875, + "loss": 0.43686504364013673, + "rewards/chosen": 0.7052061557769775, + "rewards/margins": 0.533865237236023, + "rewards/rejected": 0.1713409185409546, + "step": 2650 + }, + { + "epoch": 1.064, + "grad_norm": 0.690242350101471, + "kl": 6.12372350692749, + "learning_rate": 2.6011111111111114e-06, + "logits/chosen": 25323872.0, + "logits/rejected": 25615576.0, + "logps/chosen": -141.92086181640624, + "logps/rejected": -147.47308349609375, + "loss": 0.4814923763275146, + "rewards/chosen": 0.1965832829475403, + "rewards/margins": 0.2016111582517624, + "rewards/rejected": -0.005027875304222107, + "step": 2660 + }, + { + "epoch": 1.068, + "grad_norm": 0.5838690400123596, + "kl": 5.5848894119262695, + "learning_rate": 2.59e-06, + "logits/chosen": 26498441.6, + "logits/rejected": 25519251.2, + "logps/chosen": -134.1501220703125, + "logps/rejected": -152.97462158203126, + "loss": 0.44647746086120604, + "rewards/chosen": 0.45215396881103515, + "rewards/margins": 0.7281685590744018, + "rewards/rejected": -0.2760145902633667, + "step": 2670 + }, + { + "epoch": 1.072, + "grad_norm": 0.6714196801185608, + "kl": 6.437767028808594, + "learning_rate": 2.578888888888889e-06, + "logits/chosen": 33809168.0, + "logits/rejected": 33002240.0, + "logps/chosen": -140.62269287109376, + "logps/rejected": -169.9681640625, + "loss": 0.46931910514831543, + "rewards/chosen": 0.5085949420928955, + "rewards/margins": 0.36951395273208615, + "rewards/rejected": 0.13908098936080932, + "step": 2680 + }, + { + "epoch": 1.076, + "grad_norm": 0.6106992959976196, + "kl": 4.450573921203613, + "learning_rate": 2.567777777777778e-06, + "logits/chosen": 30799760.0, + "logits/rejected": 30721590.4, + "logps/chosen": -122.03265380859375, + "logps/rejected": -137.297802734375, + "loss": 0.4664300441741943, + "rewards/chosen": 0.25887534618377683, + "rewards/margins": 0.22206425368785856, + "rewards/rejected": 0.036811092495918275, + "step": 2690 + }, + { + "epoch": 1.08, + "grad_norm": 0.9053173661231995, + "kl": 6.650594234466553, + "learning_rate": 2.5566666666666666e-06, + "logits/chosen": 38100038.4, + "logits/rejected": 34380816.0, + "logps/chosen": -145.046533203125, + "logps/rejected": -173.3657958984375, + "loss": 0.43851666450500487, + "rewards/chosen": 0.6501460552215577, + "rewards/margins": 0.6605178594589234, + "rewards/rejected": -0.010371804237365723, + "step": 2700 + }, + { + "epoch": 1.084, + "grad_norm": 0.676274836063385, + "kl": 3.7379002571105957, + "learning_rate": 2.5455555555555554e-06, + "logits/chosen": 19580768.0, + "logits/rejected": 20510683.2, + "logps/chosen": -145.21326904296876, + "logps/rejected": -136.80472412109376, + "loss": 0.5046684741973877, + "rewards/chosen": -0.46685400009155276, + "rewards/margins": -0.04879570007324219, + "rewards/rejected": -0.41805830001831057, + "step": 2710 + }, + { + "epoch": 1.088, + "grad_norm": 0.7222055792808533, + "kl": 5.816348075866699, + "learning_rate": 2.534444444444445e-06, + "logits/chosen": 22407427.2, + "logits/rejected": 21474601.6, + "logps/chosen": -100.11134643554688, + "logps/rejected": -127.4869384765625, + "loss": 0.4877651214599609, + "rewards/chosen": 0.17378766536712648, + "rewards/margins": 0.2759766340255737, + "rewards/rejected": -0.10218896865844726, + "step": 2720 + }, + { + "epoch": 1.092, + "grad_norm": 0.6274256110191345, + "kl": 5.171383857727051, + "learning_rate": 2.523333333333334e-06, + "logits/chosen": 35835520.0, + "logits/rejected": 35165875.2, + "logps/chosen": -186.44775390625, + "logps/rejected": -167.0265869140625, + "loss": 0.466900634765625, + "rewards/chosen": -0.10870237350463867, + "rewards/margins": 0.21330931186676022, + "rewards/rejected": -0.3220116853713989, + "step": 2730 + }, + { + "epoch": 1.096, + "grad_norm": 0.6451675295829773, + "kl": 5.268320083618164, + "learning_rate": 2.5122222222222227e-06, + "logits/chosen": 32315388.8, + "logits/rejected": 34424963.2, + "logps/chosen": -177.06943359375, + "logps/rejected": -168.0710205078125, + "loss": 0.47601852416992185, + "rewards/chosen": -0.004057984054088593, + "rewards/margins": 0.10740263015031815, + "rewards/rejected": -0.11146061420440674, + "step": 2740 + }, + { + "epoch": 1.1, + "grad_norm": 0.9512624740600586, + "kl": 5.623786926269531, + "learning_rate": 2.5011111111111115e-06, + "logits/chosen": 24552736.0, + "logits/rejected": 23781004.8, + "logps/chosen": -174.98658447265626, + "logps/rejected": -152.33095703125, + "loss": 0.42731657028198244, + "rewards/chosen": 0.17493221759796143, + "rewards/margins": 0.8694432020187378, + "rewards/rejected": -0.6945109844207764, + "step": 2750 + }, + { + "epoch": 1.104, + "grad_norm": 0.8605503439903259, + "kl": 3.258263111114502, + "learning_rate": 2.4900000000000003e-06, + "logits/chosen": 17540195.2, + "logits/rejected": 14481008.0, + "logps/chosen": -130.9520263671875, + "logps/rejected": -140.87447509765624, + "loss": 0.4474031925201416, + "rewards/chosen": -0.1570604920387268, + "rewards/margins": 0.7482632040977477, + "rewards/rejected": -0.9053236961364746, + "step": 2760 + }, + { + "epoch": 1.108, + "grad_norm": 0.8902762532234192, + "kl": 6.445823669433594, + "learning_rate": 2.478888888888889e-06, + "logits/chosen": 25843875.2, + "logits/rejected": 25419281.6, + "logps/chosen": -155.84178466796874, + "logps/rejected": -156.02589111328126, + "loss": 0.464084529876709, + "rewards/chosen": 0.1782880425453186, + "rewards/margins": 0.5563196301460266, + "rewards/rejected": -0.378031587600708, + "step": 2770 + }, + { + "epoch": 1.112, + "grad_norm": 0.7916592359542847, + "kl": 5.980124473571777, + "learning_rate": 2.467777777777778e-06, + "logits/chosen": 23357057.6, + "logits/rejected": 19639329.6, + "logps/chosen": -165.2984619140625, + "logps/rejected": -144.48494873046874, + "loss": 0.44487595558166504, + "rewards/chosen": 0.28723764419555664, + "rewards/margins": 0.6892455577850342, + "rewards/rejected": -0.4020079135894775, + "step": 2780 + }, + { + "epoch": 1.116, + "grad_norm": 0.5942727327346802, + "kl": 7.625303745269775, + "learning_rate": 2.4566666666666667e-06, + "logits/chosen": 22922739.2, + "logits/rejected": 22430774.4, + "logps/chosen": -139.4244873046875, + "logps/rejected": -147.5355224609375, + "loss": 0.43770174980163573, + "rewards/chosen": 0.7275904655456543, + "rewards/margins": 0.641109848022461, + "rewards/rejected": 0.08648061752319336, + "step": 2790 + }, + { + "epoch": 1.12, + "grad_norm": 0.6265588402748108, + "kl": 5.79810094833374, + "learning_rate": 2.4455555555555555e-06, + "logits/chosen": 33326956.8, + "logits/rejected": 33234153.6, + "logps/chosen": -165.5775634765625, + "logps/rejected": -181.19012451171875, + "loss": 0.46999220848083495, + "rewards/chosen": 0.3356909275054932, + "rewards/margins": 0.41499342918396, + "rewards/rejected": -0.0793025016784668, + "step": 2800 + }, + { + "epoch": 1.12, + "eval_kl": 6.51247501373291, + "eval_logits/chosen": 31125557.248, + "eval_logits/rejected": 30991392.768, + "eval_logps/chosen": -150.6563125, + "eval_logps/rejected": -146.877875, + "eval_loss": 0.480685293674469, + "eval_rewards/chosen": 0.36694525146484375, + "eval_rewards/margins": 0.19045977783203125, + "eval_rewards/rejected": 0.1764854736328125, + "eval_runtime": 217.0415, + "eval_samples_per_second": 4.607, + "eval_steps_per_second": 2.304, + "step": 2800 + }, + { + "epoch": 1.124, + "grad_norm": 0.6718530654907227, + "kl": 7.274069309234619, + "learning_rate": 2.4344444444444448e-06, + "logits/chosen": 28781395.2, + "logits/rejected": 28910336.0, + "logps/chosen": -142.17664794921876, + "logps/rejected": -153.1429443359375, + "loss": 0.48170881271362304, + "rewards/chosen": 0.4687415599822998, + "rewards/margins": 0.17820630073547367, + "rewards/rejected": 0.29053525924682616, + "step": 2810 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.8993642330169678, + "kl": 5.114150047302246, + "learning_rate": 2.4233333333333336e-06, + "logits/chosen": 31094944.0, + "logits/rejected": 29708396.8, + "logps/chosen": -158.43695068359375, + "logps/rejected": -134.23763427734374, + "loss": 0.4788343906402588, + "rewards/chosen": 0.18828521966934203, + "rewards/margins": 0.20167077183723447, + "rewards/rejected": -0.013385552167892455, + "step": 2820 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 0.5459766387939453, + "kl": 6.137971878051758, + "learning_rate": 2.4122222222222224e-06, + "logits/chosen": 41590592.0, + "logits/rejected": 39072947.2, + "logps/chosen": -145.41527099609374, + "logps/rejected": -144.8617919921875, + "loss": 0.40869617462158203, + "rewards/chosen": 0.8426953315734863, + "rewards/margins": 0.9073116958141326, + "rewards/rejected": -0.06461636424064636, + "step": 2830 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.6354557275772095, + "kl": 5.928516387939453, + "learning_rate": 2.401111111111111e-06, + "logits/chosen": 34861424.0, + "logits/rejected": 34759747.2, + "logps/chosen": -129.4930908203125, + "logps/rejected": -143.89154052734375, + "loss": 0.44280567169189455, + "rewards/chosen": 0.5408330440521241, + "rewards/margins": 0.5000333577394486, + "rewards/rejected": 0.04079968631267548, + "step": 2840 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.5932331085205078, + "kl": 8.782042503356934, + "learning_rate": 2.39e-06, + "logits/chosen": 32643923.2, + "logits/rejected": 30833843.2, + "logps/chosen": -127.0966064453125, + "logps/rejected": -164.51356201171876, + "loss": 0.4796291351318359, + "rewards/chosen": 0.8198535919189454, + "rewards/margins": 0.17787570953369147, + "rewards/rejected": 0.6419778823852539, + "step": 2850 + }, + { + "epoch": 1.144, + "grad_norm": 0.4119536578655243, + "kl": 6.023087024688721, + "learning_rate": 2.3788888888888892e-06, + "logits/chosen": 34157424.0, + "logits/rejected": 33897881.6, + "logps/chosen": -142.58350830078126, + "logps/rejected": -112.63641357421875, + "loss": 0.44989490509033203, + "rewards/chosen": 0.5664341926574707, + "rewards/margins": 0.5420472577214241, + "rewards/rejected": 0.0243869349360466, + "step": 2860 + }, + { + "epoch": 1.148, + "grad_norm": 0.6733080744743347, + "kl": 7.073210716247559, + "learning_rate": 2.367777777777778e-06, + "logits/chosen": 32286940.8, + "logits/rejected": 32065385.6, + "logps/chosen": -144.9827392578125, + "logps/rejected": -150.515478515625, + "loss": 0.4916172981262207, + "rewards/chosen": 0.20184409618377686, + "rewards/margins": 0.11459586024284363, + "rewards/rejected": 0.08724823594093323, + "step": 2870 + }, + { + "epoch": 1.152, + "grad_norm": 1.03435480594635, + "kl": 6.8236589431762695, + "learning_rate": 2.356666666666667e-06, + "logits/chosen": 29720467.2, + "logits/rejected": 29326092.8, + "logps/chosen": -151.82506103515624, + "logps/rejected": -165.98399658203124, + "loss": 0.45775256156921384, + "rewards/chosen": 0.4520999908447266, + "rewards/margins": 0.2227289915084839, + "rewards/rejected": 0.22937099933624266, + "step": 2880 + }, + { + "epoch": 1.156, + "grad_norm": 0.5883368849754333, + "kl": 6.491732120513916, + "learning_rate": 2.3455555555555556e-06, + "logits/chosen": 23835811.2, + "logits/rejected": 23859849.6, + "logps/chosen": -170.26668701171874, + "logps/rejected": -150.8224609375, + "loss": 0.431504487991333, + "rewards/chosen": 0.614830207824707, + "rewards/margins": 0.6636435002088547, + "rewards/rejected": -0.048813292384147645, + "step": 2890 + }, + { + "epoch": 1.16, + "grad_norm": 0.7225193977355957, + "kl": 5.883708477020264, + "learning_rate": 2.334444444444445e-06, + "logits/chosen": 36251644.8, + "logits/rejected": 33482300.8, + "logps/chosen": -145.1569091796875, + "logps/rejected": -140.39755859375, + "loss": 0.4224736213684082, + "rewards/chosen": 0.4660323619842529, + "rewards/margins": 0.918277359008789, + "rewards/rejected": -0.45224499702453613, + "step": 2900 + }, + { + "epoch": 1.164, + "grad_norm": 0.550423800945282, + "kl": 6.711850643157959, + "learning_rate": 2.3233333333333337e-06, + "logits/chosen": 37449692.8, + "logits/rejected": 35041868.8, + "logps/chosen": -142.85999755859376, + "logps/rejected": -148.87371826171875, + "loss": 0.45527114868164065, + "rewards/chosen": 0.5728636741638183, + "rewards/margins": 0.5280719608068466, + "rewards/rejected": 0.04479171335697174, + "step": 2910 + }, + { + "epoch": 1.168, + "grad_norm": 0.8112925887107849, + "kl": 3.537787675857544, + "learning_rate": 2.3122222222222225e-06, + "logits/chosen": 24047856.0, + "logits/rejected": 23857336.0, + "logps/chosen": -164.10267333984376, + "logps/rejected": -137.96185302734375, + "loss": 0.44495596885681155, + "rewards/chosen": -0.09260135293006896, + "rewards/margins": 0.5166131436824798, + "rewards/rejected": -0.6092144966125488, + "step": 2920 + }, + { + "epoch": 1.172, + "grad_norm": 0.5182350277900696, + "kl": 5.1398210525512695, + "learning_rate": 2.3011111111111113e-06, + "logits/chosen": 24040510.4, + "logits/rejected": 25510265.6, + "logps/chosen": -170.7597412109375, + "logps/rejected": -124.09306640625, + "loss": 0.45585017204284667, + "rewards/chosen": 0.017396342754364014, + "rewards/margins": 0.4217635035514832, + "rewards/rejected": -0.40436716079711915, + "step": 2930 + }, + { + "epoch": 1.176, + "grad_norm": 0.7129570841789246, + "kl": 5.55086612701416, + "learning_rate": 2.29e-06, + "logits/chosen": 36301065.6, + "logits/rejected": 36730444.8, + "logps/chosen": -144.58111572265625, + "logps/rejected": -170.131298828125, + "loss": 0.45364060401916506, + "rewards/chosen": 0.47618856430053713, + "rewards/margins": 0.4523512840270996, + "rewards/rejected": 0.0238372802734375, + "step": 2940 + }, + { + "epoch": 1.18, + "grad_norm": 0.689380943775177, + "kl": 5.013358116149902, + "learning_rate": 2.278888888888889e-06, + "logits/chosen": 26987398.4, + "logits/rejected": 27705705.6, + "logps/chosen": -97.45775146484375, + "logps/rejected": -156.65115966796876, + "loss": 0.4797633171081543, + "rewards/chosen": 0.1765173554420471, + "rewards/margins": 0.27975412607192995, + "rewards/rejected": -0.10323677062988282, + "step": 2950 + }, + { + "epoch": 1.184, + "grad_norm": 0.5471240282058716, + "kl": 8.275094985961914, + "learning_rate": 2.2677777777777777e-06, + "logits/chosen": 40253052.8, + "logits/rejected": 37156057.6, + "logps/chosen": -157.5787109375, + "logps/rejected": -178.797021484375, + "loss": 0.46055126190185547, + "rewards/chosen": 0.7023271083831787, + "rewards/margins": 0.4890592336654663, + "rewards/rejected": 0.2132678747177124, + "step": 2960 + }, + { + "epoch": 1.188, + "grad_norm": 0.5812104344367981, + "kl": 6.715832710266113, + "learning_rate": 2.2566666666666665e-06, + "logits/chosen": 43975625.6, + "logits/rejected": 41445318.4, + "logps/chosen": -171.347216796875, + "logps/rejected": -180.57864990234376, + "loss": 0.4628589630126953, + "rewards/chosen": 0.22713685035705566, + "rewards/margins": 0.5415925025939942, + "rewards/rejected": -0.3144556522369385, + "step": 2970 + }, + { + "epoch": 1.192, + "grad_norm": 0.8072389364242554, + "kl": 5.419320583343506, + "learning_rate": 2.2455555555555557e-06, + "logits/chosen": 37647660.8, + "logits/rejected": 36517382.4, + "logps/chosen": -149.52344970703126, + "logps/rejected": -164.54786376953126, + "loss": 0.44607295989990237, + "rewards/chosen": 0.300301718711853, + "rewards/margins": 0.5973361253738403, + "rewards/rejected": -0.2970344066619873, + "step": 2980 + }, + { + "epoch": 1.196, + "grad_norm": 0.44364601373672485, + "kl": 6.054505825042725, + "learning_rate": 2.2344444444444446e-06, + "logits/chosen": 22235811.2, + "logits/rejected": 19985859.2, + "logps/chosen": -143.41585693359374, + "logps/rejected": -150.84788818359374, + "loss": 0.41753711700439455, + "rewards/chosen": 0.5826003074645996, + "rewards/margins": 0.9533658504486083, + "rewards/rejected": -0.37076554298400877, + "step": 2990 + }, + { + "epoch": 1.2, + "grad_norm": 0.541634202003479, + "kl": 8.647984504699707, + "learning_rate": 2.2233333333333334e-06, + "logits/chosen": 38845638.4, + "logits/rejected": 38343161.6, + "logps/chosen": -171.0084228515625, + "logps/rejected": -129.40447998046875, + "loss": 0.4108599662780762, + "rewards/chosen": 1.1798659324645997, + "rewards/margins": 0.8690209865570069, + "rewards/rejected": 0.31084494590759276, + "step": 3000 + }, + { + "epoch": 1.2, + "eval_kl": 5.96298360824585, + "eval_logits/chosen": 32690919.424, + "eval_logits/rejected": 32526981.12, + "eval_logps/chosen": -150.91603125, + "eval_logps/rejected": -147.333453125, + "eval_loss": 0.4783514738082886, + "eval_rewards/chosen": 0.3409757690429687, + "eval_rewards/margins": 0.21004846191406248, + "eval_rewards/rejected": 0.13092730712890624, + "eval_runtime": 216.7869, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3000 + }, + { + "epoch": 1.204, + "grad_norm": 0.735106348991394, + "kl": 4.565227508544922, + "learning_rate": 2.212222222222222e-06, + "logits/chosen": 42755068.8, + "logits/rejected": 41808000.0, + "logps/chosen": -144.0309814453125, + "logps/rejected": -162.88955078125, + "loss": 0.4483033180236816, + "rewards/chosen": 0.3165964841842651, + "rewards/margins": 0.5639133214950561, + "rewards/rejected": -0.24731683731079102, + "step": 3010 + }, + { + "epoch": 1.208, + "grad_norm": 0.4859018623828888, + "kl": 6.566149711608887, + "learning_rate": 2.2011111111111114e-06, + "logits/chosen": 40423686.4, + "logits/rejected": 38088403.2, + "logps/chosen": -152.13575439453126, + "logps/rejected": -164.60054931640624, + "loss": 0.43638858795166013, + "rewards/chosen": 0.7344797134399415, + "rewards/margins": 0.7190215766429902, + "rewards/rejected": 0.015458136796951294, + "step": 3020 + }, + { + "epoch": 1.212, + "grad_norm": 0.5802381634712219, + "kl": 5.501416206359863, + "learning_rate": 2.19e-06, + "logits/chosen": 36018793.6, + "logits/rejected": 34128659.2, + "logps/chosen": -139.50609130859374, + "logps/rejected": -176.41171875, + "loss": 0.48515634536743163, + "rewards/chosen": 0.2475515365600586, + "rewards/margins": 0.1769363284111023, + "rewards/rejected": 0.0706152081489563, + "step": 3030 + }, + { + "epoch": 1.216, + "grad_norm": 0.9714307188987732, + "kl": 7.294144630432129, + "learning_rate": 2.178888888888889e-06, + "logits/chosen": 29871635.2, + "logits/rejected": 28148294.4, + "logps/chosen": -144.24107666015624, + "logps/rejected": -157.43570556640626, + "loss": 0.4299461364746094, + "rewards/chosen": 0.7857762336730957, + "rewards/margins": 0.6812341213226318, + "rewards/rejected": 0.10454211235046387, + "step": 3040 + }, + { + "epoch": 1.22, + "grad_norm": 0.6073058247566223, + "kl": 5.890301704406738, + "learning_rate": 2.1677777777777782e-06, + "logits/chosen": 36303308.8, + "logits/rejected": 36356560.0, + "logps/chosen": -151.838623046875, + "logps/rejected": -171.02061767578124, + "loss": 0.4609498977661133, + "rewards/chosen": 0.38731932640075684, + "rewards/margins": 0.6510258436203002, + "rewards/rejected": -0.26370651721954347, + "step": 3050 + }, + { + "epoch": 1.224, + "grad_norm": 0.5871905088424683, + "kl": 5.802463531494141, + "learning_rate": 2.156666666666667e-06, + "logits/chosen": 28059456.0, + "logits/rejected": 26678612.8, + "logps/chosen": -159.48240966796874, + "logps/rejected": -153.2945068359375, + "loss": 0.43854827880859376, + "rewards/chosen": 0.4585693359375, + "rewards/margins": 0.676022219657898, + "rewards/rejected": -0.21745288372039795, + "step": 3060 + }, + { + "epoch": 1.228, + "grad_norm": 0.4357960522174835, + "kl": 4.12323522567749, + "learning_rate": 2.145555555555556e-06, + "logits/chosen": 34136323.2, + "logits/rejected": 33183904.0, + "logps/chosen": -146.4657958984375, + "logps/rejected": -139.709375, + "loss": 0.43050317764282225, + "rewards/chosen": 0.3769852876663208, + "rewards/margins": 0.9553431749343871, + "rewards/rejected": -0.5783578872680664, + "step": 3070 + }, + { + "epoch": 1.232, + "grad_norm": 0.47064968943595886, + "kl": 7.40407657623291, + "learning_rate": 2.1344444444444447e-06, + "logits/chosen": 42982582.4, + "logits/rejected": 40440524.8, + "logps/chosen": -142.09869384765625, + "logps/rejected": -158.719091796875, + "loss": 0.42528462409973145, + "rewards/chosen": 0.7800788879394531, + "rewards/margins": 0.7373038113117218, + "rewards/rejected": 0.04277507662773132, + "step": 3080 + }, + { + "epoch": 1.236, + "grad_norm": 0.7308046221733093, + "kl": 7.210939884185791, + "learning_rate": 2.1233333333333335e-06, + "logits/chosen": 28548304.0, + "logits/rejected": 28032595.2, + "logps/chosen": -122.3899658203125, + "logps/rejected": -133.5687744140625, + "loss": 0.4277194976806641, + "rewards/chosen": 0.8977908134460449, + "rewards/margins": 0.6939298629760742, + "rewards/rejected": 0.20386095046997071, + "step": 3090 + }, + { + "epoch": 1.24, + "grad_norm": 0.8042078614234924, + "kl": 6.6927170753479, + "learning_rate": 2.1122222222222223e-06, + "logits/chosen": 27459798.4, + "logits/rejected": 29134211.2, + "logps/chosen": -143.5735107421875, + "logps/rejected": -149.86500244140626, + "loss": 0.46234517097473143, + "rewards/chosen": 0.5464875221252441, + "rewards/margins": 0.39129424095153803, + "rewards/rejected": 0.15519328117370607, + "step": 3100 + }, + { + "epoch": 1.244, + "grad_norm": 0.5080249309539795, + "kl": 7.83388614654541, + "learning_rate": 2.101111111111111e-06, + "logits/chosen": 44153635.2, + "logits/rejected": 41055212.8, + "logps/chosen": -187.93551025390624, + "logps/rejected": -171.871630859375, + "loss": 0.4076192378997803, + "rewards/chosen": 0.8624818801879883, + "rewards/margins": 1.0380724906921388, + "rewards/rejected": -0.1755906105041504, + "step": 3110 + }, + { + "epoch": 1.248, + "grad_norm": 0.7141006588935852, + "kl": 6.444394111633301, + "learning_rate": 2.09e-06, + "logits/chosen": 22851744.0, + "logits/rejected": 21574438.4, + "logps/chosen": -135.523486328125, + "logps/rejected": -137.11583251953124, + "loss": 0.4397727966308594, + "rewards/chosen": 0.6564054965972901, + "rewards/margins": 0.65584604293108, + "rewards/rejected": 0.0005594536662101746, + "step": 3120 + }, + { + "epoch": 1.252, + "grad_norm": 0.6408936977386475, + "kl": 8.526594161987305, + "learning_rate": 2.078888888888889e-06, + "logits/chosen": 37600892.8, + "logits/rejected": 38453331.2, + "logps/chosen": -134.79913330078125, + "logps/rejected": -169.9215087890625, + "loss": 0.47039794921875, + "rewards/chosen": 0.816160011291504, + "rewards/margins": 0.25816926956176767, + "rewards/rejected": 0.5579907417297363, + "step": 3130 + }, + { + "epoch": 1.256, + "grad_norm": 0.7661492228507996, + "kl": 7.337412357330322, + "learning_rate": 2.067777777777778e-06, + "logits/chosen": 37269350.4, + "logits/rejected": 36887142.4, + "logps/chosen": -185.44752197265626, + "logps/rejected": -150.3639404296875, + "loss": 0.4498098850250244, + "rewards/chosen": 0.6579087257385254, + "rewards/margins": 0.5174487590789796, + "rewards/rejected": 0.1404599666595459, + "step": 3140 + }, + { + "epoch": 1.26, + "grad_norm": 0.5819596648216248, + "kl": 9.288617134094238, + "learning_rate": 2.0566666666666667e-06, + "logits/chosen": 41466233.6, + "logits/rejected": 39139404.8, + "logps/chosen": -120.82750244140625, + "logps/rejected": -131.20218505859376, + "loss": 0.46291580200195315, + "rewards/chosen": 0.883179759979248, + "rewards/margins": 0.3774709701538086, + "rewards/rejected": 0.5057087898254394, + "step": 3150 + }, + { + "epoch": 1.264, + "grad_norm": 0.46087002754211426, + "kl": 8.114151000976562, + "learning_rate": 2.0455555555555555e-06, + "logits/chosen": 54930982.4, + "logits/rejected": 55207168.0, + "logps/chosen": -161.81624755859374, + "logps/rejected": -131.8754638671875, + "loss": 0.4237666130065918, + "rewards/chosen": 1.0106356620788575, + "rewards/margins": 0.6930557727813721, + "rewards/rejected": 0.3175798892974854, + "step": 3160 + }, + { + "epoch": 1.268, + "grad_norm": 0.6536068320274353, + "kl": 9.215258598327637, + "learning_rate": 2.0344444444444448e-06, + "logits/chosen": 40884854.4, + "logits/rejected": 41148396.8, + "logps/chosen": -174.63795166015626, + "logps/rejected": -183.98502197265626, + "loss": 0.410884428024292, + "rewards/chosen": 1.0293194770812988, + "rewards/margins": 0.9003942966461181, + "rewards/rejected": 0.12892518043518067, + "step": 3170 + }, + { + "epoch": 1.272, + "grad_norm": 0.6242780089378357, + "kl": 5.226318359375, + "learning_rate": 2.0233333333333336e-06, + "logits/chosen": 39783075.2, + "logits/rejected": 38742624.0, + "logps/chosen": -140.7747314453125, + "logps/rejected": -128.35888671875, + "loss": 0.4299330234527588, + "rewards/chosen": 0.5863895893096924, + "rewards/margins": 0.637285441160202, + "rewards/rejected": -0.05089585185050964, + "step": 3180 + }, + { + "epoch": 1.276, + "grad_norm": 0.46700039505958557, + "kl": 7.267230033874512, + "learning_rate": 2.0122222222222224e-06, + "logits/chosen": 41375708.8, + "logits/rejected": 38081993.6, + "logps/chosen": -153.80274658203126, + "logps/rejected": -176.299755859375, + "loss": 0.460453462600708, + "rewards/chosen": 0.6854756832122803, + "rewards/margins": 0.3931422710418701, + "rewards/rejected": 0.29233341217041015, + "step": 3190 + }, + { + "epoch": 1.28, + "grad_norm": 0.6310598850250244, + "kl": 5.743724346160889, + "learning_rate": 2.001111111111111e-06, + "logits/chosen": 34063660.8, + "logits/rejected": 31735603.2, + "logps/chosen": -153.7698974609375, + "logps/rejected": -144.71922607421874, + "loss": 0.46727771759033204, + "rewards/chosen": 0.4247100830078125, + "rewards/margins": 0.3391889691352844, + "rewards/rejected": 0.08552111387252807, + "step": 3200 + }, + { + "epoch": 1.28, + "eval_kl": 6.358611106872559, + "eval_logits/chosen": 35785789.44, + "eval_logits/rejected": 35595558.912, + "eval_logps/chosen": -149.448609375, + "eval_logps/rejected": -146.011734375, + "eval_loss": 0.47684431076049805, + "eval_rewards/chosen": 0.48771685791015623, + "eval_rewards/margins": 0.22461712646484372, + "eval_rewards/rejected": 0.2630997314453125, + "eval_runtime": 216.7977, + "eval_samples_per_second": 4.613, + "eval_steps_per_second": 2.306, + "step": 3200 + }, + { + "epoch": 1.284, + "grad_norm": 0.5444361567497253, + "kl": 8.359007835388184, + "learning_rate": 1.9900000000000004e-06, + "logits/chosen": 38480896.0, + "logits/rejected": 37109968.0, + "logps/chosen": -166.1022705078125, + "logps/rejected": -135.0826416015625, + "loss": 0.4136789321899414, + "rewards/chosen": 1.1088358879089355, + "rewards/margins": 0.8896709442138672, + "rewards/rejected": 0.21916494369506836, + "step": 3210 + }, + { + "epoch": 1.288, + "grad_norm": 0.47785794734954834, + "kl": 5.674698829650879, + "learning_rate": 1.9788888888888892e-06, + "logits/chosen": 30067932.8, + "logits/rejected": 30675318.4, + "logps/chosen": -119.010009765625, + "logps/rejected": -125.2324462890625, + "loss": 0.4624322891235352, + "rewards/chosen": 0.3068151712417603, + "rewards/margins": 0.3403205394744873, + "rewards/rejected": -0.033505368232727054, + "step": 3220 + }, + { + "epoch": 1.292, + "grad_norm": 0.5500114560127258, + "kl": 6.791600704193115, + "learning_rate": 1.967777777777778e-06, + "logits/chosen": 34258720.0, + "logits/rejected": 31316112.0, + "logps/chosen": -148.19810791015624, + "logps/rejected": -154.41190185546876, + "loss": 0.449018669128418, + "rewards/chosen": 0.7993580818176269, + "rewards/margins": 0.4857671737670898, + "rewards/rejected": 0.31359090805053713, + "step": 3230 + }, + { + "epoch": 1.296, + "grad_norm": 0.5430236458778381, + "kl": 8.808231353759766, + "learning_rate": 1.956666666666667e-06, + "logits/chosen": 34977206.4, + "logits/rejected": 31934518.4, + "logps/chosen": -150.738330078125, + "logps/rejected": -182.1927490234375, + "loss": 0.45286874771118163, + "rewards/chosen": 0.8764358520507812, + "rewards/margins": 0.5628475189208983, + "rewards/rejected": 0.3135883331298828, + "step": 3240 + }, + { + "epoch": 1.3, + "grad_norm": 0.350079745054245, + "kl": 8.156396865844727, + "learning_rate": 1.9455555555555557e-06, + "logits/chosen": 38458761.6, + "logits/rejected": 36443888.0, + "logps/chosen": -153.3631103515625, + "logps/rejected": -182.10977783203126, + "loss": 0.4610450267791748, + "rewards/chosen": 0.7133102416992188, + "rewards/margins": 0.43579509258270266, + "rewards/rejected": 0.2775151491165161, + "step": 3250 + }, + { + "epoch": 1.304, + "grad_norm": 0.5987509489059448, + "kl": 4.856646537780762, + "learning_rate": 1.9344444444444445e-06, + "logits/chosen": 34302396.8, + "logits/rejected": 35170553.6, + "logps/chosen": -113.03687744140625, + "logps/rejected": -124.1705810546875, + "loss": 0.4647495269775391, + "rewards/chosen": 0.39636247158050536, + "rewards/margins": 0.2917425155639648, + "rewards/rejected": 0.10461995601654053, + "step": 3260 + }, + { + "epoch": 1.308, + "grad_norm": 0.6457140445709229, + "kl": 4.818373680114746, + "learning_rate": 1.9233333333333333e-06, + "logits/chosen": 43591721.6, + "logits/rejected": 44343180.8, + "logps/chosen": -137.19747314453124, + "logps/rejected": -153.1467529296875, + "loss": 0.42650656700134276, + "rewards/chosen": 0.6118185997009278, + "rewards/margins": 0.7147171497344971, + "rewards/rejected": -0.10289855003356933, + "step": 3270 + }, + { + "epoch": 1.312, + "grad_norm": 0.6024192571640015, + "kl": 4.331322193145752, + "learning_rate": 1.912222222222222e-06, + "logits/chosen": 40203308.8, + "logits/rejected": 41383084.8, + "logps/chosen": -156.63048095703124, + "logps/rejected": -144.2884033203125, + "loss": 0.4273221969604492, + "rewards/chosen": 0.6165127277374267, + "rewards/margins": 0.7079921424388885, + "rewards/rejected": -0.09147941470146179, + "step": 3280 + }, + { + "epoch": 1.316, + "grad_norm": 0.6954644918441772, + "kl": 4.1213908195495605, + "learning_rate": 1.9011111111111113e-06, + "logits/chosen": 28768950.4, + "logits/rejected": 24494809.6, + "logps/chosen": -143.09805908203126, + "logps/rejected": -139.8798095703125, + "loss": 0.4376859664916992, + "rewards/chosen": 0.23681788444519042, + "rewards/margins": 0.6746566295623779, + "rewards/rejected": -0.4378387451171875, + "step": 3290 + }, + { + "epoch": 1.32, + "grad_norm": 0.6745891571044922, + "kl": 4.750722408294678, + "learning_rate": 1.8900000000000001e-06, + "logits/chosen": 26675907.2, + "logits/rejected": 26277996.8, + "logps/chosen": -143.6562255859375, + "logps/rejected": -134.79100341796874, + "loss": 0.4433170795440674, + "rewards/chosen": 0.3683876276016235, + "rewards/margins": 0.5583112239837646, + "rewards/rejected": -0.18992359638214112, + "step": 3300 + }, + { + "epoch": 1.324, + "grad_norm": 0.5548813939094543, + "kl": 6.803833961486816, + "learning_rate": 1.878888888888889e-06, + "logits/chosen": 36996547.2, + "logits/rejected": 37710758.4, + "logps/chosen": -151.74556884765624, + "logps/rejected": -154.92069091796876, + "loss": 0.43219637870788574, + "rewards/chosen": 0.8092526435852051, + "rewards/margins": 0.6131466150283813, + "rewards/rejected": 0.19610602855682374, + "step": 3310 + }, + { + "epoch": 1.328, + "grad_norm": 0.5542740821838379, + "kl": 3.4922118186950684, + "learning_rate": 1.8677777777777777e-06, + "logits/chosen": 21493171.2, + "logits/rejected": 22701468.8, + "logps/chosen": -153.733984375, + "logps/rejected": -130.3265380859375, + "loss": 0.45499577522277834, + "rewards/chosen": 0.06440688967704773, + "rewards/margins": 0.4255514085292816, + "rewards/rejected": -0.36114451885223386, + "step": 3320 + }, + { + "epoch": 1.332, + "grad_norm": 0.5660212635993958, + "kl": 5.5110368728637695, + "learning_rate": 1.856666666666667e-06, + "logits/chosen": 25962280.0, + "logits/rejected": 25115568.0, + "logps/chosen": -157.3265625, + "logps/rejected": -127.7706298828125, + "loss": 0.45712642669677733, + "rewards/chosen": 0.25737948417663575, + "rewards/margins": 0.38217872381210327, + "rewards/rejected": -0.12479923963546753, + "step": 3330 + }, + { + "epoch": 1.336, + "grad_norm": 0.7863187193870544, + "kl": 4.013291835784912, + "learning_rate": 1.8455555555555558e-06, + "logits/chosen": 35347702.4, + "logits/rejected": 34501942.4, + "logps/chosen": -148.804345703125, + "logps/rejected": -153.26226806640625, + "loss": 0.4377838134765625, + "rewards/chosen": 0.25958924293518065, + "rewards/margins": 0.4975349426269531, + "rewards/rejected": -0.23794569969177246, + "step": 3340 + }, + { + "epoch": 1.34, + "grad_norm": 0.8637145757675171, + "kl": 6.854001522064209, + "learning_rate": 1.8344444444444446e-06, + "logits/chosen": 37165539.2, + "logits/rejected": 35276435.2, + "logps/chosen": -165.85059814453126, + "logps/rejected": -189.76441650390626, + "loss": 0.46552677154541017, + "rewards/chosen": 0.28020381927490234, + "rewards/margins": 0.23775566816329957, + "rewards/rejected": 0.042448151111602786, + "step": 3350 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.7173047661781311, + "kl": 6.264387130737305, + "learning_rate": 1.8233333333333334e-06, + "logits/chosen": 29046736.0, + "logits/rejected": 27349148.8, + "logps/chosen": -160.81956787109374, + "logps/rejected": -144.21968994140624, + "loss": 0.4449786186218262, + "rewards/chosen": 0.4654555320739746, + "rewards/margins": 0.5440494477748871, + "rewards/rejected": -0.07859391570091248, + "step": 3360 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 0.6456140279769897, + "kl": 3.3577468395233154, + "learning_rate": 1.8122222222222224e-06, + "logits/chosen": 32191424.0, + "logits/rejected": 29473795.2, + "logps/chosen": -138.01751708984375, + "logps/rejected": -148.85108642578126, + "loss": 0.4563105583190918, + "rewards/chosen": -0.22999329566955568, + "rewards/margins": 0.3565816402435303, + "rewards/rejected": -0.586574935913086, + "step": 3370 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.5196599960327148, + "kl": 4.913334369659424, + "learning_rate": 1.8011111111111112e-06, + "logits/chosen": 29135721.6, + "logits/rejected": 27177129.6, + "logps/chosen": -161.2333984375, + "logps/rejected": -136.623095703125, + "loss": 0.42957291603088377, + "rewards/chosen": 0.3112953186035156, + "rewards/margins": 0.6790343999862671, + "rewards/rejected": -0.36773908138275146, + "step": 3380 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.5491528511047363, + "kl": 2.574375867843628, + "learning_rate": 1.79e-06, + "logits/chosen": 39787881.6, + "logits/rejected": 37482313.6, + "logps/chosen": -132.85360107421874, + "logps/rejected": -138.88128662109375, + "loss": 0.4316267490386963, + "rewards/chosen": -0.03256496787071228, + "rewards/margins": 0.6733869135379791, + "rewards/rejected": -0.7059518814086914, + "step": 3390 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.5999844670295715, + "kl": 5.321146011352539, + "learning_rate": 1.7788888888888892e-06, + "logits/chosen": 34423936.0, + "logits/rejected": 35799561.6, + "logps/chosen": -169.89874267578125, + "logps/rejected": -162.8022705078125, + "loss": 0.4550165176391602, + "rewards/chosen": 0.13871285915374756, + "rewards/margins": 0.4647700548171997, + "rewards/rejected": -0.32605719566345215, + "step": 3400 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.4790494441986084, + "eval_logits/chosen": 29217603.584, + "eval_logits/rejected": 29389656.064, + "eval_logps/chosen": -156.61384375, + "eval_logps/rejected": -153.2220625, + "eval_loss": 0.47793951630592346, + "eval_rewards/chosen": -0.22880656433105467, + "eval_rewards/margins": 0.2291276092529297, + "eval_rewards/rejected": -0.45793417358398436, + "eval_runtime": 216.674, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 3400 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 0.5353158712387085, + "kl": 3.89123272895813, + "learning_rate": 1.767777777777778e-06, + "logits/chosen": 30302416.0, + "logits/rejected": 30243564.8, + "logps/chosen": -163.517431640625, + "logps/rejected": -174.63206787109374, + "loss": 0.4478933334350586, + "rewards/chosen": 0.02733871340751648, + "rewards/margins": 0.6695918262004852, + "rewards/rejected": -0.6422531127929687, + "step": 3410 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.5308664441108704, + "kl": 3.7901718616485596, + "learning_rate": 1.7566666666666669e-06, + "logits/chosen": 30754256.0, + "logits/rejected": 27248163.2, + "logps/chosen": -130.69642333984376, + "logps/rejected": -149.67838134765626, + "loss": 0.454122257232666, + "rewards/chosen": 0.1454862356185913, + "rewards/margins": 0.4878753900527954, + "rewards/rejected": -0.3423891544342041, + "step": 3420 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 0.7273574471473694, + "kl": 4.090758323669434, + "learning_rate": 1.7455555555555557e-06, + "logits/chosen": 28009366.4, + "logits/rejected": 28281456.0, + "logps/chosen": -150.5, + "logps/rejected": -151.46649169921875, + "loss": 0.43383193016052246, + "rewards/chosen": 0.24967949390411376, + "rewards/margins": 0.7830450773239135, + "rewards/rejected": -0.5333655834197998, + "step": 3430 + }, + { + "epoch": 1.376, + "grad_norm": 0.3887825906276703, + "kl": 5.064081192016602, + "learning_rate": 1.7344444444444447e-06, + "logits/chosen": 27018854.4, + "logits/rejected": 24356123.2, + "logps/chosen": -130.98511962890626, + "logps/rejected": -151.60482177734374, + "loss": 0.43924894332885744, + "rewards/chosen": 0.3166049957275391, + "rewards/margins": 0.6815865993499757, + "rewards/rejected": -0.36498160362243653, + "step": 3440 + }, + { + "epoch": 1.38, + "grad_norm": 0.4775777757167816, + "kl": 7.2533087730407715, + "learning_rate": 1.7233333333333335e-06, + "logits/chosen": 38613673.6, + "logits/rejected": 41955670.4, + "logps/chosen": -176.4742919921875, + "logps/rejected": -157.8961181640625, + "loss": 0.4113172054290771, + "rewards/chosen": 0.7336381912231446, + "rewards/margins": 0.7889788269996644, + "rewards/rejected": -0.05534063577651978, + "step": 3450 + }, + { + "epoch": 1.384, + "grad_norm": 0.7163631916046143, + "kl": 5.157084941864014, + "learning_rate": 1.7122222222222223e-06, + "logits/chosen": 35435958.4, + "logits/rejected": 36708380.8, + "logps/chosen": -135.27044677734375, + "logps/rejected": -149.26287841796875, + "loss": 0.49365973472595215, + "rewards/chosen": 0.14373122453689574, + "rewards/margins": 0.07738589048385619, + "rewards/rejected": 0.06634533405303955, + "step": 3460 + }, + { + "epoch": 1.388, + "grad_norm": 0.6470320224761963, + "kl": 4.387451648712158, + "learning_rate": 1.7011111111111111e-06, + "logits/chosen": 29047792.0, + "logits/rejected": 28239174.4, + "logps/chosen": -131.78316650390624, + "logps/rejected": -120.0147216796875, + "loss": 0.4465984344482422, + "rewards/chosen": 0.3296776294708252, + "rewards/margins": 0.6029248237609863, + "rewards/rejected": -0.27324719429016114, + "step": 3470 + }, + { + "epoch": 1.392, + "grad_norm": 0.6309516429901123, + "kl": 7.3288164138793945, + "learning_rate": 1.6900000000000003e-06, + "logits/chosen": 36852099.2, + "logits/rejected": 35572294.4, + "logps/chosen": -133.08834228515624, + "logps/rejected": -156.20654296875, + "loss": 0.44939703941345216, + "rewards/chosen": 0.7472519397735595, + "rewards/margins": 0.5482589960098266, + "rewards/rejected": 0.19899294376373292, + "step": 3480 + }, + { + "epoch": 1.396, + "grad_norm": 0.7260765433311462, + "kl": 3.237916946411133, + "learning_rate": 1.6788888888888891e-06, + "logits/chosen": 30554035.2, + "logits/rejected": 28196780.8, + "logps/chosen": -123.18148193359374, + "logps/rejected": -137.39403076171874, + "loss": 0.43561625480651855, + "rewards/chosen": 0.26550750732421874, + "rewards/margins": 0.6841002464294433, + "rewards/rejected": -0.4185927391052246, + "step": 3490 + }, + { + "epoch": 1.4, + "grad_norm": 0.4278745949268341, + "kl": 5.290976047515869, + "learning_rate": 1.667777777777778e-06, + "logits/chosen": 29699056.0, + "logits/rejected": 30936454.4, + "logps/chosen": -153.85557861328124, + "logps/rejected": -122.64144287109374, + "loss": 0.4286449909210205, + "rewards/chosen": 0.49454541206359864, + "rewards/margins": 0.6557791233062744, + "rewards/rejected": -0.16123371124267577, + "step": 3500 + }, + { + "epoch": 1.404, + "grad_norm": 0.7629099488258362, + "kl": 6.994016170501709, + "learning_rate": 1.6566666666666668e-06, + "logits/chosen": 26804969.6, + "logits/rejected": 27539884.8, + "logps/chosen": -131.692041015625, + "logps/rejected": -185.0650146484375, + "loss": 0.4522398948669434, + "rewards/chosen": 0.5013983726501465, + "rewards/margins": 0.40837590694427495, + "rewards/rejected": 0.09302246570587158, + "step": 3510 + }, + { + "epoch": 1.408, + "grad_norm": 0.67551189661026, + "kl": 4.912568092346191, + "learning_rate": 1.6455555555555558e-06, + "logits/chosen": 27143923.2, + "logits/rejected": 26963395.2, + "logps/chosen": -149.304931640625, + "logps/rejected": -139.75704345703124, + "loss": 0.44557414054870603, + "rewards/chosen": 0.3085124731063843, + "rewards/margins": 0.6712681531906128, + "rewards/rejected": -0.36275568008422854, + "step": 3520 + }, + { + "epoch": 1.412, + "grad_norm": 0.7234175801277161, + "kl": 5.856083869934082, + "learning_rate": 1.6344444444444446e-06, + "logits/chosen": 33198009.6, + "logits/rejected": 32032864.0, + "logps/chosen": -150.87943115234376, + "logps/rejected": -153.37586669921876, + "loss": 0.4352092742919922, + "rewards/chosen": 0.3994121074676514, + "rewards/margins": 0.5765307188034058, + "rewards/rejected": -0.1771186113357544, + "step": 3530 + }, + { + "epoch": 1.416, + "grad_norm": 0.7202039361000061, + "kl": 5.610236167907715, + "learning_rate": 1.6233333333333334e-06, + "logits/chosen": 25017616.0, + "logits/rejected": 26410630.4, + "logps/chosen": -187.0184814453125, + "logps/rejected": -115.09561767578126, + "loss": 0.4591636657714844, + "rewards/chosen": 0.3089368104934692, + "rewards/margins": 0.34654129743576045, + "rewards/rejected": -0.03760448694229126, + "step": 3540 + }, + { + "epoch": 1.42, + "grad_norm": 0.7653972506523132, + "kl": 4.185455322265625, + "learning_rate": 1.6122222222222222e-06, + "logits/chosen": 27766281.6, + "logits/rejected": 24358944.0, + "logps/chosen": -125.4722412109375, + "logps/rejected": -162.280615234375, + "loss": 0.4453754901885986, + "rewards/chosen": 0.24888882637023926, + "rewards/margins": 0.640946888923645, + "rewards/rejected": -0.39205806255340575, + "step": 3550 + }, + { + "epoch": 1.424, + "grad_norm": 0.5244606137275696, + "kl": 5.084301948547363, + "learning_rate": 1.6011111111111114e-06, + "logits/chosen": 38255152.0, + "logits/rejected": 35147708.8, + "logps/chosen": -169.7523681640625, + "logps/rejected": -189.486328125, + "loss": 0.4723203659057617, + "rewards/chosen": 0.08335857987403869, + "rewards/margins": 0.32414146065711974, + "rewards/rejected": -0.24078288078308105, + "step": 3560 + }, + { + "epoch": 1.428, + "grad_norm": 0.7249192595481873, + "kl": 6.4514336585998535, + "learning_rate": 1.5900000000000002e-06, + "logits/chosen": 31364140.8, + "logits/rejected": 32550483.2, + "logps/chosen": -136.05145263671875, + "logps/rejected": -165.39798583984376, + "loss": 0.43005828857421874, + "rewards/chosen": 0.6245347499847412, + "rewards/margins": 0.6684352219104767, + "rewards/rejected": -0.04390047192573547, + "step": 3570 + }, + { + "epoch": 1.432, + "grad_norm": 0.5767175555229187, + "kl": 3.358916759490967, + "learning_rate": 1.578888888888889e-06, + "logits/chosen": 34861779.2, + "logits/rejected": 36117113.6, + "logps/chosen": -153.4178466796875, + "logps/rejected": -138.995556640625, + "loss": 0.46177167892456056, + "rewards/chosen": 0.06014393568038941, + "rewards/margins": 0.33000377416610716, + "rewards/rejected": -0.26985983848571776, + "step": 3580 + }, + { + "epoch": 1.436, + "grad_norm": 0.8270474076271057, + "kl": 3.5596280097961426, + "learning_rate": 1.5677777777777778e-06, + "logits/chosen": 24936995.2, + "logits/rejected": 25041536.0, + "logps/chosen": -142.94605712890626, + "logps/rejected": -115.97581787109375, + "loss": 0.46255645751953123, + "rewards/chosen": 0.09056978225708008, + "rewards/margins": 0.3586315393447876, + "rewards/rejected": -0.2680617570877075, + "step": 3590 + }, + { + "epoch": 1.44, + "grad_norm": 0.712232768535614, + "kl": 3.8768982887268066, + "learning_rate": 1.5566666666666669e-06, + "logits/chosen": 31608688.0, + "logits/rejected": 29185264.0, + "logps/chosen": -135.2075439453125, + "logps/rejected": -174.82825927734376, + "loss": 0.45167975425720214, + "rewards/chosen": 0.0786507248878479, + "rewards/margins": 0.565507709980011, + "rewards/rejected": -0.4868569850921631, + "step": 3600 + }, + { + "epoch": 1.44, + "eval_kl": 4.0966901779174805, + "eval_logits/chosen": 30547773.44, + "eval_logits/rejected": 30678024.192, + "eval_logps/chosen": -154.22709375, + "eval_logps/rejected": -150.8435625, + "eval_loss": 0.47759392857551575, + "eval_rewards/chosen": 0.009867694854736328, + "eval_rewards/margins": 0.22995056533813477, + "eval_rewards/rejected": -0.22008287048339845, + "eval_runtime": 216.6033, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 3600 + }, + { + "epoch": 1.444, + "grad_norm": 0.6500243544578552, + "kl": 4.5812273025512695, + "learning_rate": 1.5455555555555557e-06, + "logits/chosen": 35394022.4, + "logits/rejected": 35018995.2, + "logps/chosen": -116.204833984375, + "logps/rejected": -184.157275390625, + "loss": 0.4943058490753174, + "rewards/chosen": -0.007693278789520264, + "rewards/margins": 0.12241411209106444, + "rewards/rejected": -0.1301073908805847, + "step": 3610 + }, + { + "epoch": 1.448, + "grad_norm": 0.6954056024551392, + "kl": 5.1914801597595215, + "learning_rate": 1.5344444444444445e-06, + "logits/chosen": 44866396.8, + "logits/rejected": 43301910.4, + "logps/chosen": -144.904541015625, + "logps/rejected": -164.3775390625, + "loss": 0.41394357681274413, + "rewards/chosen": 0.45650997161865237, + "rewards/margins": 0.8667933464050293, + "rewards/rejected": -0.41028337478637694, + "step": 3620 + }, + { + "epoch": 1.452, + "grad_norm": 0.6982813477516174, + "kl": 4.627970218658447, + "learning_rate": 1.5233333333333333e-06, + "logits/chosen": 33511337.6, + "logits/rejected": 33426937.6, + "logps/chosen": -176.5933837890625, + "logps/rejected": -148.85958251953124, + "loss": 0.43149843215942385, + "rewards/chosen": 0.27549741268157957, + "rewards/margins": 0.6807630777359008, + "rewards/rejected": -0.40526566505432127, + "step": 3630 + }, + { + "epoch": 1.456, + "grad_norm": 0.472672700881958, + "kl": 4.886686325073242, + "learning_rate": 1.5122222222222225e-06, + "logits/chosen": 21423132.8, + "logits/rejected": 20745464.0, + "logps/chosen": -100.18471069335938, + "logps/rejected": -142.314697265625, + "loss": 0.45476489067077636, + "rewards/chosen": 0.453489875793457, + "rewards/margins": 0.5046129763126374, + "rewards/rejected": -0.0511231005191803, + "step": 3640 + }, + { + "epoch": 1.46, + "grad_norm": 0.6913832426071167, + "kl": 4.06036901473999, + "learning_rate": 1.5011111111111113e-06, + "logits/chosen": 41536867.2, + "logits/rejected": 40642899.2, + "logps/chosen": -215.0681396484375, + "logps/rejected": -168.06806640625, + "loss": 0.4454173564910889, + "rewards/chosen": -0.10317556858062744, + "rewards/margins": 0.5138320684432983, + "rewards/rejected": -0.6170076370239258, + "step": 3650 + }, + { + "epoch": 1.464, + "grad_norm": 0.5362917184829712, + "kl": 3.2193520069122314, + "learning_rate": 1.4900000000000001e-06, + "logits/chosen": 25776776.0, + "logits/rejected": 22911784.0, + "logps/chosen": -130.6083984375, + "logps/rejected": -168.973876953125, + "loss": 0.41051359176635743, + "rewards/chosen": 0.19750649929046632, + "rewards/margins": 0.9899675607681275, + "rewards/rejected": -0.7924610614776612, + "step": 3660 + }, + { + "epoch": 1.468, + "grad_norm": 0.9869509935379028, + "kl": 4.485353946685791, + "learning_rate": 1.478888888888889e-06, + "logits/chosen": 27324668.8, + "logits/rejected": 27301977.6, + "logps/chosen": -132.47244873046876, + "logps/rejected": -162.38021240234374, + "loss": 0.45110092163085935, + "rewards/chosen": -0.08881351947784424, + "rewards/margins": 0.4330620527267456, + "rewards/rejected": -0.5218755722045898, + "step": 3670 + }, + { + "epoch": 1.472, + "grad_norm": 0.8374236822128296, + "kl": 5.5675249099731445, + "learning_rate": 1.467777777777778e-06, + "logits/chosen": 30133260.8, + "logits/rejected": 27502342.4, + "logps/chosen": -132.419970703125, + "logps/rejected": -164.09249267578124, + "loss": 0.4621445655822754, + "rewards/chosen": 0.29708335399627683, + "rewards/margins": 0.5296570777893066, + "rewards/rejected": -0.2325737237930298, + "step": 3680 + }, + { + "epoch": 1.476, + "grad_norm": 0.5651530623435974, + "kl": 3.6750998497009277, + "learning_rate": 1.4566666666666668e-06, + "logits/chosen": 28068777.6, + "logits/rejected": 24100668.8, + "logps/chosen": -176.51123046875, + "logps/rejected": -184.35858154296875, + "loss": 0.4131460666656494, + "rewards/chosen": 0.15955194234848022, + "rewards/margins": 1.0650185942649841, + "rewards/rejected": -0.9054666519165039, + "step": 3690 + }, + { + "epoch": 1.48, + "grad_norm": 0.6101991534233093, + "kl": 3.6094698905944824, + "learning_rate": 1.4455555555555556e-06, + "logits/chosen": 28100012.8, + "logits/rejected": 26111475.2, + "logps/chosen": -138.737109375, + "logps/rejected": -156.04112548828124, + "loss": 0.4490304470062256, + "rewards/chosen": 0.15077462196350097, + "rewards/margins": 0.5411154270172119, + "rewards/rejected": -0.39034080505371094, + "step": 3700 + }, + { + "epoch": 1.484, + "grad_norm": 0.8218708038330078, + "kl": 3.022378444671631, + "learning_rate": 1.4344444444444446e-06, + "logits/chosen": 18033281.6, + "logits/rejected": 19812489.6, + "logps/chosen": -134.19481201171874, + "logps/rejected": -143.6683837890625, + "loss": 0.4570739269256592, + "rewards/chosen": -0.3303727626800537, + "rewards/margins": 0.5034278392791749, + "rewards/rejected": -0.8338006019592286, + "step": 3710 + }, + { + "epoch": 1.488, + "grad_norm": 0.6318350434303284, + "kl": 3.771150588989258, + "learning_rate": 1.4233333333333336e-06, + "logits/chosen": 26958435.2, + "logits/rejected": 23352366.4, + "logps/chosen": -178.9782470703125, + "logps/rejected": -194.87041015625, + "loss": 0.4509871482849121, + "rewards/chosen": -0.14689927101135253, + "rewards/margins": 0.5733826160430908, + "rewards/rejected": -0.7202818870544434, + "step": 3720 + }, + { + "epoch": 1.492, + "grad_norm": 0.6234843730926514, + "kl": 5.08270263671875, + "learning_rate": 1.4122222222222224e-06, + "logits/chosen": 24577870.4, + "logits/rejected": 24378513.6, + "logps/chosen": -144.27496337890625, + "logps/rejected": -157.0753662109375, + "loss": 0.439809513092041, + "rewards/chosen": 0.2378466844558716, + "rewards/margins": 0.495815110206604, + "rewards/rejected": -0.2579684257507324, + "step": 3730 + }, + { + "epoch": 1.496, + "grad_norm": 0.6093852519989014, + "kl": 3.4414265155792236, + "learning_rate": 1.4011111111111112e-06, + "logits/chosen": 25166454.4, + "logits/rejected": 25530366.4, + "logps/chosen": -121.027001953125, + "logps/rejected": -122.510009765625, + "loss": 0.45766735076904297, + "rewards/chosen": 0.05086352825164795, + "rewards/margins": 0.45518562793731693, + "rewards/rejected": -0.40432209968566896, + "step": 3740 + }, + { + "epoch": 1.5, + "grad_norm": 0.6537352204322815, + "kl": 3.7078990936279297, + "learning_rate": 1.3900000000000002e-06, + "logits/chosen": 19157790.4, + "logits/rejected": 19428764.8, + "logps/chosen": -108.5373046875, + "logps/rejected": -131.371728515625, + "loss": 0.4350168228149414, + "rewards/chosen": 0.38070919513702395, + "rewards/margins": 0.6320278406143189, + "rewards/rejected": -0.25131864547729493, + "step": 3750 + }, + { + "epoch": 1.504, + "grad_norm": 0.7157226204872131, + "kl": 3.980473041534424, + "learning_rate": 1.378888888888889e-06, + "logits/chosen": 24798140.8, + "logits/rejected": 23861692.8, + "logps/chosen": -141.9797607421875, + "logps/rejected": -169.5621337890625, + "loss": 0.39097282886505125, + "rewards/chosen": 0.2861147403717041, + "rewards/margins": 1.1024574756622314, + "rewards/rejected": -0.8163427352905274, + "step": 3760 + }, + { + "epoch": 1.508, + "grad_norm": 0.5401111245155334, + "kl": 3.915759563446045, + "learning_rate": 1.3677777777777779e-06, + "logits/chosen": 27341155.2, + "logits/rejected": 23659843.2, + "logps/chosen": -167.150537109375, + "logps/rejected": -156.52796630859376, + "loss": 0.4424854278564453, + "rewards/chosen": 0.20697882175445556, + "rewards/margins": 0.6797100305557251, + "rewards/rejected": -0.4727312088012695, + "step": 3770 + }, + { + "epoch": 1.512, + "grad_norm": 0.5865006446838379, + "kl": 3.9945666790008545, + "learning_rate": 1.3566666666666667e-06, + "logits/chosen": 43287993.6, + "logits/rejected": 42994304.0, + "logps/chosen": -151.379150390625, + "logps/rejected": -170.74891357421876, + "loss": 0.46252665519714353, + "rewards/chosen": 0.28137707710266113, + "rewards/margins": 0.3768645763397217, + "rewards/rejected": -0.09548749923706054, + "step": 3780 + }, + { + "epoch": 1.516, + "grad_norm": 0.7108325362205505, + "kl": 5.5239386558532715, + "learning_rate": 1.3455555555555557e-06, + "logits/chosen": 25728556.8, + "logits/rejected": 25374808.0, + "logps/chosen": -126.28800048828126, + "logps/rejected": -141.21763916015624, + "loss": 0.4297455310821533, + "rewards/chosen": 0.6269711494445801, + "rewards/margins": 0.7478980660438538, + "rewards/rejected": -0.12092691659927368, + "step": 3790 + }, + { + "epoch": 1.52, + "grad_norm": 0.5811319947242737, + "kl": 5.248955726623535, + "learning_rate": 1.3344444444444447e-06, + "logits/chosen": 33503753.6, + "logits/rejected": 31553878.4, + "logps/chosen": -178.1581298828125, + "logps/rejected": -143.417431640625, + "loss": 0.45870108604431153, + "rewards/chosen": 0.4684587001800537, + "rewards/margins": 0.4509533554315567, + "rewards/rejected": 0.01750534474849701, + "step": 3800 + }, + { + "epoch": 1.52, + "eval_kl": 4.183420181274414, + "eval_logits/chosen": 30411423.744, + "eval_logits/rejected": 30603616.256, + "eval_logps/chosen": -154.112203125, + "eval_logps/rejected": -150.76103125, + "eval_loss": 0.4779178202152252, + "eval_rewards/chosen": 0.021359254837036133, + "eval_rewards/margins": 0.23318927192687988, + "eval_rewards/rejected": -0.21183001708984375, + "eval_runtime": 217.1598, + "eval_samples_per_second": 4.605, + "eval_steps_per_second": 2.302, + "step": 3800 + }, + { + "epoch": 1.524, + "grad_norm": 0.7062050700187683, + "kl": 4.107216835021973, + "learning_rate": 1.3233333333333335e-06, + "logits/chosen": 26509099.2, + "logits/rejected": 24838300.8, + "logps/chosen": -137.1124267578125, + "logps/rejected": -149.33499755859376, + "loss": 0.44512219429016114, + "rewards/chosen": 0.20196728706359862, + "rewards/margins": 0.5584580659866333, + "rewards/rejected": -0.35649077892303466, + "step": 3810 + }, + { + "epoch": 1.528, + "grad_norm": 0.43405744433403015, + "kl": 6.5067572593688965, + "learning_rate": 1.3122222222222223e-06, + "logits/chosen": 34846220.8, + "logits/rejected": 33152172.8, + "logps/chosen": -144.03416748046874, + "logps/rejected": -156.00250244140625, + "loss": 0.3955928564071655, + "rewards/chosen": 0.8109316825866699, + "rewards/margins": 1.0338047742843628, + "rewards/rejected": -0.22287309169769287, + "step": 3820 + }, + { + "epoch": 1.532, + "grad_norm": 0.48609739542007446, + "kl": 4.6954779624938965, + "learning_rate": 1.3011111111111113e-06, + "logits/chosen": 24615228.8, + "logits/rejected": 25253913.6, + "logps/chosen": -152.94261474609374, + "logps/rejected": -162.1392578125, + "loss": 0.44533653259277345, + "rewards/chosen": -0.09745782017707824, + "rewards/margins": 0.5474663436412811, + "rewards/rejected": -0.6449241638183594, + "step": 3830 + }, + { + "epoch": 1.536, + "grad_norm": 0.8033897280693054, + "kl": 4.541080951690674, + "learning_rate": 1.2900000000000001e-06, + "logits/chosen": 23824740.8, + "logits/rejected": 25749264.0, + "logps/chosen": -148.12535400390624, + "logps/rejected": -132.90029296875, + "loss": 0.4263105869293213, + "rewards/chosen": 0.34121017456054686, + "rewards/margins": 0.7844597339630126, + "rewards/rejected": -0.4432495594024658, + "step": 3840 + }, + { + "epoch": 1.54, + "grad_norm": 0.5979334115982056, + "kl": 3.1920642852783203, + "learning_rate": 1.278888888888889e-06, + "logits/chosen": 25940036.8, + "logits/rejected": 25332123.2, + "logps/chosen": -125.19688720703125, + "logps/rejected": -124.583349609375, + "loss": 0.42458858489990237, + "rewards/chosen": 0.11553690433502198, + "rewards/margins": 0.7247669935226441, + "rewards/rejected": -0.6092300891876221, + "step": 3850 + }, + { + "epoch": 1.544, + "grad_norm": 0.6010407209396362, + "kl": 7.0015411376953125, + "learning_rate": 1.2677777777777778e-06, + "logits/chosen": 28349667.2, + "logits/rejected": 29532544.0, + "logps/chosen": -160.0486083984375, + "logps/rejected": -148.159619140625, + "loss": 0.433948278427124, + "rewards/chosen": 0.6110920429229736, + "rewards/margins": 0.6498981416225433, + "rewards/rejected": -0.038806098699569705, + "step": 3860 + }, + { + "epoch": 1.548, + "grad_norm": 0.5501318573951721, + "kl": 5.648660659790039, + "learning_rate": 1.2566666666666668e-06, + "logits/chosen": 26864864.0, + "logits/rejected": 25404660.8, + "logps/chosen": -173.2337890625, + "logps/rejected": -160.03525390625, + "loss": 0.43099102973937986, + "rewards/chosen": 0.3765087604522705, + "rewards/margins": 0.7873351097106933, + "rewards/rejected": -0.4108263492584229, + "step": 3870 + }, + { + "epoch": 1.552, + "grad_norm": 0.5478479862213135, + "kl": 3.376481294631958, + "learning_rate": 1.2455555555555556e-06, + "logits/chosen": 32903283.2, + "logits/rejected": 32423433.6, + "logps/chosen": -146.17325439453126, + "logps/rejected": -139.81845703125, + "loss": 0.4483139991760254, + "rewards/chosen": 0.12561094760894775, + "rewards/margins": 0.3918390512466431, + "rewards/rejected": -0.26622810363769533, + "step": 3880 + }, + { + "epoch": 1.556, + "grad_norm": 0.6676125526428223, + "kl": 4.547513484954834, + "learning_rate": 1.2344444444444446e-06, + "logits/chosen": 37385532.8, + "logits/rejected": 38763891.2, + "logps/chosen": -155.23748779296875, + "logps/rejected": -168.7784912109375, + "loss": 0.4901449203491211, + "rewards/chosen": 0.16143620014190674, + "rewards/margins": 0.11462950706481934, + "rewards/rejected": 0.0468066930770874, + "step": 3890 + }, + { + "epoch": 1.56, + "grad_norm": 0.7040978670120239, + "kl": 5.753296852111816, + "learning_rate": 1.2233333333333334e-06, + "logits/chosen": 34341609.6, + "logits/rejected": 33446153.6, + "logps/chosen": -145.878369140625, + "logps/rejected": -180.95491943359374, + "loss": 0.4818913459777832, + "rewards/chosen": 0.37144837379455564, + "rewards/margins": 0.2124497532844543, + "rewards/rejected": 0.15899862051010133, + "step": 3900 + }, + { + "epoch": 1.564, + "grad_norm": 0.48866015672683716, + "kl": 5.357041358947754, + "learning_rate": 1.2122222222222222e-06, + "logits/chosen": 27682144.0, + "logits/rejected": 28275977.6, + "logps/chosen": -150.2932861328125, + "logps/rejected": -144.590478515625, + "loss": 0.4552725315093994, + "rewards/chosen": 0.34428427219390867, + "rewards/margins": 0.4334153890609741, + "rewards/rejected": -0.08913111686706543, + "step": 3910 + }, + { + "epoch": 1.568, + "grad_norm": 0.536604642868042, + "kl": 4.181241512298584, + "learning_rate": 1.2011111111111112e-06, + "logits/chosen": 33525574.4, + "logits/rejected": 32405542.4, + "logps/chosen": -137.73907470703125, + "logps/rejected": -178.59801025390624, + "loss": 0.4515504837036133, + "rewards/chosen": 0.20468955039978026, + "rewards/margins": 0.46134023666381835, + "rewards/rejected": -0.2566506862640381, + "step": 3920 + }, + { + "epoch": 1.572, + "grad_norm": 0.7674170136451721, + "kl": 4.251042366027832, + "learning_rate": 1.19e-06, + "logits/chosen": 40275475.2, + "logits/rejected": 36563088.0, + "logps/chosen": -187.68133544921875, + "logps/rejected": -170.8183349609375, + "loss": 0.406461238861084, + "rewards/chosen": 0.5473237037658691, + "rewards/margins": 1.11146821975708, + "rewards/rejected": -0.564144515991211, + "step": 3930 + }, + { + "epoch": 1.576, + "grad_norm": 0.483766108751297, + "kl": 4.224446773529053, + "learning_rate": 1.178888888888889e-06, + "logits/chosen": 25433030.4, + "logits/rejected": 25341385.6, + "logps/chosen": -117.7307861328125, + "logps/rejected": -139.6504150390625, + "loss": 0.42197356224060056, + "rewards/chosen": 0.24890828132629395, + "rewards/margins": 0.7226192951202393, + "rewards/rejected": -0.4737110137939453, + "step": 3940 + }, + { + "epoch": 1.58, + "grad_norm": 0.5180490016937256, + "kl": 3.8868117332458496, + "learning_rate": 1.1677777777777779e-06, + "logits/chosen": 33638163.2, + "logits/rejected": 34016854.4, + "logps/chosen": -129.69632568359376, + "logps/rejected": -143.5869384765625, + "loss": 0.43038105964660645, + "rewards/chosen": 0.3097927808761597, + "rewards/margins": 0.6986382722854614, + "rewards/rejected": -0.38884549140930175, + "step": 3950 + }, + { + "epoch": 1.584, + "grad_norm": 0.5631889700889587, + "kl": 3.5186545848846436, + "learning_rate": 1.1566666666666667e-06, + "logits/chosen": 30367190.4, + "logits/rejected": 29603392.0, + "logps/chosen": -153.10296630859375, + "logps/rejected": -163.72001953125, + "loss": 0.4525346279144287, + "rewards/chosen": -0.12654991149902345, + "rewards/margins": 0.4131012439727783, + "rewards/rejected": -0.5396511554718018, + "step": 3960 + }, + { + "epoch": 1.588, + "grad_norm": 0.5241718292236328, + "kl": 4.312124729156494, + "learning_rate": 1.1455555555555557e-06, + "logits/chosen": 24180620.8, + "logits/rejected": 26728572.8, + "logps/chosen": -140.080517578125, + "logps/rejected": -115.592919921875, + "loss": 0.4370166301727295, + "rewards/chosen": 0.2027698278427124, + "rewards/margins": 0.536671781539917, + "rewards/rejected": -0.3339019536972046, + "step": 3970 + }, + { + "epoch": 1.592, + "grad_norm": 0.5848884582519531, + "kl": 4.956355571746826, + "learning_rate": 1.1344444444444445e-06, + "logits/chosen": 29427676.8, + "logits/rejected": 25774260.8, + "logps/chosen": -147.395361328125, + "logps/rejected": -165.61865234375, + "loss": 0.4145470142364502, + "rewards/chosen": 0.4822521686553955, + "rewards/margins": 0.6998722791671753, + "rewards/rejected": -0.2176201105117798, + "step": 3980 + }, + { + "epoch": 1.596, + "grad_norm": 0.7140029668807983, + "kl": 5.854944705963135, + "learning_rate": 1.1233333333333333e-06, + "logits/chosen": 39306499.2, + "logits/rejected": 34811254.4, + "logps/chosen": -137.74361572265624, + "logps/rejected": -148.427978515625, + "loss": 0.41861691474914553, + "rewards/chosen": 0.6944310188293457, + "rewards/margins": 1.1321285247802733, + "rewards/rejected": -0.4376975059509277, + "step": 3990 + }, + { + "epoch": 1.6, + "grad_norm": 0.7309412360191345, + "kl": 3.7196297645568848, + "learning_rate": 1.1122222222222223e-06, + "logits/chosen": 29282588.8, + "logits/rejected": 28362857.6, + "logps/chosen": -168.0974365234375, + "logps/rejected": -160.11337890625, + "loss": 0.4827260971069336, + "rewards/chosen": -0.2844557285308838, + "rewards/margins": -0.075036096572876, + "rewards/rejected": -0.2094196319580078, + "step": 4000 + }, + { + "epoch": 1.6, + "eval_kl": 4.738241195678711, + "eval_logits/chosen": 29865963.52, + "eval_logits/rejected": 30057035.776, + "eval_logps/chosen": -153.191, + "eval_logps/rejected": -149.831890625, + "eval_loss": 0.4781652092933655, + "eval_rewards/chosen": 0.1134777603149414, + "eval_rewards/margins": 0.23239292907714842, + "eval_rewards/rejected": -0.11891516876220704, + "eval_runtime": 216.5956, + "eval_samples_per_second": 4.617, + "eval_steps_per_second": 2.308, + "step": 4000 + }, + { + "epoch": 1.604, + "grad_norm": 0.6143787503242493, + "kl": 5.21004581451416, + "learning_rate": 1.1011111111111113e-06, + "logits/chosen": 31273155.2, + "logits/rejected": 31765638.4, + "logps/chosen": -172.1530029296875, + "logps/rejected": -172.15975341796874, + "loss": 0.4306319713592529, + "rewards/chosen": 0.42154908180236816, + "rewards/margins": 0.6504992485046387, + "rewards/rejected": -0.2289501667022705, + "step": 4010 + }, + { + "epoch": 1.608, + "grad_norm": 0.3771494925022125, + "kl": 4.662692070007324, + "learning_rate": 1.0900000000000002e-06, + "logits/chosen": 33774937.6, + "logits/rejected": 34804582.4, + "logps/chosen": -162.08702392578124, + "logps/rejected": -170.07498779296876, + "loss": 0.4483354568481445, + "rewards/chosen": -0.026274442672729492, + "rewards/margins": 0.4997582912445069, + "rewards/rejected": -0.5260327339172364, + "step": 4020 + }, + { + "epoch": 1.612, + "grad_norm": 0.8712416291236877, + "kl": 4.398558616638184, + "learning_rate": 1.078888888888889e-06, + "logits/chosen": 23578403.2, + "logits/rejected": 21598651.2, + "logps/chosen": -131.9225830078125, + "logps/rejected": -196.6409423828125, + "loss": 0.441709041595459, + "rewards/chosen": 0.29367847442626954, + "rewards/margins": 0.6416111469268799, + "rewards/rejected": -0.34793267250061033, + "step": 4030 + }, + { + "epoch": 1.616, + "grad_norm": 0.7502478957176208, + "kl": 4.180516242980957, + "learning_rate": 1.0677777777777778e-06, + "logits/chosen": 24851664.0, + "logits/rejected": 24331059.2, + "logps/chosen": -134.76131591796874, + "logps/rejected": -146.1375, + "loss": 0.42934479713439944, + "rewards/chosen": 0.15300320386886596, + "rewards/margins": 0.7616443037986755, + "rewards/rejected": -0.6086410999298095, + "step": 4040 + }, + { + "epoch": 1.62, + "grad_norm": 0.6729795932769775, + "kl": 3.5344510078430176, + "learning_rate": 1.0566666666666668e-06, + "logits/chosen": 30835174.4, + "logits/rejected": 30916979.2, + "logps/chosen": -167.9162841796875, + "logps/rejected": -148.01121826171874, + "loss": 0.40194120407104494, + "rewards/chosen": 0.18118813037872314, + "rewards/margins": 1.300507092475891, + "rewards/rejected": -1.119318962097168, + "step": 4050 + }, + { + "epoch": 1.624, + "grad_norm": 0.8483315110206604, + "kl": 4.144981384277344, + "learning_rate": 1.0455555555555556e-06, + "logits/chosen": 23790352.0, + "logits/rejected": 21096166.4, + "logps/chosen": -167.572314453125, + "logps/rejected": -160.46065673828124, + "loss": 0.4239004135131836, + "rewards/chosen": 0.40131430625915526, + "rewards/margins": 0.9644507408142089, + "rewards/rejected": -0.5631364345550537, + "step": 4060 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 0.7283557653427124, + "kl": 5.704648971557617, + "learning_rate": 1.0344444444444446e-06, + "logits/chosen": 34577296.0, + "logits/rejected": 28464720.0, + "logps/chosen": -131.04425048828125, + "logps/rejected": -170.54326171875, + "loss": 0.3883501052856445, + "rewards/chosen": 0.8470425605773926, + "rewards/margins": 1.1756995677948, + "rewards/rejected": -0.3286570072174072, + "step": 4070 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.7542695999145508, + "kl": 5.014037132263184, + "learning_rate": 1.0233333333333334e-06, + "logits/chosen": 33581536.0, + "logits/rejected": 31188915.2, + "logps/chosen": -200.83006591796874, + "logps/rejected": -201.557470703125, + "loss": 0.4472477912902832, + "rewards/chosen": 0.1391082763671875, + "rewards/margins": 0.5890022277832031, + "rewards/rejected": -0.4498939514160156, + "step": 4080 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 0.699129045009613, + "kl": 6.782160758972168, + "learning_rate": 1.0122222222222224e-06, + "logits/chosen": 33043475.2, + "logits/rejected": 32764982.4, + "logps/chosen": -178.3090087890625, + "logps/rejected": -168.98370361328125, + "loss": 0.42490806579589846, + "rewards/chosen": 0.7757717132568359, + "rewards/margins": 0.9651144385337829, + "rewards/rejected": -0.189342725276947, + "step": 4090 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.7144243717193604, + "kl": 3.5951950550079346, + "learning_rate": 1.0011111111111112e-06, + "logits/chosen": 24177262.4, + "logits/rejected": 20001184.0, + "logps/chosen": -142.5383056640625, + "logps/rejected": -174.16627197265626, + "loss": 0.4106290817260742, + "rewards/chosen": 0.029287612438201903, + "rewards/margins": 1.2375101923942566, + "rewards/rejected": -1.2082225799560546, + "step": 4100 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 0.7295018434524536, + "kl": 3.5175278186798096, + "learning_rate": 9.9e-07, + "logits/chosen": 37862601.6, + "logits/rejected": 34241705.6, + "logps/chosen": -180.07008056640626, + "logps/rejected": -165.27158203125, + "loss": 0.41535110473632814, + "rewards/chosen": 0.3609702348709106, + "rewards/margins": 0.8377941370010376, + "rewards/rejected": -0.47682390213012693, + "step": 4110 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.687157154083252, + "kl": 4.200056076049805, + "learning_rate": 9.788888888888889e-07, + "logits/chosen": 30389494.4, + "logits/rejected": 33020348.8, + "logps/chosen": -205.723876953125, + "logps/rejected": -189.8884765625, + "loss": 0.4510765075683594, + "rewards/chosen": -0.2252514600753784, + "rewards/margins": 0.257360577583313, + "rewards/rejected": -0.48261203765869143, + "step": 4120 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 0.6598585844039917, + "kl": 5.19122838973999, + "learning_rate": 9.677777777777779e-07, + "logits/chosen": 25100822.4, + "logits/rejected": 22434776.0, + "logps/chosen": -139.93997802734376, + "logps/rejected": -148.89000244140624, + "loss": 0.42380781173706056, + "rewards/chosen": 0.25194945335388186, + "rewards/margins": 0.616188907623291, + "rewards/rejected": -0.3642394542694092, + "step": 4130 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.596368134021759, + "kl": 4.2246832847595215, + "learning_rate": 9.566666666666667e-07, + "logits/chosen": 23562168.0, + "logits/rejected": 19095112.0, + "logps/chosen": -167.05655517578126, + "logps/rejected": -163.794482421875, + "loss": 0.42250747680664064, + "rewards/chosen": 0.26157207489013673, + "rewards/margins": 0.9496047973632813, + "rewards/rejected": -0.6880327224731445, + "step": 4140 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.9235984683036804, + "kl": 4.401445388793945, + "learning_rate": 9.455555555555557e-07, + "logits/chosen": 23643918.4, + "logits/rejected": 26274195.2, + "logps/chosen": -147.10347900390624, + "logps/rejected": -122.74593505859374, + "loss": 0.4525291919708252, + "rewards/chosen": 0.2129079818725586, + "rewards/margins": 0.4618348360061646, + "rewards/rejected": -0.24892685413360596, + "step": 4150 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.5369182825088501, + "kl": 4.616759777069092, + "learning_rate": 9.344444444444445e-07, + "logits/chosen": 22211737.6, + "logits/rejected": 20181283.2, + "logps/chosen": -121.6099365234375, + "logps/rejected": -147.121728515625, + "loss": 0.4210421085357666, + "rewards/chosen": 0.3621690273284912, + "rewards/margins": 0.832556676864624, + "rewards/rejected": -0.4703876495361328, + "step": 4160 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 0.7842811942100525, + "kl": 3.4067413806915283, + "learning_rate": 9.233333333333334e-07, + "logits/chosen": 27008988.8, + "logits/rejected": 25419982.4, + "logps/chosen": -170.2637939453125, + "logps/rejected": -197.46827392578126, + "loss": 0.4370081424713135, + "rewards/chosen": -0.2569821834564209, + "rewards/margins": 0.6670010089874268, + "rewards/rejected": -0.9239831924438476, + "step": 4170 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.9903603196144104, + "kl": 2.7231457233428955, + "learning_rate": 9.122222222222222e-07, + "logits/chosen": 24366032.0, + "logits/rejected": 23620707.2, + "logps/chosen": -151.88421630859375, + "logps/rejected": -164.09239501953124, + "loss": 0.42814011573791505, + "rewards/chosen": 0.023269623517990112, + "rewards/margins": 0.6117547690868378, + "rewards/rejected": -0.5884851455688477, + "step": 4180 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 0.8565585613250732, + "kl": 3.8181614875793457, + "learning_rate": 9.011111111111112e-07, + "logits/chosen": 30655971.2, + "logits/rejected": 30687513.6, + "logps/chosen": -158.52818603515624, + "logps/rejected": -156.8181884765625, + "loss": 0.4161073684692383, + "rewards/chosen": 0.05024971961975098, + "rewards/margins": 0.9918828487396241, + "rewards/rejected": -0.9416331291198731, + "step": 4190 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.6923061609268188, + "kl": 3.7691681385040283, + "learning_rate": 8.900000000000001e-07, + "logits/chosen": 29314112.0, + "logits/rejected": 29080848.0, + "logps/chosen": -143.7644775390625, + "logps/rejected": -143.195703125, + "loss": 0.42991132736206056, + "rewards/chosen": 0.14495362043380738, + "rewards/margins": 0.5658432364463806, + "rewards/rejected": -0.42088961601257324, + "step": 4200 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 4.478702545166016, + "eval_logits/chosen": 27134529.536, + "eval_logits/rejected": 27462119.424, + "eval_logps/chosen": -154.735171875, + "eval_logps/rejected": -151.346375, + "eval_loss": 0.4791446030139923, + "eval_rewards/chosen": -0.040938720703125, + "eval_rewards/margins": 0.22942645263671874, + "eval_rewards/rejected": -0.2703651733398437, + "eval_runtime": 216.6724, + "eval_samples_per_second": 4.615, + "eval_steps_per_second": 2.308, + "step": 4200 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 0.8431246876716614, + "kl": 3.3740649223327637, + "learning_rate": 8.78888888888889e-07, + "logits/chosen": 24580116.8, + "logits/rejected": 23560889.6, + "logps/chosen": -130.23404541015626, + "logps/rejected": -173.274755859375, + "loss": 0.45294036865234377, + "rewards/chosen": -0.21662135124206544, + "rewards/margins": 0.6990167140960692, + "rewards/rejected": -0.9156380653381347, + "step": 4210 + }, + { + "epoch": 1.688, + "grad_norm": 1.0964020490646362, + "kl": 3.4218056201934814, + "learning_rate": 8.677777777777778e-07, + "logits/chosen": 25205835.2, + "logits/rejected": 24163204.8, + "logps/chosen": -144.6944580078125, + "logps/rejected": -165.13668212890624, + "loss": 0.41784143447875977, + "rewards/chosen": 0.14861660003662108, + "rewards/margins": 0.8323590278625488, + "rewards/rejected": -0.6837424278259278, + "step": 4220 + }, + { + "epoch": 1.692, + "grad_norm": 0.8997116088867188, + "kl": 5.5824151039123535, + "learning_rate": 8.566666666666668e-07, + "logits/chosen": 23316640.0, + "logits/rejected": 23185657.6, + "logps/chosen": -150.42138671875, + "logps/rejected": -176.9662109375, + "loss": 0.4408450126647949, + "rewards/chosen": 0.33873915672302246, + "rewards/margins": 0.5633441925048828, + "rewards/rejected": -0.22460503578186036, + "step": 4230 + }, + { + "epoch": 1.696, + "grad_norm": 0.9084079265594482, + "kl": 3.3602194786071777, + "learning_rate": 8.455555555555556e-07, + "logits/chosen": 20662596.8, + "logits/rejected": 21372936.0, + "logps/chosen": -158.28271484375, + "logps/rejected": -157.30823974609376, + "loss": 0.4337437152862549, + "rewards/chosen": -0.27880520820617677, + "rewards/margins": 0.39356427192687987, + "rewards/rejected": -0.6723694801330566, + "step": 4240 + }, + { + "epoch": 1.7, + "grad_norm": 0.822826623916626, + "kl": 6.255753993988037, + "learning_rate": 8.344444444444445e-07, + "logits/chosen": 28279952.0, + "logits/rejected": 30563142.4, + "logps/chosen": -207.4703369140625, + "logps/rejected": -156.87734375, + "loss": 0.401468563079834, + "rewards/chosen": 0.3442718505859375, + "rewards/margins": 0.6474948883056642, + "rewards/rejected": -0.3032230377197266, + "step": 4250 + }, + { + "epoch": 1.704, + "grad_norm": 0.7218330502510071, + "kl": 3.7593655586242676, + "learning_rate": 8.233333333333333e-07, + "logits/chosen": 21211801.6, + "logits/rejected": 25582251.2, + "logps/chosen": -181.3871337890625, + "logps/rejected": -158.4944580078125, + "loss": 0.4855056285858154, + "rewards/chosen": -0.661691427230835, + "rewards/margins": -0.19646124839782714, + "rewards/rejected": -0.4652301788330078, + "step": 4260 + }, + { + "epoch": 1.708, + "grad_norm": 0.5836480259895325, + "kl": 4.7980637550354, + "learning_rate": 8.122222222222223e-07, + "logits/chosen": 25395673.6, + "logits/rejected": 26470857.6, + "logps/chosen": -129.83922119140624, + "logps/rejected": -120.219775390625, + "loss": 0.4517657279968262, + "rewards/chosen": 0.2096014976501465, + "rewards/margins": 0.4321582317352295, + "rewards/rejected": -0.222556734085083, + "step": 4270 + }, + { + "epoch": 1.712, + "grad_norm": 0.6632907390594482, + "kl": 4.515078544616699, + "learning_rate": 8.011111111111111e-07, + "logits/chosen": 28016816.0, + "logits/rejected": 29217836.8, + "logps/chosen": -161.2439697265625, + "logps/rejected": -146.48646240234376, + "loss": 0.4749518871307373, + "rewards/chosen": -0.10194592475891114, + "rewards/margins": 0.19709014892578125, + "rewards/rejected": -0.29903607368469237, + "step": 4280 + }, + { + "epoch": 1.716, + "grad_norm": 0.9374505877494812, + "kl": 4.215886116027832, + "learning_rate": 7.900000000000001e-07, + "logits/chosen": 25599212.8, + "logits/rejected": 27952102.4, + "logps/chosen": -146.201220703125, + "logps/rejected": -121.0344482421875, + "loss": 0.4816310882568359, + "rewards/chosen": -0.015983200073242186, + "rewards/margins": 0.10800590515136718, + "rewards/rejected": -0.12398910522460938, + "step": 4290 + }, + { + "epoch": 1.72, + "grad_norm": 0.9015432596206665, + "kl": 3.52048921585083, + "learning_rate": 7.788888888888889e-07, + "logits/chosen": 19586571.2, + "logits/rejected": 17563038.4, + "logps/chosen": -147.8046142578125, + "logps/rejected": -163.8701416015625, + "loss": 0.4111928939819336, + "rewards/chosen": 0.09343934059143066, + "rewards/margins": 1.163926935195923, + "rewards/rejected": -1.0704875946044923, + "step": 4300 + }, + { + "epoch": 1.724, + "grad_norm": 0.6239180564880371, + "kl": 1.9489761590957642, + "learning_rate": 7.677777777777779e-07, + "logits/chosen": 29297952.0, + "logits/rejected": 25391588.8, + "logps/chosen": -142.50740966796874, + "logps/rejected": -180.68870849609374, + "loss": 0.43747854232788086, + "rewards/chosen": -0.7470763683319092, + "rewards/margins": 0.7179863452911378, + "rewards/rejected": -1.465062713623047, + "step": 4310 + }, + { + "epoch": 1.728, + "grad_norm": 0.6577679514884949, + "kl": 4.765759468078613, + "learning_rate": 7.566666666666667e-07, + "logits/chosen": 24857828.8, + "logits/rejected": 23441436.8, + "logps/chosen": -146.1468017578125, + "logps/rejected": -153.498046875, + "loss": 0.44129347801208496, + "rewards/chosen": -0.11145193576812744, + "rewards/margins": 0.5271980524063111, + "rewards/rejected": -0.6386499881744385, + "step": 4320 + }, + { + "epoch": 1.732, + "grad_norm": 0.6183480024337769, + "kl": 4.645040988922119, + "learning_rate": 7.455555555555556e-07, + "logits/chosen": 22485094.4, + "logits/rejected": 24854936.0, + "logps/chosen": -170.60594482421874, + "logps/rejected": -139.26231689453124, + "loss": 0.4434357166290283, + "rewards/chosen": -0.2083209276199341, + "rewards/margins": 0.2575597524642944, + "rewards/rejected": -0.4658806800842285, + "step": 4330 + }, + { + "epoch": 1.736, + "grad_norm": 0.727397084236145, + "kl": 3.468677520751953, + "learning_rate": 7.344444444444445e-07, + "logits/chosen": 19499200.0, + "logits/rejected": 15777052.8, + "logps/chosen": -133.76549072265624, + "logps/rejected": -177.62255859375, + "loss": 0.39086987972259524, + "rewards/chosen": 0.10505068302154541, + "rewards/margins": 1.318554902076721, + "rewards/rejected": -1.2135042190551757, + "step": 4340 + }, + { + "epoch": 1.74, + "grad_norm": 0.5569754242897034, + "kl": 3.695270538330078, + "learning_rate": 7.233333333333334e-07, + "logits/chosen": 21616838.4, + "logits/rejected": 25698086.4, + "logps/chosen": -166.606103515625, + "logps/rejected": -136.80845947265624, + "loss": 0.4519169807434082, + "rewards/chosen": -0.5317587852478027, + "rewards/margins": 0.12961096763610835, + "rewards/rejected": -0.6613697528839111, + "step": 4350 + }, + { + "epoch": 1.744, + "grad_norm": 0.5092763900756836, + "kl": 5.227725028991699, + "learning_rate": 7.122222222222223e-07, + "logits/chosen": 18400382.4, + "logits/rejected": 17152075.2, + "logps/chosen": -152.55262451171876, + "logps/rejected": -161.2753173828125, + "loss": 0.42585110664367676, + "rewards/chosen": 0.08888615369796753, + "rewards/margins": 1.0932955622673035, + "rewards/rejected": -1.004409408569336, + "step": 4360 + }, + { + "epoch": 1.748, + "grad_norm": 0.6089858412742615, + "kl": 6.389164924621582, + "learning_rate": 7.011111111111112e-07, + "logits/chosen": 23212915.2, + "logits/rejected": 23174060.8, + "logps/chosen": -148.427197265625, + "logps/rejected": -138.902783203125, + "loss": 0.43767833709716797, + "rewards/chosen": 0.34542050361633303, + "rewards/margins": 0.6011173248291015, + "rewards/rejected": -0.25569682121276854, + "step": 4370 + }, + { + "epoch": 1.752, + "grad_norm": 0.7319818735122681, + "kl": 3.631608486175537, + "learning_rate": 6.900000000000001e-07, + "logits/chosen": 24275633.6, + "logits/rejected": 25451712.0, + "logps/chosen": -130.7281982421875, + "logps/rejected": -169.6556884765625, + "loss": 0.4526735782623291, + "rewards/chosen": -0.1837414264678955, + "rewards/margins": 0.5917365074157716, + "rewards/rejected": -0.775477933883667, + "step": 4380 + }, + { + "epoch": 1.756, + "grad_norm": 0.7777149081230164, + "kl": 3.294254779815674, + "learning_rate": 6.78888888888889e-07, + "logits/chosen": 21825590.4, + "logits/rejected": 21820012.8, + "logps/chosen": -147.787158203125, + "logps/rejected": -144.9162841796875, + "loss": 0.43806142807006837, + "rewards/chosen": -0.11295137405395508, + "rewards/margins": 0.8744370460510255, + "rewards/rejected": -0.9873884201049805, + "step": 4390 + }, + { + "epoch": 1.76, + "grad_norm": 0.6188346147537231, + "kl": 1.1032154560089111, + "learning_rate": 6.677777777777779e-07, + "logits/chosen": 21570892.8, + "logits/rejected": 23144080.0, + "logps/chosen": -135.95819091796875, + "logps/rejected": -149.30087890625, + "loss": 0.44826564788818357, + "rewards/chosen": -0.7288064002990723, + "rewards/margins": 0.5491142272949218, + "rewards/rejected": -1.277920627593994, + "step": 4400 + }, + { + "epoch": 1.76, + "eval_kl": 3.972273826599121, + "eval_logits/chosen": 25631897.6, + "eval_logits/rejected": 26045097.984, + "eval_logps/chosen": -156.330625, + "eval_logps/rejected": -152.941625, + "eval_loss": 0.4793069362640381, + "eval_rewards/chosen": -0.2004847412109375, + "eval_rewards/margins": 0.22940472412109375, + "eval_rewards/rejected": -0.42988946533203126, + "eval_runtime": 216.7456, + "eval_samples_per_second": 4.614, + "eval_steps_per_second": 2.307, + "step": 4400 + }, + { + "epoch": 1.764, + "grad_norm": 0.8933963179588318, + "kl": 3.5940029621124268, + "learning_rate": 6.566666666666667e-07, + "logits/chosen": 23419489.6, + "logits/rejected": 19540764.8, + "logps/chosen": -159.04957275390626, + "logps/rejected": -170.299072265625, + "loss": 0.4445340633392334, + "rewards/chosen": -0.11221444606781006, + "rewards/margins": 0.6003352880477906, + "rewards/rejected": -0.7125497341156006, + "step": 4410 + }, + { + "epoch": 1.768, + "grad_norm": 1.043148159980774, + "kl": 4.92350959777832, + "learning_rate": 6.455555555555556e-07, + "logits/chosen": 23423225.6, + "logits/rejected": 20264896.0, + "logps/chosen": -134.481201171875, + "logps/rejected": -165.0137451171875, + "loss": 0.43820796012878416, + "rewards/chosen": 0.19546182155609132, + "rewards/margins": 0.7127113103866578, + "rewards/rejected": -0.5172494888305664, + "step": 4420 + }, + { + "epoch": 1.772, + "grad_norm": 0.6755536198616028, + "kl": 3.846719741821289, + "learning_rate": 6.344444444444445e-07, + "logits/chosen": 30165392.0, + "logits/rejected": 34583219.2, + "logps/chosen": -156.612158203125, + "logps/rejected": -142.9615234375, + "loss": 0.4490334510803223, + "rewards/chosen": -0.037669995427131654, + "rewards/margins": 0.416199442744255, + "rewards/rejected": -0.4538694381713867, + "step": 4430 + }, + { + "epoch": 1.776, + "grad_norm": 0.6009793877601624, + "kl": 3.6022000312805176, + "learning_rate": 6.233333333333333e-07, + "logits/chosen": 16789396.8, + "logits/rejected": 20022726.4, + "logps/chosen": -139.00106201171874, + "logps/rejected": -119.23135986328126, + "loss": 0.48907132148742677, + "rewards/chosen": -0.18124552965164184, + "rewards/margins": 0.10945202112197874, + "rewards/rejected": -0.2906975507736206, + "step": 4440 + }, + { + "epoch": 1.78, + "grad_norm": 0.5468002557754517, + "kl": 4.557890892028809, + "learning_rate": 6.122222222222222e-07, + "logits/chosen": 30898377.6, + "logits/rejected": 29239990.4, + "logps/chosen": -161.43404541015624, + "logps/rejected": -163.96580810546874, + "loss": 0.4502861499786377, + "rewards/chosen": 0.12291504144668579, + "rewards/margins": 0.5061401724815369, + "rewards/rejected": -0.38322513103485106, + "step": 4450 + }, + { + "epoch": 1.784, + "grad_norm": 0.5758063793182373, + "kl": 2.764960527420044, + "learning_rate": 6.011111111111112e-07, + "logits/chosen": 23224883.2, + "logits/rejected": 23461840.0, + "logps/chosen": -147.754541015625, + "logps/rejected": -139.2906494140625, + "loss": 0.44967427253723147, + "rewards/chosen": -0.2614432334899902, + "rewards/margins": 0.513807487487793, + "rewards/rejected": -0.7752507209777832, + "step": 4460 + }, + { + "epoch": 1.788, + "grad_norm": 0.7826879620552063, + "kl": 4.313460350036621, + "learning_rate": 5.900000000000001e-07, + "logits/chosen": 28475094.4, + "logits/rejected": 29499904.0, + "logps/chosen": -149.60347900390624, + "logps/rejected": -139.57032470703126, + "loss": 0.4358978748321533, + "rewards/chosen": 0.18888943195343016, + "rewards/margins": 0.7328470468521118, + "rewards/rejected": -0.5439576148986817, + "step": 4470 + }, + { + "epoch": 1.792, + "grad_norm": 0.7026771306991577, + "kl": 1.8280513286590576, + "learning_rate": 5.788888888888889e-07, + "logits/chosen": 15314908.8, + "logits/rejected": 14658353.6, + "logps/chosen": -146.75345458984376, + "logps/rejected": -157.1791259765625, + "loss": 0.4324824810028076, + "rewards/chosen": -0.4457141399383545, + "rewards/margins": 0.7849259853363036, + "rewards/rejected": -1.2306401252746582, + "step": 4480 + }, + { + "epoch": 1.796, + "grad_norm": 0.7853025197982788, + "kl": 5.189270496368408, + "learning_rate": 5.677777777777779e-07, + "logits/chosen": 26974720.0, + "logits/rejected": 25859792.0, + "logps/chosen": -164.4057861328125, + "logps/rejected": -164.3528076171875, + "loss": 0.41428799629211427, + "rewards/chosen": 0.3338757514953613, + "rewards/margins": 0.8068063259124756, + "rewards/rejected": -0.4729305744171143, + "step": 4490 + }, + { + "epoch": 1.8, + "grad_norm": 0.7289919853210449, + "kl": 3.4229626655578613, + "learning_rate": 5.566666666666667e-07, + "logits/chosen": 28305580.8, + "logits/rejected": 27415660.8, + "logps/chosen": -127.7979248046875, + "logps/rejected": -144.92559814453125, + "loss": 0.4419555187225342, + "rewards/chosen": -0.07625447511672974, + "rewards/margins": 0.5168057322502135, + "rewards/rejected": -0.5930602073669433, + "step": 4500 + }, + { + "epoch": 1.804, + "grad_norm": 0.7820873856544495, + "kl": 5.29005241394043, + "learning_rate": 5.455555555555556e-07, + "logits/chosen": 30357193.6, + "logits/rejected": 28641846.4, + "logps/chosen": -164.97662353515625, + "logps/rejected": -146.59793701171876, + "loss": 0.43038201332092285, + "rewards/chosen": 0.35901241302490233, + "rewards/margins": 0.7769660949707031, + "rewards/rejected": -0.4179536819458008, + "step": 4510 + }, + { + "epoch": 1.808, + "grad_norm": 0.8984478116035461, + "kl": 3.696812868118286, + "learning_rate": 5.344444444444445e-07, + "logits/chosen": 21213681.6, + "logits/rejected": 20043033.6, + "logps/chosen": -183.95279541015626, + "logps/rejected": -168.1439208984375, + "loss": 0.4274559020996094, + "rewards/chosen": -0.015572810173034668, + "rewards/margins": 0.6375526189804077, + "rewards/rejected": -0.6531254291534424, + "step": 4520 + }, + { + "epoch": 1.812, + "grad_norm": 0.5371900200843811, + "kl": 2.156186819076538, + "learning_rate": 5.233333333333334e-07, + "logits/chosen": 24134673.6, + "logits/rejected": 21031646.4, + "logps/chosen": -141.1941162109375, + "logps/rejected": -176.01248779296876, + "loss": 0.3860702276229858, + "rewards/chosen": -0.2229753017425537, + "rewards/margins": 1.4064032077789306, + "rewards/rejected": -1.6293785095214843, + "step": 4530 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.8296744227409363, + "kl": 3.7835755348205566, + "learning_rate": 5.122222222222222e-07, + "logits/chosen": 36388054.4, + "logits/rejected": 31688457.6, + "logps/chosen": -160.08714599609374, + "logps/rejected": -207.0276123046875, + "loss": 0.40818300247192385, + "rewards/chosen": 0.12663592100143434, + "rewards/margins": 1.2323408007621766, + "rewards/rejected": -1.1057048797607423, + "step": 4540 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.5752077102661133, + "kl": 3.985583543777466, + "learning_rate": 5.011111111111112e-07, + "logits/chosen": 21616614.4, + "logits/rejected": 22038934.4, + "logps/chosen": -139.400634765625, + "logps/rejected": -120.08944091796874, + "loss": 0.45534987449645997, + "rewards/chosen": -0.1639024496078491, + "rewards/margins": 0.37830965518951415, + "rewards/rejected": -0.5422121047973633, + "step": 4550 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.7708961367607117, + "kl": 3.2169101238250732, + "learning_rate": 4.900000000000001e-07, + "logits/chosen": 23925704.0, + "logits/rejected": 19160164.8, + "logps/chosen": -168.2747802734375, + "logps/rejected": -186.7814697265625, + "loss": 0.4789942741394043, + "rewards/chosen": -0.4888188362121582, + "rewards/margins": 0.5942277908325195, + "rewards/rejected": -1.0830466270446777, + "step": 4560 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 0.6243308186531067, + "kl": 4.417626857757568, + "learning_rate": 4.78888888888889e-07, + "logits/chosen": 27919126.4, + "logits/rejected": 26266936.0, + "logps/chosen": -180.739013671875, + "logps/rejected": -187.88717041015624, + "loss": 0.38467090129852294, + "rewards/chosen": -0.027477288246154787, + "rewards/margins": 1.3471161603927613, + "rewards/rejected": -1.374593448638916, + "step": 4570 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6753647327423096, + "kl": 2.572279930114746, + "learning_rate": 4.6777777777777785e-07, + "logits/chosen": 24245668.8, + "logits/rejected": 25570259.2, + "logps/chosen": -183.2251953125, + "logps/rejected": -144.81016845703124, + "loss": 0.47222309112548827, + "rewards/chosen": -0.7298378944396973, + "rewards/margins": -0.06857419013977051, + "rewards/rejected": -0.6612637042999268, + "step": 4580 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 0.6756667494773865, + "kl": 2.9830145835876465, + "learning_rate": 4.566666666666667e-07, + "logits/chosen": 19786300.8, + "logits/rejected": 18077294.4, + "logps/chosen": -142.7079833984375, + "logps/rejected": -167.11500244140626, + "loss": 0.42957119941711425, + "rewards/chosen": -0.10680264234542847, + "rewards/margins": 1.0903936505317688, + "rewards/rejected": -1.1971962928771973, + "step": 4590 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.5496138334274292, + "kl": 2.980377197265625, + "learning_rate": 4.455555555555556e-07, + "logits/chosen": 19038470.4, + "logits/rejected": 22191712.0, + "logps/chosen": -171.56102294921874, + "logps/rejected": -158.53353271484374, + "loss": 0.5080355644226074, + "rewards/chosen": -0.6536062240600586, + "rewards/margins": -0.24833087921142583, + "rewards/rejected": -0.4052753448486328, + "step": 4600 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.5254554748535156, + "eval_logits/chosen": 23809890.304, + "eval_logits/rejected": 24273334.272, + "eval_logps/chosen": -158.050265625, + "eval_logps/rejected": -154.70503125, + "eval_loss": 0.47947752475738525, + "eval_rewards/chosen": -0.37244818115234374, + "eval_rewards/margins": 0.2337820434570313, + "eval_rewards/rejected": -0.606230224609375, + "eval_runtime": 216.9986, + "eval_samples_per_second": 4.608, + "eval_steps_per_second": 2.304, + "step": 4600 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 0.53779536485672, + "kl": 2.9465878009796143, + "learning_rate": 4.344444444444445e-07, + "logits/chosen": 18342864.0, + "logits/rejected": 17376532.8, + "logps/chosen": -151.92774658203126, + "logps/rejected": -173.01671142578124, + "loss": 0.4371053218841553, + "rewards/chosen": 0.02389627695083618, + "rewards/margins": 0.8590038895606995, + "rewards/rejected": -0.8351076126098633, + "step": 4610 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.8398244380950928, + "kl": 4.876760482788086, + "learning_rate": 4.233333333333334e-07, + "logits/chosen": 18385000.0, + "logits/rejected": 16077065.6, + "logps/chosen": -145.19398193359376, + "logps/rejected": -156.1609619140625, + "loss": 0.4620822906494141, + "rewards/chosen": -0.28269295692443847, + "rewards/margins": 0.44993181228637696, + "rewards/rejected": -0.7326247692108154, + "step": 4620 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 0.5775973796844482, + "kl": 3.4174346923828125, + "learning_rate": 4.1222222222222225e-07, + "logits/chosen": 25656241.6, + "logits/rejected": 24305292.8, + "logps/chosen": -178.959912109375, + "logps/rejected": -173.44217529296876, + "loss": 0.42700676918029784, + "rewards/chosen": -0.3585548162460327, + "rewards/margins": 0.6643104791641236, + "rewards/rejected": -1.0228652954101562, + "step": 4630 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.5026371479034424, + "kl": 5.277353763580322, + "learning_rate": 4.0111111111111116e-07, + "logits/chosen": 23320708.8, + "logits/rejected": 26072505.6, + "logps/chosen": -162.64700927734376, + "logps/rejected": -163.052490234375, + "loss": 0.4349231719970703, + "rewards/chosen": 0.08882616758346558, + "rewards/margins": 0.8141135096549988, + "rewards/rejected": -0.7252873420715332, + "step": 4640 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.9511138796806335, + "kl": 3.9891674518585205, + "learning_rate": 3.9e-07, + "logits/chosen": 22138430.4, + "logits/rejected": 20103414.4, + "logps/chosen": -165.88394775390626, + "logps/rejected": -168.73675537109375, + "loss": 0.45732574462890624, + "rewards/chosen": -0.1767573595046997, + "rewards/margins": 0.7479500532150268, + "rewards/rejected": -0.9247074127197266, + "step": 4650 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.9161826968193054, + "kl": 4.220607280731201, + "learning_rate": 3.7888888888888894e-07, + "logits/chosen": 28969993.6, + "logits/rejected": 29838678.4, + "logps/chosen": -155.28150634765626, + "logps/rejected": -168.24874267578124, + "loss": 0.4191298961639404, + "rewards/chosen": 0.2649924993515015, + "rewards/margins": 0.8968584775924683, + "rewards/rejected": -0.6318659782409668, + "step": 4660 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 0.5795943140983582, + "kl": 2.4628450870513916, + "learning_rate": 3.677777777777778e-07, + "logits/chosen": 15824608.0, + "logits/rejected": 14315985.6, + "logps/chosen": -139.00059814453124, + "logps/rejected": -158.5366943359375, + "loss": 0.41784844398498533, + "rewards/chosen": -0.5109588146209717, + "rewards/margins": 0.7057264804840088, + "rewards/rejected": -1.2166852951049805, + "step": 4670 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 1.1482797861099243, + "kl": 5.052585601806641, + "learning_rate": 3.566666666666667e-07, + "logits/chosen": 31239507.2, + "logits/rejected": 31432432.0, + "logps/chosen": -179.59786376953124, + "logps/rejected": -199.677734375, + "loss": 0.4447749137878418, + "rewards/chosen": -0.503559160232544, + "rewards/margins": 0.2613923072814941, + "rewards/rejected": -0.7649514675140381, + "step": 4680 + }, + { + "epoch": 1.876, + "grad_norm": 0.4730696678161621, + "kl": 3.02178955078125, + "learning_rate": 3.4555555555555557e-07, + "logits/chosen": 31747740.8, + "logits/rejected": 31681862.4, + "logps/chosen": -140.8739990234375, + "logps/rejected": -135.987109375, + "loss": 0.48804163932800293, + "rewards/chosen": -0.5644313812255859, + "rewards/margins": -0.14893860816955562, + "rewards/rejected": -0.4154927730560303, + "step": 4690 + }, + { + "epoch": 1.88, + "grad_norm": 0.3857530653476715, + "kl": 4.020687103271484, + "learning_rate": 3.344444444444445e-07, + "logits/chosen": 22076350.4, + "logits/rejected": 21447452.8, + "logps/chosen": -153.19390869140625, + "logps/rejected": -149.92744140625, + "loss": 0.43955206871032715, + "rewards/chosen": -0.1915654182434082, + "rewards/margins": 0.6257681846618652, + "rewards/rejected": -0.8173336029052735, + "step": 4700 + }, + { + "epoch": 1.884, + "grad_norm": 0.6033211350440979, + "kl": 3.1054883003234863, + "learning_rate": 3.2333333333333334e-07, + "logits/chosen": 21611427.2, + "logits/rejected": 24669811.2, + "logps/chosen": -174.2831787109375, + "logps/rejected": -169.7625, + "loss": 0.46239595413208007, + "rewards/chosen": -0.480119514465332, + "rewards/margins": 0.46341266632080075, + "rewards/rejected": -0.9435321807861328, + "step": 4710 + }, + { + "epoch": 1.888, + "grad_norm": 0.4621000289916992, + "kl": 4.2236480712890625, + "learning_rate": 3.1222222222222226e-07, + "logits/chosen": 31168502.4, + "logits/rejected": 29779161.6, + "logps/chosen": -155.3637939453125, + "logps/rejected": -156.241455078125, + "loss": 0.47081618309020995, + "rewards/chosen": -0.27088658809661864, + "rewards/margins": 0.3101552248001099, + "rewards/rejected": -0.5810418128967285, + "step": 4720 + }, + { + "epoch": 1.892, + "grad_norm": 0.9124680161476135, + "kl": 3.5288634300231934, + "learning_rate": 3.0111111111111117e-07, + "logits/chosen": 21657052.8, + "logits/rejected": 22909753.6, + "logps/chosen": -122.133251953125, + "logps/rejected": -120.75472412109374, + "loss": 0.4612110137939453, + "rewards/chosen": -0.11650089025497437, + "rewards/margins": 0.5821841835975647, + "rewards/rejected": -0.6986850738525391, + "step": 4730 + }, + { + "epoch": 1.896, + "grad_norm": 0.6643645763397217, + "kl": 4.355770587921143, + "learning_rate": 2.9000000000000003e-07, + "logits/chosen": 18843272.0, + "logits/rejected": 15865353.6, + "logps/chosen": -149.32027587890624, + "logps/rejected": -151.32247314453124, + "loss": 0.43994879722595215, + "rewards/chosen": -0.003923875093460083, + "rewards/margins": 1.0778412997722626, + "rewards/rejected": -1.0817651748657227, + "step": 4740 + }, + { + "epoch": 1.9, + "grad_norm": 0.7815224528312683, + "kl": 3.7565338611602783, + "learning_rate": 2.7888888888888894e-07, + "logits/chosen": 17081561.6, + "logits/rejected": 20796334.4, + "logps/chosen": -143.27027587890626, + "logps/rejected": -158.528515625, + "loss": 0.4135895252227783, + "rewards/chosen": -0.08530845642089843, + "rewards/margins": 0.8226515769958496, + "rewards/rejected": -0.907960033416748, + "step": 4750 + }, + { + "epoch": 1.904, + "grad_norm": 0.6945417523384094, + "kl": 4.180878639221191, + "learning_rate": 2.677777777777778e-07, + "logits/chosen": 26961856.0, + "logits/rejected": 28458582.4, + "logps/chosen": -157.9197265625, + "logps/rejected": -175.12359619140625, + "loss": 0.4510028839111328, + "rewards/chosen": -0.5507327556610108, + "rewards/margins": 0.6062280178070069, + "rewards/rejected": -1.1569607734680176, + "step": 4760 + }, + { + "epoch": 1.908, + "grad_norm": 0.6717256903648376, + "kl": 2.790264844894409, + "learning_rate": 2.566666666666667e-07, + "logits/chosen": 26413414.4, + "logits/rejected": 26038793.6, + "logps/chosen": -155.70628662109374, + "logps/rejected": -127.781591796875, + "loss": 0.45128421783447265, + "rewards/chosen": -0.09372057914733886, + "rewards/margins": 0.4549129009246826, + "rewards/rejected": -0.5486334800720215, + "step": 4770 + }, + { + "epoch": 1.912, + "grad_norm": 0.7105498313903809, + "kl": 2.446235179901123, + "learning_rate": 2.455555555555556e-07, + "logits/chosen": 13721004.8, + "logits/rejected": 10354278.4, + "logps/chosen": -126.288916015625, + "logps/rejected": -146.79375, + "loss": 0.48598880767822267, + "rewards/chosen": -0.6219570159912109, + "rewards/margins": 0.45132789611816415, + "rewards/rejected": -1.073284912109375, + "step": 4780 + }, + { + "epoch": 1.916, + "grad_norm": 0.6213298439979553, + "kl": 3.837221622467041, + "learning_rate": 2.3444444444444446e-07, + "logits/chosen": 28986860.8, + "logits/rejected": 28953318.4, + "logps/chosen": -171.77513427734374, + "logps/rejected": -183.67852783203125, + "loss": 0.4227924346923828, + "rewards/chosen": 0.06139696836471557, + "rewards/margins": 1.0945536494255066, + "rewards/rejected": -1.033156681060791, + "step": 4790 + }, + { + "epoch": 1.92, + "grad_norm": 0.8169627785682678, + "kl": 2.6108901500701904, + "learning_rate": 2.2333333333333335e-07, + "logits/chosen": 25155881.6, + "logits/rejected": 27653331.2, + "logps/chosen": -169.9299072265625, + "logps/rejected": -174.208056640625, + "loss": 0.43941802978515626, + "rewards/chosen": -0.3946220397949219, + "rewards/margins": 0.4759023666381836, + "rewards/rejected": -0.8705244064331055, + "step": 4800 + }, + { + "epoch": 1.92, + "eval_kl": 3.7999300956726074, + "eval_logits/chosen": 24666335.232, + "eval_logits/rejected": 25062387.712, + "eval_logps/chosen": -157.002125, + "eval_logps/rejected": -153.64446875, + "eval_loss": 0.47916504740715027, + "eval_rewards/chosen": -0.2676349487304687, + "eval_rewards/margins": 0.23253808593749997, + "eval_rewards/rejected": -0.5001730346679687, + "eval_runtime": 221.7048, + "eval_samples_per_second": 4.511, + "eval_steps_per_second": 2.255, + "step": 4800 + }, + { + "epoch": 1.924, + "grad_norm": 0.7637141942977905, + "kl": 2.9708378314971924, + "learning_rate": 2.1222222222222223e-07, + "logits/chosen": 29937721.6, + "logits/rejected": 27126444.8, + "logps/chosen": -123.94395751953125, + "logps/rejected": -154.766162109375, + "loss": 0.41828279495239257, + "rewards/chosen": -0.29229035377502444, + "rewards/margins": 0.7458555698394774, + "rewards/rejected": -1.0381459236145019, + "step": 4810 + }, + { + "epoch": 1.928, + "grad_norm": 0.5419637560844421, + "kl": 4.58957576751709, + "learning_rate": 2.0111111111111112e-07, + "logits/chosen": 24804246.4, + "logits/rejected": 23061873.6, + "logps/chosen": -153.1744873046875, + "logps/rejected": -197.04478759765624, + "loss": 0.4111452102661133, + "rewards/chosen": 0.09454797506332398, + "rewards/margins": 1.0740018248558045, + "rewards/rejected": -0.9794538497924805, + "step": 4820 + }, + { + "epoch": 1.932, + "grad_norm": 0.8519544005393982, + "kl": 4.268065452575684, + "learning_rate": 1.9e-07, + "logits/chosen": 26309577.6, + "logits/rejected": 23776206.4, + "logps/chosen": -143.24700927734375, + "logps/rejected": -166.7473388671875, + "loss": 0.38274445533752444, + "rewards/chosen": 0.41695270538330076, + "rewards/margins": 1.2130630970001222, + "rewards/rejected": -0.7961103916168213, + "step": 4830 + }, + { + "epoch": 1.936, + "grad_norm": 0.7942814826965332, + "kl": 3.378706455230713, + "learning_rate": 1.788888888888889e-07, + "logits/chosen": 17424446.4, + "logits/rejected": 18293580.8, + "logps/chosen": -138.1279296875, + "logps/rejected": -144.323291015625, + "loss": 0.4470942497253418, + "rewards/chosen": -0.39404864311218263, + "rewards/margins": 0.655165147781372, + "rewards/rejected": -1.0492137908935546, + "step": 4840 + }, + { + "epoch": 1.94, + "grad_norm": 0.8591831922531128, + "kl": 5.319828510284424, + "learning_rate": 1.6777777777777778e-07, + "logits/chosen": 18527142.4, + "logits/rejected": 19455331.2, + "logps/chosen": -178.698828125, + "logps/rejected": -143.05250244140626, + "loss": 0.44331812858581543, + "rewards/chosen": -0.13738110065460205, + "rewards/margins": 0.4401890993118286, + "rewards/rejected": -0.5775701999664307, + "step": 4850 + }, + { + "epoch": 1.944, + "grad_norm": 0.533585250377655, + "kl": 5.78318977355957, + "learning_rate": 1.5666666666666667e-07, + "logits/chosen": 23592966.4, + "logits/rejected": 20490028.8, + "logps/chosen": -146.7453857421875, + "logps/rejected": -152.55186767578124, + "loss": 0.42699480056762695, + "rewards/chosen": 0.32837748527526855, + "rewards/margins": 0.9336020469665527, + "rewards/rejected": -0.6052245616912841, + "step": 4860 + }, + { + "epoch": 1.948, + "grad_norm": 0.9099907279014587, + "kl": 6.092495918273926, + "learning_rate": 1.4555555555555558e-07, + "logits/chosen": 25231241.6, + "logits/rejected": 20059145.6, + "logps/chosen": -132.9916259765625, + "logps/rejected": -202.9581298828125, + "loss": 0.4632194995880127, + "rewards/chosen": -0.10310151576995849, + "rewards/margins": 0.43110449314117427, + "rewards/rejected": -0.5342060089111328, + "step": 4870 + }, + { + "epoch": 1.952, + "grad_norm": 0.7092785835266113, + "kl": 4.797276020050049, + "learning_rate": 1.3444444444444447e-07, + "logits/chosen": 37765456.0, + "logits/rejected": 38072672.0, + "logps/chosen": -172.77252197265625, + "logps/rejected": -154.9837890625, + "loss": 0.4410356044769287, + "rewards/chosen": 0.2933366775512695, + "rewards/margins": 0.7540252685546875, + "rewards/rejected": -0.46068859100341797, + "step": 4880 + }, + { + "epoch": 1.956, + "grad_norm": 0.8571650981903076, + "kl": 4.5116286277771, + "learning_rate": 1.2333333333333335e-07, + "logits/chosen": 26100118.4, + "logits/rejected": 26903241.6, + "logps/chosen": -173.86666259765624, + "logps/rejected": -172.926513671875, + "loss": 0.44402332305908204, + "rewards/chosen": -0.08177288174629212, + "rewards/margins": 0.747695654630661, + "rewards/rejected": -0.8294685363769532, + "step": 4890 + }, + { + "epoch": 1.96, + "grad_norm": 0.5442430377006531, + "kl": 4.051485538482666, + "learning_rate": 1.1222222222222223e-07, + "logits/chosen": 29020172.8, + "logits/rejected": 27422496.0, + "logps/chosen": -167.26827392578124, + "logps/rejected": -181.95833740234374, + "loss": 0.4251366138458252, + "rewards/chosen": -0.028320443630218507, + "rewards/margins": 0.9460867524147034, + "rewards/rejected": -0.9744071960449219, + "step": 4900 + }, + { + "epoch": 1.964, + "grad_norm": 0.7448293566703796, + "kl": 3.2710437774658203, + "learning_rate": 1.0111111111111111e-07, + "logits/chosen": 30350928.0, + "logits/rejected": 30090121.6, + "logps/chosen": -165.0755615234375, + "logps/rejected": -158.35567626953124, + "loss": 0.42624425888061523, + "rewards/chosen": 0.13982071876525878, + "rewards/margins": 0.8426364898681641, + "rewards/rejected": -0.7028157711029053, + "step": 4910 + }, + { + "epoch": 1.968, + "grad_norm": 0.7627419829368591, + "kl": 3.8884456157684326, + "learning_rate": 9e-08, + "logits/chosen": 24701544.0, + "logits/rejected": 24283966.4, + "logps/chosen": -150.92520751953126, + "logps/rejected": -170.37452392578126, + "loss": 0.47156991958618166, + "rewards/chosen": -0.3949418067932129, + "rewards/margins": 0.43591785430908203, + "rewards/rejected": -0.8308596611022949, + "step": 4920 + }, + { + "epoch": 1.972, + "grad_norm": 0.7276384830474854, + "kl": 4.388872146606445, + "learning_rate": 7.88888888888889e-08, + "logits/chosen": 28244620.8, + "logits/rejected": 33328278.4, + "logps/chosen": -163.6046630859375, + "logps/rejected": -118.111083984375, + "loss": 0.4811519145965576, + "rewards/chosen": -0.36003918647766114, + "rewards/margins": 0.0924633979797363, + "rewards/rejected": -0.45250258445739744, + "step": 4930 + }, + { + "epoch": 1.976, + "grad_norm": 0.7117029428482056, + "kl": 5.032760143280029, + "learning_rate": 6.777777777777778e-08, + "logits/chosen": 22939393.6, + "logits/rejected": 21714008.0, + "logps/chosen": -145.11226806640624, + "logps/rejected": -152.2065185546875, + "loss": 0.44475903511047366, + "rewards/chosen": 0.10631499290466309, + "rewards/margins": 0.6777109622955322, + "rewards/rejected": -0.5713959693908691, + "step": 4940 + }, + { + "epoch": 1.98, + "grad_norm": 0.9050812125205994, + "kl": 4.289942741394043, + "learning_rate": 5.666666666666668e-08, + "logits/chosen": 30201168.0, + "logits/rejected": 28422313.6, + "logps/chosen": -178.32493896484374, + "logps/rejected": -183.266015625, + "loss": 0.42292218208312987, + "rewards/chosen": 0.3787633657455444, + "rewards/margins": 0.8953429460525513, + "rewards/rejected": -0.5165795803070068, + "step": 4950 + }, + { + "epoch": 1.984, + "grad_norm": 1.1372522115707397, + "kl": 3.6693832874298096, + "learning_rate": 4.5555555555555564e-08, + "logits/chosen": 32653657.6, + "logits/rejected": 30764761.6, + "logps/chosen": -148.58985595703126, + "logps/rejected": -156.90859375, + "loss": 0.4652153491973877, + "rewards/chosen": 0.04619384109973908, + "rewards/margins": 0.47224822342395784, + "rewards/rejected": -0.42605438232421877, + "step": 4960 + }, + { + "epoch": 1.988, + "grad_norm": 1.0090582370758057, + "kl": 4.217179775238037, + "learning_rate": 3.4444444444444444e-08, + "logits/chosen": 35080169.6, + "logits/rejected": 34306604.8, + "logps/chosen": -163.31170654296875, + "logps/rejected": -124.7479736328125, + "loss": 0.46434483528137205, + "rewards/chosen": -0.040966635942459105, + "rewards/margins": 0.5186804473400116, + "rewards/rejected": -0.5596470832824707, + "step": 4970 + }, + { + "epoch": 1.992, + "grad_norm": 0.8154371380805969, + "kl": 3.197679281234741, + "learning_rate": 2.3333333333333337e-08, + "logits/chosen": 14189470.4, + "logits/rejected": 14436078.4, + "logps/chosen": -132.91417236328124, + "logps/rejected": -114.66080322265626, + "loss": 0.4718677520751953, + "rewards/chosen": -0.3627371311187744, + "rewards/margins": 0.35457229614257807, + "rewards/rejected": -0.7173094272613525, + "step": 4980 + }, + { + "epoch": 1.996, + "grad_norm": 0.7544147372245789, + "kl": 3.230269193649292, + "learning_rate": 1.2222222222222224e-08, + "logits/chosen": 31088787.2, + "logits/rejected": 30435852.8, + "logps/chosen": -119.33135986328125, + "logps/rejected": -141.03900146484375, + "loss": 0.42151288986206054, + "rewards/chosen": 0.11817736625671386, + "rewards/margins": 0.9457557201385498, + "rewards/rejected": -0.827578353881836, + "step": 4990 + }, + { + "epoch": 2.0, + "grad_norm": 0.8810012340545654, + "kl": 3.4651100635528564, + "learning_rate": 1.1111111111111113e-09, + "logits/chosen": 21273099.2, + "logits/rejected": 24867558.4, + "logps/chosen": -164.9623779296875, + "logps/rejected": -131.0203125, + "loss": 0.4696086883544922, + "rewards/chosen": -0.329726505279541, + "rewards/margins": 0.22194571495056153, + "rewards/rejected": -0.5516722202301025, + "step": 5000 + }, + { + "epoch": 2.0, + "eval_kl": 3.9559381008148193, + "eval_logits/chosen": 24913780.736, + "eval_logits/rejected": 25290629.12, + "eval_logps/chosen": -156.587265625, + "eval_logps/rejected": -153.22965625, + "eval_loss": 0.47909215092658997, + "eval_rewards/chosen": -0.22614874267578125, + "eval_rewards/margins": 0.23254312133789065, + "eval_rewards/rejected": -0.4586918640136719, + "eval_runtime": 217.5944, + "eval_samples_per_second": 4.596, + "eval_steps_per_second": 2.298, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_20k/lora/checkpoint-5000/training_args.bin b/v5/KTO/KTO_20k/lora/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..3b3df9314987039f6eb4aae71c1789a27c508f03 --- /dev/null +++ b/v5/KTO/KTO_20k/lora/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b224910eb4f0913af2c07ef9b4ff545409726d7169b35fc1b136bed8f918d2c +size 5521 diff --git a/v5/KTO/KTO_2k/KTO_2k/README.md b/v5/KTO/KTO_2k/KTO_2k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_2k/KTO_2k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_2k/KTO_2k/adapter_config.json b/v5/KTO/KTO_2k/KTO_2k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..79c87a5e26c750982ea826f71569628c0700f71f --- /dev/null +++ b/v5/KTO/KTO_2k/KTO_2k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "gate_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_2k/KTO_2k/adapter_model.safetensors b/v5/KTO/KTO_2k/KTO_2k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dcefc15cae8ecc7bb912fe8bde2505a59d0eebcb --- /dev/null +++ b/v5/KTO/KTO_2k/KTO_2k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72b3b7d336ff6daaf659b5ae99796ada35d4a1c485657c3327204ea5455fee5 +size 180385008 diff --git a/v5/KTO/KTO_2k/MKTO_2k/chat_template.jinja b/v5/KTO/KTO_2k/MKTO_2k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_2k/MKTO_2k/config.json b/v5/KTO/KTO_2k/MKTO_2k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/KTO/KTO_2k/MKTO_2k/generation_config.json b/v5/KTO/KTO_2k/MKTO_2k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/KTO/KTO_2k/MKTO_2k/model.safetensors b/v5/KTO/KTO_2k/MKTO_2k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a09aceaa34e40d55d3984f3e5a54947e556e156a --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a66114971dc79e07c58a643a7cfbd71f2d8f0181cfb2acc1a400cccebd4165a +size 2471645464 diff --git a/v5/KTO/KTO_2k/MKTO_2k/tokenizer.json b/v5/KTO/KTO_2k/MKTO_2k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_2k/MKTO_2k/tokenizer_config.json b/v5/KTO/KTO_2k/MKTO_2k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_2k/MKTO_2k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_2k/lora/README.md b/v5/KTO/KTO_2k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5af818503142d6d030deaf75b24dc337cfcdd19a --- /dev/null +++ b/v5/KTO/KTO_2k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- kto +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/zb6z96g9) + + +This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite KTO as: + +```bibtex +@article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/README.md b/v5/KTO/KTO_2k/lora/checkpoint-180/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_config.json b/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..79c87a5e26c750982ea826f71569628c0700f71f --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "gate_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_model.safetensors b/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dcefc15cae8ecc7bb912fe8bde2505a59d0eebcb --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d72b3b7d336ff6daaf659b5ae99796ada35d4a1c485657c3327204ea5455fee5 +size 180385008 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/chat_template.jinja b/v5/KTO/KTO_2k/lora/checkpoint-180/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/optimizer.pt b/v5/KTO/KTO_2k/lora/checkpoint-180/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc56f8a0af5a15736cd7c97431f91d41882d8e0f --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a2b67093d1b3d5fffb16340cc8311ffda90eb77df07cac2cbc9d023663b9268 +size 360902475 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/rng_state.pth b/v5/KTO/KTO_2k/lora/checkpoint-180/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9cd53ddd51087a39d299073b9d407413ac1f02a5 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17cd930da9783ca70bad4b9cdeee6a06c0acea8f34645a333c93341f487f66a3 +size 14645 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/scaler.pt b/v5/KTO/KTO_2k/lora/checkpoint-180/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..909b4368b345625d18b5bb74870c91f6211af7f4 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a46dab75a56f27dde2e8e3e67ded80f1f613cc87f428da2078839cef043ade4 +size 1383 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/scheduler.pt b/v5/KTO/KTO_2k/lora/checkpoint-180/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..11fdfd7854210bac954b9c94933e4470b3bbd670 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06928441d7e7b9bb5e28e7f5db9adc8a92b5bc74cf3e1fe2743f66b9f31aff5a +size 1465 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer.json b/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer_config.json b/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/trainer_state.json b/v5/KTO/KTO_2k/lora/checkpoint-180/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..643ab1df353b9cd912a59780392b9566ef0e7d3d --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/trainer_state.json @@ -0,0 +1,448 @@ +{ + "best_global_step": 180, + "best_metric": 0.010626815795898442, + "best_model_checkpoint": "output/lora/checkpoint-180", + "epoch": 0.72, + "eval_steps": 20, + "global_step": 180, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "grad_norm": 0.5028387904167175, + "kl": 0.01572093926370144, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 30264627.2, + "logits/rejected": 32013382.4, + "logps/chosen": -148.62435302734374, + "logps/rejected": -128.52413330078124, + "loss": 0.5001428127288818, + "rewards/chosen": 0.000409355154260993, + "rewards/margins": -0.0011432173196226358, + "rewards/rejected": 0.0015525724738836288, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 0.4273326098918915, + "kl": 0.02463815174996853, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": 54053068.8, + "logits/rejected": 53587827.2, + "logps/chosen": -139.9073974609375, + "logps/rejected": -151.66370849609376, + "loss": 0.5000814437866211, + "rewards/chosen": 0.0014432575553655624, + "rewards/margins": -0.0006511589512228967, + "rewards/rejected": 0.002094416506588459, + "step": 20 + }, + { + "epoch": 0.08, + "eval_kl": 0.019881391897797585, + "eval_logits/chosen": 38914011.136, + "eval_logits/rejected": 38816776.192, + "eval_logps/chosen": -154.32296875, + "eval_logps/rejected": -148.641921875, + "eval_loss": 0.49997514486312866, + "eval_rewards/chosen": 0.00028007772564888, + "eval_rewards/margins": 0.00019879454374313355, + "eval_rewards/rejected": 8.128318190574646e-05, + "eval_runtime": 214.9745, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 20 + }, + { + "epoch": 0.12, + "grad_norm": 0.4189560115337372, + "kl": 0.011721396818757057, + "learning_rate": 2.9e-06, + "logits/chosen": 34689024.0, + "logits/rejected": 34760614.4, + "logps/chosen": -131.55169677734375, + "logps/rejected": -140.08671875, + "loss": 0.4998650550842285, + "rewards/chosen": -0.0027512358501553535, + "rewards/margins": 0.0010801648721098902, + "rewards/rejected": -0.0038314007222652437, + "step": 30 + }, + { + "epoch": 0.16, + "grad_norm": 0.35710522532463074, + "kl": 0.014891624450683594, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 43734198.4, + "logits/rejected": 44358838.4, + "logps/chosen": -144.14744873046874, + "logps/rejected": -145.7405517578125, + "loss": 0.49999523162841797, + "rewards/chosen": -0.003765878826379776, + "rewards/margins": 3.8427859544754115e-05, + "rewards/rejected": -0.00380430668592453, + "step": 40 + }, + { + "epoch": 0.16, + "eval_kl": 0.010086631402373314, + "eval_logits/chosen": 38739668.992, + "eval_logits/rejected": 38645895.168, + "eval_logps/chosen": -154.4100625, + "eval_logps/rejected": -148.72634375, + "eval_loss": 0.5000083446502686, + "eval_rewards/chosen": -0.00842873191833496, + "eval_rewards/margins": -6.65245056152338e-05, + "eval_rewards/rejected": -0.008362207412719726, + "eval_runtime": 214.0434, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 2.336, + "step": 40 + }, + { + "epoch": 0.2, + "grad_norm": 0.32243308424949646, + "kl": 0.03667576238512993, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 43570592.0, + "logits/rejected": 45423254.4, + "logps/chosen": -140.7910400390625, + "logps/rejected": -154.12264404296874, + "loss": 0.4997419357299805, + "rewards/chosen": -0.0029973506927490233, + "rewards/margins": 0.0020649198442697523, + "rewards/rejected": -0.005062270537018776, + "step": 50 + }, + { + "epoch": 0.24, + "grad_norm": 0.2825469970703125, + "kl": 0.11386795341968536, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 37111891.2, + "logits/rejected": 34546041.6, + "logps/chosen": -105.48106689453125, + "logps/rejected": -113.80091552734375, + "loss": 0.4988723278045654, + "rewards/chosen": 0.008526696264743805, + "rewards/margins": 0.009030784061178566, + "rewards/rejected": -0.0005040877964347601, + "step": 60 + }, + { + "epoch": 0.24, + "eval_kl": 0.07935438305139542, + "eval_logits/chosen": 38417367.04, + "eval_logits/rejected": 38329237.504, + "eval_logps/chosen": -154.4071875, + "eval_logps/rejected": -148.7261875, + "eval_loss": 0.49997425079345703, + "eval_rewards/chosen": -0.008140082359313966, + "eval_rewards/margins": 0.0002057380676269531, + "eval_rewards/rejected": -0.008345820426940919, + "eval_runtime": 214.5644, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.33, + "step": 60 + }, + { + "epoch": 0.28, + "grad_norm": 0.38684579730033875, + "kl": 0.07920856773853302, + "learning_rate": 4.7888888888888894e-06, + "logits/chosen": 47922112.0, + "logits/rejected": 46894880.0, + "logps/chosen": -165.71815185546876, + "logps/rejected": -175.34261474609374, + "loss": 0.498098087310791, + "rewards/chosen": -0.0340001106262207, + "rewards/margins": 0.01531563997268677, + "rewards/rejected": -0.04931575059890747, + "step": 70 + }, + { + "epoch": 0.32, + "grad_norm": 0.46789467334747314, + "kl": 0.07494600117206573, + "learning_rate": 4.677777777777778e-06, + "logits/chosen": 40747894.4, + "logits/rejected": 40754576.0, + "logps/chosen": -155.0984619140625, + "logps/rejected": -163.70439453125, + "loss": 0.4992827415466309, + "rewards/chosen": -0.0378705084323883, + "rewards/margins": 0.005790993571281433, + "rewards/rejected": -0.043661502003669736, + "step": 80 + }, + { + "epoch": 0.32, + "eval_kl": 0.08320723474025726, + "eval_logits/chosen": 37684162.56, + "eval_logits/rejected": 37622898.688, + "eval_logps/chosen": -154.907359375, + "eval_logps/rejected": -149.193203125, + "eval_loss": 0.5003835558891296, + "eval_rewards/chosen": -0.05815739822387695, + "eval_rewards/margins": -0.0031089820861816414, + "eval_rewards/rejected": -0.05504841613769531, + "eval_runtime": 214.9467, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 80 + }, + { + "epoch": 0.36, + "grad_norm": 0.3978089690208435, + "kl": 0.17448857426643372, + "learning_rate": 4.566666666666667e-06, + "logits/chosen": 43566720.0, + "logits/rejected": 43788972.8, + "logps/chosen": -169.9933837890625, + "logps/rejected": -159.5298828125, + "loss": 0.4963120460510254, + "rewards/chosen": -0.02978883981704712, + "rewards/margins": 0.029619407653808594, + "rewards/rejected": -0.05940824747085571, + "step": 90 + }, + { + "epoch": 0.4, + "grad_norm": 0.48971185088157654, + "kl": 0.3320249021053314, + "learning_rate": 4.455555555555555e-06, + "logits/chosen": 27562105.6, + "logits/rejected": 26916403.2, + "logps/chosen": -133.9698486328125, + "logps/rejected": -164.26551513671876, + "loss": 0.49673967361450194, + "rewards/chosen": -0.009067486226558685, + "rewards/margins": 0.026504097878932955, + "rewards/rejected": -0.03557158410549164, + "step": 100 + }, + { + "epoch": 0.4, + "eval_kl": 0.2823386490345001, + "eval_logits/chosen": 37584830.464, + "eval_logits/rejected": 37518467.072, + "eval_logps/chosen": -154.656765625, + "eval_logps/rejected": -148.958828125, + "eval_loss": 0.500180184841156, + "eval_rewards/chosen": -0.033099037170410156, + "eval_rewards/margins": -0.001489009857177731, + "eval_rewards/rejected": -0.031610027313232425, + "eval_runtime": 214.534, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.331, + "step": 100 + }, + { + "epoch": 0.44, + "grad_norm": 0.38800352811813354, + "kl": 0.35814735293388367, + "learning_rate": 4.344444444444445e-06, + "logits/chosen": 37732076.8, + "logits/rejected": 37432582.4, + "logps/chosen": -130.15322265625, + "logps/rejected": -135.52431640625, + "loss": 0.4978357791900635, + "rewards/chosen": 0.0017490973696112633, + "rewards/margins": 0.017441909573972226, + "rewards/rejected": -0.015692812204360963, + "step": 110 + }, + { + "epoch": 0.48, + "grad_norm": 0.3928312063217163, + "kl": 0.5314055681228638, + "learning_rate": 4.233333333333334e-06, + "logits/chosen": 46644518.4, + "logits/rejected": 46458028.8, + "logps/chosen": -161.97745361328126, + "logps/rejected": -133.66865234375, + "loss": 0.5009183883666992, + "rewards/chosen": -0.017161448299884797, + "rewards/margins": -0.007284644991159439, + "rewards/rejected": -0.009876803308725358, + "step": 120 + }, + { + "epoch": 0.48, + "eval_kl": 0.3733839988708496, + "eval_logits/chosen": 37409923.072, + "eval_logits/rejected": 37337829.376, + "eval_logps/chosen": -154.6501875, + "eval_logps/rejected": -148.94265625, + "eval_loss": 0.5002961754798889, + "eval_rewards/chosen": -0.032439395904541014, + "eval_rewards/margins": -0.002446096420288084, + "eval_rewards/rejected": -0.02999329948425293, + "eval_runtime": 214.7717, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 120 + }, + { + "epoch": 0.52, + "grad_norm": 0.31859076023101807, + "kl": 0.7786873579025269, + "learning_rate": 4.122222222222222e-06, + "logits/chosen": 39394198.4, + "logits/rejected": 38848038.4, + "logps/chosen": -161.5689453125, + "logps/rejected": -139.7646484375, + "loss": 0.5006535530090332, + "rewards/chosen": 0.037287008762359616, + "rewards/margins": -0.0053613424301147475, + "rewards/rejected": 0.042648351192474364, + "step": 130 + }, + { + "epoch": 0.56, + "grad_norm": 0.35979101061820984, + "kl": 0.9413240551948547, + "learning_rate": 4.011111111111111e-06, + "logits/chosen": 40873280.0, + "logits/rejected": 42775779.2, + "logps/chosen": -141.7545654296875, + "logps/rejected": -156.98377685546876, + "loss": 0.5007112503051758, + "rewards/chosen": 0.06499210000038147, + "rewards/margins": -0.0058412253856658936, + "rewards/rejected": 0.07083332538604736, + "step": 140 + }, + { + "epoch": 0.56, + "eval_kl": 0.8415165543556213, + "eval_logits/chosen": 37861306.368, + "eval_logits/rejected": 37748256.768, + "eval_logps/chosen": -153.833046875, + "eval_logps/rejected": -148.1711875, + "eval_loss": 0.49973729252815247, + "eval_rewards/chosen": 0.04927331161499023, + "eval_rewards/margins": 0.0021207618713378895, + "eval_rewards/rejected": 0.04715254974365234, + "eval_runtime": 213.9529, + "eval_samples_per_second": 4.674, + "eval_steps_per_second": 2.337, + "step": 140 + }, + { + "epoch": 0.6, + "grad_norm": 0.3864636719226837, + "kl": 1.0000280141830444, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 40998265.6, + "logits/rejected": 42312035.2, + "logps/chosen": -129.3378662109375, + "logps/rejected": -116.82637939453124, + "loss": 0.495453929901123, + "rewards/chosen": 0.08338069915771484, + "rewards/margins": 0.03556356728076935, + "rewards/rejected": 0.047817131876945494, + "step": 150 + }, + { + "epoch": 0.64, + "grad_norm": 0.3939324915409088, + "kl": 1.3347299098968506, + "learning_rate": 3.7888888888888893e-06, + "logits/chosen": 34671974.4, + "logits/rejected": 35068432.0, + "logps/chosen": -142.6143798828125, + "logps/rejected": -146.22757568359376, + "loss": 0.4980185508728027, + "rewards/chosen": 0.10691760778427124, + "rewards/margins": 0.016442364454269415, + "rewards/rejected": 0.09047524333000183, + "step": 160 + }, + { + "epoch": 0.64, + "eval_kl": 1.297573208808899, + "eval_logits/chosen": 38283317.248, + "eval_logits/rejected": 38130507.776, + "eval_logps/chosen": -153.150921875, + "eval_logps/rejected": -147.5410625, + "eval_loss": 0.4990925192832947, + "eval_rewards/chosen": 0.11748551940917969, + "eval_rewards/margins": 0.007319358825683589, + "eval_rewards/rejected": 0.1101661605834961, + "eval_runtime": 214.7809, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 160 + }, + { + "epoch": 0.68, + "grad_norm": 0.32822251319885254, + "kl": 1.4739606380462646, + "learning_rate": 3.6777777777777778e-06, + "logits/chosen": 38105820.8, + "logits/rejected": 39919369.6, + "logps/chosen": -137.03099365234374, + "logps/rejected": -148.72579345703124, + "loss": 0.49805259704589844, + "rewards/chosen": 0.14163793325424195, + "rewards/margins": 0.015950965881347673, + "rewards/rejected": 0.12568696737289428, + "step": 170 + }, + { + "epoch": 0.72, + "grad_norm": 0.48161256313323975, + "kl": 1.9404491186141968, + "learning_rate": 3.566666666666667e-06, + "logits/chosen": 44436915.2, + "logits/rejected": 45288496.0, + "logps/chosen": -143.67095947265625, + "logps/rejected": -169.277587890625, + "loss": 0.4995573997497559, + "rewards/chosen": 0.18708930015563965, + "rewards/margins": 0.003508448600769043, + "rewards/rejected": 0.1835808515548706, + "step": 180 + }, + { + "epoch": 0.72, + "eval_kl": 1.780542254447937, + "eval_logits/chosen": 38592557.056, + "eval_logits/rejected": 38413033.472, + "eval_logps/chosen": -152.56709375, + "eval_logps/rejected": -146.9903125, + "eval_loss": 0.4986813962459564, + "eval_rewards/chosen": 0.1758671875, + "eval_rewards/margins": 0.010626815795898442, + "eval_rewards/rejected": 0.16524037170410155, + "eval_runtime": 214.9135, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 2.327, + "step": 180 + } + ], + "logging_steps": 10, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-180/training_args.bin b/v5/KTO/KTO_2k/lora/checkpoint-180/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4bd98b4c087a91a6868c0d02be1d3fadc2a8cce1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-180/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a7ce213d9d8d780414e78695f2513e61226fb7b06531a5bcdf434ae993c976 +size 5521 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/README.md b/v5/KTO/KTO_2k/lora/checkpoint-480/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_config.json b/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..79c87a5e26c750982ea826f71569628c0700f71f --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "gate_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_model.safetensors b/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2653a01edda02910b78e6fe6f1306b22d29a326c --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a460c68ef77af9f4d55f352649d150803e71fd81f8c2b85163e99bb680d0d185 +size 180385008 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/chat_template.jinja b/v5/KTO/KTO_2k/lora/checkpoint-480/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/optimizer.pt b/v5/KTO/KTO_2k/lora/checkpoint-480/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fdae6faf6a0a4a4ab2c9623c0f1fa0e6a2715c4 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53f9bfc4b0a8e588a60df9d117dc1fcb193aa3f5818df370940965850cae4445 +size 360902475 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/rng_state.pth b/v5/KTO/KTO_2k/lora/checkpoint-480/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/scaler.pt b/v5/KTO/KTO_2k/lora/checkpoint-480/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..97b7fefa051328c77bd749295f057c75bab00507 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d48ee9d9909680ca611f0a95c8cefcadb338dd2851b722337f41dd0606fbe3b +size 1383 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/scheduler.pt b/v5/KTO/KTO_2k/lora/checkpoint-480/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d29300914f3c32a9e3c4c7fe4696a81b2b553579 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:805b92651a554ce3dc994fa308e5874d2eb050fadf0b3c50e5a8d186de5a65dd +size 1465 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer.json b/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer_config.json b/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/trainer_state.json b/v5/KTO/KTO_2k/lora/checkpoint-480/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f8c2f32abb6d8c028b96fca383d6d63ae7d73c21 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/trainer_state.json @@ -0,0 +1,1138 @@ +{ + "best_global_step": 180, + "best_metric": 0.010626815795898442, + "best_model_checkpoint": "output/lora/checkpoint-180", + "epoch": 1.92, + "eval_steps": 20, + "global_step": 480, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "grad_norm": 0.5028387904167175, + "kl": 0.01572093926370144, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 30264627.2, + "logits/rejected": 32013382.4, + "logps/chosen": -148.62435302734374, + "logps/rejected": -128.52413330078124, + "loss": 0.5001428127288818, + "rewards/chosen": 0.000409355154260993, + "rewards/margins": -0.0011432173196226358, + "rewards/rejected": 0.0015525724738836288, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 0.4273326098918915, + "kl": 0.02463815174996853, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": 54053068.8, + "logits/rejected": 53587827.2, + "logps/chosen": -139.9073974609375, + "logps/rejected": -151.66370849609376, + "loss": 0.5000814437866211, + "rewards/chosen": 0.0014432575553655624, + "rewards/margins": -0.0006511589512228967, + "rewards/rejected": 0.002094416506588459, + "step": 20 + }, + { + "epoch": 0.08, + "eval_kl": 0.019881391897797585, + "eval_logits/chosen": 38914011.136, + "eval_logits/rejected": 38816776.192, + "eval_logps/chosen": -154.32296875, + "eval_logps/rejected": -148.641921875, + "eval_loss": 0.49997514486312866, + "eval_rewards/chosen": 0.00028007772564888, + "eval_rewards/margins": 0.00019879454374313355, + "eval_rewards/rejected": 8.128318190574646e-05, + "eval_runtime": 214.9745, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 20 + }, + { + "epoch": 0.12, + "grad_norm": 0.4189560115337372, + "kl": 0.011721396818757057, + "learning_rate": 2.9e-06, + "logits/chosen": 34689024.0, + "logits/rejected": 34760614.4, + "logps/chosen": -131.55169677734375, + "logps/rejected": -140.08671875, + "loss": 0.4998650550842285, + "rewards/chosen": -0.0027512358501553535, + "rewards/margins": 0.0010801648721098902, + "rewards/rejected": -0.0038314007222652437, + "step": 30 + }, + { + "epoch": 0.16, + "grad_norm": 0.35710522532463074, + "kl": 0.014891624450683594, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 43734198.4, + "logits/rejected": 44358838.4, + "logps/chosen": -144.14744873046874, + "logps/rejected": -145.7405517578125, + "loss": 0.49999523162841797, + "rewards/chosen": -0.003765878826379776, + "rewards/margins": 3.8427859544754115e-05, + "rewards/rejected": -0.00380430668592453, + "step": 40 + }, + { + "epoch": 0.16, + "eval_kl": 0.010086631402373314, + "eval_logits/chosen": 38739668.992, + "eval_logits/rejected": 38645895.168, + "eval_logps/chosen": -154.4100625, + "eval_logps/rejected": -148.72634375, + "eval_loss": 0.5000083446502686, + "eval_rewards/chosen": -0.00842873191833496, + "eval_rewards/margins": -6.65245056152338e-05, + "eval_rewards/rejected": -0.008362207412719726, + "eval_runtime": 214.0434, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 2.336, + "step": 40 + }, + { + "epoch": 0.2, + "grad_norm": 0.32243308424949646, + "kl": 0.03667576238512993, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 43570592.0, + "logits/rejected": 45423254.4, + "logps/chosen": -140.7910400390625, + "logps/rejected": -154.12264404296874, + "loss": 0.4997419357299805, + "rewards/chosen": -0.0029973506927490233, + "rewards/margins": 0.0020649198442697523, + "rewards/rejected": -0.005062270537018776, + "step": 50 + }, + { + "epoch": 0.24, + "grad_norm": 0.2825469970703125, + "kl": 0.11386795341968536, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 37111891.2, + "logits/rejected": 34546041.6, + "logps/chosen": -105.48106689453125, + "logps/rejected": -113.80091552734375, + "loss": 0.4988723278045654, + "rewards/chosen": 0.008526696264743805, + "rewards/margins": 0.009030784061178566, + "rewards/rejected": -0.0005040877964347601, + "step": 60 + }, + { + "epoch": 0.24, + "eval_kl": 0.07935438305139542, + "eval_logits/chosen": 38417367.04, + "eval_logits/rejected": 38329237.504, + "eval_logps/chosen": -154.4071875, + "eval_logps/rejected": -148.7261875, + "eval_loss": 0.49997425079345703, + "eval_rewards/chosen": -0.008140082359313966, + "eval_rewards/margins": 0.0002057380676269531, + "eval_rewards/rejected": -0.008345820426940919, + "eval_runtime": 214.5644, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.33, + "step": 60 + }, + { + "epoch": 0.28, + "grad_norm": 0.38684579730033875, + "kl": 0.07920856773853302, + "learning_rate": 4.7888888888888894e-06, + "logits/chosen": 47922112.0, + "logits/rejected": 46894880.0, + "logps/chosen": -165.71815185546876, + "logps/rejected": -175.34261474609374, + "loss": 0.498098087310791, + "rewards/chosen": -0.0340001106262207, + "rewards/margins": 0.01531563997268677, + "rewards/rejected": -0.04931575059890747, + "step": 70 + }, + { + "epoch": 0.32, + "grad_norm": 0.46789467334747314, + "kl": 0.07494600117206573, + "learning_rate": 4.677777777777778e-06, + "logits/chosen": 40747894.4, + "logits/rejected": 40754576.0, + "logps/chosen": -155.0984619140625, + "logps/rejected": -163.70439453125, + "loss": 0.4992827415466309, + "rewards/chosen": -0.0378705084323883, + "rewards/margins": 0.005790993571281433, + "rewards/rejected": -0.043661502003669736, + "step": 80 + }, + { + "epoch": 0.32, + "eval_kl": 0.08320723474025726, + "eval_logits/chosen": 37684162.56, + "eval_logits/rejected": 37622898.688, + "eval_logps/chosen": -154.907359375, + "eval_logps/rejected": -149.193203125, + "eval_loss": 0.5003835558891296, + "eval_rewards/chosen": -0.05815739822387695, + "eval_rewards/margins": -0.0031089820861816414, + "eval_rewards/rejected": -0.05504841613769531, + "eval_runtime": 214.9467, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 80 + }, + { + "epoch": 0.36, + "grad_norm": 0.3978089690208435, + "kl": 0.17448857426643372, + "learning_rate": 4.566666666666667e-06, + "logits/chosen": 43566720.0, + "logits/rejected": 43788972.8, + "logps/chosen": -169.9933837890625, + "logps/rejected": -159.5298828125, + "loss": 0.4963120460510254, + "rewards/chosen": -0.02978883981704712, + "rewards/margins": 0.029619407653808594, + "rewards/rejected": -0.05940824747085571, + "step": 90 + }, + { + "epoch": 0.4, + "grad_norm": 0.48971185088157654, + "kl": 0.3320249021053314, + "learning_rate": 4.455555555555555e-06, + "logits/chosen": 27562105.6, + "logits/rejected": 26916403.2, + "logps/chosen": -133.9698486328125, + "logps/rejected": -164.26551513671876, + "loss": 0.49673967361450194, + "rewards/chosen": -0.009067486226558685, + "rewards/margins": 0.026504097878932955, + "rewards/rejected": -0.03557158410549164, + "step": 100 + }, + { + "epoch": 0.4, + "eval_kl": 0.2823386490345001, + "eval_logits/chosen": 37584830.464, + "eval_logits/rejected": 37518467.072, + "eval_logps/chosen": -154.656765625, + "eval_logps/rejected": -148.958828125, + "eval_loss": 0.500180184841156, + "eval_rewards/chosen": -0.033099037170410156, + "eval_rewards/margins": -0.001489009857177731, + "eval_rewards/rejected": -0.031610027313232425, + "eval_runtime": 214.534, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.331, + "step": 100 + }, + { + "epoch": 0.44, + "grad_norm": 0.38800352811813354, + "kl": 0.35814735293388367, + "learning_rate": 4.344444444444445e-06, + "logits/chosen": 37732076.8, + "logits/rejected": 37432582.4, + "logps/chosen": -130.15322265625, + "logps/rejected": -135.52431640625, + "loss": 0.4978357791900635, + "rewards/chosen": 0.0017490973696112633, + "rewards/margins": 0.017441909573972226, + "rewards/rejected": -0.015692812204360963, + "step": 110 + }, + { + "epoch": 0.48, + "grad_norm": 0.3928312063217163, + "kl": 0.5314055681228638, + "learning_rate": 4.233333333333334e-06, + "logits/chosen": 46644518.4, + "logits/rejected": 46458028.8, + "logps/chosen": -161.97745361328126, + "logps/rejected": -133.66865234375, + "loss": 0.5009183883666992, + "rewards/chosen": -0.017161448299884797, + "rewards/margins": -0.007284644991159439, + "rewards/rejected": -0.009876803308725358, + "step": 120 + }, + { + "epoch": 0.48, + "eval_kl": 0.3733839988708496, + "eval_logits/chosen": 37409923.072, + "eval_logits/rejected": 37337829.376, + "eval_logps/chosen": -154.6501875, + "eval_logps/rejected": -148.94265625, + "eval_loss": 0.5002961754798889, + "eval_rewards/chosen": -0.032439395904541014, + "eval_rewards/margins": -0.002446096420288084, + "eval_rewards/rejected": -0.02999329948425293, + "eval_runtime": 214.7717, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 120 + }, + { + "epoch": 0.52, + "grad_norm": 0.31859076023101807, + "kl": 0.7786873579025269, + "learning_rate": 4.122222222222222e-06, + "logits/chosen": 39394198.4, + "logits/rejected": 38848038.4, + "logps/chosen": -161.5689453125, + "logps/rejected": -139.7646484375, + "loss": 0.5006535530090332, + "rewards/chosen": 0.037287008762359616, + "rewards/margins": -0.0053613424301147475, + "rewards/rejected": 0.042648351192474364, + "step": 130 + }, + { + "epoch": 0.56, + "grad_norm": 0.35979101061820984, + "kl": 0.9413240551948547, + "learning_rate": 4.011111111111111e-06, + "logits/chosen": 40873280.0, + "logits/rejected": 42775779.2, + "logps/chosen": -141.7545654296875, + "logps/rejected": -156.98377685546876, + "loss": 0.5007112503051758, + "rewards/chosen": 0.06499210000038147, + "rewards/margins": -0.0058412253856658936, + "rewards/rejected": 0.07083332538604736, + "step": 140 + }, + { + "epoch": 0.56, + "eval_kl": 0.8415165543556213, + "eval_logits/chosen": 37861306.368, + "eval_logits/rejected": 37748256.768, + "eval_logps/chosen": -153.833046875, + "eval_logps/rejected": -148.1711875, + "eval_loss": 0.49973729252815247, + "eval_rewards/chosen": 0.04927331161499023, + "eval_rewards/margins": 0.0021207618713378895, + "eval_rewards/rejected": 0.04715254974365234, + "eval_runtime": 213.9529, + "eval_samples_per_second": 4.674, + "eval_steps_per_second": 2.337, + "step": 140 + }, + { + "epoch": 0.6, + "grad_norm": 0.3864636719226837, + "kl": 1.0000280141830444, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 40998265.6, + "logits/rejected": 42312035.2, + "logps/chosen": -129.3378662109375, + "logps/rejected": -116.82637939453124, + "loss": 0.495453929901123, + "rewards/chosen": 0.08338069915771484, + "rewards/margins": 0.03556356728076935, + "rewards/rejected": 0.047817131876945494, + "step": 150 + }, + { + "epoch": 0.64, + "grad_norm": 0.3939324915409088, + "kl": 1.3347299098968506, + "learning_rate": 3.7888888888888893e-06, + "logits/chosen": 34671974.4, + "logits/rejected": 35068432.0, + "logps/chosen": -142.6143798828125, + "logps/rejected": -146.22757568359376, + "loss": 0.4980185508728027, + "rewards/chosen": 0.10691760778427124, + "rewards/margins": 0.016442364454269415, + "rewards/rejected": 0.09047524333000183, + "step": 160 + }, + { + "epoch": 0.64, + "eval_kl": 1.297573208808899, + "eval_logits/chosen": 38283317.248, + "eval_logits/rejected": 38130507.776, + "eval_logps/chosen": -153.150921875, + "eval_logps/rejected": -147.5410625, + "eval_loss": 0.4990925192832947, + "eval_rewards/chosen": 0.11748551940917969, + "eval_rewards/margins": 0.007319358825683589, + "eval_rewards/rejected": 0.1101661605834961, + "eval_runtime": 214.7809, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 160 + }, + { + "epoch": 0.68, + "grad_norm": 0.32822251319885254, + "kl": 1.4739606380462646, + "learning_rate": 3.6777777777777778e-06, + "logits/chosen": 38105820.8, + "logits/rejected": 39919369.6, + "logps/chosen": -137.03099365234374, + "logps/rejected": -148.72579345703124, + "loss": 0.49805259704589844, + "rewards/chosen": 0.14163793325424195, + "rewards/margins": 0.015950965881347673, + "rewards/rejected": 0.12568696737289428, + "step": 170 + }, + { + "epoch": 0.72, + "grad_norm": 0.48161256313323975, + "kl": 1.9404491186141968, + "learning_rate": 3.566666666666667e-06, + "logits/chosen": 44436915.2, + "logits/rejected": 45288496.0, + "logps/chosen": -143.67095947265625, + "logps/rejected": -169.277587890625, + "loss": 0.4995573997497559, + "rewards/chosen": 0.18708930015563965, + "rewards/margins": 0.003508448600769043, + "rewards/rejected": 0.1835808515548706, + "step": 180 + }, + { + "epoch": 0.72, + "eval_kl": 1.780542254447937, + "eval_logits/chosen": 38592557.056, + "eval_logits/rejected": 38413033.472, + "eval_logps/chosen": -152.56709375, + "eval_logps/rejected": -146.9903125, + "eval_loss": 0.4986813962459564, + "eval_rewards/chosen": 0.1758671875, + "eval_rewards/margins": 0.010626815795898442, + "eval_rewards/rejected": 0.16524037170410155, + "eval_runtime": 214.9135, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 2.327, + "step": 180 + }, + { + "epoch": 0.76, + "grad_norm": 0.3721677362918854, + "kl": 1.7214577198028564, + "learning_rate": 3.455555555555556e-06, + "logits/chosen": 41783456.0, + "logits/rejected": 43276217.6, + "logps/chosen": -122.831103515625, + "logps/rejected": -135.89095458984374, + "loss": 0.49962491989135743, + "rewards/chosen": 0.16950526237487792, + "rewards/margins": 0.0030477762222289873, + "rewards/rejected": 0.16645748615264894, + "step": 190 + }, + { + "epoch": 0.8, + "grad_norm": 0.4780789911746979, + "kl": 2.060234785079956, + "learning_rate": 3.3444444444444445e-06, + "logits/chosen": 47242921.6, + "logits/rejected": 48330630.4, + "logps/chosen": -149.18778076171876, + "logps/rejected": -171.2810546875, + "loss": 0.49495983123779297, + "rewards/chosen": 0.21567411422729493, + "rewards/margins": 0.04064606428146364, + "rewards/rejected": 0.1750280499458313, + "step": 200 + }, + { + "epoch": 0.8, + "eval_kl": 1.5634552240371704, + "eval_logits/chosen": 38330896.384, + "eval_logits/rejected": 38168498.176, + "eval_logps/chosen": -152.89178125, + "eval_logps/rejected": -147.299953125, + "eval_loss": 0.498869389295578, + "eval_rewards/chosen": 0.143399169921875, + "eval_rewards/margins": 0.009122253417968768, + "eval_rewards/rejected": 0.13427691650390625, + "eval_runtime": 214.6785, + "eval_samples_per_second": 4.658, + "eval_steps_per_second": 2.329, + "step": 200 + }, + { + "epoch": 0.84, + "grad_norm": 0.42962703108787537, + "kl": 1.3932136297225952, + "learning_rate": 3.2333333333333334e-06, + "logits/chosen": 33657414.4, + "logits/rejected": 33685497.6, + "logps/chosen": -139.3125244140625, + "logps/rejected": -131.3388671875, + "loss": 0.5001070499420166, + "rewards/chosen": 0.09355279803276062, + "rewards/margins": 0.0002027988433837835, + "rewards/rejected": 0.09334999918937684, + "step": 210 + }, + { + "epoch": 0.88, + "grad_norm": 0.4065878987312317, + "kl": 1.480474829673767, + "learning_rate": 3.1222222222222228e-06, + "logits/chosen": 35385318.4, + "logits/rejected": 36077315.2, + "logps/chosen": -100.69112548828124, + "logps/rejected": -125.24149169921876, + "loss": 0.500240707397461, + "rewards/chosen": 0.10761514902114869, + "rewards/margins": -0.0023266911506652777, + "rewards/rejected": 0.10994184017181396, + "step": 220 + }, + { + "epoch": 0.88, + "eval_kl": 1.0575237274169922, + "eval_logits/chosen": 37674934.272, + "eval_logits/rejected": 37547470.848, + "eval_logps/chosen": -153.71990625, + "eval_logps/rejected": -148.08821875, + "eval_loss": 0.4993574917316437, + "eval_rewards/chosen": 0.060586246490478515, + "eval_rewards/margins": 0.005133884429931637, + "eval_rewards/rejected": 0.05545236206054688, + "eval_runtime": 214.7758, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 220 + }, + { + "epoch": 0.92, + "grad_norm": 0.41962286829948425, + "kl": 1.0945483446121216, + "learning_rate": 3.0111111111111113e-06, + "logits/chosen": 47781094.4, + "logits/rejected": 46534304.0, + "logps/chosen": -184.4533447265625, + "logps/rejected": -162.75020751953124, + "loss": 0.5014426708221436, + "rewards/chosen": 0.061765891313552854, + "rewards/margins": -0.012319356203079224, + "rewards/rejected": 0.07408524751663208, + "step": 230 + }, + { + "epoch": 0.96, + "grad_norm": 0.40805885195732117, + "kl": 1.362263560295105, + "learning_rate": 2.9e-06, + "logits/chosen": 47934688.0, + "logits/rejected": 48291654.4, + "logps/chosen": -175.8403076171875, + "logps/rejected": -165.47391357421876, + "loss": 0.5020192623138428, + "rewards/chosen": 0.08795768022537231, + "rewards/margins": -0.018206548690795896, + "rewards/rejected": 0.10616422891616821, + "step": 240 + }, + { + "epoch": 0.96, + "eval_kl": 1.1043673753738403, + "eval_logits/chosen": 37919653.888, + "eval_logits/rejected": 37776138.24, + "eval_logps/chosen": -153.59246875, + "eval_logps/rejected": -147.980203125, + "eval_loss": 0.49911773204803467, + "eval_rewards/chosen": 0.07333123779296875, + "eval_rewards/margins": 0.007078071594238289, + "eval_rewards/rejected": 0.06625316619873046, + "eval_runtime": 213.224, + "eval_samples_per_second": 4.69, + "eval_steps_per_second": 2.345, + "step": 240 + }, + { + "epoch": 1.0, + "grad_norm": 0.5161420702934265, + "kl": 1.3074615001678467, + "learning_rate": 2.788888888888889e-06, + "logits/chosen": 36994057.6, + "logits/rejected": 35017337.6, + "logps/chosen": -175.2834716796875, + "logps/rejected": -144.57603759765624, + "loss": 0.49155316352844236, + "rewards/chosen": 0.13294192552566528, + "rewards/margins": 0.06808240413665771, + "rewards/rejected": 0.06485952138900757, + "step": 250 + }, + { + "epoch": 1.04, + "grad_norm": 0.5176857113838196, + "kl": 1.1041967868804932, + "learning_rate": 2.677777777777778e-06, + "logits/chosen": 29588192.0, + "logits/rejected": 30928668.8, + "logps/chosen": -146.99859619140625, + "logps/rejected": -128.4593994140625, + "loss": 0.48071441650390623, + "rewards/chosen": 0.16298424005508422, + "rewards/margins": 0.15495829358696936, + "rewards/rejected": 0.008025946468114853, + "step": 260 + }, + { + "epoch": 1.04, + "eval_kl": 1.3196667432785034, + "eval_logits/chosen": 37925871.616, + "eval_logits/rejected": 37763534.848, + "eval_logps/chosen": -153.3530625, + "eval_logps/rejected": -147.759578125, + "eval_loss": 0.4988880455493927, + "eval_rewards/chosen": 0.09727115631103515, + "eval_rewards/margins": 0.008956184387207022, + "eval_rewards/rejected": 0.08831497192382813, + "eval_runtime": 213.7741, + "eval_samples_per_second": 4.678, + "eval_steps_per_second": 2.339, + "step": 260 + }, + { + "epoch": 1.08, + "grad_norm": 0.46143728494644165, + "kl": 1.3379521369934082, + "learning_rate": 2.566666666666667e-06, + "logits/chosen": 52862003.2, + "logits/rejected": 52381398.4, + "logps/chosen": -138.23076171875, + "logps/rejected": -151.10428466796876, + "loss": 0.48619937896728516, + "rewards/chosen": 0.16910784244537352, + "rewards/margins": 0.11107043027877807, + "rewards/rejected": 0.058037412166595456, + "step": 270 + }, + { + "epoch": 1.12, + "grad_norm": 0.4464263319969177, + "kl": 1.270957589149475, + "learning_rate": 2.455555555555556e-06, + "logits/chosen": 33784115.2, + "logits/rejected": 33531318.4, + "logps/chosen": -130.073583984375, + "logps/rejected": -140.40438232421874, + "loss": 0.47771358489990234, + "rewards/chosen": 0.14505974054336548, + "rewards/margins": 0.18065866827964783, + "rewards/rejected": -0.03559892773628235, + "step": 280 + }, + { + "epoch": 1.12, + "eval_kl": 1.1183552742004395, + "eval_logits/chosen": 37143470.08, + "eval_logits/rejected": 37001617.408, + "eval_logps/chosen": -153.897140625, + "eval_logps/rejected": -148.2568125, + "eval_loss": 0.4994434118270874, + "eval_rewards/chosen": 0.04286346435546875, + "eval_rewards/margins": 0.00427254486083984, + "eval_rewards/rejected": 0.03859091949462891, + "eval_runtime": 213.5941, + "eval_samples_per_second": 4.682, + "eval_steps_per_second": 2.341, + "step": 280 + }, + { + "epoch": 1.16, + "grad_norm": 0.37523216009140015, + "kl": 1.423752784729004, + "learning_rate": 2.3444444444444448e-06, + "logits/chosen": 42669203.2, + "logits/rejected": 43161667.2, + "logps/chosen": -142.2610595703125, + "logps/rejected": -145.66318359375, + "loss": 0.47769603729248045, + "rewards/chosen": 0.18487329483032228, + "rewards/margins": 0.18094245791435243, + "rewards/rejected": 0.003930836915969849, + "step": 290 + }, + { + "epoch": 1.2, + "grad_norm": 0.3394813537597656, + "kl": 0.895855724811554, + "learning_rate": 2.2333333333333333e-06, + "logits/chosen": 41978041.6, + "logits/rejected": 43551177.6, + "logps/chosen": -139.19925537109376, + "logps/rejected": -155.9151123046875, + "loss": 0.4623682498931885, + "rewards/chosen": 0.15618009567260743, + "rewards/margins": 0.3404909610748291, + "rewards/rejected": -0.1843108654022217, + "step": 300 + }, + { + "epoch": 1.2, + "eval_kl": 1.0962754487991333, + "eval_logits/chosen": 36886769.664, + "eval_logits/rejected": 36750012.416, + "eval_logps/chosen": -154.00446875, + "eval_logps/rejected": -148.363828125, + "eval_loss": 0.4994364380836487, + "eval_rewards/chosen": 0.03213003921508789, + "eval_rewards/margins": 0.004239551544189455, + "eval_rewards/rejected": 0.027890487670898436, + "eval_runtime": 213.2494, + "eval_samples_per_second": 4.689, + "eval_steps_per_second": 2.345, + "step": 300 + }, + { + "epoch": 1.24, + "grad_norm": 0.3372270166873932, + "kl": 1.0476510524749756, + "learning_rate": 2.1222222222222226e-06, + "logits/chosen": 35257689.6, + "logits/rejected": 32474739.2, + "logps/chosen": -104.17430419921875, + "logps/rejected": -115.5439453125, + "loss": 0.4642783641815186, + "rewards/chosen": 0.139203941822052, + "rewards/margins": 0.31401048898696904, + "rewards/rejected": -0.174806547164917, + "step": 310 + }, + { + "epoch": 1.28, + "grad_norm": 0.4338719844818115, + "kl": 1.3322747945785522, + "learning_rate": 2.011111111111111e-06, + "logits/chosen": 46726528.0, + "logits/rejected": 45261251.2, + "logps/chosen": -163.8673583984375, + "logps/rejected": -176.94974365234376, + "loss": 0.45936293601989747, + "rewards/chosen": 0.15107860565185546, + "rewards/margins": 0.3611048460006714, + "rewards/rejected": -0.21002624034881592, + "step": 320 + }, + { + "epoch": 1.28, + "eval_kl": 1.2043012380599976, + "eval_logits/chosen": 36777713.664, + "eval_logits/rejected": 36639129.6, + "eval_logps/chosen": -153.916890625, + "eval_logps/rejected": -148.2699375, + "eval_loss": 0.4995039105415344, + "eval_rewards/chosen": 0.0408895263671875, + "eval_rewards/margins": 0.003611473083496089, + "eval_rewards/rejected": 0.03727805328369141, + "eval_runtime": 213.3704, + "eval_samples_per_second": 4.687, + "eval_steps_per_second": 2.343, + "step": 320 + }, + { + "epoch": 1.32, + "grad_norm": 0.5007278323173523, + "kl": 1.4502718448638916, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": 40259123.2, + "logits/rejected": 39875241.6, + "logps/chosen": -152.3288818359375, + "logps/rejected": -163.76326904296874, + "loss": 0.46443448066711424, + "rewards/chosen": 0.2390885829925537, + "rewards/margins": 0.2886385679244995, + "rewards/rejected": -0.049549984931945804, + "step": 330 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.42771467566490173, + "kl": 1.447667121887207, + "learning_rate": 1.788888888888889e-06, + "logits/chosen": 42535587.2, + "logits/rejected": 42390444.8, + "logps/chosen": -167.4290283203125, + "logps/rejected": -159.91485595703125, + "loss": 0.4603309631347656, + "rewards/chosen": 0.22664895057678222, + "rewards/margins": 0.32455546855926515, + "rewards/rejected": -0.09790651798248291, + "step": 340 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 1.338826060295105, + "eval_logits/chosen": 36785934.336, + "eval_logits/rejected": 36644188.16, + "eval_logps/chosen": -153.77146875, + "eval_logps/rejected": -148.13078125, + "eval_loss": 0.4994281530380249, + "eval_rewards/chosen": 0.05543093872070313, + "eval_rewards/margins": 0.0042367248535156304, + "eval_rewards/rejected": 0.0511942138671875, + "eval_runtime": 213.5161, + "eval_samples_per_second": 4.683, + "eval_steps_per_second": 2.342, + "step": 340 + }, + { + "epoch": 1.4, + "grad_norm": 0.5768368244171143, + "kl": 1.390388011932373, + "learning_rate": 1.6777777777777779e-06, + "logits/chosen": 26650705.6, + "logits/rejected": 25887854.4, + "logps/chosen": -132.03568115234376, + "logps/rejected": -165.2656005859375, + "loss": 0.4619337558746338, + "rewards/chosen": 0.18434877395629884, + "rewards/margins": 0.31992815732955937, + "rewards/rejected": -0.1355793833732605, + "step": 350 + }, + { + "epoch": 1.44, + "grad_norm": 0.4281937777996063, + "kl": 1.585924506187439, + "learning_rate": 1.566666666666667e-06, + "logits/chosen": 37149299.2, + "logits/rejected": 36698345.6, + "logps/chosen": -127.9746337890625, + "logps/rejected": -135.4433837890625, + "loss": 0.47208099365234374, + "rewards/chosen": 0.21960780620574952, + "rewards/margins": 0.22720773071050646, + "rewards/rejected": -0.007599924504756927, + "step": 360 + }, + { + "epoch": 1.44, + "eval_kl": 1.311928391456604, + "eval_logits/chosen": 36534513.664, + "eval_logits/rejected": 36399624.192, + "eval_logps/chosen": -153.9535, + "eval_logps/rejected": -148.298265625, + "eval_loss": 0.49958622455596924, + "eval_rewards/chosen": 0.037226760864257816, + "eval_rewards/margins": 0.0027809829711914064, + "eval_rewards/rejected": 0.03444577789306641, + "eval_runtime": 213.3999, + "eval_samples_per_second": 4.686, + "eval_steps_per_second": 2.343, + "step": 360 + }, + { + "epoch": 1.48, + "grad_norm": 0.4531302750110626, + "kl": 2.1966803073883057, + "learning_rate": 1.4555555555555557e-06, + "logits/chosen": 45897555.2, + "logits/rejected": 45395225.6, + "logps/chosen": -159.5014404296875, + "logps/rejected": -133.12310791015625, + "loss": 0.47774505615234375, + "rewards/chosen": 0.23044068813323976, + "rewards/margins": 0.185763356089592, + "rewards/rejected": 0.044677332043647766, + "step": 370 + }, + { + "epoch": 1.52, + "grad_norm": 0.3507835268974304, + "kl": 1.9232231378555298, + "learning_rate": 1.3444444444444446e-06, + "logits/chosen": 38330899.2, + "logits/rejected": 37796579.2, + "logps/chosen": -159.63173828125, + "logps/rejected": -139.68382568359374, + "loss": 0.4776346206665039, + "rewards/chosen": 0.23100655078887938, + "rewards/margins": 0.18027588725090027, + "rewards/rejected": 0.05073066353797913, + "step": 380 + }, + { + "epoch": 1.52, + "eval_kl": 1.5263408422470093, + "eval_logits/chosen": 36685828.096, + "eval_logits/rejected": 36531130.368, + "eval_logps/chosen": -153.611984375, + "eval_logps/rejected": -147.968125, + "eval_loss": 0.4994567334651947, + "eval_rewards/chosen": 0.07137952423095703, + "eval_rewards/margins": 0.003920288085937501, + "eval_rewards/rejected": 0.06745923614501953, + "eval_runtime": 213.3185, + "eval_samples_per_second": 4.688, + "eval_steps_per_second": 2.344, + "step": 380 + }, + { + "epoch": 1.56, + "grad_norm": 0.3990439772605896, + "kl": 1.5738338232040405, + "learning_rate": 1.2333333333333335e-06, + "logits/chosen": 39818672.0, + "logits/rejected": 41608393.6, + "logps/chosen": -140.68485107421876, + "logps/rejected": -158.0736328125, + "loss": 0.47451934814453123, + "rewards/chosen": 0.1719655990600586, + "rewards/margins": 0.21011758744716644, + "rewards/rejected": -0.03815198838710785, + "step": 390 + }, + { + "epoch": 1.6, + "grad_norm": 0.45497065782546997, + "kl": 1.9209775924682617, + "learning_rate": 1.1222222222222222e-06, + "logits/chosen": 39674937.6, + "logits/rejected": 40975612.8, + "logps/chosen": -128.1379638671875, + "logps/rejected": -117.67076416015625, + "loss": 0.4702040672302246, + "rewards/chosen": 0.20337235927581787, + "rewards/margins": 0.2399928867816925, + "rewards/rejected": -0.036620527505874634, + "step": 400 + }, + { + "epoch": 1.6, + "eval_kl": 1.4976035356521606, + "eval_logits/chosen": 36562317.312, + "eval_logits/rejected": 36410597.376, + "eval_logps/chosen": -153.700203125, + "eval_logps/rejected": -148.05290625, + "eval_loss": 0.49948906898498535, + "eval_rewards/chosen": 0.06255731201171875, + "eval_rewards/margins": 0.0035767364501953156, + "eval_rewards/rejected": 0.05898057556152344, + "eval_runtime": 213.3061, + "eval_samples_per_second": 4.688, + "eval_steps_per_second": 2.344, + "step": 400 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.44851529598236084, + "kl": 1.8833658695220947, + "learning_rate": 1.0111111111111111e-06, + "logits/chosen": 33354137.6, + "logits/rejected": 33615910.4, + "logps/chosen": -141.6330078125, + "logps/rejected": -147.7822021484375, + "loss": 0.4692805290222168, + "rewards/chosen": 0.20505664348602295, + "rewards/margins": 0.2700430333614349, + "rewards/rejected": -0.06498638987541198, + "step": 410 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.3704666197299957, + "kl": 1.809345006942749, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 36638368.0, + "logits/rejected": 38020054.4, + "logps/chosen": -136.44818115234375, + "logps/rejected": -150.01075439453126, + "loss": 0.47684297561645506, + "rewards/chosen": 0.1999206304550171, + "rewards/margins": 0.20272973477840425, + "rewards/rejected": -0.002809104323387146, + "step": 420 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 1.6867021322250366, + "eval_logits/chosen": 36786245.632, + "eval_logits/rejected": 36618121.216, + "eval_logps/chosen": -153.361515625, + "eval_logps/rejected": -147.73934375, + "eval_loss": 0.49919602274894714, + "eval_rewards/chosen": 0.0964253921508789, + "eval_rewards/margins": 0.006087821960449213, + "eval_rewards/rejected": 0.09033757019042969, + "eval_runtime": 213.6555, + "eval_samples_per_second": 4.68, + "eval_steps_per_second": 2.34, + "step": 420 + }, + { + "epoch": 1.72, + "grad_norm": 0.4963972270488739, + "kl": 2.1293835639953613, + "learning_rate": 7.888888888888889e-07, + "logits/chosen": 42904048.0, + "logits/rejected": 43455846.4, + "logps/chosen": -143.107568359375, + "logps/rejected": -170.5847900390625, + "loss": 0.4767764568328857, + "rewards/chosen": 0.24343018531799315, + "rewards/margins": 0.19057121276855468, + "rewards/rejected": 0.05285897254943848, + "step": 430 + }, + { + "epoch": 1.76, + "grad_norm": 0.4412926137447357, + "kl": 1.6203444004058838, + "learning_rate": 6.777777777777779e-07, + "logits/chosen": 40034080.0, + "logits/rejected": 41308224.0, + "logps/chosen": -122.6408203125, + "logps/rejected": -137.3405029296875, + "loss": 0.47936244010925294, + "rewards/chosen": 0.18853185176849366, + "rewards/margins": 0.16702898889780046, + "rewards/rejected": 0.021502862870693206, + "step": 440 + }, + { + "epoch": 1.76, + "eval_kl": 1.7451139688491821, + "eval_logits/chosen": 36817911.808, + "eval_logits/rejected": 36649046.016, + "eval_logps/chosen": -153.282765625, + "eval_logps/rejected": -147.663953125, + "eval_loss": 0.4991537928581238, + "eval_rewards/chosen": 0.10430096435546875, + "eval_rewards/margins": 0.006423851013183587, + "eval_rewards/rejected": 0.09787711334228516, + "eval_runtime": 213.1314, + "eval_samples_per_second": 4.692, + "eval_steps_per_second": 2.346, + "step": 440 + }, + { + "epoch": 1.8, + "grad_norm": 0.5419949293136597, + "kl": 2.0249862670898438, + "learning_rate": 5.666666666666667e-07, + "logits/chosen": 45477520.0, + "logits/rejected": 45914156.8, + "logps/chosen": -148.67772216796874, + "logps/rejected": -173.6037353515625, + "loss": 0.4622932434082031, + "rewards/chosen": 0.2666788101196289, + "rewards/margins": 0.32391947507858276, + "rewards/rejected": -0.05724066495895386, + "step": 450 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.5022501945495605, + "kl": 1.6355278491973877, + "learning_rate": 4.5555555555555563e-07, + "logits/chosen": 31932201.6, + "logits/rejected": 31943212.8, + "logps/chosen": -139.18756103515625, + "logps/rejected": -133.04072265625, + "loss": 0.4802652359008789, + "rewards/chosen": 0.10604774951934814, + "rewards/margins": 0.18288437128067017, + "rewards/rejected": -0.07683662176132203, + "step": 460 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 1.667972207069397, + "eval_logits/chosen": 36711571.456, + "eval_logits/rejected": 36549623.808, + "eval_logps/chosen": -153.430859375, + "eval_logps/rejected": -147.80959375, + "eval_loss": 0.4991794526576996, + "eval_rewards/chosen": 0.08949153137207032, + "eval_rewards/margins": 0.006177841186523439, + "eval_rewards/rejected": 0.08331369018554688, + "eval_runtime": 213.7973, + "eval_samples_per_second": 4.677, + "eval_steps_per_second": 2.339, + "step": 460 + }, + { + "epoch": 1.88, + "grad_norm": 0.4620107412338257, + "kl": 2.087587833404541, + "learning_rate": 3.444444444444445e-07, + "logits/chosen": 34332934.4, + "logits/rejected": 34831878.4, + "logps/chosen": -99.85589599609375, + "logps/rejected": -125.88466796875, + "loss": 0.4824401378631592, + "rewards/chosen": 0.19113779067993164, + "rewards/margins": 0.1455146014690399, + "rewards/rejected": 0.04562318921089172, + "step": 470 + }, + { + "epoch": 1.92, + "grad_norm": 0.45210617780685425, + "kl": 1.7846978902816772, + "learning_rate": 2.3333333333333336e-07, + "logits/chosen": 46964457.6, + "logits/rejected": 45760390.4, + "logps/chosen": -183.32412109375, + "logps/rejected": -163.24814453125, + "loss": 0.4812910079956055, + "rewards/chosen": 0.17468774318695068, + "rewards/margins": 0.1503958523273468, + "rewards/rejected": 0.02429189085960388, + "step": 480 + }, + { + "epoch": 1.92, + "eval_kl": 1.6339410543441772, + "eval_logits/chosen": 36667240.448, + "eval_logits/rejected": 36505169.92, + "eval_logps/chosen": -153.504015625, + "eval_logps/rejected": -147.870328125, + "eval_loss": 0.4993217885494232, + "eval_rewards/chosen": 0.08217547607421875, + "eval_rewards/margins": 0.0049354019165039065, + "eval_rewards/rejected": 0.07724007415771485, + "eval_runtime": 213.9159, + "eval_samples_per_second": 4.675, + "eval_steps_per_second": 2.337, + "step": 480 + } + ], + "logging_steps": 10, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-480/training_args.bin b/v5/KTO/KTO_2k/lora/checkpoint-480/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4bd98b4c087a91a6868c0d02be1d3fadc2a8cce1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-480/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a7ce213d9d8d780414e78695f2513e61226fb7b06531a5bcdf434ae993c976 +size 5521 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/README.md b/v5/KTO/KTO_2k/lora/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_config.json b/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..79c87a5e26c750982ea826f71569628c0700f71f --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "gate_proj", + "k_proj", + "q_proj", + "v_proj", + "down_proj", + "up_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_model.safetensors b/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..30582034649d7c4496327fd6263b6d51f7ea87cb --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d7123cb64167407354c7a41453d4666839d9a9539f8c6773ebefb3d72bfb43 +size 180385008 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/chat_template.jinja b/v5/KTO/KTO_2k/lora/checkpoint-500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/optimizer.pt b/v5/KTO/KTO_2k/lora/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ae1bdeba7cdb3f923684c23459a9aed534a832b --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cf211604b4face955a8400a57e6e381ec30264e7178063fb4b6c5c04e0e898 +size 360902475 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/rng_state.pth b/v5/KTO/KTO_2k/lora/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/scaler.pt b/v5/KTO/KTO_2k/lora/checkpoint-500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..86c857e8514e5db52a765434b135695dac4c9c36 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f77569c2e850b04af982cc8c1389f1430851448915c593b69e5da36ce05b71d7 +size 1383 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/scheduler.pt b/v5/KTO/KTO_2k/lora/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..11f58cb4016a7f75e45b4d50b8f5db52fdf4c0e8 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f369d8647b02587ea9d43b517f5ba41d7c98e522781f1b62206828d755e77757 +size 1465 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer.json b/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer_config.json b/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/trainer_state.json b/v5/KTO/KTO_2k/lora/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3783bb7629d8f37d8f5aa77834895d2036da2e4b --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/trainer_state.json @@ -0,0 +1,1184 @@ +{ + "best_global_step": 180, + "best_metric": 0.010626815795898442, + "best_model_checkpoint": "output/lora/checkpoint-180", + "epoch": 2.0, + "eval_steps": 20, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.04, + "grad_norm": 0.5028387904167175, + "kl": 0.01572093926370144, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 30264627.2, + "logits/rejected": 32013382.4, + "logps/chosen": -148.62435302734374, + "logps/rejected": -128.52413330078124, + "loss": 0.5001428127288818, + "rewards/chosen": 0.000409355154260993, + "rewards/margins": -0.0011432173196226358, + "rewards/rejected": 0.0015525724738836288, + "step": 10 + }, + { + "epoch": 0.08, + "grad_norm": 0.4273326098918915, + "kl": 0.02463815174996853, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": 54053068.8, + "logits/rejected": 53587827.2, + "logps/chosen": -139.9073974609375, + "logps/rejected": -151.66370849609376, + "loss": 0.5000814437866211, + "rewards/chosen": 0.0014432575553655624, + "rewards/margins": -0.0006511589512228967, + "rewards/rejected": 0.002094416506588459, + "step": 20 + }, + { + "epoch": 0.08, + "eval_kl": 0.019881391897797585, + "eval_logits/chosen": 38914011.136, + "eval_logits/rejected": 38816776.192, + "eval_logps/chosen": -154.32296875, + "eval_logps/rejected": -148.641921875, + "eval_loss": 0.49997514486312866, + "eval_rewards/chosen": 0.00028007772564888, + "eval_rewards/margins": 0.00019879454374313355, + "eval_rewards/rejected": 8.128318190574646e-05, + "eval_runtime": 214.9745, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 20 + }, + { + "epoch": 0.12, + "grad_norm": 0.4189560115337372, + "kl": 0.011721396818757057, + "learning_rate": 2.9e-06, + "logits/chosen": 34689024.0, + "logits/rejected": 34760614.4, + "logps/chosen": -131.55169677734375, + "logps/rejected": -140.08671875, + "loss": 0.4998650550842285, + "rewards/chosen": -0.0027512358501553535, + "rewards/margins": 0.0010801648721098902, + "rewards/rejected": -0.0038314007222652437, + "step": 30 + }, + { + "epoch": 0.16, + "grad_norm": 0.35710522532463074, + "kl": 0.014891624450683594, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 43734198.4, + "logits/rejected": 44358838.4, + "logps/chosen": -144.14744873046874, + "logps/rejected": -145.7405517578125, + "loss": 0.49999523162841797, + "rewards/chosen": -0.003765878826379776, + "rewards/margins": 3.8427859544754115e-05, + "rewards/rejected": -0.00380430668592453, + "step": 40 + }, + { + "epoch": 0.16, + "eval_kl": 0.010086631402373314, + "eval_logits/chosen": 38739668.992, + "eval_logits/rejected": 38645895.168, + "eval_logps/chosen": -154.4100625, + "eval_logps/rejected": -148.72634375, + "eval_loss": 0.5000083446502686, + "eval_rewards/chosen": -0.00842873191833496, + "eval_rewards/margins": -6.65245056152338e-05, + "eval_rewards/rejected": -0.008362207412719726, + "eval_runtime": 214.0434, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 2.336, + "step": 40 + }, + { + "epoch": 0.2, + "grad_norm": 0.32243308424949646, + "kl": 0.03667576238512993, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 43570592.0, + "logits/rejected": 45423254.4, + "logps/chosen": -140.7910400390625, + "logps/rejected": -154.12264404296874, + "loss": 0.4997419357299805, + "rewards/chosen": -0.0029973506927490233, + "rewards/margins": 0.0020649198442697523, + "rewards/rejected": -0.005062270537018776, + "step": 50 + }, + { + "epoch": 0.24, + "grad_norm": 0.2825469970703125, + "kl": 0.11386795341968536, + "learning_rate": 4.9000000000000005e-06, + "logits/chosen": 37111891.2, + "logits/rejected": 34546041.6, + "logps/chosen": -105.48106689453125, + "logps/rejected": -113.80091552734375, + "loss": 0.4988723278045654, + "rewards/chosen": 0.008526696264743805, + "rewards/margins": 0.009030784061178566, + "rewards/rejected": -0.0005040877964347601, + "step": 60 + }, + { + "epoch": 0.24, + "eval_kl": 0.07935438305139542, + "eval_logits/chosen": 38417367.04, + "eval_logits/rejected": 38329237.504, + "eval_logps/chosen": -154.4071875, + "eval_logps/rejected": -148.7261875, + "eval_loss": 0.49997425079345703, + "eval_rewards/chosen": -0.008140082359313966, + "eval_rewards/margins": 0.0002057380676269531, + "eval_rewards/rejected": -0.008345820426940919, + "eval_runtime": 214.5644, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.33, + "step": 60 + }, + { + "epoch": 0.28, + "grad_norm": 0.38684579730033875, + "kl": 0.07920856773853302, + "learning_rate": 4.7888888888888894e-06, + "logits/chosen": 47922112.0, + "logits/rejected": 46894880.0, + "logps/chosen": -165.71815185546876, + "logps/rejected": -175.34261474609374, + "loss": 0.498098087310791, + "rewards/chosen": -0.0340001106262207, + "rewards/margins": 0.01531563997268677, + "rewards/rejected": -0.04931575059890747, + "step": 70 + }, + { + "epoch": 0.32, + "grad_norm": 0.46789467334747314, + "kl": 0.07494600117206573, + "learning_rate": 4.677777777777778e-06, + "logits/chosen": 40747894.4, + "logits/rejected": 40754576.0, + "logps/chosen": -155.0984619140625, + "logps/rejected": -163.70439453125, + "loss": 0.4992827415466309, + "rewards/chosen": -0.0378705084323883, + "rewards/margins": 0.005790993571281433, + "rewards/rejected": -0.043661502003669736, + "step": 80 + }, + { + "epoch": 0.32, + "eval_kl": 0.08320723474025726, + "eval_logits/chosen": 37684162.56, + "eval_logits/rejected": 37622898.688, + "eval_logps/chosen": -154.907359375, + "eval_logps/rejected": -149.193203125, + "eval_loss": 0.5003835558891296, + "eval_rewards/chosen": -0.05815739822387695, + "eval_rewards/margins": -0.0031089820861816414, + "eval_rewards/rejected": -0.05504841613769531, + "eval_runtime": 214.9467, + "eval_samples_per_second": 4.652, + "eval_steps_per_second": 2.326, + "step": 80 + }, + { + "epoch": 0.36, + "grad_norm": 0.3978089690208435, + "kl": 0.17448857426643372, + "learning_rate": 4.566666666666667e-06, + "logits/chosen": 43566720.0, + "logits/rejected": 43788972.8, + "logps/chosen": -169.9933837890625, + "logps/rejected": -159.5298828125, + "loss": 0.4963120460510254, + "rewards/chosen": -0.02978883981704712, + "rewards/margins": 0.029619407653808594, + "rewards/rejected": -0.05940824747085571, + "step": 90 + }, + { + "epoch": 0.4, + "grad_norm": 0.48971185088157654, + "kl": 0.3320249021053314, + "learning_rate": 4.455555555555555e-06, + "logits/chosen": 27562105.6, + "logits/rejected": 26916403.2, + "logps/chosen": -133.9698486328125, + "logps/rejected": -164.26551513671876, + "loss": 0.49673967361450194, + "rewards/chosen": -0.009067486226558685, + "rewards/margins": 0.026504097878932955, + "rewards/rejected": -0.03557158410549164, + "step": 100 + }, + { + "epoch": 0.4, + "eval_kl": 0.2823386490345001, + "eval_logits/chosen": 37584830.464, + "eval_logits/rejected": 37518467.072, + "eval_logps/chosen": -154.656765625, + "eval_logps/rejected": -148.958828125, + "eval_loss": 0.500180184841156, + "eval_rewards/chosen": -0.033099037170410156, + "eval_rewards/margins": -0.001489009857177731, + "eval_rewards/rejected": -0.031610027313232425, + "eval_runtime": 214.534, + "eval_samples_per_second": 4.661, + "eval_steps_per_second": 2.331, + "step": 100 + }, + { + "epoch": 0.44, + "grad_norm": 0.38800352811813354, + "kl": 0.35814735293388367, + "learning_rate": 4.344444444444445e-06, + "logits/chosen": 37732076.8, + "logits/rejected": 37432582.4, + "logps/chosen": -130.15322265625, + "logps/rejected": -135.52431640625, + "loss": 0.4978357791900635, + "rewards/chosen": 0.0017490973696112633, + "rewards/margins": 0.017441909573972226, + "rewards/rejected": -0.015692812204360963, + "step": 110 + }, + { + "epoch": 0.48, + "grad_norm": 0.3928312063217163, + "kl": 0.5314055681228638, + "learning_rate": 4.233333333333334e-06, + "logits/chosen": 46644518.4, + "logits/rejected": 46458028.8, + "logps/chosen": -161.97745361328126, + "logps/rejected": -133.66865234375, + "loss": 0.5009183883666992, + "rewards/chosen": -0.017161448299884797, + "rewards/margins": -0.007284644991159439, + "rewards/rejected": -0.009876803308725358, + "step": 120 + }, + { + "epoch": 0.48, + "eval_kl": 0.3733839988708496, + "eval_logits/chosen": 37409923.072, + "eval_logits/rejected": 37337829.376, + "eval_logps/chosen": -154.6501875, + "eval_logps/rejected": -148.94265625, + "eval_loss": 0.5002961754798889, + "eval_rewards/chosen": -0.032439395904541014, + "eval_rewards/margins": -0.002446096420288084, + "eval_rewards/rejected": -0.02999329948425293, + "eval_runtime": 214.7717, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 120 + }, + { + "epoch": 0.52, + "grad_norm": 0.31859076023101807, + "kl": 0.7786873579025269, + "learning_rate": 4.122222222222222e-06, + "logits/chosen": 39394198.4, + "logits/rejected": 38848038.4, + "logps/chosen": -161.5689453125, + "logps/rejected": -139.7646484375, + "loss": 0.5006535530090332, + "rewards/chosen": 0.037287008762359616, + "rewards/margins": -0.0053613424301147475, + "rewards/rejected": 0.042648351192474364, + "step": 130 + }, + { + "epoch": 0.56, + "grad_norm": 0.35979101061820984, + "kl": 0.9413240551948547, + "learning_rate": 4.011111111111111e-06, + "logits/chosen": 40873280.0, + "logits/rejected": 42775779.2, + "logps/chosen": -141.7545654296875, + "logps/rejected": -156.98377685546876, + "loss": 0.5007112503051758, + "rewards/chosen": 0.06499210000038147, + "rewards/margins": -0.0058412253856658936, + "rewards/rejected": 0.07083332538604736, + "step": 140 + }, + { + "epoch": 0.56, + "eval_kl": 0.8415165543556213, + "eval_logits/chosen": 37861306.368, + "eval_logits/rejected": 37748256.768, + "eval_logps/chosen": -153.833046875, + "eval_logps/rejected": -148.1711875, + "eval_loss": 0.49973729252815247, + "eval_rewards/chosen": 0.04927331161499023, + "eval_rewards/margins": 0.0021207618713378895, + "eval_rewards/rejected": 0.04715254974365234, + "eval_runtime": 213.9529, + "eval_samples_per_second": 4.674, + "eval_steps_per_second": 2.337, + "step": 140 + }, + { + "epoch": 0.6, + "grad_norm": 0.3864636719226837, + "kl": 1.0000280141830444, + "learning_rate": 3.900000000000001e-06, + "logits/chosen": 40998265.6, + "logits/rejected": 42312035.2, + "logps/chosen": -129.3378662109375, + "logps/rejected": -116.82637939453124, + "loss": 0.495453929901123, + "rewards/chosen": 0.08338069915771484, + "rewards/margins": 0.03556356728076935, + "rewards/rejected": 0.047817131876945494, + "step": 150 + }, + { + "epoch": 0.64, + "grad_norm": 0.3939324915409088, + "kl": 1.3347299098968506, + "learning_rate": 3.7888888888888893e-06, + "logits/chosen": 34671974.4, + "logits/rejected": 35068432.0, + "logps/chosen": -142.6143798828125, + "logps/rejected": -146.22757568359376, + "loss": 0.4980185508728027, + "rewards/chosen": 0.10691760778427124, + "rewards/margins": 0.016442364454269415, + "rewards/rejected": 0.09047524333000183, + "step": 160 + }, + { + "epoch": 0.64, + "eval_kl": 1.297573208808899, + "eval_logits/chosen": 38283317.248, + "eval_logits/rejected": 38130507.776, + "eval_logps/chosen": -153.150921875, + "eval_logps/rejected": -147.5410625, + "eval_loss": 0.4990925192832947, + "eval_rewards/chosen": 0.11748551940917969, + "eval_rewards/margins": 0.007319358825683589, + "eval_rewards/rejected": 0.1101661605834961, + "eval_runtime": 214.7809, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 160 + }, + { + "epoch": 0.68, + "grad_norm": 0.32822251319885254, + "kl": 1.4739606380462646, + "learning_rate": 3.6777777777777778e-06, + "logits/chosen": 38105820.8, + "logits/rejected": 39919369.6, + "logps/chosen": -137.03099365234374, + "logps/rejected": -148.72579345703124, + "loss": 0.49805259704589844, + "rewards/chosen": 0.14163793325424195, + "rewards/margins": 0.015950965881347673, + "rewards/rejected": 0.12568696737289428, + "step": 170 + }, + { + "epoch": 0.72, + "grad_norm": 0.48161256313323975, + "kl": 1.9404491186141968, + "learning_rate": 3.566666666666667e-06, + "logits/chosen": 44436915.2, + "logits/rejected": 45288496.0, + "logps/chosen": -143.67095947265625, + "logps/rejected": -169.277587890625, + "loss": 0.4995573997497559, + "rewards/chosen": 0.18708930015563965, + "rewards/margins": 0.003508448600769043, + "rewards/rejected": 0.1835808515548706, + "step": 180 + }, + { + "epoch": 0.72, + "eval_kl": 1.780542254447937, + "eval_logits/chosen": 38592557.056, + "eval_logits/rejected": 38413033.472, + "eval_logps/chosen": -152.56709375, + "eval_logps/rejected": -146.9903125, + "eval_loss": 0.4986813962459564, + "eval_rewards/chosen": 0.1758671875, + "eval_rewards/margins": 0.010626815795898442, + "eval_rewards/rejected": 0.16524037170410155, + "eval_runtime": 214.9135, + "eval_samples_per_second": 4.653, + "eval_steps_per_second": 2.327, + "step": 180 + }, + { + "epoch": 0.76, + "grad_norm": 0.3721677362918854, + "kl": 1.7214577198028564, + "learning_rate": 3.455555555555556e-06, + "logits/chosen": 41783456.0, + "logits/rejected": 43276217.6, + "logps/chosen": -122.831103515625, + "logps/rejected": -135.89095458984374, + "loss": 0.49962491989135743, + "rewards/chosen": 0.16950526237487792, + "rewards/margins": 0.0030477762222289873, + "rewards/rejected": 0.16645748615264894, + "step": 190 + }, + { + "epoch": 0.8, + "grad_norm": 0.4780789911746979, + "kl": 2.060234785079956, + "learning_rate": 3.3444444444444445e-06, + "logits/chosen": 47242921.6, + "logits/rejected": 48330630.4, + "logps/chosen": -149.18778076171876, + "logps/rejected": -171.2810546875, + "loss": 0.49495983123779297, + "rewards/chosen": 0.21567411422729493, + "rewards/margins": 0.04064606428146364, + "rewards/rejected": 0.1750280499458313, + "step": 200 + }, + { + "epoch": 0.8, + "eval_kl": 1.5634552240371704, + "eval_logits/chosen": 38330896.384, + "eval_logits/rejected": 38168498.176, + "eval_logps/chosen": -152.89178125, + "eval_logps/rejected": -147.299953125, + "eval_loss": 0.498869389295578, + "eval_rewards/chosen": 0.143399169921875, + "eval_rewards/margins": 0.009122253417968768, + "eval_rewards/rejected": 0.13427691650390625, + "eval_runtime": 214.6785, + "eval_samples_per_second": 4.658, + "eval_steps_per_second": 2.329, + "step": 200 + }, + { + "epoch": 0.84, + "grad_norm": 0.42962703108787537, + "kl": 1.3932136297225952, + "learning_rate": 3.2333333333333334e-06, + "logits/chosen": 33657414.4, + "logits/rejected": 33685497.6, + "logps/chosen": -139.3125244140625, + "logps/rejected": -131.3388671875, + "loss": 0.5001070499420166, + "rewards/chosen": 0.09355279803276062, + "rewards/margins": 0.0002027988433837835, + "rewards/rejected": 0.09334999918937684, + "step": 210 + }, + { + "epoch": 0.88, + "grad_norm": 0.4065878987312317, + "kl": 1.480474829673767, + "learning_rate": 3.1222222222222228e-06, + "logits/chosen": 35385318.4, + "logits/rejected": 36077315.2, + "logps/chosen": -100.69112548828124, + "logps/rejected": -125.24149169921876, + "loss": 0.500240707397461, + "rewards/chosen": 0.10761514902114869, + "rewards/margins": -0.0023266911506652777, + "rewards/rejected": 0.10994184017181396, + "step": 220 + }, + { + "epoch": 0.88, + "eval_kl": 1.0575237274169922, + "eval_logits/chosen": 37674934.272, + "eval_logits/rejected": 37547470.848, + "eval_logps/chosen": -153.71990625, + "eval_logps/rejected": -148.08821875, + "eval_loss": 0.4993574917316437, + "eval_rewards/chosen": 0.060586246490478515, + "eval_rewards/margins": 0.005133884429931637, + "eval_rewards/rejected": 0.05545236206054688, + "eval_runtime": 214.7758, + "eval_samples_per_second": 4.656, + "eval_steps_per_second": 2.328, + "step": 220 + }, + { + "epoch": 0.92, + "grad_norm": 0.41962286829948425, + "kl": 1.0945483446121216, + "learning_rate": 3.0111111111111113e-06, + "logits/chosen": 47781094.4, + "logits/rejected": 46534304.0, + "logps/chosen": -184.4533447265625, + "logps/rejected": -162.75020751953124, + "loss": 0.5014426708221436, + "rewards/chosen": 0.061765891313552854, + "rewards/margins": -0.012319356203079224, + "rewards/rejected": 0.07408524751663208, + "step": 230 + }, + { + "epoch": 0.96, + "grad_norm": 0.40805885195732117, + "kl": 1.362263560295105, + "learning_rate": 2.9e-06, + "logits/chosen": 47934688.0, + "logits/rejected": 48291654.4, + "logps/chosen": -175.8403076171875, + "logps/rejected": -165.47391357421876, + "loss": 0.5020192623138428, + "rewards/chosen": 0.08795768022537231, + "rewards/margins": -0.018206548690795896, + "rewards/rejected": 0.10616422891616821, + "step": 240 + }, + { + "epoch": 0.96, + "eval_kl": 1.1043673753738403, + "eval_logits/chosen": 37919653.888, + "eval_logits/rejected": 37776138.24, + "eval_logps/chosen": -153.59246875, + "eval_logps/rejected": -147.980203125, + "eval_loss": 0.49911773204803467, + "eval_rewards/chosen": 0.07333123779296875, + "eval_rewards/margins": 0.007078071594238289, + "eval_rewards/rejected": 0.06625316619873046, + "eval_runtime": 213.224, + "eval_samples_per_second": 4.69, + "eval_steps_per_second": 2.345, + "step": 240 + }, + { + "epoch": 1.0, + "grad_norm": 0.5161420702934265, + "kl": 1.3074615001678467, + "learning_rate": 2.788888888888889e-06, + "logits/chosen": 36994057.6, + "logits/rejected": 35017337.6, + "logps/chosen": -175.2834716796875, + "logps/rejected": -144.57603759765624, + "loss": 0.49155316352844236, + "rewards/chosen": 0.13294192552566528, + "rewards/margins": 0.06808240413665771, + "rewards/rejected": 0.06485952138900757, + "step": 250 + }, + { + "epoch": 1.04, + "grad_norm": 0.5176857113838196, + "kl": 1.1041967868804932, + "learning_rate": 2.677777777777778e-06, + "logits/chosen": 29588192.0, + "logits/rejected": 30928668.8, + "logps/chosen": -146.99859619140625, + "logps/rejected": -128.4593994140625, + "loss": 0.48071441650390623, + "rewards/chosen": 0.16298424005508422, + "rewards/margins": 0.15495829358696936, + "rewards/rejected": 0.008025946468114853, + "step": 260 + }, + { + "epoch": 1.04, + "eval_kl": 1.3196667432785034, + "eval_logits/chosen": 37925871.616, + "eval_logits/rejected": 37763534.848, + "eval_logps/chosen": -153.3530625, + "eval_logps/rejected": -147.759578125, + "eval_loss": 0.4988880455493927, + "eval_rewards/chosen": 0.09727115631103515, + "eval_rewards/margins": 0.008956184387207022, + "eval_rewards/rejected": 0.08831497192382813, + "eval_runtime": 213.7741, + "eval_samples_per_second": 4.678, + "eval_steps_per_second": 2.339, + "step": 260 + }, + { + "epoch": 1.08, + "grad_norm": 0.46143728494644165, + "kl": 1.3379521369934082, + "learning_rate": 2.566666666666667e-06, + "logits/chosen": 52862003.2, + "logits/rejected": 52381398.4, + "logps/chosen": -138.23076171875, + "logps/rejected": -151.10428466796876, + "loss": 0.48619937896728516, + "rewards/chosen": 0.16910784244537352, + "rewards/margins": 0.11107043027877807, + "rewards/rejected": 0.058037412166595456, + "step": 270 + }, + { + "epoch": 1.12, + "grad_norm": 0.4464263319969177, + "kl": 1.270957589149475, + "learning_rate": 2.455555555555556e-06, + "logits/chosen": 33784115.2, + "logits/rejected": 33531318.4, + "logps/chosen": -130.073583984375, + "logps/rejected": -140.40438232421874, + "loss": 0.47771358489990234, + "rewards/chosen": 0.14505974054336548, + "rewards/margins": 0.18065866827964783, + "rewards/rejected": -0.03559892773628235, + "step": 280 + }, + { + "epoch": 1.12, + "eval_kl": 1.1183552742004395, + "eval_logits/chosen": 37143470.08, + "eval_logits/rejected": 37001617.408, + "eval_logps/chosen": -153.897140625, + "eval_logps/rejected": -148.2568125, + "eval_loss": 0.4994434118270874, + "eval_rewards/chosen": 0.04286346435546875, + "eval_rewards/margins": 0.00427254486083984, + "eval_rewards/rejected": 0.03859091949462891, + "eval_runtime": 213.5941, + "eval_samples_per_second": 4.682, + "eval_steps_per_second": 2.341, + "step": 280 + }, + { + "epoch": 1.16, + "grad_norm": 0.37523216009140015, + "kl": 1.423752784729004, + "learning_rate": 2.3444444444444448e-06, + "logits/chosen": 42669203.2, + "logits/rejected": 43161667.2, + "logps/chosen": -142.2610595703125, + "logps/rejected": -145.66318359375, + "loss": 0.47769603729248045, + "rewards/chosen": 0.18487329483032228, + "rewards/margins": 0.18094245791435243, + "rewards/rejected": 0.003930836915969849, + "step": 290 + }, + { + "epoch": 1.2, + "grad_norm": 0.3394813537597656, + "kl": 0.895855724811554, + "learning_rate": 2.2333333333333333e-06, + "logits/chosen": 41978041.6, + "logits/rejected": 43551177.6, + "logps/chosen": -139.19925537109376, + "logps/rejected": -155.9151123046875, + "loss": 0.4623682498931885, + "rewards/chosen": 0.15618009567260743, + "rewards/margins": 0.3404909610748291, + "rewards/rejected": -0.1843108654022217, + "step": 300 + }, + { + "epoch": 1.2, + "eval_kl": 1.0962754487991333, + "eval_logits/chosen": 36886769.664, + "eval_logits/rejected": 36750012.416, + "eval_logps/chosen": -154.00446875, + "eval_logps/rejected": -148.363828125, + "eval_loss": 0.4994364380836487, + "eval_rewards/chosen": 0.03213003921508789, + "eval_rewards/margins": 0.004239551544189455, + "eval_rewards/rejected": 0.027890487670898436, + "eval_runtime": 213.2494, + "eval_samples_per_second": 4.689, + "eval_steps_per_second": 2.345, + "step": 300 + }, + { + "epoch": 1.24, + "grad_norm": 0.3372270166873932, + "kl": 1.0476510524749756, + "learning_rate": 2.1222222222222226e-06, + "logits/chosen": 35257689.6, + "logits/rejected": 32474739.2, + "logps/chosen": -104.17430419921875, + "logps/rejected": -115.5439453125, + "loss": 0.4642783641815186, + "rewards/chosen": 0.139203941822052, + "rewards/margins": 0.31401048898696904, + "rewards/rejected": -0.174806547164917, + "step": 310 + }, + { + "epoch": 1.28, + "grad_norm": 0.4338719844818115, + "kl": 1.3322747945785522, + "learning_rate": 2.011111111111111e-06, + "logits/chosen": 46726528.0, + "logits/rejected": 45261251.2, + "logps/chosen": -163.8673583984375, + "logps/rejected": -176.94974365234376, + "loss": 0.45936293601989747, + "rewards/chosen": 0.15107860565185546, + "rewards/margins": 0.3611048460006714, + "rewards/rejected": -0.21002624034881592, + "step": 320 + }, + { + "epoch": 1.28, + "eval_kl": 1.2043012380599976, + "eval_logits/chosen": 36777713.664, + "eval_logits/rejected": 36639129.6, + "eval_logps/chosen": -153.916890625, + "eval_logps/rejected": -148.2699375, + "eval_loss": 0.4995039105415344, + "eval_rewards/chosen": 0.0408895263671875, + "eval_rewards/margins": 0.003611473083496089, + "eval_rewards/rejected": 0.03727805328369141, + "eval_runtime": 213.3704, + "eval_samples_per_second": 4.687, + "eval_steps_per_second": 2.343, + "step": 320 + }, + { + "epoch": 1.32, + "grad_norm": 0.5007278323173523, + "kl": 1.4502718448638916, + "learning_rate": 1.9000000000000002e-06, + "logits/chosen": 40259123.2, + "logits/rejected": 39875241.6, + "logps/chosen": -152.3288818359375, + "logps/rejected": -163.76326904296874, + "loss": 0.46443448066711424, + "rewards/chosen": 0.2390885829925537, + "rewards/margins": 0.2886385679244995, + "rewards/rejected": -0.049549984931945804, + "step": 330 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.42771467566490173, + "kl": 1.447667121887207, + "learning_rate": 1.788888888888889e-06, + "logits/chosen": 42535587.2, + "logits/rejected": 42390444.8, + "logps/chosen": -167.4290283203125, + "logps/rejected": -159.91485595703125, + "loss": 0.4603309631347656, + "rewards/chosen": 0.22664895057678222, + "rewards/margins": 0.32455546855926515, + "rewards/rejected": -0.09790651798248291, + "step": 340 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 1.338826060295105, + "eval_logits/chosen": 36785934.336, + "eval_logits/rejected": 36644188.16, + "eval_logps/chosen": -153.77146875, + "eval_logps/rejected": -148.13078125, + "eval_loss": 0.4994281530380249, + "eval_rewards/chosen": 0.05543093872070313, + "eval_rewards/margins": 0.0042367248535156304, + "eval_rewards/rejected": 0.0511942138671875, + "eval_runtime": 213.5161, + "eval_samples_per_second": 4.683, + "eval_steps_per_second": 2.342, + "step": 340 + }, + { + "epoch": 1.4, + "grad_norm": 0.5768368244171143, + "kl": 1.390388011932373, + "learning_rate": 1.6777777777777779e-06, + "logits/chosen": 26650705.6, + "logits/rejected": 25887854.4, + "logps/chosen": -132.03568115234376, + "logps/rejected": -165.2656005859375, + "loss": 0.4619337558746338, + "rewards/chosen": 0.18434877395629884, + "rewards/margins": 0.31992815732955937, + "rewards/rejected": -0.1355793833732605, + "step": 350 + }, + { + "epoch": 1.44, + "grad_norm": 0.4281937777996063, + "kl": 1.585924506187439, + "learning_rate": 1.566666666666667e-06, + "logits/chosen": 37149299.2, + "logits/rejected": 36698345.6, + "logps/chosen": -127.9746337890625, + "logps/rejected": -135.4433837890625, + "loss": 0.47208099365234374, + "rewards/chosen": 0.21960780620574952, + "rewards/margins": 0.22720773071050646, + "rewards/rejected": -0.007599924504756927, + "step": 360 + }, + { + "epoch": 1.44, + "eval_kl": 1.311928391456604, + "eval_logits/chosen": 36534513.664, + "eval_logits/rejected": 36399624.192, + "eval_logps/chosen": -153.9535, + "eval_logps/rejected": -148.298265625, + "eval_loss": 0.49958622455596924, + "eval_rewards/chosen": 0.037226760864257816, + "eval_rewards/margins": 0.0027809829711914064, + "eval_rewards/rejected": 0.03444577789306641, + "eval_runtime": 213.3999, + "eval_samples_per_second": 4.686, + "eval_steps_per_second": 2.343, + "step": 360 + }, + { + "epoch": 1.48, + "grad_norm": 0.4531302750110626, + "kl": 2.1966803073883057, + "learning_rate": 1.4555555555555557e-06, + "logits/chosen": 45897555.2, + "logits/rejected": 45395225.6, + "logps/chosen": -159.5014404296875, + "logps/rejected": -133.12310791015625, + "loss": 0.47774505615234375, + "rewards/chosen": 0.23044068813323976, + "rewards/margins": 0.185763356089592, + "rewards/rejected": 0.044677332043647766, + "step": 370 + }, + { + "epoch": 1.52, + "grad_norm": 0.3507835268974304, + "kl": 1.9232231378555298, + "learning_rate": 1.3444444444444446e-06, + "logits/chosen": 38330899.2, + "logits/rejected": 37796579.2, + "logps/chosen": -159.63173828125, + "logps/rejected": -139.68382568359374, + "loss": 0.4776346206665039, + "rewards/chosen": 0.23100655078887938, + "rewards/margins": 0.18027588725090027, + "rewards/rejected": 0.05073066353797913, + "step": 380 + }, + { + "epoch": 1.52, + "eval_kl": 1.5263408422470093, + "eval_logits/chosen": 36685828.096, + "eval_logits/rejected": 36531130.368, + "eval_logps/chosen": -153.611984375, + "eval_logps/rejected": -147.968125, + "eval_loss": 0.4994567334651947, + "eval_rewards/chosen": 0.07137952423095703, + "eval_rewards/margins": 0.003920288085937501, + "eval_rewards/rejected": 0.06745923614501953, + "eval_runtime": 213.3185, + "eval_samples_per_second": 4.688, + "eval_steps_per_second": 2.344, + "step": 380 + }, + { + "epoch": 1.56, + "grad_norm": 0.3990439772605896, + "kl": 1.5738338232040405, + "learning_rate": 1.2333333333333335e-06, + "logits/chosen": 39818672.0, + "logits/rejected": 41608393.6, + "logps/chosen": -140.68485107421876, + "logps/rejected": -158.0736328125, + "loss": 0.47451934814453123, + "rewards/chosen": 0.1719655990600586, + "rewards/margins": 0.21011758744716644, + "rewards/rejected": -0.03815198838710785, + "step": 390 + }, + { + "epoch": 1.6, + "grad_norm": 0.45497065782546997, + "kl": 1.9209775924682617, + "learning_rate": 1.1222222222222222e-06, + "logits/chosen": 39674937.6, + "logits/rejected": 40975612.8, + "logps/chosen": -128.1379638671875, + "logps/rejected": -117.67076416015625, + "loss": 0.4702040672302246, + "rewards/chosen": 0.20337235927581787, + "rewards/margins": 0.2399928867816925, + "rewards/rejected": -0.036620527505874634, + "step": 400 + }, + { + "epoch": 1.6, + "eval_kl": 1.4976035356521606, + "eval_logits/chosen": 36562317.312, + "eval_logits/rejected": 36410597.376, + "eval_logps/chosen": -153.700203125, + "eval_logps/rejected": -148.05290625, + "eval_loss": 0.49948906898498535, + "eval_rewards/chosen": 0.06255731201171875, + "eval_rewards/margins": 0.0035767364501953156, + "eval_rewards/rejected": 0.05898057556152344, + "eval_runtime": 213.3061, + "eval_samples_per_second": 4.688, + "eval_steps_per_second": 2.344, + "step": 400 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.44851529598236084, + "kl": 1.8833658695220947, + "learning_rate": 1.0111111111111111e-06, + "logits/chosen": 33354137.6, + "logits/rejected": 33615910.4, + "logps/chosen": -141.6330078125, + "logps/rejected": -147.7822021484375, + "loss": 0.4692805290222168, + "rewards/chosen": 0.20505664348602295, + "rewards/margins": 0.2700430333614349, + "rewards/rejected": -0.06498638987541198, + "step": 410 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.3704666197299957, + "kl": 1.809345006942749, + "learning_rate": 9.000000000000001e-07, + "logits/chosen": 36638368.0, + "logits/rejected": 38020054.4, + "logps/chosen": -136.44818115234375, + "logps/rejected": -150.01075439453126, + "loss": 0.47684297561645506, + "rewards/chosen": 0.1999206304550171, + "rewards/margins": 0.20272973477840425, + "rewards/rejected": -0.002809104323387146, + "step": 420 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 1.6867021322250366, + "eval_logits/chosen": 36786245.632, + "eval_logits/rejected": 36618121.216, + "eval_logps/chosen": -153.361515625, + "eval_logps/rejected": -147.73934375, + "eval_loss": 0.49919602274894714, + "eval_rewards/chosen": 0.0964253921508789, + "eval_rewards/margins": 0.006087821960449213, + "eval_rewards/rejected": 0.09033757019042969, + "eval_runtime": 213.6555, + "eval_samples_per_second": 4.68, + "eval_steps_per_second": 2.34, + "step": 420 + }, + { + "epoch": 1.72, + "grad_norm": 0.4963972270488739, + "kl": 2.1293835639953613, + "learning_rate": 7.888888888888889e-07, + "logits/chosen": 42904048.0, + "logits/rejected": 43455846.4, + "logps/chosen": -143.107568359375, + "logps/rejected": -170.5847900390625, + "loss": 0.4767764568328857, + "rewards/chosen": 0.24343018531799315, + "rewards/margins": 0.19057121276855468, + "rewards/rejected": 0.05285897254943848, + "step": 430 + }, + { + "epoch": 1.76, + "grad_norm": 0.4412926137447357, + "kl": 1.6203444004058838, + "learning_rate": 6.777777777777779e-07, + "logits/chosen": 40034080.0, + "logits/rejected": 41308224.0, + "logps/chosen": -122.6408203125, + "logps/rejected": -137.3405029296875, + "loss": 0.47936244010925294, + "rewards/chosen": 0.18853185176849366, + "rewards/margins": 0.16702898889780046, + "rewards/rejected": 0.021502862870693206, + "step": 440 + }, + { + "epoch": 1.76, + "eval_kl": 1.7451139688491821, + "eval_logits/chosen": 36817911.808, + "eval_logits/rejected": 36649046.016, + "eval_logps/chosen": -153.282765625, + "eval_logps/rejected": -147.663953125, + "eval_loss": 0.4991537928581238, + "eval_rewards/chosen": 0.10430096435546875, + "eval_rewards/margins": 0.006423851013183587, + "eval_rewards/rejected": 0.09787711334228516, + "eval_runtime": 213.1314, + "eval_samples_per_second": 4.692, + "eval_steps_per_second": 2.346, + "step": 440 + }, + { + "epoch": 1.8, + "grad_norm": 0.5419949293136597, + "kl": 2.0249862670898438, + "learning_rate": 5.666666666666667e-07, + "logits/chosen": 45477520.0, + "logits/rejected": 45914156.8, + "logps/chosen": -148.67772216796874, + "logps/rejected": -173.6037353515625, + "loss": 0.4622932434082031, + "rewards/chosen": 0.2666788101196289, + "rewards/margins": 0.32391947507858276, + "rewards/rejected": -0.05724066495895386, + "step": 450 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.5022501945495605, + "kl": 1.6355278491973877, + "learning_rate": 4.5555555555555563e-07, + "logits/chosen": 31932201.6, + "logits/rejected": 31943212.8, + "logps/chosen": -139.18756103515625, + "logps/rejected": -133.04072265625, + "loss": 0.4802652359008789, + "rewards/chosen": 0.10604774951934814, + "rewards/margins": 0.18288437128067017, + "rewards/rejected": -0.07683662176132203, + "step": 460 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 1.667972207069397, + "eval_logits/chosen": 36711571.456, + "eval_logits/rejected": 36549623.808, + "eval_logps/chosen": -153.430859375, + "eval_logps/rejected": -147.80959375, + "eval_loss": 0.4991794526576996, + "eval_rewards/chosen": 0.08949153137207032, + "eval_rewards/margins": 0.006177841186523439, + "eval_rewards/rejected": 0.08331369018554688, + "eval_runtime": 213.7973, + "eval_samples_per_second": 4.677, + "eval_steps_per_second": 2.339, + "step": 460 + }, + { + "epoch": 1.88, + "grad_norm": 0.4620107412338257, + "kl": 2.087587833404541, + "learning_rate": 3.444444444444445e-07, + "logits/chosen": 34332934.4, + "logits/rejected": 34831878.4, + "logps/chosen": -99.85589599609375, + "logps/rejected": -125.88466796875, + "loss": 0.4824401378631592, + "rewards/chosen": 0.19113779067993164, + "rewards/margins": 0.1455146014690399, + "rewards/rejected": 0.04562318921089172, + "step": 470 + }, + { + "epoch": 1.92, + "grad_norm": 0.45210617780685425, + "kl": 1.7846978902816772, + "learning_rate": 2.3333333333333336e-07, + "logits/chosen": 46964457.6, + "logits/rejected": 45760390.4, + "logps/chosen": -183.32412109375, + "logps/rejected": -163.24814453125, + "loss": 0.4812910079956055, + "rewards/chosen": 0.17468774318695068, + "rewards/margins": 0.1503958523273468, + "rewards/rejected": 0.02429189085960388, + "step": 480 + }, + { + "epoch": 1.92, + "eval_kl": 1.6339410543441772, + "eval_logits/chosen": 36667240.448, + "eval_logits/rejected": 36505169.92, + "eval_logps/chosen": -153.504015625, + "eval_logps/rejected": -147.870328125, + "eval_loss": 0.4993217885494232, + "eval_rewards/chosen": 0.08217547607421875, + "eval_rewards/margins": 0.0049354019165039065, + "eval_rewards/rejected": 0.07724007415771485, + "eval_runtime": 213.9159, + "eval_samples_per_second": 4.675, + "eval_steps_per_second": 2.337, + "step": 480 + }, + { + "epoch": 1.96, + "grad_norm": 0.4333805441856384, + "kl": 2.0376548767089844, + "learning_rate": 1.2222222222222225e-07, + "logits/chosen": 46712921.6, + "logits/rejected": 47160374.4, + "logps/chosen": -174.75072021484374, + "logps/rejected": -165.94029541015624, + "loss": 0.48319125175476074, + "rewards/chosen": 0.19691554307937623, + "rewards/margins": 0.13738937377929689, + "rewards/rejected": 0.05952616930007935, + "step": 490 + }, + { + "epoch": 2.0, + "grad_norm": 0.5714597105979919, + "kl": 1.8620449304580688, + "learning_rate": 1.1111111111111112e-08, + "logits/chosen": 35965187.2, + "logits/rejected": 33827209.6, + "logps/chosen": -174.20643310546876, + "logps/rejected": -145.505029296875, + "loss": 0.46739583015441893, + "rewards/chosen": 0.2406463384628296, + "rewards/margins": 0.2686867892742157, + "rewards/rejected": -0.02804045081138611, + "step": 500 + }, + { + "epoch": 2.0, + "eval_kl": 1.6334943771362305, + "eval_logits/chosen": 36681326.592, + "eval_logits/rejected": 36518973.44, + "eval_logps/chosen": -153.4953125, + "eval_logps/rejected": -147.870140625, + "eval_loss": 0.4992177486419678, + "eval_rewards/chosen": 0.08304692840576172, + "eval_rewards/margins": 0.005788291931152351, + "eval_rewards/rejected": 0.07725863647460937, + "eval_runtime": 213.4167, + "eval_samples_per_second": 4.686, + "eval_steps_per_second": 2.343, + "step": 500 + } + ], + "logging_steps": 10, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_2k/lora/checkpoint-500/training_args.bin b/v5/KTO/KTO_2k/lora/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4bd98b4c087a91a6868c0d02be1d3fadc2a8cce1 --- /dev/null +++ b/v5/KTO/KTO_2k/lora/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a7ce213d9d8d780414e78695f2513e61226fb7b06531a5bcdf434ae993c976 +size 5521 diff --git a/v5/KTO/KTO_5k/KTO_5k/README.md b/v5/KTO/KTO_5k/KTO_5k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_5k/KTO_5k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_5k/KTO_5k/adapter_config.json b/v5/KTO/KTO_5k/KTO_5k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c7fb826a92b8c340dc085ae4ee70addde7e565 --- /dev/null +++ b/v5/KTO/KTO_5k/KTO_5k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_5k/KTO_5k/adapter_model.safetensors b/v5/KTO/KTO_5k/KTO_5k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77b9f5e435613cd045beb0f66319704b5505b36c --- /dev/null +++ b/v5/KTO/KTO_5k/KTO_5k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720b371b30c818649afd37be34c4b78ee171efcd914d588a3b556c79b1f46c4b +size 180385008 diff --git a/v5/KTO/KTO_5k/MKTO_5k/chat_template.jinja b/v5/KTO/KTO_5k/MKTO_5k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_5k/MKTO_5k/config.json b/v5/KTO/KTO_5k/MKTO_5k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..269c2ffa2c365f594cb5e44218192c94b419a0cb --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.0.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/KTO/KTO_5k/MKTO_5k/generation_config.json b/v5/KTO/KTO_5k/MKTO_5k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7c9c2224cd391437f7236b3f36305dd39a63ab0a --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.0.0" +} diff --git a/v5/KTO/KTO_5k/MKTO_5k/model.safetensors b/v5/KTO/KTO_5k/MKTO_5k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..947171b5a234274c2144af1218ff1a533c952fbb --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:604e89985e45d90351204fa63ff605f362eb471193fe22950d2e18b4a9ea1857 +size 2471645464 diff --git a/v5/KTO/KTO_5k/MKTO_5k/tokenizer.json b/v5/KTO/KTO_5k/MKTO_5k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_5k/MKTO_5k/tokenizer_config.json b/v5/KTO/KTO_5k/MKTO_5k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_5k/MKTO_5k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_5k/lora/README.md b/v5/KTO/KTO_5k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ee8f70bc547822f927fc49aff017dd02cd4f9872 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/README.md @@ -0,0 +1,67 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- kto +- trl +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/kt79h7ud) + + +This model was trained with KTO, a method introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306). + +### Framework versions + +- TRL: 0.27.2 +- Transformers: 5.0.0 +- Pytorch: 2.8.0+cu128 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite KTO as: + +```bibtex +@article{ethayarajh2024kto, + title = {{KTO: Model Alignment as Prospect Theoretic Optimization}}, + author = {Kawin Ethayarajh and Winnie Xu and Niklas Muennighoff and Dan Jurafsky and Douwe Kiela}, + year = 2024, + eprint = {arXiv:2402.01306}, +} +``` + +Cite TRL as: + +```bibtex +@misc{vonwerra2022trl, + title = {{TRL: Transformer Reinforcement Learning}}, + author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec}, + year = 2020, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/huggingface/trl}} +} +``` \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/README.md b/v5/KTO/KTO_5k/lora/checkpoint-1150/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c7fb826a92b8c340dc085ae4ee70addde7e565 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_model.safetensors b/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a69ae1f51ab74482a806e7ba68256a88c51b8654 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f95522e0caefe86ec47dcdd65ca27799e9c5266d656cdb2f11c6d7f113d57529 +size 180385008 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/chat_template.jinja b/v5/KTO/KTO_5k/lora/checkpoint-1150/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/optimizer.pt b/v5/KTO/KTO_5k/lora/checkpoint-1150/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0f449949ea58575479b07da73b6e7bb7c75381c8 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6bfb7d37817b6cc37958062f243a1e07de691853d7a7b3a8838b76c220f7909 +size 360902475 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/rng_state.pth b/v5/KTO/KTO_5k/lora/checkpoint-1150/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8ed093d588068f9dbfd74c539e39e9699211bff7 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c52a1d71e1557ac186ac99828ae2743580a49ee952817aecfe62308e6f8bd3fd +size 14645 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/scaler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1150/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..918a788376fc85ce860b67f3fb0e54efa428f205 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9987b1cc31b296257172b3a916ce5dfd78ca159370cb8d913165c130a86e4e7d +size 1383 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/scheduler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1150/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab785cc57b1293f2cf641c3ae6acce6a26cc7d39 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4632980b5d2143613d7bfecb4bf9b018381aa2cc79d06a6882cb313b20896a70 +size 1465 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer.json b/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/trainer_state.json b/v5/KTO/KTO_5k/lora/checkpoint-1150/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7d2b16c9ed438d8a6b7a924442401e02950978da --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/trainer_state.json @@ -0,0 +1,2127 @@ +{ + "best_global_step": 1150, + "best_metric": 0.057583449840545656, + "best_model_checkpoint": "output/lora/checkpoint-1150", + "epoch": 1.8399999999999999, + "eval_steps": 50, + "global_step": 1150, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 2.1381592750549316, + "kl": 0.01649792119860649, + "learning_rate": 3.6e-07, + "logits/chosen": 28205651.2, + "logits/rejected": 29669123.2, + "logps/chosen": -150.3176025390625, + "logps/rejected": -130.385302734375, + "loss": 0.4999302864074707, + "rewards/chosen": 0.0005133629310876131, + "rewards/margins": 0.0005571890709688887, + "rewards/rejected": -4.382613988127559e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.9010003805160522, + "kl": 0.020609140396118164, + "learning_rate": 7.6e-07, + "logits/chosen": 52049945.6, + "logits/rejected": 51142873.6, + "logps/chosen": -140.97896728515624, + "logps/rejected": -153.13775634765625, + "loss": 0.49991936683654786, + "rewards/chosen": 0.0004100656602531672, + "rewards/margins": 0.0006456565577536821, + "rewards/rejected": -0.00023559089750051497, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 1.82417893409729, + "kl": 0.01093914546072483, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32594544.0, + "logits/rejected": 32500614.4, + "logps/chosen": -133.37647705078126, + "logps/rejected": -142.03988037109374, + "loss": 0.5000736713409424, + "rewards/chosen": -0.002948903851211071, + "rewards/margins": -0.0005901286378502844, + "rewards/rejected": -0.0023587752133607865, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 1.6120444536209106, + "kl": 0.010594606399536133, + "learning_rate": 1.56e-06, + "logits/chosen": 41530739.2, + "logits/rejected": 42298668.8, + "logps/chosen": -145.56357421875, + "logps/rejected": -147.24957275390625, + "loss": 0.5001413822174072, + "rewards/chosen": -0.0032692715525627137, + "rewards/margins": -0.0011312337592244148, + "rewards/rejected": -0.002138037793338299, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 1.3366488218307495, + "kl": 0.01903839036822319, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 40986995.2, + "logits/rejected": 42846995.2, + "logps/chosen": -142.60504150390625, + "logps/rejected": -155.85986328125, + "loss": 0.500172233581543, + "rewards/chosen": -0.002861199527978897, + "rewards/margins": -0.0013772012665867806, + "rewards/rejected": -0.0014839982613921165, + "step": 50 + }, + { + "epoch": 0.08, + "eval_kl": 0.03923250734806061, + "eval_logits/chosen": 37010317.312, + "eval_logits/rejected": 36932890.624, + "eval_logps/chosen": -155.7828125, + "eval_logps/rejected": -149.957953125, + "eval_loss": 0.4999832808971405, + "eval_rewards/chosen": 0.0007290065288543702, + "eval_rewards/margins": 0.00013377457857131968, + "eval_rewards/rejected": 0.0005952319502830505, + "eval_runtime": 211.9346, + "eval_samples_per_second": 4.718, + "eval_steps_per_second": 2.359, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 1.2370270490646362, + "kl": 0.042702484875917435, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 34543625.6, + "logits/rejected": 31963372.8, + "logps/chosen": -106.8656494140625, + "logps/rejected": -115.66375732421875, + "loss": 0.499523401260376, + "rewards/chosen": 0.00055726058781147, + "rewards/margins": 0.003814282640814781, + "rewards/rejected": -0.003257022053003311, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 1.6257128715515137, + "kl": 0.025073956698179245, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 45796806.4, + "logits/rejected": 44777859.2, + "logps/chosen": -167.59599609375, + "logps/rejected": -176.96552734375, + "loss": 0.4996492862701416, + "rewards/chosen": -0.017988091707229613, + "rewards/margins": 0.0028077125549316427, + "rewards/rejected": -0.020795804262161256, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 1.9464651346206665, + "kl": 0.032842040061950684, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 39214451.2, + "logits/rejected": 39159056.0, + "logps/chosen": -156.45654296875, + "logps/rejected": -164.982177734375, + "loss": 0.4997319221496582, + "rewards/chosen": -0.012030959129333496, + "rewards/margins": 0.0021469339728355415, + "rewards/rejected": -0.014177893102169038, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 1.664642572402954, + "kl": 0.13523416221141815, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 41776288.0, + "logits/rejected": 41958195.2, + "logps/chosen": -171.09915771484376, + "logps/rejected": -160.55670166015625, + "loss": 0.4979794979095459, + "rewards/chosen": 0.0015864329412579536, + "rewards/margins": 0.016174097545444965, + "rewards/rejected": -0.014587664604187011, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 2.069972515106201, + "kl": 0.2824670374393463, + "learning_rate": 3.96e-06, + "logits/chosen": 26015552.0, + "logits/rejected": 25218312.0, + "logps/chosen": -135.43885498046876, + "logps/rejected": -166.34676513671874, + "loss": 0.4990866184234619, + "rewards/chosen": 0.013756407797336579, + "rewards/margins": 0.007336309552192688, + "rewards/rejected": 0.00642009824514389, + "step": 100 + }, + { + "epoch": 0.16, + "eval_kl": 0.26922619342803955, + "eval_logits/chosen": 36658610.176, + "eval_logits/rejected": 36587118.592, + "eval_logps/chosen": -155.710015625, + "eval_logps/rejected": -149.88709375, + "eval_loss": 0.49995896220207214, + "eval_rewards/chosen": 0.008008602142333985, + "eval_rewards/margins": 0.0003274984359741221, + "eval_rewards/rejected": 0.007681103706359863, + "eval_runtime": 211.5606, + "eval_samples_per_second": 4.727, + "eval_steps_per_second": 2.363, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 1.6185622215270996, + "kl": 0.2616101801395416, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 36089392.0, + "logits/rejected": 35749190.4, + "logps/chosen": -131.4165283203125, + "logps/rejected": -136.50457763671875, + "loss": 0.4985805511474609, + "rewards/chosen": 0.014740067720413207, + "rewards/margins": 0.011374564468860626, + "rewards/rejected": 0.003365503251552582, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 1.6586893796920776, + "kl": 0.3709116578102112, + "learning_rate": 4.76e-06, + "logits/chosen": 44621308.8, + "logits/rejected": 44430220.8, + "logps/chosen": -163.35196533203126, + "logps/rejected": -134.7572998046875, + "loss": 0.500658369064331, + "rewards/chosen": 0.003282211720943451, + "rewards/margins": -0.0052413523197174065, + "rewards/rejected": 0.008523564040660857, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 1.287909746170044, + "kl": 0.7775768041610718, + "learning_rate": 4.982222222222222e-06, + "logits/chosen": 37691072.0, + "logits/rejected": 37058822.4, + "logps/chosen": -162.92593994140626, + "logps/rejected": -140.751171875, + "loss": 0.5000242233276367, + "rewards/chosen": 0.06439838409423829, + "rewards/margins": -0.00020327568054198664, + "rewards/rejected": 0.06460165977478027, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 1.4811052083969116, + "kl": 1.0720264911651611, + "learning_rate": 4.937777777777778e-06, + "logits/chosen": 39377881.6, + "logits/rejected": 41394512.0, + "logps/chosen": -142.946826171875, + "logps/rejected": -158.21910400390624, + "loss": 0.501332950592041, + "rewards/chosen": 0.0980217456817627, + "rewards/margins": -0.010658252239227298, + "rewards/rejected": 0.10867999792098999, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 1.6531552076339722, + "kl": 1.1869957447052002, + "learning_rate": 4.893333333333334e-06, + "logits/chosen": 39561158.4, + "logits/rejected": 40957766.4, + "logps/chosen": -130.68802490234376, + "logps/rejected": -117.63740234375, + "loss": 0.4958030223846436, + "rewards/chosen": 0.12626923322677613, + "rewards/margins": 0.0335154950618744, + "rewards/rejected": 0.09275373816490173, + "step": 150 + }, + { + "epoch": 0.24, + "eval_kl": 1.516471266746521, + "eval_logits/chosen": 37183991.808, + "eval_logits/rejected": 37034459.136, + "eval_logps/chosen": -154.240734375, + "eval_logps/rejected": -148.51396875, + "eval_loss": 0.49876031279563904, + "eval_rewards/chosen": 0.154937744140625, + "eval_rewards/margins": 0.009945236206054697, + "eval_rewards/rejected": 0.1449925079345703, + "eval_runtime": 211.5801, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 1.6023988723754883, + "kl": 1.7867761850357056, + "learning_rate": 4.848888888888889e-06, + "logits/chosen": 33342684.8, + "logits/rejected": 33521395.2, + "logps/chosen": -143.76165771484375, + "logps/rejected": -147.3908447265625, + "loss": 0.49936847686767577, + "rewards/chosen": 0.17880032062530518, + "rewards/margins": 0.005077493190765392, + "rewards/rejected": 0.1737228274345398, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 1.3449970483779907, + "kl": 2.119621753692627, + "learning_rate": 4.804444444444445e-06, + "logits/chosen": 36148233.6, + "logits/rejected": 38120403.2, + "logps/chosen": -137.6266845703125, + "logps/rejected": -149.0926025390625, + "loss": 0.4978325843811035, + "rewards/chosen": 0.22067615985870362, + "rewards/margins": 0.01742777824401856, + "rewards/rejected": 0.20324838161468506, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 1.9490461349487305, + "kl": 2.847586154937744, + "learning_rate": 4.76e-06, + "logits/chosen": 43064544.0, + "logits/rejected": 43962390.4, + "logps/chosen": -143.948681640625, + "logps/rejected": -169.59462890625, + "loss": 0.5009199619293213, + "rewards/chosen": 0.28105921745300294, + "rewards/margins": -0.007398319244384777, + "rewards/rejected": 0.2884575366973877, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 1.550258755683899, + "kl": 2.792905330657959, + "learning_rate": 4.715555555555556e-06, + "logits/chosen": 40556035.2, + "logits/rejected": 42005014.4, + "logps/chosen": -123.0137451171875, + "logps/rejected": -136.06087646484374, + "loss": 0.49814720153808595, + "rewards/chosen": 0.2867321491241455, + "rewards/margins": 0.014883160591125488, + "rewards/rejected": 0.27184898853302003, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 2.0262064933776855, + "kl": 3.2026119232177734, + "learning_rate": 4.6711111111111115e-06, + "logits/chosen": 45602153.6, + "logits/rejected": 46734368.0, + "logps/chosen": -149.51041259765626, + "logps/rejected": -171.689697265625, + "loss": 0.5007596492767334, + "rewards/chosen": 0.3171941041946411, + "rewards/margins": -0.006134414672851585, + "rewards/rejected": 0.3233285188674927, + "step": 200 + }, + { + "epoch": 0.32, + "eval_kl": 2.7445971965789795, + "eval_logits/chosen": 37825519.616, + "eval_logits/rejected": 37620092.928, + "eval_logps/chosen": -152.96146875, + "eval_logps/rejected": -147.30584375, + "eval_loss": 0.4978778660297394, + "eval_rewards/chosen": 0.2828658447265625, + "eval_rewards/margins": 0.017060607910156234, + "eval_rewards/rejected": 0.2658052368164063, + "eval_runtime": 211.7592, + "eval_samples_per_second": 4.722, + "eval_steps_per_second": 2.361, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 1.9593161344528198, + "kl": 2.349473714828491, + "learning_rate": 4.626666666666667e-06, + "logits/chosen": 32432160.0, + "logits/rejected": 32459036.8, + "logps/chosen": -140.2314453125, + "logps/rejected": -132.20750732421874, + "loss": 0.5013795852661133, + "rewards/chosen": 0.22654273509979247, + "rewards/margins": -0.011085557937622087, + "rewards/rejected": 0.23762829303741456, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 1.6649645566940308, + "kl": 2.177248239517212, + "learning_rate": 4.582222222222223e-06, + "logits/chosen": 34417107.2, + "logits/rejected": 35237868.8, + "logps/chosen": -101.06302490234376, + "logps/rejected": -125.54276123046876, + "loss": 0.5004647254943848, + "rewards/chosen": 0.20845344066619872, + "rewards/margins": -0.003761553764343256, + "rewards/rejected": 0.21221499443054198, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 1.7191715240478516, + "kl": 1.6887900829315186, + "learning_rate": 4.537777777777778e-06, + "logits/chosen": 46128198.4, + "logits/rejected": 45076755.2, + "logps/chosen": -185.54364013671875, + "logps/rejected": -163.69344482421874, + "loss": 0.5005609512329101, + "rewards/chosen": 0.1496042490005493, + "rewards/margins": -0.00507398843765261, + "rewards/rejected": 0.15467823743820192, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 1.6625752449035645, + "kl": 1.9195034503936768, + "learning_rate": 4.493333333333333e-06, + "logits/chosen": 46306035.2, + "logits/rejected": 46461657.6, + "logps/chosen": -176.698291015625, + "logps/rejected": -166.738232421875, + "loss": 0.5003566741943359, + "rewards/chosen": 0.1773249626159668, + "rewards/margins": -0.0034087777137756237, + "rewards/rejected": 0.18073374032974243, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.08888578414917, + "kl": 1.9199730157852173, + "learning_rate": 4.448888888888889e-06, + "logits/chosen": 35625705.6, + "logits/rejected": 33570604.8, + "logps/chosen": -176.07003173828124, + "logps/rejected": -145.513232421875, + "loss": 0.4901569366455078, + "rewards/chosen": 0.2223743438720703, + "rewards/margins": 0.078991961479187, + "rewards/rejected": 0.1433823823928833, + "step": 250 + }, + { + "epoch": 0.4, + "eval_kl": 1.8728376626968384, + "eval_logits/chosen": 37098446.848, + "eval_logits/rejected": 36913745.92, + "eval_logps/chosen": -153.93390625, + "eval_logps/rejected": -148.244546875, + "eval_loss": 0.4982966184616089, + "eval_rewards/chosen": 0.18562066650390624, + "eval_rewards/margins": 0.013686828613281243, + "eval_rewards/rejected": 0.171933837890625, + "eval_runtime": 212.9007, + "eval_samples_per_second": 4.697, + "eval_steps_per_second": 2.349, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 1.585038423538208, + "kl": 1.6053378582000732, + "learning_rate": 4.404444444444445e-06, + "logits/chosen": 28695961.6, + "logits/rejected": 27433849.6, + "logps/chosen": -129.23218994140626, + "logps/rejected": -132.16243896484374, + "loss": 0.4940999984741211, + "rewards/chosen": 0.16844781637191772, + "rewards/margins": 0.04744429588317871, + "rewards/rejected": 0.12100352048873901, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 2.185063362121582, + "kl": 2.0674309730529785, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 37351123.2, + "logits/rejected": 37435251.2, + "logps/chosen": -157.37740478515624, + "logps/rejected": -152.574072265625, + "loss": 0.4948906421661377, + "rewards/chosen": 0.18760323524475098, + "rewards/margins": 0.04087167978286743, + "rewards/rejected": 0.14673155546188354, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 1.842838168144226, + "kl": 1.880658745765686, + "learning_rate": 4.315555555555556e-06, + "logits/chosen": 31317900.8, + "logits/rejected": 25257848.0, + "logps/chosen": -168.01707763671874, + "logps/rejected": -140.544775390625, + "loss": 0.49900665283203127, + "rewards/chosen": 0.16304240226745606, + "rewards/margins": 0.008140754699707042, + "rewards/rejected": 0.15490164756774902, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 1.4501603841781616, + "kl": 2.7841668128967285, + "learning_rate": 4.271111111111111e-06, + "logits/chosen": 29001040.0, + "logits/rejected": 28279708.8, + "logps/chosen": -144.5850830078125, + "logps/rejected": -147.846533203125, + "loss": 0.48676314353942873, + "rewards/chosen": 0.322883677482605, + "rewards/margins": 0.10617766380310059, + "rewards/rejected": 0.2167060136795044, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 2.2522799968719482, + "kl": 2.674313545227051, + "learning_rate": 4.226666666666667e-06, + "logits/chosen": 39827664.0, + "logits/rejected": 40046345.6, + "logps/chosen": -168.2622314453125, + "logps/rejected": -181.0140869140625, + "loss": 0.4900949001312256, + "rewards/chosen": 0.22891669273376464, + "rewards/margins": 0.08893496990203856, + "rewards/rejected": 0.13998172283172608, + "step": 300 + }, + { + "epoch": 0.48, + "eval_kl": 2.443347215652466, + "eval_logits/chosen": 35836559.36, + "eval_logits/rejected": 35640664.064, + "eval_logps/chosen": -153.67809375, + "eval_logps/rejected": -147.97525, + "eval_loss": 0.49841761589050293, + "eval_rewards/chosen": 0.21120219421386718, + "eval_rewards/margins": 0.012337142944335938, + "eval_rewards/rejected": 0.19886505126953125, + "eval_runtime": 212.5056, + "eval_samples_per_second": 4.706, + "eval_steps_per_second": 2.353, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 1.8431649208068848, + "kl": 2.4258689880371094, + "learning_rate": 4.182222222222222e-06, + "logits/chosen": 31914422.4, + "logits/rejected": 32899113.6, + "logps/chosen": -146.6043701171875, + "logps/rejected": -155.2097412109375, + "loss": 0.5078470706939697, + "rewards/chosen": 0.1587265133857727, + "rewards/margins": -0.06373668909072877, + "rewards/rejected": 0.22246320247650148, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 2.756876230239868, + "kl": 1.3744744062423706, + "learning_rate": 4.1377777777777784e-06, + "logits/chosen": 35366806.4, + "logits/rejected": 33021190.4, + "logps/chosen": -161.6685302734375, + "logps/rejected": -134.85858154296875, + "loss": 0.5033583641052246, + "rewards/chosen": 0.047298938035964966, + "rewards/margins": -0.02729131579399109, + "rewards/rejected": 0.07459025382995606, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 1.6488304138183594, + "kl": 1.8090463876724243, + "learning_rate": 4.093333333333334e-06, + "logits/chosen": 44740480.0, + "logits/rejected": 41858704.0, + "logps/chosen": -153.429052734375, + "logps/rejected": -145.27188720703126, + "loss": 0.4838115692138672, + "rewards/chosen": 0.16980862617492676, + "rewards/margins": 0.13209896087646483, + "rewards/rejected": 0.03770966529846191, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 1.8334484100341797, + "kl": 1.624638557434082, + "learning_rate": 4.04888888888889e-06, + "logits/chosen": 36759168.0, + "logits/rejected": 37663475.2, + "logps/chosen": -135.3531005859375, + "logps/rejected": -144.48751220703124, + "loss": 0.49995737075805663, + "rewards/chosen": 0.11444320678710937, + "rewards/margins": -0.002312994003295904, + "rewards/rejected": 0.11675620079040527, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 1.7367093563079834, + "kl": 2.787097930908203, + "learning_rate": 4.004444444444445e-06, + "logits/chosen": 35158457.6, + "logits/rejected": 34068956.8, + "logps/chosen": -134.371630859375, + "logps/rejected": -169.529150390625, + "loss": 0.5052088737487793, + "rewards/chosen": 0.2354206323623657, + "rewards/margins": -0.04196789264678957, + "rewards/rejected": 0.2773885250091553, + "step": 350 + }, + { + "epoch": 0.56, + "eval_kl": 2.2334909439086914, + "eval_logits/chosen": 35659685.888, + "eval_logits/rejected": 35460202.496, + "eval_logps/chosen": -154.1329375, + "eval_logps/rejected": -148.447578125, + "eval_loss": 0.4981686472892761, + "eval_rewards/chosen": 0.16571835327148438, + "eval_rewards/margins": 0.014086547851562492, + "eval_rewards/rejected": 0.15163180541992188, + "eval_runtime": 210.7265, + "eval_samples_per_second": 4.745, + "eval_steps_per_second": 2.373, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 2.0100009441375732, + "kl": 2.0726542472839355, + "learning_rate": 3.96e-06, + "logits/chosen": 37896438.4, + "logits/rejected": 35973884.8, + "logps/chosen": -147.7560546875, + "logps/rejected": -113.3482177734375, + "loss": 0.48930912017822265, + "rewards/chosen": 0.16988544464111327, + "rewards/margins": 0.08941116333007812, + "rewards/rejected": 0.08047428131103515, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 1.6454890966415405, + "kl": 2.1270346641540527, + "learning_rate": 3.9155555555555554e-06, + "logits/chosen": 34360019.2, + "logits/rejected": 34796140.8, + "logps/chosen": -147.8162353515625, + "logps/rejected": -151.31751708984376, + "loss": 0.5053381443023681, + "rewards/chosen": 0.08656104803085327, + "rewards/margins": -0.053690028190612804, + "rewards/rejected": 0.14025107622146607, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 2.241021156311035, + "kl": 2.6244335174560547, + "learning_rate": 3.8711111111111115e-06, + "logits/chosen": 31317593.6, + "logits/rejected": 32079606.4, + "logps/chosen": -155.6551513671875, + "logps/rejected": -167.29332275390624, + "loss": 0.5026909828186035, + "rewards/chosen": 0.18726186752319335, + "rewards/margins": -0.035893630981445324, + "rewards/rejected": 0.22315549850463867, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 1.744504690170288, + "kl": 2.1470510959625244, + "learning_rate": 3.826666666666667e-06, + "logits/chosen": 27632387.2, + "logits/rejected": 26959638.4, + "logps/chosen": -176.283740234375, + "logps/rejected": -151.25721435546876, + "loss": 0.48785767555236814, + "rewards/chosen": 0.16686009168624877, + "rewards/margins": 0.09664145708084106, + "rewards/rejected": 0.07021863460540771, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 2.052776575088501, + "kl": 1.9149287939071655, + "learning_rate": 3.782222222222223e-06, + "logits/chosen": 38708992.0, + "logits/rejected": 36759104.0, + "logps/chosen": -150.0161376953125, + "logps/rejected": -137.032666015625, + "loss": 0.48592147827148435, + "rewards/chosen": 0.13577580451965332, + "rewards/margins": 0.11200562268495559, + "rewards/rejected": 0.023770181834697722, + "step": 400 + }, + { + "epoch": 0.64, + "eval_kl": 1.9213757514953613, + "eval_logits/chosen": 34004680.704, + "eval_logits/rejected": 33861222.4, + "eval_logps/chosen": -155.130359375, + "eval_logps/rejected": -149.4599375, + "eval_loss": 0.4978408217430115, + "eval_rewards/chosen": 0.06597476959228515, + "eval_rewards/margins": 0.015579383850097654, + "eval_rewards/rejected": 0.0503953857421875, + "eval_runtime": 215.3729, + "eval_samples_per_second": 4.643, + "eval_steps_per_second": 2.322, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 1.505508542060852, + "kl": 2.1655712127685547, + "learning_rate": 3.737777777777778e-06, + "logits/chosen": 39197555.2, + "logits/rejected": 36953779.2, + "logps/chosen": -148.135498046875, + "logps/rejected": -150.240234375, + "loss": 0.48557405471801757, + "rewards/chosen": 0.17278852462768554, + "rewards/margins": 0.12511927187442778, + "rewards/rejected": 0.04766925275325775, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 2.4999916553497314, + "kl": 1.624707579612732, + "learning_rate": 3.6933333333333337e-06, + "logits/chosen": 27496678.4, + "logits/rejected": 26063419.2, + "logps/chosen": -165.584228515625, + "logps/rejected": -133.94266357421876, + "loss": 0.49836010932922364, + "rewards/chosen": -0.088151615858078, + "rewards/margins": -0.013219672441482547, + "rewards/rejected": -0.07493194341659545, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 1.6226006746292114, + "kl": 1.6943508386611938, + "learning_rate": 3.648888888888889e-06, + "logits/chosen": 28216393.6, + "logits/rejected": 26552371.2, + "logps/chosen": -172.898681640625, + "logps/rejected": -121.724560546875, + "loss": 0.4999542236328125, + "rewards/chosen": -0.05118745565414429, + "rewards/margins": -0.011095824837684634, + "rewards/rejected": -0.04009163081645965, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 2.3899142742156982, + "kl": 2.0509917736053467, + "learning_rate": 3.604444444444445e-06, + "logits/chosen": 35301628.8, + "logits/rejected": 36549334.4, + "logps/chosen": -150.047705078125, + "logps/rejected": -170.48095703125, + "loss": 0.504191255569458, + "rewards/chosen": 0.07901791334152222, + "rewards/margins": -0.03352437019348144, + "rewards/rejected": 0.11254228353500366, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 1.915216088294983, + "kl": 2.2649385929107666, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 27320227.2, + "logits/rejected": 30460025.6, + "logps/chosen": -100.75572509765625, + "logps/rejected": -155.90166015625, + "loss": 0.5083817481994629, + "rewards/chosen": 0.032135069370269775, + "rewards/margins": -0.08187388181686402, + "rewards/rejected": 0.11400895118713379, + "step": 450 + }, + { + "epoch": 0.72, + "eval_kl": 2.8508355617523193, + "eval_logits/chosen": 33711845.376, + "eval_logits/rejected": 33509806.08, + "eval_logps/chosen": -153.81721875, + "eval_logps/rejected": -148.221515625, + "eval_loss": 0.49701353907585144, + "eval_rewards/chosen": 0.19728890991210937, + "eval_rewards/margins": 0.02305152893066406, + "eval_rewards/rejected": 0.1742373809814453, + "eval_runtime": 210.8637, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 2.420854091644287, + "kl": 3.208031415939331, + "learning_rate": 3.515555555555556e-06, + "logits/chosen": 39778067.2, + "logits/rejected": 36642828.8, + "logps/chosen": -164.1375732421875, + "logps/rejected": -180.76805419921874, + "loss": 0.49746012687683105, + "rewards/chosen": 0.20578148365020751, + "rewards/margins": 0.009352195262908924, + "rewards/rejected": 0.1964292883872986, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 1.615047812461853, + "kl": 2.750185489654541, + "learning_rate": 3.471111111111111e-06, + "logits/chosen": 47119552.0, + "logits/rejected": 44422067.2, + "logps/chosen": -175.47635498046876, + "logps/rejected": -178.90897216796876, + "loss": 0.5061192512512207, + "rewards/chosen": -0.03337647318840027, + "rewards/margins": -0.035451799631118774, + "rewards/rejected": 0.0020753264427185057, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 2.1674153804779053, + "kl": 1.8968321084976196, + "learning_rate": 3.426666666666667e-06, + "logits/chosen": 37006256.0, + "logits/rejected": 36102249.6, + "logps/chosen": -153.7917724609375, + "logps/rejected": -165.3869140625, + "loss": 0.47748627662658694, + "rewards/chosen": 0.003384724259376526, + "rewards/margins": 0.18434576094150543, + "rewards/rejected": -0.1809610366821289, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 1.6082895994186401, + "kl": 2.0536258220672607, + "learning_rate": 3.3822222222222224e-06, + "logits/chosen": 22186331.2, + "logits/rejected": 20350340.8, + "logps/chosen": -151.07054443359374, + "logps/rejected": -150.1708740234375, + "loss": 0.4888655185699463, + "rewards/chosen": -0.01819072961807251, + "rewards/margins": 0.14195933341979983, + "rewards/rejected": -0.16015006303787233, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 2.310601234436035, + "kl": 3.129138469696045, + "learning_rate": 3.337777777777778e-06, + "logits/chosen": 37795449.6, + "logits/rejected": 35220675.2, + "logps/chosen": -181.1957763671875, + "logps/rejected": -132.0344970703125, + "loss": 0.48585872650146483, + "rewards/chosen": 0.3015714168548584, + "rewards/margins": 0.12076919078826903, + "rewards/rejected": 0.18080222606658936, + "step": 500 + }, + { + "epoch": 0.8, + "eval_kl": 2.6557276248931885, + "eval_logits/chosen": 32642598.912, + "eval_logits/rejected": 32467406.848, + "eval_logps/chosen": -154.477703125, + "eval_logps/rejected": -148.869640625, + "eval_loss": 0.49703630805015564, + "eval_rewards/chosen": 0.13124107360839843, + "eval_rewards/margins": 0.021815811157226556, + "eval_rewards/rejected": 0.10942526245117187, + "eval_runtime": 211.0284, + "eval_samples_per_second": 4.739, + "eval_steps_per_second": 2.369, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 2.234365224838257, + "kl": 1.7492139339447021, + "learning_rate": 3.2933333333333333e-06, + "logits/chosen": 39694166.4, + "logits/rejected": 40852192.0, + "logps/chosen": -148.5533447265625, + "logps/rejected": -162.5984375, + "loss": 0.4872898101806641, + "rewards/chosen": 0.013645458221435546, + "rewards/margins": 0.10560911893844604, + "rewards/rejected": -0.0919636607170105, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 2.3625874519348145, + "kl": 3.2840332984924316, + "learning_rate": 3.2488888888888894e-06, + "logits/chosen": 37915008.0, + "logits/rejected": 36570806.4, + "logps/chosen": -157.36220703125, + "logps/rejected": -164.6677734375, + "loss": 0.4821781635284424, + "rewards/chosen": 0.32392158508300783, + "rewards/margins": 0.17874917984008792, + "rewards/rejected": 0.1451724052429199, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 2.0702106952667236, + "kl": 2.1847689151763916, + "learning_rate": 3.2044444444444446e-06, + "logits/chosen": 33132352.0, + "logits/rejected": 32190089.6, + "logps/chosen": -144.52484130859375, + "logps/rejected": -179.6159912109375, + "loss": 0.5058434486389161, + "rewards/chosen": -0.07189960479736328, + "rewards/margins": -0.031065639853477475, + "rewards/rejected": -0.0408339649438858, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 2.4268131256103516, + "kl": 3.71620512008667, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 27596662.4, + "logits/rejected": 26981862.4, + "logps/chosen": -150.15238037109376, + "logps/rejected": -159.4679443359375, + "loss": 0.46803932189941405, + "rewards/chosen": 0.3402720928192139, + "rewards/margins": 0.2841139912605286, + "rewards/rejected": 0.0561581015586853, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 2.0024309158325195, + "kl": 2.6835620403289795, + "learning_rate": 3.1155555555555555e-06, + "logits/chosen": 32613926.4, + "logits/rejected": 33945558.4, + "logps/chosen": -157.42744140625, + "logps/rejected": -171.3582275390625, + "loss": 0.49735183715820314, + "rewards/chosen": -0.0414805144071579, + "rewards/margins": 0.10379274189472197, + "rewards/rejected": -0.14527325630187987, + "step": 550 + }, + { + "epoch": 0.88, + "eval_kl": 2.534240245819092, + "eval_logits/chosen": 30600110.08, + "eval_logits/rejected": 30418843.648, + "eval_logps/chosen": -155.587328125, + "eval_logps/rejected": -149.98478125, + "eval_loss": 0.49648168683052063, + "eval_rewards/chosen": 0.02027870178222656, + "eval_rewards/margins": 0.022367488861083983, + "eval_rewards/rejected": -0.002088787078857422, + "eval_runtime": 210.9141, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.371, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 2.065328598022461, + "kl": 2.3084464073181152, + "learning_rate": 3.0711111111111115e-06, + "logits/chosen": 26840976.0, + "logits/rejected": 25480748.8, + "logps/chosen": -165.23271484375, + "logps/rejected": -152.96129150390624, + "loss": 0.49146738052368166, + "rewards/chosen": 0.038733655214309694, + "rewards/margins": 0.05340470671653748, + "rewards/rejected": -0.014671051502227783, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 1.7352122068405151, + "kl": 1.976243019104004, + "learning_rate": 3.0266666666666668e-06, + "logits/chosen": 30859900.8, + "logits/rejected": 29282716.8, + "logps/chosen": -153.5951904296875, + "logps/rejected": -138.8325927734375, + "loss": 0.48922386169433596, + "rewards/chosen": -0.16246808767318727, + "rewards/margins": 0.173832905292511, + "rewards/rejected": -0.33630099296569826, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 1.950156331062317, + "kl": 3.2299084663391113, + "learning_rate": 2.9822222222222224e-06, + "logits/chosen": 37093142.4, + "logits/rejected": 35527721.6, + "logps/chosen": -151.84559326171876, + "logps/rejected": -162.35234375, + "loss": 0.48923511505126954, + "rewards/chosen": -0.03407045304775238, + "rewards/margins": 0.13557693064212797, + "rewards/rejected": -0.16964738368988036, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 2.4236228466033936, + "kl": 3.0165860652923584, + "learning_rate": 2.937777777777778e-06, + "logits/chosen": 21686569.6, + "logits/rejected": 22070460.8, + "logps/chosen": -130.4562744140625, + "logps/rejected": -136.6388916015625, + "loss": 0.4813654899597168, + "rewards/chosen": 0.219277286529541, + "rewards/margins": 0.19798394441604614, + "rewards/rejected": 0.021293342113494873, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 2.492583990097046, + "kl": 2.3715996742248535, + "learning_rate": 2.8933333333333337e-06, + "logits/chosen": 20666136.0, + "logits/rejected": 23334753.6, + "logps/chosen": -153.04947509765626, + "logps/rejected": -153.43118896484376, + "loss": 0.522442102432251, + "rewards/chosen": -0.225014066696167, + "rewards/margins": -0.17056108117103577, + "rewards/rejected": -0.054452985525131226, + "step": 600 + }, + { + "epoch": 0.96, + "eval_kl": 2.428438425064087, + "eval_logits/chosen": 29508644.864, + "eval_logits/rejected": 29339983.872, + "eval_logps/chosen": -156.279359375, + "eval_logps/rejected": -150.735375, + "eval_loss": 0.49547284841537476, + "eval_rewards/chosen": -0.048923690795898436, + "eval_rewards/margins": 0.02822325134277344, + "eval_rewards/rejected": -0.07714694213867188, + "eval_runtime": 211.6125, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 1.849755883216858, + "kl": 4.1999735832214355, + "learning_rate": 2.8488888888888894e-06, + "logits/chosen": 35977270.4, + "logits/rejected": 32739616.0, + "logps/chosen": -196.6281982421875, + "logps/rejected": -171.48328857421876, + "loss": 0.4923978805541992, + "rewards/chosen": 0.14797959327697754, + "rewards/margins": 0.12893219590187074, + "rewards/rejected": 0.01904739737510681, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 2.2259228229522705, + "kl": 2.290156602859497, + "learning_rate": 2.8044444444444446e-06, + "logits/chosen": 18237147.2, + "logits/rejected": 16839369.6, + "logps/chosen": -143.46595458984376, + "logps/rejected": -140.927001953125, + "loss": 0.4779216766357422, + "rewards/chosen": 0.04038125276565552, + "rewards/margins": 0.21843221187591552, + "rewards/rejected": -0.17805095911026, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 1.9715921878814697, + "kl": 2.682957172393799, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 28436249.6, + "logits/rejected": 29027222.4, + "logps/chosen": -155.50318603515626, + "logps/rejected": -156.95904541015625, + "loss": 0.4716252326965332, + "rewards/chosen": 0.20086636543273925, + "rewards/margins": 0.32287436723709106, + "rewards/rejected": -0.12200800180435181, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 1.9490185976028442, + "kl": 2.792576551437378, + "learning_rate": 2.715555555555556e-06, + "logits/chosen": 35926758.4, + "logits/rejected": 36004332.8, + "logps/chosen": -139.33636474609375, + "logps/rejected": -143.99090576171875, + "loss": 0.4631006717681885, + "rewards/chosen": 0.2187732219696045, + "rewards/margins": 0.31144561171531676, + "rewards/rejected": -0.09267238974571228, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 2.6189019680023193, + "kl": 2.2690906524658203, + "learning_rate": 2.6711111111111116e-06, + "logits/chosen": 32591907.2, + "logits/rejected": 32146668.8, + "logps/chosen": -138.1519775390625, + "logps/rejected": -154.90008544921875, + "loss": 0.47899231910705564, + "rewards/chosen": 0.09387065768241883, + "rewards/margins": 0.19471864104270936, + "rewards/rejected": -0.10084798336029052, + "step": 650 + }, + { + "epoch": 1.04, + "eval_kl": 2.578673839569092, + "eval_logits/chosen": 28904480.768, + "eval_logits/rejected": 28698251.264, + "eval_logps/chosen": -156.2849375, + "eval_logps/rejected": -150.820046875, + "eval_loss": 0.4945332705974579, + "eval_rewards/chosen": -0.049482513427734375, + "eval_rewards/margins": 0.03613288116455079, + "eval_rewards/rejected": -0.08561539459228516, + "eval_runtime": 210.9586, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 2.37, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 1.7106302976608276, + "kl": 3.502686023712158, + "learning_rate": 2.6266666666666668e-06, + "logits/chosen": 35256064.0, + "logits/rejected": 34360857.6, + "logps/chosen": -150.1806884765625, + "logps/rejected": -148.415771484375, + "loss": 0.5002040386199951, + "rewards/chosen": 0.014912448823451996, + "rewards/margins": 0.010746008902788162, + "rewards/rejected": 0.0041664399206638334, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 2.5150351524353027, + "kl": 1.832489013671875, + "learning_rate": 2.5822222222222224e-06, + "logits/chosen": 34642908.8, + "logits/rejected": 36512502.4, + "logps/chosen": -145.1234375, + "logps/rejected": -152.358642578125, + "loss": 0.4734458923339844, + "rewards/chosen": 0.06824090480804443, + "rewards/margins": 0.24784770011901855, + "rewards/rejected": -0.17960679531097412, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 1.9682056903839111, + "kl": 2.000276565551758, + "learning_rate": 2.537777777777778e-06, + "logits/chosen": 27413356.8, + "logits/rejected": 24885939.2, + "logps/chosen": -121.7287841796875, + "logps/rejected": -141.73358154296875, + "loss": 0.44886274337768556, + "rewards/chosen": 0.1158550500869751, + "rewards/margins": 0.5507017374038696, + "rewards/rejected": -0.4348466873168945, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 1.9337836503982544, + "kl": 3.4022421836853027, + "learning_rate": 2.4933333333333333e-06, + "logits/chosen": 31807184.0, + "logits/rejected": 30969852.8, + "logps/chosen": -124.63392333984375, + "logps/rejected": -149.852197265625, + "loss": 0.46492581367492675, + "rewards/chosen": 0.14620821475982665, + "rewards/margins": 0.39136860370635984, + "rewards/rejected": -0.2451603889465332, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 2.3496735095977783, + "kl": 2.625760078430176, + "learning_rate": 2.448888888888889e-06, + "logits/chosen": 31142732.8, + "logits/rejected": 31784934.4, + "logps/chosen": -136.10140380859374, + "logps/rejected": -160.45810546875, + "loss": 0.4712203025817871, + "rewards/chosen": 0.030043387413024904, + "rewards/margins": 0.2471763849258423, + "rewards/rejected": -0.2171329975128174, + "step": 700 + }, + { + "epoch": 1.12, + "eval_kl": 2.867997169494629, + "eval_logits/chosen": 28665548.8, + "eval_logits/rejected": 28371339.264, + "eval_logps/chosen": -155.87825, + "eval_logps/rejected": -150.4365625, + "eval_loss": 0.49442729353904724, + "eval_rewards/chosen": -0.008813528060913086, + "eval_rewards/margins": 0.038452112197875976, + "eval_rewards/rejected": -0.047265640258789064, + "eval_runtime": 210.8987, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 1.8647392988204956, + "kl": 3.5727601051330566, + "learning_rate": 2.4044444444444446e-06, + "logits/chosen": 23887081.6, + "logits/rejected": 20803046.4, + "logps/chosen": -186.822802734375, + "logps/rejected": -185.6416259765625, + "loss": 0.43730711936950684, + "rewards/chosen": 0.37336575984954834, + "rewards/margins": 0.6151942014694214, + "rewards/rejected": -0.24182844161987305, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 1.630876064300537, + "kl": 3.1620936393737793, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 31887926.4, + "logits/rejected": 30896633.6, + "logps/chosen": -148.16866455078124, + "logps/rejected": -147.6019287109375, + "loss": 0.46859025955200195, + "rewards/chosen": 0.26157245635986326, + "rewards/margins": 0.22991548180580137, + "rewards/rejected": 0.03165697455406189, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 1.9033029079437256, + "kl": 2.6529040336608887, + "learning_rate": 2.3155555555555555e-06, + "logits/chosen": 21981940.8, + "logits/rejected": 20872329.6, + "logps/chosen": -118.38714599609375, + "logps/rejected": -131.70751953125, + "loss": 0.46525821685791013, + "rewards/chosen": -0.042069154977798465, + "rewards/margins": 0.3900075852870941, + "rewards/rejected": -0.43207674026489257, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 1.7811493873596191, + "kl": 4.9487690925598145, + "learning_rate": 2.2711111111111116e-06, + "logits/chosen": 32023660.8, + "logits/rejected": 32334473.6, + "logps/chosen": -160.63001708984376, + "logps/rejected": -155.8528076171875, + "loss": 0.4717572212219238, + "rewards/chosen": 0.43720192909240724, + "rewards/margins": 0.3068490982055664, + "rewards/rejected": 0.13035283088684083, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 2.2068121433258057, + "kl": 5.166882514953613, + "learning_rate": 2.226666666666667e-06, + "logits/chosen": 37952841.6, + "logits/rejected": 37582118.4, + "logps/chosen": -158.07510986328126, + "logps/rejected": -136.27884521484376, + "loss": 0.45236787796020506, + "rewards/chosen": 0.5586390972137452, + "rewards/margins": 0.447090494632721, + "rewards/rejected": 0.11154860258102417, + "step": 750 + }, + { + "epoch": 1.2, + "eval_kl": 4.169778823852539, + "eval_logits/chosen": 30752735.232, + "eval_logits/rejected": 30298806.272, + "eval_logps/chosen": -153.261625, + "eval_logps/rejected": -147.9041875, + "eval_loss": 0.49414026737213135, + "eval_rewards/chosen": 0.2528500061035156, + "eval_rewards/margins": 0.046879043579101526, + "eval_rewards/rejected": 0.20597096252441408, + "eval_runtime": 210.5527, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 2.375, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 1.7532247304916382, + "kl": 5.59327507019043, + "learning_rate": 2.1822222222222225e-06, + "logits/chosen": 34789904.0, + "logits/rejected": 35887366.4, + "logps/chosen": -140.38455810546876, + "logps/rejected": -154.4322021484375, + "loss": 0.4671647548675537, + "rewards/chosen": 0.6628150463104248, + "rewards/margins": 0.32444992065429684, + "rewards/rejected": 0.3383651256561279, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 1.8754905462265015, + "kl": 3.97855806350708, + "learning_rate": 2.137777777777778e-06, + "logits/chosen": 27867337.6, + "logits/rejected": 29447075.2, + "logps/chosen": -128.08131103515626, + "logps/rejected": -140.113671875, + "loss": 0.47170114517211914, + "rewards/chosen": 0.3410197257995605, + "rewards/margins": 0.21830989122390745, + "rewards/rejected": 0.12270983457565307, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 2.736323833465576, + "kl": 4.552127838134766, + "learning_rate": 2.0933333333333338e-06, + "logits/chosen": 33397673.6, + "logits/rejected": 34484364.8, + "logps/chosen": -137.85001220703126, + "logps/rejected": -139.0888427734375, + "loss": 0.45501227378845216, + "rewards/chosen": 0.355634069442749, + "rewards/margins": 0.35818901062011715, + "rewards/rejected": -0.0025549411773681642, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 1.9888898134231567, + "kl": 3.6503052711486816, + "learning_rate": 2.048888888888889e-06, + "logits/chosen": 29773580.8, + "logits/rejected": 28645248.0, + "logps/chosen": -151.4457275390625, + "logps/rejected": -138.25706787109374, + "loss": 0.46457924842834475, + "rewards/chosen": 0.29306089878082275, + "rewards/margins": 0.400502347946167, + "rewards/rejected": -0.10744144916534423, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 1.5264906883239746, + "kl": 5.461276531219482, + "learning_rate": 2.0044444444444446e-06, + "logits/chosen": 32210188.8, + "logits/rejected": 31754496.0, + "logps/chosen": -139.758837890625, + "logps/rejected": -154.13162841796876, + "loss": 0.47219176292419435, + "rewards/chosen": 0.6297782897949219, + "rewards/margins": 0.23436756134033204, + "rewards/rejected": 0.39541072845458985, + "step": 800 + }, + { + "epoch": 1.28, + "eval_kl": 4.377986907958984, + "eval_logits/chosen": 31938863.104, + "eval_logits/rejected": 31473901.568, + "eval_logps/chosen": -152.64290625, + "eval_logps/rejected": -147.31334375, + "eval_loss": 0.49390554428100586, + "eval_rewards/chosen": 0.3147206115722656, + "eval_rewards/margins": 0.04966500854492184, + "eval_rewards/rejected": 0.26505560302734377, + "eval_runtime": 210.9274, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.37, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 1.8228152990341187, + "kl": 4.066787242889404, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 35085334.4, + "logits/rejected": 34852982.4, + "logps/chosen": -132.74925537109374, + "logps/rejected": -158.0354736328125, + "loss": 0.44358067512512206, + "rewards/chosen": 0.4759791374206543, + "rewards/margins": 0.4843965947628021, + "rewards/rejected": -0.008417457342147827, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 2.1346709728240967, + "kl": 4.6763811111450195, + "learning_rate": 1.915555555555556e-06, + "logits/chosen": 34945942.4, + "logits/rejected": 36177011.2, + "logps/chosen": -132.50538330078126, + "logps/rejected": -161.39248046875, + "loss": 0.47060041427612304, + "rewards/chosen": 0.4741304874420166, + "rewards/margins": 0.28036924600601193, + "rewards/rejected": 0.19376124143600465, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 2.9988934993743896, + "kl": 3.6898865699768066, + "learning_rate": 1.8711111111111114e-06, + "logits/chosen": 36008406.4, + "logits/rejected": 37406822.4, + "logps/chosen": -136.77215576171875, + "logps/rejected": -147.55556640625, + "loss": 0.46726012229919434, + "rewards/chosen": 0.3493239164352417, + "rewards/margins": 0.2635639488697052, + "rewards/rejected": 0.0857599675655365, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.762466311454773, + "kl": 4.097973346710205, + "learning_rate": 1.8266666666666668e-06, + "logits/chosen": 27684899.2, + "logits/rejected": 28798355.2, + "logps/chosen": -118.63145751953125, + "logps/rejected": -141.59239501953124, + "loss": 0.47371621131896974, + "rewards/chosen": 0.31828267574310304, + "rewards/margins": 0.29973786473274233, + "rewards/rejected": 0.018544811010360717, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.833742380142212, + "kl": 4.407935619354248, + "learning_rate": 1.7822222222222225e-06, + "logits/chosen": 33654512.0, + "logits/rejected": 33191171.2, + "logps/chosen": -151.17977294921874, + "logps/rejected": -145.46224365234374, + "loss": 0.47383294105529783, + "rewards/chosen": 0.16252880096435546, + "rewards/margins": 0.22959471344947813, + "rewards/rejected": -0.06706591248512268, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.920696973800659, + "eval_logits/chosen": 31450167.296, + "eval_logits/rejected": 31058980.864, + "eval_logps/chosen": -153.5180625, + "eval_logps/rejected": -148.185328125, + "eval_loss": 0.49386003613471985, + "eval_rewards/chosen": 0.22720516967773438, + "eval_rewards/margins": 0.04934913635253907, + "eval_rewards/rejected": 0.1778560333251953, + "eval_runtime": 211.4228, + "eval_samples_per_second": 4.73, + "eval_steps_per_second": 2.365, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 2.183370351791382, + "kl": 4.991496562957764, + "learning_rate": 1.737777777777778e-06, + "logits/chosen": 47378035.2, + "logits/rejected": 45498502.4, + "logps/chosen": -179.4380126953125, + "logps/rejected": -169.1109375, + "loss": 0.4720784664154053, + "rewards/chosen": 0.43294267654418944, + "rewards/margins": 0.21851625442504882, + "rewards/rejected": 0.21442642211914062, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 2.1924335956573486, + "kl": 5.118699073791504, + "learning_rate": 1.6933333333333336e-06, + "logits/chosen": 29969433.6, + "logits/rejected": 28471737.6, + "logps/chosen": -192.4035888671875, + "logps/rejected": -151.5296875, + "loss": 0.46271333694458006, + "rewards/chosen": 0.5544761657714844, + "rewards/margins": 0.31934370994567873, + "rewards/rejected": 0.23513245582580566, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 2.2703323364257812, + "kl": 2.6695058345794678, + "learning_rate": 1.648888888888889e-06, + "logits/chosen": 26682553.6, + "logits/rejected": 24626574.4, + "logps/chosen": -133.742041015625, + "logps/rejected": -147.77803955078124, + "loss": 0.43658647537231443, + "rewards/chosen": 0.262941312789917, + "rewards/margins": 0.628947639465332, + "rewards/rejected": -0.36600632667541505, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 1.7647465467453003, + "kl": 4.138214111328125, + "learning_rate": 1.6044444444444447e-06, + "logits/chosen": 27142838.4, + "logits/rejected": 26620787.2, + "logps/chosen": -133.81441650390624, + "logps/rejected": -135.07823486328124, + "loss": 0.4540394306182861, + "rewards/chosen": 0.3629532098770142, + "rewards/margins": 0.4822005391120911, + "rewards/rejected": -0.1192473292350769, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 2.0833559036254883, + "kl": 3.821770191192627, + "learning_rate": 1.56e-06, + "logits/chosen": 25844046.4, + "logits/rejected": 22660449.6, + "logps/chosen": -145.89912109375, + "logps/rejected": -130.87916259765626, + "loss": 0.44003853797912595, + "rewards/chosen": 0.4248363018035889, + "rewards/margins": 0.5529402971267701, + "rewards/rejected": -0.12810399532318115, + "step": 900 + }, + { + "epoch": 1.44, + "eval_kl": 4.193332672119141, + "eval_logits/chosen": 32452849.664, + "eval_logits/rejected": 32089155.584, + "eval_logps/chosen": -152.94290625, + "eval_logps/rejected": -147.62978125, + "eval_loss": 0.4937511086463928, + "eval_rewards/chosen": 0.28472021484375, + "eval_rewards/margins": 0.05130918884277341, + "eval_rewards/rejected": 0.23341102600097657, + "eval_runtime": 211.0632, + "eval_samples_per_second": 4.738, + "eval_steps_per_second": 2.369, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 1.9122978448867798, + "kl": 5.044002056121826, + "learning_rate": 1.5155555555555558e-06, + "logits/chosen": 28371500.8, + "logits/rejected": 26429561.6, + "logps/chosen": -170.57901611328126, + "logps/rejected": -170.0248046875, + "loss": 0.47440948486328127, + "rewards/chosen": 0.4807882308959961, + "rewards/margins": 0.20360822677612306, + "rewards/rejected": 0.27718000411987304, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 2.096123218536377, + "kl": 4.188933372497559, + "learning_rate": 1.4711111111111112e-06, + "logits/chosen": 34476659.2, + "logits/rejected": 31294201.6, + "logps/chosen": -165.4956787109375, + "logps/rejected": -143.78316650390624, + "loss": 0.44411406517028806, + "rewards/chosen": 0.5467419147491455, + "rewards/margins": 0.49016233682632443, + "rewards/rejected": 0.05657957792282105, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 2.136502265930176, + "kl": 5.916023254394531, + "learning_rate": 1.4266666666666668e-06, + "logits/chosen": 28660502.4, + "logits/rejected": 31565062.4, + "logps/chosen": -145.20224609375, + "logps/rejected": -184.6095947265625, + "loss": 0.4747187614440918, + "rewards/chosen": 0.5481678962707519, + "rewards/margins": 0.3383267402648925, + "rewards/rejected": 0.20984115600585937, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 3.4681079387664795, + "kl": 3.9104812145233154, + "learning_rate": 1.3822222222222223e-06, + "logits/chosen": 32520064.0, + "logits/rejected": 28152707.2, + "logps/chosen": -149.12630615234374, + "logps/rejected": -132.30379638671874, + "loss": 0.4755962371826172, + "rewards/chosen": 0.34991438388824464, + "rewards/margins": 0.19766778945922853, + "rewards/rejected": 0.1522465944290161, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 2.1049246788024902, + "kl": 4.365435600280762, + "learning_rate": 1.337777777777778e-06, + "logits/chosen": 37928726.4, + "logits/rejected": 36957033.6, + "logps/chosen": -154.08798828125, + "logps/rejected": -146.7669189453125, + "loss": 0.4579151630401611, + "rewards/chosen": 0.46179609298706054, + "rewards/margins": 0.37595014572143554, + "rewards/rejected": 0.085845947265625, + "step": 950 + }, + { + "epoch": 1.52, + "eval_kl": 4.171284198760986, + "eval_logits/chosen": 32833683.456, + "eval_logits/rejected": 32522022.912, + "eval_logps/chosen": -152.9865, + "eval_logps/rejected": -147.679296875, + "eval_loss": 0.4936215281486511, + "eval_rewards/chosen": 0.2803621826171875, + "eval_rewards/margins": 0.051901611328124986, + "eval_rewards/rejected": 0.2284605712890625, + "eval_runtime": 211.1202, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 2.368, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 2.297563076019287, + "kl": 4.221343994140625, + "learning_rate": 1.2933333333333334e-06, + "logits/chosen": 41744796.8, + "logits/rejected": 40177462.4, + "logps/chosen": -140.44161376953124, + "logps/rejected": -148.71304931640626, + "loss": 0.44645137786865235, + "rewards/chosen": 0.4949165344238281, + "rewards/margins": 0.46811245679855346, + "rewards/rejected": 0.026804077625274658, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 2.0365242958068848, + "kl": 4.4261579513549805, + "learning_rate": 1.248888888888889e-06, + "logits/chosen": 32556515.2, + "logits/rejected": 33512262.4, + "logps/chosen": -133.8440673828125, + "logps/rejected": -171.82977294921875, + "loss": 0.4730066776275635, + "rewards/chosen": 0.4945687294006348, + "rewards/margins": 0.2293097019195557, + "rewards/rejected": 0.2652590274810791, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 1.5643013715744019, + "kl": 4.663653373718262, + "learning_rate": 1.2044444444444447e-06, + "logits/chosen": 32883987.2, + "logits/rejected": 30414611.2, + "logps/chosen": -126.985400390625, + "logps/rejected": -116.391650390625, + "loss": 0.4877506732940674, + "rewards/chosen": 0.48381505012512205, + "rewards/margins": 0.09749135971069334, + "rewards/rejected": 0.3863236904144287, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 2.706939458847046, + "kl": 4.062044620513916, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32006976.0, + "logits/rejected": 31020704.0, + "logps/chosen": -166.25194091796874, + "logps/rejected": -154.0007080078125, + "loss": 0.44759297370910645, + "rewards/chosen": 0.35017178058624265, + "rewards/margins": 0.5452085971832275, + "rewards/rejected": -0.19503681659698485, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 1.8194115161895752, + "kl": 3.5774059295654297, + "learning_rate": 1.1155555555555558e-06, + "logits/chosen": 28698640.0, + "logits/rejected": 29143193.6, + "logps/chosen": -139.20194091796876, + "logps/rejected": -158.261376953125, + "loss": 0.4849833965301514, + "rewards/chosen": 0.1898583173751831, + "rewards/margins": 0.07131674289703369, + "rewards/rejected": 0.11854157447814942, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_kl": 4.220986366271973, + "eval_logits/chosen": 32568942.592, + "eval_logits/rejected": 32248098.816, + "eval_logps/chosen": -152.99434375, + "eval_logps/rejected": -147.7251875, + "eval_loss": 0.4932064116001129, + "eval_rewards/chosen": 0.2795771179199219, + "eval_rewards/margins": 0.055706237792968766, + "eval_rewards/rejected": 0.22387088012695314, + "eval_runtime": 901.9357, + "eval_samples_per_second": 1.109, + "eval_steps_per_second": 0.554, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 2.4502618312835693, + "kl": 3.438178539276123, + "learning_rate": 1.0711111111111112e-06, + "logits/chosen": 23637744.0, + "logits/rejected": 21885137.6, + "logps/chosen": -154.96070556640626, + "logps/rejected": -135.8352783203125, + "loss": 0.4609940528869629, + "rewards/chosen": 0.33989131450653076, + "rewards/margins": 0.2776340961456299, + "rewards/rejected": 0.06225721836090088, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 2.483098268508911, + "kl": 4.026124000549316, + "learning_rate": 1.0266666666666669e-06, + "logits/chosen": 33672102.4, + "logits/rejected": 33149174.4, + "logps/chosen": -172.960546875, + "logps/rejected": -169.11124267578126, + "loss": 0.4497981548309326, + "rewards/chosen": 0.3961763620376587, + "rewards/margins": 0.43121243715286256, + "rewards/rejected": -0.03503607511520386, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 1.9396111965179443, + "kl": 3.11645770072937, + "learning_rate": 9.822222222222223e-07, + "logits/chosen": 33916867.2, + "logits/rejected": 29841721.6, + "logps/chosen": -149.1454345703125, + "logps/rejected": -127.8354248046875, + "loss": 0.4286343574523926, + "rewards/chosen": 0.36662404537200927, + "rewards/margins": 0.662821626663208, + "rewards/rejected": -0.2961975812911987, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.473919153213501, + "kl": 3.834186553955078, + "learning_rate": 9.377777777777778e-07, + "logits/chosen": 34663897.6, + "logits/rejected": 32536246.4, + "logps/chosen": -142.38626708984376, + "logps/rejected": -151.28388671875, + "loss": 0.4545116901397705, + "rewards/chosen": 0.403075122833252, + "rewards/margins": 0.4584430515766144, + "rewards/rejected": -0.055367928743362424, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.401204824447632, + "kl": 2.8307433128356934, + "learning_rate": 8.933333333333334e-07, + "logits/chosen": 25615622.4, + "logits/rejected": 24212544.0, + "logps/chosen": -194.39169921875, + "logps/rejected": -139.34288330078124, + "loss": 0.48009257316589354, + "rewards/chosen": -0.09855471849441529, + "rewards/margins": 0.04281153678894044, + "rewards/rejected": -0.14136625528335572, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.395029306411743, + "eval_logits/chosen": 30017314.816, + "eval_logits/rejected": 29781239.808, + "eval_logps/chosen": -155.0224375, + "eval_logps/rejected": -149.753125, + "eval_loss": 0.4927977919578552, + "eval_rewards/chosen": 0.07676624298095704, + "eval_rewards/margins": 0.0556891098022461, + "eval_rewards/rejected": 0.021077133178710936, + "eval_runtime": 211.7163, + "eval_samples_per_second": 4.723, + "eval_steps_per_second": 2.362, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 2.213663101196289, + "kl": 2.5274927616119385, + "learning_rate": 8.488888888888889e-07, + "logits/chosen": 21050780.8, + "logits/rejected": 22174214.4, + "logps/chosen": -143.71790771484376, + "logps/rejected": -136.63013916015626, + "loss": 0.4820300579071045, + "rewards/chosen": 0.13502249717712403, + "rewards/margins": 0.1883419156074524, + "rewards/rejected": -0.05331941843032837, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 2.839602470397949, + "kl": 2.8527681827545166, + "learning_rate": 8.044444444444445e-07, + "logits/chosen": 31860320.0, + "logits/rejected": 34545088.0, + "logps/chosen": -123.39615478515626, + "logps/rejected": -144.3958740234375, + "loss": 0.48537321090698243, + "rewards/chosen": 0.06347188949584961, + "rewards/margins": 0.1278951048851013, + "rewards/rejected": -0.0644232153892517, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 2.238354206085205, + "kl": 3.4803032875061035, + "learning_rate": 7.6e-07, + "logits/chosen": 30298761.6, + "logits/rejected": 28377660.8, + "logps/chosen": -127.83525390625, + "logps/rejected": -187.0820556640625, + "loss": 0.46401171684265136, + "rewards/chosen": 0.19893896579742432, + "rewards/margins": 0.3267621874809265, + "rewards/rejected": -0.1278232216835022, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 2.201462507247925, + "kl": 3.8757426738739014, + "learning_rate": 7.155555555555556e-07, + "logits/chosen": 45421788.8, + "logits/rejected": 42151324.8, + "logps/chosen": -175.1874755859375, + "logps/rejected": -169.148291015625, + "loss": 0.48148083686828613, + "rewards/chosen": 0.16172538995742797, + "rewards/margins": 0.2750619053840637, + "rewards/rejected": -0.11333651542663574, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 1.8805228471755981, + "kl": 3.8151164054870605, + "learning_rate": 6.711111111111111e-07, + "logits/chosen": 31197808.0, + "logits/rejected": 27353356.8, + "logps/chosen": -152.6833740234375, + "logps/rejected": -185.17493896484376, + "loss": 0.4390877723693848, + "rewards/chosen": 0.15401217937469483, + "rewards/margins": 0.5842344522476196, + "rewards/rejected": -0.4302222728729248, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_kl": 3.337947368621826, + "eval_logits/chosen": 29766119.424, + "eval_logits/rejected": 29534177.28, + "eval_logps/chosen": -155.123390625, + "eval_logps/rejected": -149.8595625, + "eval_loss": 0.4927149713039398, + "eval_rewards/chosen": 0.06667286682128906, + "eval_rewards/margins": 0.056239251136779786, + "eval_rewards/rejected": 0.010433615684509278, + "eval_runtime": 211.2573, + "eval_samples_per_second": 4.734, + "eval_steps_per_second": 2.367, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 2.611490249633789, + "kl": 4.028485298156738, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": 30264723.2, + "logits/rejected": 30910204.8, + "logps/chosen": -174.413818359375, + "logps/rejected": -188.7349853515625, + "loss": 0.4568845272064209, + "rewards/chosen": 0.035149258375167844, + "rewards/margins": 0.44202625155448916, + "rewards/rejected": -0.4068769931793213, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 2.5337791442871094, + "kl": 3.7211251258850098, + "learning_rate": 5.822222222222223e-07, + "logits/chosen": 26241504.0, + "logits/rejected": 23940459.2, + "logps/chosen": -151.94140625, + "logps/rejected": -115.9620849609375, + "loss": 0.4610316276550293, + "rewards/chosen": 0.3789072036743164, + "rewards/margins": 0.3300951421260834, + "rewards/rejected": 0.04881206154823303, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 1.5708776712417603, + "kl": 3.1767425537109375, + "learning_rate": 5.377777777777779e-07, + "logits/chosen": 42052073.6, + "logits/rejected": 39844899.2, + "logps/chosen": -169.72109375, + "logps/rejected": -151.2173583984375, + "loss": 0.4554294109344482, + "rewards/chosen": 0.2683689832687378, + "rewards/margins": 0.47904453277587894, + "rewards/rejected": -0.21067554950714112, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 2.6482839584350586, + "kl": 2.7335541248321533, + "learning_rate": 4.933333333333334e-07, + "logits/chosen": 37147670.4, + "logits/rejected": 37500460.8, + "logps/chosen": -143.32291259765626, + "logps/rejected": -160.43739013671876, + "loss": 0.46424403190612795, + "rewards/chosen": 0.15881721973419188, + "rewards/margins": 0.3237978339195251, + "rewards/rejected": -0.16498061418533325, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 2.6286239624023438, + "kl": 2.9139907360076904, + "learning_rate": 4.488888888888889e-07, + "logits/chosen": 22594232.0, + "logits/rejected": 20993777.6, + "logps/chosen": -151.5849365234375, + "logps/rejected": -194.24169921875, + "loss": 0.4465163230895996, + "rewards/chosen": 0.19300849437713624, + "rewards/margins": 0.5745944738388062, + "rewards/rejected": -0.38158597946166994, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.375143051147461, + "eval_logits/chosen": 29445828.608, + "eval_logits/rejected": 29209438.208, + "eval_logps/chosen": -155.187921875, + "eval_logps/rejected": -149.93753125, + "eval_loss": 0.4925803244113922, + "eval_rewards/chosen": 0.06021894836425781, + "eval_rewards/margins": 0.057583449840545656, + "eval_rewards/rejected": 0.0026354985237121583, + "eval_runtime": 211.2149, + "eval_samples_per_second": 4.735, + "eval_steps_per_second": 2.367, + "step": 1150 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1150/training_args.bin b/v5/KTO/KTO_5k/lora/checkpoint-1150/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4b0593b1fb99fd0ef500fd051a7332500d83f31 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1150/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb38612e474b2b75329a529c7bd7e818140a323dc202e6e5201e7c6648635d30 +size 5649 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/README.md b/v5/KTO/KTO_5k/lora/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c7fb826a92b8c340dc085ae4ee70addde7e565 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_model.safetensors b/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b27ca00a7938fc9d1570995dfce17421a5ede1e --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50c3b8f59060de10b6325c2c58dca2ebfd3ba71f257febc012898902b91ab380 +size 180385008 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/chat_template.jinja b/v5/KTO/KTO_5k/lora/checkpoint-1200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/optimizer.pt b/v5/KTO/KTO_5k/lora/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..42cf132873fa4417e20e589d1bb00d466976302a --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f73f9c3f0e0e1c172a4fa581df5aba22b92443276176736502a2c95095c6379a +size 360902475 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/rng_state.pth b/v5/KTO/KTO_5k/lora/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..13e11a54e352d8a7149df1f88c1b023ee9973959 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7affab63b271ed0f59a5b53056fc0a581226a41dcdf2fc2b80b669e7c3cf714 +size 14645 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/scaler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3529b9e1021ddc95e3af7b2d72233fab602a2d19 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18783150ac09b6b81cea5af47876a10bfe5f36c3d76aca4ffce5382bdfaf7b28 +size 1383 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/scheduler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16288dae153973b38b84616b601cb8389affa442 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84d1c98b7b9f7b3b2ee683e65fc0ab14b23bb5caf8d8b157bf55b37ce1bf3f2b +size 1465 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer.json b/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/trainer_state.json b/v5/KTO/KTO_5k/lora/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f2b268f8b43d21bf164d764899e17a4265fce89a --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/trainer_state.json @@ -0,0 +1,2218 @@ +{ + "best_global_step": 1150, + "best_metric": 0.057583449840545656, + "best_model_checkpoint": "output/lora/checkpoint-1150", + "epoch": 1.92, + "eval_steps": 50, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 2.1381592750549316, + "kl": 0.01649792119860649, + "learning_rate": 3.6e-07, + "logits/chosen": 28205651.2, + "logits/rejected": 29669123.2, + "logps/chosen": -150.3176025390625, + "logps/rejected": -130.385302734375, + "loss": 0.4999302864074707, + "rewards/chosen": 0.0005133629310876131, + "rewards/margins": 0.0005571890709688887, + "rewards/rejected": -4.382613988127559e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.9010003805160522, + "kl": 0.020609140396118164, + "learning_rate": 7.6e-07, + "logits/chosen": 52049945.6, + "logits/rejected": 51142873.6, + "logps/chosen": -140.97896728515624, + "logps/rejected": -153.13775634765625, + "loss": 0.49991936683654786, + "rewards/chosen": 0.0004100656602531672, + "rewards/margins": 0.0006456565577536821, + "rewards/rejected": -0.00023559089750051497, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 1.82417893409729, + "kl": 0.01093914546072483, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32594544.0, + "logits/rejected": 32500614.4, + "logps/chosen": -133.37647705078126, + "logps/rejected": -142.03988037109374, + "loss": 0.5000736713409424, + "rewards/chosen": -0.002948903851211071, + "rewards/margins": -0.0005901286378502844, + "rewards/rejected": -0.0023587752133607865, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 1.6120444536209106, + "kl": 0.010594606399536133, + "learning_rate": 1.56e-06, + "logits/chosen": 41530739.2, + "logits/rejected": 42298668.8, + "logps/chosen": -145.56357421875, + "logps/rejected": -147.24957275390625, + "loss": 0.5001413822174072, + "rewards/chosen": -0.0032692715525627137, + "rewards/margins": -0.0011312337592244148, + "rewards/rejected": -0.002138037793338299, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 1.3366488218307495, + "kl": 0.01903839036822319, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 40986995.2, + "logits/rejected": 42846995.2, + "logps/chosen": -142.60504150390625, + "logps/rejected": -155.85986328125, + "loss": 0.500172233581543, + "rewards/chosen": -0.002861199527978897, + "rewards/margins": -0.0013772012665867806, + "rewards/rejected": -0.0014839982613921165, + "step": 50 + }, + { + "epoch": 0.08, + "eval_kl": 0.03923250734806061, + "eval_logits/chosen": 37010317.312, + "eval_logits/rejected": 36932890.624, + "eval_logps/chosen": -155.7828125, + "eval_logps/rejected": -149.957953125, + "eval_loss": 0.4999832808971405, + "eval_rewards/chosen": 0.0007290065288543702, + "eval_rewards/margins": 0.00013377457857131968, + "eval_rewards/rejected": 0.0005952319502830505, + "eval_runtime": 211.9346, + "eval_samples_per_second": 4.718, + "eval_steps_per_second": 2.359, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 1.2370270490646362, + "kl": 0.042702484875917435, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 34543625.6, + "logits/rejected": 31963372.8, + "logps/chosen": -106.8656494140625, + "logps/rejected": -115.66375732421875, + "loss": 0.499523401260376, + "rewards/chosen": 0.00055726058781147, + "rewards/margins": 0.003814282640814781, + "rewards/rejected": -0.003257022053003311, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 1.6257128715515137, + "kl": 0.025073956698179245, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 45796806.4, + "logits/rejected": 44777859.2, + "logps/chosen": -167.59599609375, + "logps/rejected": -176.96552734375, + "loss": 0.4996492862701416, + "rewards/chosen": -0.017988091707229613, + "rewards/margins": 0.0028077125549316427, + "rewards/rejected": -0.020795804262161256, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 1.9464651346206665, + "kl": 0.032842040061950684, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 39214451.2, + "logits/rejected": 39159056.0, + "logps/chosen": -156.45654296875, + "logps/rejected": -164.982177734375, + "loss": 0.4997319221496582, + "rewards/chosen": -0.012030959129333496, + "rewards/margins": 0.0021469339728355415, + "rewards/rejected": -0.014177893102169038, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 1.664642572402954, + "kl": 0.13523416221141815, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 41776288.0, + "logits/rejected": 41958195.2, + "logps/chosen": -171.09915771484376, + "logps/rejected": -160.55670166015625, + "loss": 0.4979794979095459, + "rewards/chosen": 0.0015864329412579536, + "rewards/margins": 0.016174097545444965, + "rewards/rejected": -0.014587664604187011, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 2.069972515106201, + "kl": 0.2824670374393463, + "learning_rate": 3.96e-06, + "logits/chosen": 26015552.0, + "logits/rejected": 25218312.0, + "logps/chosen": -135.43885498046876, + "logps/rejected": -166.34676513671874, + "loss": 0.4990866184234619, + "rewards/chosen": 0.013756407797336579, + "rewards/margins": 0.007336309552192688, + "rewards/rejected": 0.00642009824514389, + "step": 100 + }, + { + "epoch": 0.16, + "eval_kl": 0.26922619342803955, + "eval_logits/chosen": 36658610.176, + "eval_logits/rejected": 36587118.592, + "eval_logps/chosen": -155.710015625, + "eval_logps/rejected": -149.88709375, + "eval_loss": 0.49995896220207214, + "eval_rewards/chosen": 0.008008602142333985, + "eval_rewards/margins": 0.0003274984359741221, + "eval_rewards/rejected": 0.007681103706359863, + "eval_runtime": 211.5606, + "eval_samples_per_second": 4.727, + "eval_steps_per_second": 2.363, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 1.6185622215270996, + "kl": 0.2616101801395416, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 36089392.0, + "logits/rejected": 35749190.4, + "logps/chosen": -131.4165283203125, + "logps/rejected": -136.50457763671875, + "loss": 0.4985805511474609, + "rewards/chosen": 0.014740067720413207, + "rewards/margins": 0.011374564468860626, + "rewards/rejected": 0.003365503251552582, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 1.6586893796920776, + "kl": 0.3709116578102112, + "learning_rate": 4.76e-06, + "logits/chosen": 44621308.8, + "logits/rejected": 44430220.8, + "logps/chosen": -163.35196533203126, + "logps/rejected": -134.7572998046875, + "loss": 0.500658369064331, + "rewards/chosen": 0.003282211720943451, + "rewards/margins": -0.0052413523197174065, + "rewards/rejected": 0.008523564040660857, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 1.287909746170044, + "kl": 0.7775768041610718, + "learning_rate": 4.982222222222222e-06, + "logits/chosen": 37691072.0, + "logits/rejected": 37058822.4, + "logps/chosen": -162.92593994140626, + "logps/rejected": -140.751171875, + "loss": 0.5000242233276367, + "rewards/chosen": 0.06439838409423829, + "rewards/margins": -0.00020327568054198664, + "rewards/rejected": 0.06460165977478027, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 1.4811052083969116, + "kl": 1.0720264911651611, + "learning_rate": 4.937777777777778e-06, + "logits/chosen": 39377881.6, + "logits/rejected": 41394512.0, + "logps/chosen": -142.946826171875, + "logps/rejected": -158.21910400390624, + "loss": 0.501332950592041, + "rewards/chosen": 0.0980217456817627, + "rewards/margins": -0.010658252239227298, + "rewards/rejected": 0.10867999792098999, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 1.6531552076339722, + "kl": 1.1869957447052002, + "learning_rate": 4.893333333333334e-06, + "logits/chosen": 39561158.4, + "logits/rejected": 40957766.4, + "logps/chosen": -130.68802490234376, + "logps/rejected": -117.63740234375, + "loss": 0.4958030223846436, + "rewards/chosen": 0.12626923322677613, + "rewards/margins": 0.0335154950618744, + "rewards/rejected": 0.09275373816490173, + "step": 150 + }, + { + "epoch": 0.24, + "eval_kl": 1.516471266746521, + "eval_logits/chosen": 37183991.808, + "eval_logits/rejected": 37034459.136, + "eval_logps/chosen": -154.240734375, + "eval_logps/rejected": -148.51396875, + "eval_loss": 0.49876031279563904, + "eval_rewards/chosen": 0.154937744140625, + "eval_rewards/margins": 0.009945236206054697, + "eval_rewards/rejected": 0.1449925079345703, + "eval_runtime": 211.5801, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 1.6023988723754883, + "kl": 1.7867761850357056, + "learning_rate": 4.848888888888889e-06, + "logits/chosen": 33342684.8, + "logits/rejected": 33521395.2, + "logps/chosen": -143.76165771484375, + "logps/rejected": -147.3908447265625, + "loss": 0.49936847686767577, + "rewards/chosen": 0.17880032062530518, + "rewards/margins": 0.005077493190765392, + "rewards/rejected": 0.1737228274345398, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 1.3449970483779907, + "kl": 2.119621753692627, + "learning_rate": 4.804444444444445e-06, + "logits/chosen": 36148233.6, + "logits/rejected": 38120403.2, + "logps/chosen": -137.6266845703125, + "logps/rejected": -149.0926025390625, + "loss": 0.4978325843811035, + "rewards/chosen": 0.22067615985870362, + "rewards/margins": 0.01742777824401856, + "rewards/rejected": 0.20324838161468506, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 1.9490461349487305, + "kl": 2.847586154937744, + "learning_rate": 4.76e-06, + "logits/chosen": 43064544.0, + "logits/rejected": 43962390.4, + "logps/chosen": -143.948681640625, + "logps/rejected": -169.59462890625, + "loss": 0.5009199619293213, + "rewards/chosen": 0.28105921745300294, + "rewards/margins": -0.007398319244384777, + "rewards/rejected": 0.2884575366973877, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 1.550258755683899, + "kl": 2.792905330657959, + "learning_rate": 4.715555555555556e-06, + "logits/chosen": 40556035.2, + "logits/rejected": 42005014.4, + "logps/chosen": -123.0137451171875, + "logps/rejected": -136.06087646484374, + "loss": 0.49814720153808595, + "rewards/chosen": 0.2867321491241455, + "rewards/margins": 0.014883160591125488, + "rewards/rejected": 0.27184898853302003, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 2.0262064933776855, + "kl": 3.2026119232177734, + "learning_rate": 4.6711111111111115e-06, + "logits/chosen": 45602153.6, + "logits/rejected": 46734368.0, + "logps/chosen": -149.51041259765626, + "logps/rejected": -171.689697265625, + "loss": 0.5007596492767334, + "rewards/chosen": 0.3171941041946411, + "rewards/margins": -0.006134414672851585, + "rewards/rejected": 0.3233285188674927, + "step": 200 + }, + { + "epoch": 0.32, + "eval_kl": 2.7445971965789795, + "eval_logits/chosen": 37825519.616, + "eval_logits/rejected": 37620092.928, + "eval_logps/chosen": -152.96146875, + "eval_logps/rejected": -147.30584375, + "eval_loss": 0.4978778660297394, + "eval_rewards/chosen": 0.2828658447265625, + "eval_rewards/margins": 0.017060607910156234, + "eval_rewards/rejected": 0.2658052368164063, + "eval_runtime": 211.7592, + "eval_samples_per_second": 4.722, + "eval_steps_per_second": 2.361, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 1.9593161344528198, + "kl": 2.349473714828491, + "learning_rate": 4.626666666666667e-06, + "logits/chosen": 32432160.0, + "logits/rejected": 32459036.8, + "logps/chosen": -140.2314453125, + "logps/rejected": -132.20750732421874, + "loss": 0.5013795852661133, + "rewards/chosen": 0.22654273509979247, + "rewards/margins": -0.011085557937622087, + "rewards/rejected": 0.23762829303741456, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 1.6649645566940308, + "kl": 2.177248239517212, + "learning_rate": 4.582222222222223e-06, + "logits/chosen": 34417107.2, + "logits/rejected": 35237868.8, + "logps/chosen": -101.06302490234376, + "logps/rejected": -125.54276123046876, + "loss": 0.5004647254943848, + "rewards/chosen": 0.20845344066619872, + "rewards/margins": -0.003761553764343256, + "rewards/rejected": 0.21221499443054198, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 1.7191715240478516, + "kl": 1.6887900829315186, + "learning_rate": 4.537777777777778e-06, + "logits/chosen": 46128198.4, + "logits/rejected": 45076755.2, + "logps/chosen": -185.54364013671875, + "logps/rejected": -163.69344482421874, + "loss": 0.5005609512329101, + "rewards/chosen": 0.1496042490005493, + "rewards/margins": -0.00507398843765261, + "rewards/rejected": 0.15467823743820192, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 1.6625752449035645, + "kl": 1.9195034503936768, + "learning_rate": 4.493333333333333e-06, + "logits/chosen": 46306035.2, + "logits/rejected": 46461657.6, + "logps/chosen": -176.698291015625, + "logps/rejected": -166.738232421875, + "loss": 0.5003566741943359, + "rewards/chosen": 0.1773249626159668, + "rewards/margins": -0.0034087777137756237, + "rewards/rejected": 0.18073374032974243, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.08888578414917, + "kl": 1.9199730157852173, + "learning_rate": 4.448888888888889e-06, + "logits/chosen": 35625705.6, + "logits/rejected": 33570604.8, + "logps/chosen": -176.07003173828124, + "logps/rejected": -145.513232421875, + "loss": 0.4901569366455078, + "rewards/chosen": 0.2223743438720703, + "rewards/margins": 0.078991961479187, + "rewards/rejected": 0.1433823823928833, + "step": 250 + }, + { + "epoch": 0.4, + "eval_kl": 1.8728376626968384, + "eval_logits/chosen": 37098446.848, + "eval_logits/rejected": 36913745.92, + "eval_logps/chosen": -153.93390625, + "eval_logps/rejected": -148.244546875, + "eval_loss": 0.4982966184616089, + "eval_rewards/chosen": 0.18562066650390624, + "eval_rewards/margins": 0.013686828613281243, + "eval_rewards/rejected": 0.171933837890625, + "eval_runtime": 212.9007, + "eval_samples_per_second": 4.697, + "eval_steps_per_second": 2.349, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 1.585038423538208, + "kl": 1.6053378582000732, + "learning_rate": 4.404444444444445e-06, + "logits/chosen": 28695961.6, + "logits/rejected": 27433849.6, + "logps/chosen": -129.23218994140626, + "logps/rejected": -132.16243896484374, + "loss": 0.4940999984741211, + "rewards/chosen": 0.16844781637191772, + "rewards/margins": 0.04744429588317871, + "rewards/rejected": 0.12100352048873901, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 2.185063362121582, + "kl": 2.0674309730529785, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 37351123.2, + "logits/rejected": 37435251.2, + "logps/chosen": -157.37740478515624, + "logps/rejected": -152.574072265625, + "loss": 0.4948906421661377, + "rewards/chosen": 0.18760323524475098, + "rewards/margins": 0.04087167978286743, + "rewards/rejected": 0.14673155546188354, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 1.842838168144226, + "kl": 1.880658745765686, + "learning_rate": 4.315555555555556e-06, + "logits/chosen": 31317900.8, + "logits/rejected": 25257848.0, + "logps/chosen": -168.01707763671874, + "logps/rejected": -140.544775390625, + "loss": 0.49900665283203127, + "rewards/chosen": 0.16304240226745606, + "rewards/margins": 0.008140754699707042, + "rewards/rejected": 0.15490164756774902, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 1.4501603841781616, + "kl": 2.7841668128967285, + "learning_rate": 4.271111111111111e-06, + "logits/chosen": 29001040.0, + "logits/rejected": 28279708.8, + "logps/chosen": -144.5850830078125, + "logps/rejected": -147.846533203125, + "loss": 0.48676314353942873, + "rewards/chosen": 0.322883677482605, + "rewards/margins": 0.10617766380310059, + "rewards/rejected": 0.2167060136795044, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 2.2522799968719482, + "kl": 2.674313545227051, + "learning_rate": 4.226666666666667e-06, + "logits/chosen": 39827664.0, + "logits/rejected": 40046345.6, + "logps/chosen": -168.2622314453125, + "logps/rejected": -181.0140869140625, + "loss": 0.4900949001312256, + "rewards/chosen": 0.22891669273376464, + "rewards/margins": 0.08893496990203856, + "rewards/rejected": 0.13998172283172608, + "step": 300 + }, + { + "epoch": 0.48, + "eval_kl": 2.443347215652466, + "eval_logits/chosen": 35836559.36, + "eval_logits/rejected": 35640664.064, + "eval_logps/chosen": -153.67809375, + "eval_logps/rejected": -147.97525, + "eval_loss": 0.49841761589050293, + "eval_rewards/chosen": 0.21120219421386718, + "eval_rewards/margins": 0.012337142944335938, + "eval_rewards/rejected": 0.19886505126953125, + "eval_runtime": 212.5056, + "eval_samples_per_second": 4.706, + "eval_steps_per_second": 2.353, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 1.8431649208068848, + "kl": 2.4258689880371094, + "learning_rate": 4.182222222222222e-06, + "logits/chosen": 31914422.4, + "logits/rejected": 32899113.6, + "logps/chosen": -146.6043701171875, + "logps/rejected": -155.2097412109375, + "loss": 0.5078470706939697, + "rewards/chosen": 0.1587265133857727, + "rewards/margins": -0.06373668909072877, + "rewards/rejected": 0.22246320247650148, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 2.756876230239868, + "kl": 1.3744744062423706, + "learning_rate": 4.1377777777777784e-06, + "logits/chosen": 35366806.4, + "logits/rejected": 33021190.4, + "logps/chosen": -161.6685302734375, + "logps/rejected": -134.85858154296875, + "loss": 0.5033583641052246, + "rewards/chosen": 0.047298938035964966, + "rewards/margins": -0.02729131579399109, + "rewards/rejected": 0.07459025382995606, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 1.6488304138183594, + "kl": 1.8090463876724243, + "learning_rate": 4.093333333333334e-06, + "logits/chosen": 44740480.0, + "logits/rejected": 41858704.0, + "logps/chosen": -153.429052734375, + "logps/rejected": -145.27188720703126, + "loss": 0.4838115692138672, + "rewards/chosen": 0.16980862617492676, + "rewards/margins": 0.13209896087646483, + "rewards/rejected": 0.03770966529846191, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 1.8334484100341797, + "kl": 1.624638557434082, + "learning_rate": 4.04888888888889e-06, + "logits/chosen": 36759168.0, + "logits/rejected": 37663475.2, + "logps/chosen": -135.3531005859375, + "logps/rejected": -144.48751220703124, + "loss": 0.49995737075805663, + "rewards/chosen": 0.11444320678710937, + "rewards/margins": -0.002312994003295904, + "rewards/rejected": 0.11675620079040527, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 1.7367093563079834, + "kl": 2.787097930908203, + "learning_rate": 4.004444444444445e-06, + "logits/chosen": 35158457.6, + "logits/rejected": 34068956.8, + "logps/chosen": -134.371630859375, + "logps/rejected": -169.529150390625, + "loss": 0.5052088737487793, + "rewards/chosen": 0.2354206323623657, + "rewards/margins": -0.04196789264678957, + "rewards/rejected": 0.2773885250091553, + "step": 350 + }, + { + "epoch": 0.56, + "eval_kl": 2.2334909439086914, + "eval_logits/chosen": 35659685.888, + "eval_logits/rejected": 35460202.496, + "eval_logps/chosen": -154.1329375, + "eval_logps/rejected": -148.447578125, + "eval_loss": 0.4981686472892761, + "eval_rewards/chosen": 0.16571835327148438, + "eval_rewards/margins": 0.014086547851562492, + "eval_rewards/rejected": 0.15163180541992188, + "eval_runtime": 210.7265, + "eval_samples_per_second": 4.745, + "eval_steps_per_second": 2.373, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 2.0100009441375732, + "kl": 2.0726542472839355, + "learning_rate": 3.96e-06, + "logits/chosen": 37896438.4, + "logits/rejected": 35973884.8, + "logps/chosen": -147.7560546875, + "logps/rejected": -113.3482177734375, + "loss": 0.48930912017822265, + "rewards/chosen": 0.16988544464111327, + "rewards/margins": 0.08941116333007812, + "rewards/rejected": 0.08047428131103515, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 1.6454890966415405, + "kl": 2.1270346641540527, + "learning_rate": 3.9155555555555554e-06, + "logits/chosen": 34360019.2, + "logits/rejected": 34796140.8, + "logps/chosen": -147.8162353515625, + "logps/rejected": -151.31751708984376, + "loss": 0.5053381443023681, + "rewards/chosen": 0.08656104803085327, + "rewards/margins": -0.053690028190612804, + "rewards/rejected": 0.14025107622146607, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 2.241021156311035, + "kl": 2.6244335174560547, + "learning_rate": 3.8711111111111115e-06, + "logits/chosen": 31317593.6, + "logits/rejected": 32079606.4, + "logps/chosen": -155.6551513671875, + "logps/rejected": -167.29332275390624, + "loss": 0.5026909828186035, + "rewards/chosen": 0.18726186752319335, + "rewards/margins": -0.035893630981445324, + "rewards/rejected": 0.22315549850463867, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 1.744504690170288, + "kl": 2.1470510959625244, + "learning_rate": 3.826666666666667e-06, + "logits/chosen": 27632387.2, + "logits/rejected": 26959638.4, + "logps/chosen": -176.283740234375, + "logps/rejected": -151.25721435546876, + "loss": 0.48785767555236814, + "rewards/chosen": 0.16686009168624877, + "rewards/margins": 0.09664145708084106, + "rewards/rejected": 0.07021863460540771, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 2.052776575088501, + "kl": 1.9149287939071655, + "learning_rate": 3.782222222222223e-06, + "logits/chosen": 38708992.0, + "logits/rejected": 36759104.0, + "logps/chosen": -150.0161376953125, + "logps/rejected": -137.032666015625, + "loss": 0.48592147827148435, + "rewards/chosen": 0.13577580451965332, + "rewards/margins": 0.11200562268495559, + "rewards/rejected": 0.023770181834697722, + "step": 400 + }, + { + "epoch": 0.64, + "eval_kl": 1.9213757514953613, + "eval_logits/chosen": 34004680.704, + "eval_logits/rejected": 33861222.4, + "eval_logps/chosen": -155.130359375, + "eval_logps/rejected": -149.4599375, + "eval_loss": 0.4978408217430115, + "eval_rewards/chosen": 0.06597476959228515, + "eval_rewards/margins": 0.015579383850097654, + "eval_rewards/rejected": 0.0503953857421875, + "eval_runtime": 215.3729, + "eval_samples_per_second": 4.643, + "eval_steps_per_second": 2.322, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 1.505508542060852, + "kl": 2.1655712127685547, + "learning_rate": 3.737777777777778e-06, + "logits/chosen": 39197555.2, + "logits/rejected": 36953779.2, + "logps/chosen": -148.135498046875, + "logps/rejected": -150.240234375, + "loss": 0.48557405471801757, + "rewards/chosen": 0.17278852462768554, + "rewards/margins": 0.12511927187442778, + "rewards/rejected": 0.04766925275325775, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 2.4999916553497314, + "kl": 1.624707579612732, + "learning_rate": 3.6933333333333337e-06, + "logits/chosen": 27496678.4, + "logits/rejected": 26063419.2, + "logps/chosen": -165.584228515625, + "logps/rejected": -133.94266357421876, + "loss": 0.49836010932922364, + "rewards/chosen": -0.088151615858078, + "rewards/margins": -0.013219672441482547, + "rewards/rejected": -0.07493194341659545, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 1.6226006746292114, + "kl": 1.6943508386611938, + "learning_rate": 3.648888888888889e-06, + "logits/chosen": 28216393.6, + "logits/rejected": 26552371.2, + "logps/chosen": -172.898681640625, + "logps/rejected": -121.724560546875, + "loss": 0.4999542236328125, + "rewards/chosen": -0.05118745565414429, + "rewards/margins": -0.011095824837684634, + "rewards/rejected": -0.04009163081645965, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 2.3899142742156982, + "kl": 2.0509917736053467, + "learning_rate": 3.604444444444445e-06, + "logits/chosen": 35301628.8, + "logits/rejected": 36549334.4, + "logps/chosen": -150.047705078125, + "logps/rejected": -170.48095703125, + "loss": 0.504191255569458, + "rewards/chosen": 0.07901791334152222, + "rewards/margins": -0.03352437019348144, + "rewards/rejected": 0.11254228353500366, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 1.915216088294983, + "kl": 2.2649385929107666, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 27320227.2, + "logits/rejected": 30460025.6, + "logps/chosen": -100.75572509765625, + "logps/rejected": -155.90166015625, + "loss": 0.5083817481994629, + "rewards/chosen": 0.032135069370269775, + "rewards/margins": -0.08187388181686402, + "rewards/rejected": 0.11400895118713379, + "step": 450 + }, + { + "epoch": 0.72, + "eval_kl": 2.8508355617523193, + "eval_logits/chosen": 33711845.376, + "eval_logits/rejected": 33509806.08, + "eval_logps/chosen": -153.81721875, + "eval_logps/rejected": -148.221515625, + "eval_loss": 0.49701353907585144, + "eval_rewards/chosen": 0.19728890991210937, + "eval_rewards/margins": 0.02305152893066406, + "eval_rewards/rejected": 0.1742373809814453, + "eval_runtime": 210.8637, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 2.420854091644287, + "kl": 3.208031415939331, + "learning_rate": 3.515555555555556e-06, + "logits/chosen": 39778067.2, + "logits/rejected": 36642828.8, + "logps/chosen": -164.1375732421875, + "logps/rejected": -180.76805419921874, + "loss": 0.49746012687683105, + "rewards/chosen": 0.20578148365020751, + "rewards/margins": 0.009352195262908924, + "rewards/rejected": 0.1964292883872986, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 1.615047812461853, + "kl": 2.750185489654541, + "learning_rate": 3.471111111111111e-06, + "logits/chosen": 47119552.0, + "logits/rejected": 44422067.2, + "logps/chosen": -175.47635498046876, + "logps/rejected": -178.90897216796876, + "loss": 0.5061192512512207, + "rewards/chosen": -0.03337647318840027, + "rewards/margins": -0.035451799631118774, + "rewards/rejected": 0.0020753264427185057, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 2.1674153804779053, + "kl": 1.8968321084976196, + "learning_rate": 3.426666666666667e-06, + "logits/chosen": 37006256.0, + "logits/rejected": 36102249.6, + "logps/chosen": -153.7917724609375, + "logps/rejected": -165.3869140625, + "loss": 0.47748627662658694, + "rewards/chosen": 0.003384724259376526, + "rewards/margins": 0.18434576094150543, + "rewards/rejected": -0.1809610366821289, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 1.6082895994186401, + "kl": 2.0536258220672607, + "learning_rate": 3.3822222222222224e-06, + "logits/chosen": 22186331.2, + "logits/rejected": 20350340.8, + "logps/chosen": -151.07054443359374, + "logps/rejected": -150.1708740234375, + "loss": 0.4888655185699463, + "rewards/chosen": -0.01819072961807251, + "rewards/margins": 0.14195933341979983, + "rewards/rejected": -0.16015006303787233, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 2.310601234436035, + "kl": 3.129138469696045, + "learning_rate": 3.337777777777778e-06, + "logits/chosen": 37795449.6, + "logits/rejected": 35220675.2, + "logps/chosen": -181.1957763671875, + "logps/rejected": -132.0344970703125, + "loss": 0.48585872650146483, + "rewards/chosen": 0.3015714168548584, + "rewards/margins": 0.12076919078826903, + "rewards/rejected": 0.18080222606658936, + "step": 500 + }, + { + "epoch": 0.8, + "eval_kl": 2.6557276248931885, + "eval_logits/chosen": 32642598.912, + "eval_logits/rejected": 32467406.848, + "eval_logps/chosen": -154.477703125, + "eval_logps/rejected": -148.869640625, + "eval_loss": 0.49703630805015564, + "eval_rewards/chosen": 0.13124107360839843, + "eval_rewards/margins": 0.021815811157226556, + "eval_rewards/rejected": 0.10942526245117187, + "eval_runtime": 211.0284, + "eval_samples_per_second": 4.739, + "eval_steps_per_second": 2.369, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 2.234365224838257, + "kl": 1.7492139339447021, + "learning_rate": 3.2933333333333333e-06, + "logits/chosen": 39694166.4, + "logits/rejected": 40852192.0, + "logps/chosen": -148.5533447265625, + "logps/rejected": -162.5984375, + "loss": 0.4872898101806641, + "rewards/chosen": 0.013645458221435546, + "rewards/margins": 0.10560911893844604, + "rewards/rejected": -0.0919636607170105, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 2.3625874519348145, + "kl": 3.2840332984924316, + "learning_rate": 3.2488888888888894e-06, + "logits/chosen": 37915008.0, + "logits/rejected": 36570806.4, + "logps/chosen": -157.36220703125, + "logps/rejected": -164.6677734375, + "loss": 0.4821781635284424, + "rewards/chosen": 0.32392158508300783, + "rewards/margins": 0.17874917984008792, + "rewards/rejected": 0.1451724052429199, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 2.0702106952667236, + "kl": 2.1847689151763916, + "learning_rate": 3.2044444444444446e-06, + "logits/chosen": 33132352.0, + "logits/rejected": 32190089.6, + "logps/chosen": -144.52484130859375, + "logps/rejected": -179.6159912109375, + "loss": 0.5058434486389161, + "rewards/chosen": -0.07189960479736328, + "rewards/margins": -0.031065639853477475, + "rewards/rejected": -0.0408339649438858, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 2.4268131256103516, + "kl": 3.71620512008667, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 27596662.4, + "logits/rejected": 26981862.4, + "logps/chosen": -150.15238037109376, + "logps/rejected": -159.4679443359375, + "loss": 0.46803932189941405, + "rewards/chosen": 0.3402720928192139, + "rewards/margins": 0.2841139912605286, + "rewards/rejected": 0.0561581015586853, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 2.0024309158325195, + "kl": 2.6835620403289795, + "learning_rate": 3.1155555555555555e-06, + "logits/chosen": 32613926.4, + "logits/rejected": 33945558.4, + "logps/chosen": -157.42744140625, + "logps/rejected": -171.3582275390625, + "loss": 0.49735183715820314, + "rewards/chosen": -0.0414805144071579, + "rewards/margins": 0.10379274189472197, + "rewards/rejected": -0.14527325630187987, + "step": 550 + }, + { + "epoch": 0.88, + "eval_kl": 2.534240245819092, + "eval_logits/chosen": 30600110.08, + "eval_logits/rejected": 30418843.648, + "eval_logps/chosen": -155.587328125, + "eval_logps/rejected": -149.98478125, + "eval_loss": 0.49648168683052063, + "eval_rewards/chosen": 0.02027870178222656, + "eval_rewards/margins": 0.022367488861083983, + "eval_rewards/rejected": -0.002088787078857422, + "eval_runtime": 210.9141, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.371, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 2.065328598022461, + "kl": 2.3084464073181152, + "learning_rate": 3.0711111111111115e-06, + "logits/chosen": 26840976.0, + "logits/rejected": 25480748.8, + "logps/chosen": -165.23271484375, + "logps/rejected": -152.96129150390624, + "loss": 0.49146738052368166, + "rewards/chosen": 0.038733655214309694, + "rewards/margins": 0.05340470671653748, + "rewards/rejected": -0.014671051502227783, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 1.7352122068405151, + "kl": 1.976243019104004, + "learning_rate": 3.0266666666666668e-06, + "logits/chosen": 30859900.8, + "logits/rejected": 29282716.8, + "logps/chosen": -153.5951904296875, + "logps/rejected": -138.8325927734375, + "loss": 0.48922386169433596, + "rewards/chosen": -0.16246808767318727, + "rewards/margins": 0.173832905292511, + "rewards/rejected": -0.33630099296569826, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 1.950156331062317, + "kl": 3.2299084663391113, + "learning_rate": 2.9822222222222224e-06, + "logits/chosen": 37093142.4, + "logits/rejected": 35527721.6, + "logps/chosen": -151.84559326171876, + "logps/rejected": -162.35234375, + "loss": 0.48923511505126954, + "rewards/chosen": -0.03407045304775238, + "rewards/margins": 0.13557693064212797, + "rewards/rejected": -0.16964738368988036, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 2.4236228466033936, + "kl": 3.0165860652923584, + "learning_rate": 2.937777777777778e-06, + "logits/chosen": 21686569.6, + "logits/rejected": 22070460.8, + "logps/chosen": -130.4562744140625, + "logps/rejected": -136.6388916015625, + "loss": 0.4813654899597168, + "rewards/chosen": 0.219277286529541, + "rewards/margins": 0.19798394441604614, + "rewards/rejected": 0.021293342113494873, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 2.492583990097046, + "kl": 2.3715996742248535, + "learning_rate": 2.8933333333333337e-06, + "logits/chosen": 20666136.0, + "logits/rejected": 23334753.6, + "logps/chosen": -153.04947509765626, + "logps/rejected": -153.43118896484376, + "loss": 0.522442102432251, + "rewards/chosen": -0.225014066696167, + "rewards/margins": -0.17056108117103577, + "rewards/rejected": -0.054452985525131226, + "step": 600 + }, + { + "epoch": 0.96, + "eval_kl": 2.428438425064087, + "eval_logits/chosen": 29508644.864, + "eval_logits/rejected": 29339983.872, + "eval_logps/chosen": -156.279359375, + "eval_logps/rejected": -150.735375, + "eval_loss": 0.49547284841537476, + "eval_rewards/chosen": -0.048923690795898436, + "eval_rewards/margins": 0.02822325134277344, + "eval_rewards/rejected": -0.07714694213867188, + "eval_runtime": 211.6125, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 1.849755883216858, + "kl": 4.1999735832214355, + "learning_rate": 2.8488888888888894e-06, + "logits/chosen": 35977270.4, + "logits/rejected": 32739616.0, + "logps/chosen": -196.6281982421875, + "logps/rejected": -171.48328857421876, + "loss": 0.4923978805541992, + "rewards/chosen": 0.14797959327697754, + "rewards/margins": 0.12893219590187074, + "rewards/rejected": 0.01904739737510681, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 2.2259228229522705, + "kl": 2.290156602859497, + "learning_rate": 2.8044444444444446e-06, + "logits/chosen": 18237147.2, + "logits/rejected": 16839369.6, + "logps/chosen": -143.46595458984376, + "logps/rejected": -140.927001953125, + "loss": 0.4779216766357422, + "rewards/chosen": 0.04038125276565552, + "rewards/margins": 0.21843221187591552, + "rewards/rejected": -0.17805095911026, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 1.9715921878814697, + "kl": 2.682957172393799, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 28436249.6, + "logits/rejected": 29027222.4, + "logps/chosen": -155.50318603515626, + "logps/rejected": -156.95904541015625, + "loss": 0.4716252326965332, + "rewards/chosen": 0.20086636543273925, + "rewards/margins": 0.32287436723709106, + "rewards/rejected": -0.12200800180435181, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 1.9490185976028442, + "kl": 2.792576551437378, + "learning_rate": 2.715555555555556e-06, + "logits/chosen": 35926758.4, + "logits/rejected": 36004332.8, + "logps/chosen": -139.33636474609375, + "logps/rejected": -143.99090576171875, + "loss": 0.4631006717681885, + "rewards/chosen": 0.2187732219696045, + "rewards/margins": 0.31144561171531676, + "rewards/rejected": -0.09267238974571228, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 2.6189019680023193, + "kl": 2.2690906524658203, + "learning_rate": 2.6711111111111116e-06, + "logits/chosen": 32591907.2, + "logits/rejected": 32146668.8, + "logps/chosen": -138.1519775390625, + "logps/rejected": -154.90008544921875, + "loss": 0.47899231910705564, + "rewards/chosen": 0.09387065768241883, + "rewards/margins": 0.19471864104270936, + "rewards/rejected": -0.10084798336029052, + "step": 650 + }, + { + "epoch": 1.04, + "eval_kl": 2.578673839569092, + "eval_logits/chosen": 28904480.768, + "eval_logits/rejected": 28698251.264, + "eval_logps/chosen": -156.2849375, + "eval_logps/rejected": -150.820046875, + "eval_loss": 0.4945332705974579, + "eval_rewards/chosen": -0.049482513427734375, + "eval_rewards/margins": 0.03613288116455079, + "eval_rewards/rejected": -0.08561539459228516, + "eval_runtime": 210.9586, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 2.37, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 1.7106302976608276, + "kl": 3.502686023712158, + "learning_rate": 2.6266666666666668e-06, + "logits/chosen": 35256064.0, + "logits/rejected": 34360857.6, + "logps/chosen": -150.1806884765625, + "logps/rejected": -148.415771484375, + "loss": 0.5002040386199951, + "rewards/chosen": 0.014912448823451996, + "rewards/margins": 0.010746008902788162, + "rewards/rejected": 0.0041664399206638334, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 2.5150351524353027, + "kl": 1.832489013671875, + "learning_rate": 2.5822222222222224e-06, + "logits/chosen": 34642908.8, + "logits/rejected": 36512502.4, + "logps/chosen": -145.1234375, + "logps/rejected": -152.358642578125, + "loss": 0.4734458923339844, + "rewards/chosen": 0.06824090480804443, + "rewards/margins": 0.24784770011901855, + "rewards/rejected": -0.17960679531097412, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 1.9682056903839111, + "kl": 2.000276565551758, + "learning_rate": 2.537777777777778e-06, + "logits/chosen": 27413356.8, + "logits/rejected": 24885939.2, + "logps/chosen": -121.7287841796875, + "logps/rejected": -141.73358154296875, + "loss": 0.44886274337768556, + "rewards/chosen": 0.1158550500869751, + "rewards/margins": 0.5507017374038696, + "rewards/rejected": -0.4348466873168945, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 1.9337836503982544, + "kl": 3.4022421836853027, + "learning_rate": 2.4933333333333333e-06, + "logits/chosen": 31807184.0, + "logits/rejected": 30969852.8, + "logps/chosen": -124.63392333984375, + "logps/rejected": -149.852197265625, + "loss": 0.46492581367492675, + "rewards/chosen": 0.14620821475982665, + "rewards/margins": 0.39136860370635984, + "rewards/rejected": -0.2451603889465332, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 2.3496735095977783, + "kl": 2.625760078430176, + "learning_rate": 2.448888888888889e-06, + "logits/chosen": 31142732.8, + "logits/rejected": 31784934.4, + "logps/chosen": -136.10140380859374, + "logps/rejected": -160.45810546875, + "loss": 0.4712203025817871, + "rewards/chosen": 0.030043387413024904, + "rewards/margins": 0.2471763849258423, + "rewards/rejected": -0.2171329975128174, + "step": 700 + }, + { + "epoch": 1.12, + "eval_kl": 2.867997169494629, + "eval_logits/chosen": 28665548.8, + "eval_logits/rejected": 28371339.264, + "eval_logps/chosen": -155.87825, + "eval_logps/rejected": -150.4365625, + "eval_loss": 0.49442729353904724, + "eval_rewards/chosen": -0.008813528060913086, + "eval_rewards/margins": 0.038452112197875976, + "eval_rewards/rejected": -0.047265640258789064, + "eval_runtime": 210.8987, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 1.8647392988204956, + "kl": 3.5727601051330566, + "learning_rate": 2.4044444444444446e-06, + "logits/chosen": 23887081.6, + "logits/rejected": 20803046.4, + "logps/chosen": -186.822802734375, + "logps/rejected": -185.6416259765625, + "loss": 0.43730711936950684, + "rewards/chosen": 0.37336575984954834, + "rewards/margins": 0.6151942014694214, + "rewards/rejected": -0.24182844161987305, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 1.630876064300537, + "kl": 3.1620936393737793, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 31887926.4, + "logits/rejected": 30896633.6, + "logps/chosen": -148.16866455078124, + "logps/rejected": -147.6019287109375, + "loss": 0.46859025955200195, + "rewards/chosen": 0.26157245635986326, + "rewards/margins": 0.22991548180580137, + "rewards/rejected": 0.03165697455406189, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 1.9033029079437256, + "kl": 2.6529040336608887, + "learning_rate": 2.3155555555555555e-06, + "logits/chosen": 21981940.8, + "logits/rejected": 20872329.6, + "logps/chosen": -118.38714599609375, + "logps/rejected": -131.70751953125, + "loss": 0.46525821685791013, + "rewards/chosen": -0.042069154977798465, + "rewards/margins": 0.3900075852870941, + "rewards/rejected": -0.43207674026489257, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 1.7811493873596191, + "kl": 4.9487690925598145, + "learning_rate": 2.2711111111111116e-06, + "logits/chosen": 32023660.8, + "logits/rejected": 32334473.6, + "logps/chosen": -160.63001708984376, + "logps/rejected": -155.8528076171875, + "loss": 0.4717572212219238, + "rewards/chosen": 0.43720192909240724, + "rewards/margins": 0.3068490982055664, + "rewards/rejected": 0.13035283088684083, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 2.2068121433258057, + "kl": 5.166882514953613, + "learning_rate": 2.226666666666667e-06, + "logits/chosen": 37952841.6, + "logits/rejected": 37582118.4, + "logps/chosen": -158.07510986328126, + "logps/rejected": -136.27884521484376, + "loss": 0.45236787796020506, + "rewards/chosen": 0.5586390972137452, + "rewards/margins": 0.447090494632721, + "rewards/rejected": 0.11154860258102417, + "step": 750 + }, + { + "epoch": 1.2, + "eval_kl": 4.169778823852539, + "eval_logits/chosen": 30752735.232, + "eval_logits/rejected": 30298806.272, + "eval_logps/chosen": -153.261625, + "eval_logps/rejected": -147.9041875, + "eval_loss": 0.49414026737213135, + "eval_rewards/chosen": 0.2528500061035156, + "eval_rewards/margins": 0.046879043579101526, + "eval_rewards/rejected": 0.20597096252441408, + "eval_runtime": 210.5527, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 2.375, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 1.7532247304916382, + "kl": 5.59327507019043, + "learning_rate": 2.1822222222222225e-06, + "logits/chosen": 34789904.0, + "logits/rejected": 35887366.4, + "logps/chosen": -140.38455810546876, + "logps/rejected": -154.4322021484375, + "loss": 0.4671647548675537, + "rewards/chosen": 0.6628150463104248, + "rewards/margins": 0.32444992065429684, + "rewards/rejected": 0.3383651256561279, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 1.8754905462265015, + "kl": 3.97855806350708, + "learning_rate": 2.137777777777778e-06, + "logits/chosen": 27867337.6, + "logits/rejected": 29447075.2, + "logps/chosen": -128.08131103515626, + "logps/rejected": -140.113671875, + "loss": 0.47170114517211914, + "rewards/chosen": 0.3410197257995605, + "rewards/margins": 0.21830989122390745, + "rewards/rejected": 0.12270983457565307, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 2.736323833465576, + "kl": 4.552127838134766, + "learning_rate": 2.0933333333333338e-06, + "logits/chosen": 33397673.6, + "logits/rejected": 34484364.8, + "logps/chosen": -137.85001220703126, + "logps/rejected": -139.0888427734375, + "loss": 0.45501227378845216, + "rewards/chosen": 0.355634069442749, + "rewards/margins": 0.35818901062011715, + "rewards/rejected": -0.0025549411773681642, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 1.9888898134231567, + "kl": 3.6503052711486816, + "learning_rate": 2.048888888888889e-06, + "logits/chosen": 29773580.8, + "logits/rejected": 28645248.0, + "logps/chosen": -151.4457275390625, + "logps/rejected": -138.25706787109374, + "loss": 0.46457924842834475, + "rewards/chosen": 0.29306089878082275, + "rewards/margins": 0.400502347946167, + "rewards/rejected": -0.10744144916534423, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 1.5264906883239746, + "kl": 5.461276531219482, + "learning_rate": 2.0044444444444446e-06, + "logits/chosen": 32210188.8, + "logits/rejected": 31754496.0, + "logps/chosen": -139.758837890625, + "logps/rejected": -154.13162841796876, + "loss": 0.47219176292419435, + "rewards/chosen": 0.6297782897949219, + "rewards/margins": 0.23436756134033204, + "rewards/rejected": 0.39541072845458985, + "step": 800 + }, + { + "epoch": 1.28, + "eval_kl": 4.377986907958984, + "eval_logits/chosen": 31938863.104, + "eval_logits/rejected": 31473901.568, + "eval_logps/chosen": -152.64290625, + "eval_logps/rejected": -147.31334375, + "eval_loss": 0.49390554428100586, + "eval_rewards/chosen": 0.3147206115722656, + "eval_rewards/margins": 0.04966500854492184, + "eval_rewards/rejected": 0.26505560302734377, + "eval_runtime": 210.9274, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.37, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 1.8228152990341187, + "kl": 4.066787242889404, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 35085334.4, + "logits/rejected": 34852982.4, + "logps/chosen": -132.74925537109374, + "logps/rejected": -158.0354736328125, + "loss": 0.44358067512512206, + "rewards/chosen": 0.4759791374206543, + "rewards/margins": 0.4843965947628021, + "rewards/rejected": -0.008417457342147827, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 2.1346709728240967, + "kl": 4.6763811111450195, + "learning_rate": 1.915555555555556e-06, + "logits/chosen": 34945942.4, + "logits/rejected": 36177011.2, + "logps/chosen": -132.50538330078126, + "logps/rejected": -161.39248046875, + "loss": 0.47060041427612304, + "rewards/chosen": 0.4741304874420166, + "rewards/margins": 0.28036924600601193, + "rewards/rejected": 0.19376124143600465, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 2.9988934993743896, + "kl": 3.6898865699768066, + "learning_rate": 1.8711111111111114e-06, + "logits/chosen": 36008406.4, + "logits/rejected": 37406822.4, + "logps/chosen": -136.77215576171875, + "logps/rejected": -147.55556640625, + "loss": 0.46726012229919434, + "rewards/chosen": 0.3493239164352417, + "rewards/margins": 0.2635639488697052, + "rewards/rejected": 0.0857599675655365, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.762466311454773, + "kl": 4.097973346710205, + "learning_rate": 1.8266666666666668e-06, + "logits/chosen": 27684899.2, + "logits/rejected": 28798355.2, + "logps/chosen": -118.63145751953125, + "logps/rejected": -141.59239501953124, + "loss": 0.47371621131896974, + "rewards/chosen": 0.31828267574310304, + "rewards/margins": 0.29973786473274233, + "rewards/rejected": 0.018544811010360717, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.833742380142212, + "kl": 4.407935619354248, + "learning_rate": 1.7822222222222225e-06, + "logits/chosen": 33654512.0, + "logits/rejected": 33191171.2, + "logps/chosen": -151.17977294921874, + "logps/rejected": -145.46224365234374, + "loss": 0.47383294105529783, + "rewards/chosen": 0.16252880096435546, + "rewards/margins": 0.22959471344947813, + "rewards/rejected": -0.06706591248512268, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.920696973800659, + "eval_logits/chosen": 31450167.296, + "eval_logits/rejected": 31058980.864, + "eval_logps/chosen": -153.5180625, + "eval_logps/rejected": -148.185328125, + "eval_loss": 0.49386003613471985, + "eval_rewards/chosen": 0.22720516967773438, + "eval_rewards/margins": 0.04934913635253907, + "eval_rewards/rejected": 0.1778560333251953, + "eval_runtime": 211.4228, + "eval_samples_per_second": 4.73, + "eval_steps_per_second": 2.365, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 2.183370351791382, + "kl": 4.991496562957764, + "learning_rate": 1.737777777777778e-06, + "logits/chosen": 47378035.2, + "logits/rejected": 45498502.4, + "logps/chosen": -179.4380126953125, + "logps/rejected": -169.1109375, + "loss": 0.4720784664154053, + "rewards/chosen": 0.43294267654418944, + "rewards/margins": 0.21851625442504882, + "rewards/rejected": 0.21442642211914062, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 2.1924335956573486, + "kl": 5.118699073791504, + "learning_rate": 1.6933333333333336e-06, + "logits/chosen": 29969433.6, + "logits/rejected": 28471737.6, + "logps/chosen": -192.4035888671875, + "logps/rejected": -151.5296875, + "loss": 0.46271333694458006, + "rewards/chosen": 0.5544761657714844, + "rewards/margins": 0.31934370994567873, + "rewards/rejected": 0.23513245582580566, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 2.2703323364257812, + "kl": 2.6695058345794678, + "learning_rate": 1.648888888888889e-06, + "logits/chosen": 26682553.6, + "logits/rejected": 24626574.4, + "logps/chosen": -133.742041015625, + "logps/rejected": -147.77803955078124, + "loss": 0.43658647537231443, + "rewards/chosen": 0.262941312789917, + "rewards/margins": 0.628947639465332, + "rewards/rejected": -0.36600632667541505, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 1.7647465467453003, + "kl": 4.138214111328125, + "learning_rate": 1.6044444444444447e-06, + "logits/chosen": 27142838.4, + "logits/rejected": 26620787.2, + "logps/chosen": -133.81441650390624, + "logps/rejected": -135.07823486328124, + "loss": 0.4540394306182861, + "rewards/chosen": 0.3629532098770142, + "rewards/margins": 0.4822005391120911, + "rewards/rejected": -0.1192473292350769, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 2.0833559036254883, + "kl": 3.821770191192627, + "learning_rate": 1.56e-06, + "logits/chosen": 25844046.4, + "logits/rejected": 22660449.6, + "logps/chosen": -145.89912109375, + "logps/rejected": -130.87916259765626, + "loss": 0.44003853797912595, + "rewards/chosen": 0.4248363018035889, + "rewards/margins": 0.5529402971267701, + "rewards/rejected": -0.12810399532318115, + "step": 900 + }, + { + "epoch": 1.44, + "eval_kl": 4.193332672119141, + "eval_logits/chosen": 32452849.664, + "eval_logits/rejected": 32089155.584, + "eval_logps/chosen": -152.94290625, + "eval_logps/rejected": -147.62978125, + "eval_loss": 0.4937511086463928, + "eval_rewards/chosen": 0.28472021484375, + "eval_rewards/margins": 0.05130918884277341, + "eval_rewards/rejected": 0.23341102600097657, + "eval_runtime": 211.0632, + "eval_samples_per_second": 4.738, + "eval_steps_per_second": 2.369, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 1.9122978448867798, + "kl": 5.044002056121826, + "learning_rate": 1.5155555555555558e-06, + "logits/chosen": 28371500.8, + "logits/rejected": 26429561.6, + "logps/chosen": -170.57901611328126, + "logps/rejected": -170.0248046875, + "loss": 0.47440948486328127, + "rewards/chosen": 0.4807882308959961, + "rewards/margins": 0.20360822677612306, + "rewards/rejected": 0.27718000411987304, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 2.096123218536377, + "kl": 4.188933372497559, + "learning_rate": 1.4711111111111112e-06, + "logits/chosen": 34476659.2, + "logits/rejected": 31294201.6, + "logps/chosen": -165.4956787109375, + "logps/rejected": -143.78316650390624, + "loss": 0.44411406517028806, + "rewards/chosen": 0.5467419147491455, + "rewards/margins": 0.49016233682632443, + "rewards/rejected": 0.05657957792282105, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 2.136502265930176, + "kl": 5.916023254394531, + "learning_rate": 1.4266666666666668e-06, + "logits/chosen": 28660502.4, + "logits/rejected": 31565062.4, + "logps/chosen": -145.20224609375, + "logps/rejected": -184.6095947265625, + "loss": 0.4747187614440918, + "rewards/chosen": 0.5481678962707519, + "rewards/margins": 0.3383267402648925, + "rewards/rejected": 0.20984115600585937, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 3.4681079387664795, + "kl": 3.9104812145233154, + "learning_rate": 1.3822222222222223e-06, + "logits/chosen": 32520064.0, + "logits/rejected": 28152707.2, + "logps/chosen": -149.12630615234374, + "logps/rejected": -132.30379638671874, + "loss": 0.4755962371826172, + "rewards/chosen": 0.34991438388824464, + "rewards/margins": 0.19766778945922853, + "rewards/rejected": 0.1522465944290161, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 2.1049246788024902, + "kl": 4.365435600280762, + "learning_rate": 1.337777777777778e-06, + "logits/chosen": 37928726.4, + "logits/rejected": 36957033.6, + "logps/chosen": -154.08798828125, + "logps/rejected": -146.7669189453125, + "loss": 0.4579151630401611, + "rewards/chosen": 0.46179609298706054, + "rewards/margins": 0.37595014572143554, + "rewards/rejected": 0.085845947265625, + "step": 950 + }, + { + "epoch": 1.52, + "eval_kl": 4.171284198760986, + "eval_logits/chosen": 32833683.456, + "eval_logits/rejected": 32522022.912, + "eval_logps/chosen": -152.9865, + "eval_logps/rejected": -147.679296875, + "eval_loss": 0.4936215281486511, + "eval_rewards/chosen": 0.2803621826171875, + "eval_rewards/margins": 0.051901611328124986, + "eval_rewards/rejected": 0.2284605712890625, + "eval_runtime": 211.1202, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 2.368, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 2.297563076019287, + "kl": 4.221343994140625, + "learning_rate": 1.2933333333333334e-06, + "logits/chosen": 41744796.8, + "logits/rejected": 40177462.4, + "logps/chosen": -140.44161376953124, + "logps/rejected": -148.71304931640626, + "loss": 0.44645137786865235, + "rewards/chosen": 0.4949165344238281, + "rewards/margins": 0.46811245679855346, + "rewards/rejected": 0.026804077625274658, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 2.0365242958068848, + "kl": 4.4261579513549805, + "learning_rate": 1.248888888888889e-06, + "logits/chosen": 32556515.2, + "logits/rejected": 33512262.4, + "logps/chosen": -133.8440673828125, + "logps/rejected": -171.82977294921875, + "loss": 0.4730066776275635, + "rewards/chosen": 0.4945687294006348, + "rewards/margins": 0.2293097019195557, + "rewards/rejected": 0.2652590274810791, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 1.5643013715744019, + "kl": 4.663653373718262, + "learning_rate": 1.2044444444444447e-06, + "logits/chosen": 32883987.2, + "logits/rejected": 30414611.2, + "logps/chosen": -126.985400390625, + "logps/rejected": -116.391650390625, + "loss": 0.4877506732940674, + "rewards/chosen": 0.48381505012512205, + "rewards/margins": 0.09749135971069334, + "rewards/rejected": 0.3863236904144287, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 2.706939458847046, + "kl": 4.062044620513916, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32006976.0, + "logits/rejected": 31020704.0, + "logps/chosen": -166.25194091796874, + "logps/rejected": -154.0007080078125, + "loss": 0.44759297370910645, + "rewards/chosen": 0.35017178058624265, + "rewards/margins": 0.5452085971832275, + "rewards/rejected": -0.19503681659698485, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 1.8194115161895752, + "kl": 3.5774059295654297, + "learning_rate": 1.1155555555555558e-06, + "logits/chosen": 28698640.0, + "logits/rejected": 29143193.6, + "logps/chosen": -139.20194091796876, + "logps/rejected": -158.261376953125, + "loss": 0.4849833965301514, + "rewards/chosen": 0.1898583173751831, + "rewards/margins": 0.07131674289703369, + "rewards/rejected": 0.11854157447814942, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_kl": 4.220986366271973, + "eval_logits/chosen": 32568942.592, + "eval_logits/rejected": 32248098.816, + "eval_logps/chosen": -152.99434375, + "eval_logps/rejected": -147.7251875, + "eval_loss": 0.4932064116001129, + "eval_rewards/chosen": 0.2795771179199219, + "eval_rewards/margins": 0.055706237792968766, + "eval_rewards/rejected": 0.22387088012695314, + "eval_runtime": 901.9357, + "eval_samples_per_second": 1.109, + "eval_steps_per_second": 0.554, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 2.4502618312835693, + "kl": 3.438178539276123, + "learning_rate": 1.0711111111111112e-06, + "logits/chosen": 23637744.0, + "logits/rejected": 21885137.6, + "logps/chosen": -154.96070556640626, + "logps/rejected": -135.8352783203125, + "loss": 0.4609940528869629, + "rewards/chosen": 0.33989131450653076, + "rewards/margins": 0.2776340961456299, + "rewards/rejected": 0.06225721836090088, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 2.483098268508911, + "kl": 4.026124000549316, + "learning_rate": 1.0266666666666669e-06, + "logits/chosen": 33672102.4, + "logits/rejected": 33149174.4, + "logps/chosen": -172.960546875, + "logps/rejected": -169.11124267578126, + "loss": 0.4497981548309326, + "rewards/chosen": 0.3961763620376587, + "rewards/margins": 0.43121243715286256, + "rewards/rejected": -0.03503607511520386, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 1.9396111965179443, + "kl": 3.11645770072937, + "learning_rate": 9.822222222222223e-07, + "logits/chosen": 33916867.2, + "logits/rejected": 29841721.6, + "logps/chosen": -149.1454345703125, + "logps/rejected": -127.8354248046875, + "loss": 0.4286343574523926, + "rewards/chosen": 0.36662404537200927, + "rewards/margins": 0.662821626663208, + "rewards/rejected": -0.2961975812911987, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.473919153213501, + "kl": 3.834186553955078, + "learning_rate": 9.377777777777778e-07, + "logits/chosen": 34663897.6, + "logits/rejected": 32536246.4, + "logps/chosen": -142.38626708984376, + "logps/rejected": -151.28388671875, + "loss": 0.4545116901397705, + "rewards/chosen": 0.403075122833252, + "rewards/margins": 0.4584430515766144, + "rewards/rejected": -0.055367928743362424, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.401204824447632, + "kl": 2.8307433128356934, + "learning_rate": 8.933333333333334e-07, + "logits/chosen": 25615622.4, + "logits/rejected": 24212544.0, + "logps/chosen": -194.39169921875, + "logps/rejected": -139.34288330078124, + "loss": 0.48009257316589354, + "rewards/chosen": -0.09855471849441529, + "rewards/margins": 0.04281153678894044, + "rewards/rejected": -0.14136625528335572, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.395029306411743, + "eval_logits/chosen": 30017314.816, + "eval_logits/rejected": 29781239.808, + "eval_logps/chosen": -155.0224375, + "eval_logps/rejected": -149.753125, + "eval_loss": 0.4927977919578552, + "eval_rewards/chosen": 0.07676624298095704, + "eval_rewards/margins": 0.0556891098022461, + "eval_rewards/rejected": 0.021077133178710936, + "eval_runtime": 211.7163, + "eval_samples_per_second": 4.723, + "eval_steps_per_second": 2.362, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 2.213663101196289, + "kl": 2.5274927616119385, + "learning_rate": 8.488888888888889e-07, + "logits/chosen": 21050780.8, + "logits/rejected": 22174214.4, + "logps/chosen": -143.71790771484376, + "logps/rejected": -136.63013916015626, + "loss": 0.4820300579071045, + "rewards/chosen": 0.13502249717712403, + "rewards/margins": 0.1883419156074524, + "rewards/rejected": -0.05331941843032837, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 2.839602470397949, + "kl": 2.8527681827545166, + "learning_rate": 8.044444444444445e-07, + "logits/chosen": 31860320.0, + "logits/rejected": 34545088.0, + "logps/chosen": -123.39615478515626, + "logps/rejected": -144.3958740234375, + "loss": 0.48537321090698243, + "rewards/chosen": 0.06347188949584961, + "rewards/margins": 0.1278951048851013, + "rewards/rejected": -0.0644232153892517, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 2.238354206085205, + "kl": 3.4803032875061035, + "learning_rate": 7.6e-07, + "logits/chosen": 30298761.6, + "logits/rejected": 28377660.8, + "logps/chosen": -127.83525390625, + "logps/rejected": -187.0820556640625, + "loss": 0.46401171684265136, + "rewards/chosen": 0.19893896579742432, + "rewards/margins": 0.3267621874809265, + "rewards/rejected": -0.1278232216835022, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 2.201462507247925, + "kl": 3.8757426738739014, + "learning_rate": 7.155555555555556e-07, + "logits/chosen": 45421788.8, + "logits/rejected": 42151324.8, + "logps/chosen": -175.1874755859375, + "logps/rejected": -169.148291015625, + "loss": 0.48148083686828613, + "rewards/chosen": 0.16172538995742797, + "rewards/margins": 0.2750619053840637, + "rewards/rejected": -0.11333651542663574, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 1.8805228471755981, + "kl": 3.8151164054870605, + "learning_rate": 6.711111111111111e-07, + "logits/chosen": 31197808.0, + "logits/rejected": 27353356.8, + "logps/chosen": -152.6833740234375, + "logps/rejected": -185.17493896484376, + "loss": 0.4390877723693848, + "rewards/chosen": 0.15401217937469483, + "rewards/margins": 0.5842344522476196, + "rewards/rejected": -0.4302222728729248, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_kl": 3.337947368621826, + "eval_logits/chosen": 29766119.424, + "eval_logits/rejected": 29534177.28, + "eval_logps/chosen": -155.123390625, + "eval_logps/rejected": -149.8595625, + "eval_loss": 0.4927149713039398, + "eval_rewards/chosen": 0.06667286682128906, + "eval_rewards/margins": 0.056239251136779786, + "eval_rewards/rejected": 0.010433615684509278, + "eval_runtime": 211.2573, + "eval_samples_per_second": 4.734, + "eval_steps_per_second": 2.367, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 2.611490249633789, + "kl": 4.028485298156738, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": 30264723.2, + "logits/rejected": 30910204.8, + "logps/chosen": -174.413818359375, + "logps/rejected": -188.7349853515625, + "loss": 0.4568845272064209, + "rewards/chosen": 0.035149258375167844, + "rewards/margins": 0.44202625155448916, + "rewards/rejected": -0.4068769931793213, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 2.5337791442871094, + "kl": 3.7211251258850098, + "learning_rate": 5.822222222222223e-07, + "logits/chosen": 26241504.0, + "logits/rejected": 23940459.2, + "logps/chosen": -151.94140625, + "logps/rejected": -115.9620849609375, + "loss": 0.4610316276550293, + "rewards/chosen": 0.3789072036743164, + "rewards/margins": 0.3300951421260834, + "rewards/rejected": 0.04881206154823303, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 1.5708776712417603, + "kl": 3.1767425537109375, + "learning_rate": 5.377777777777779e-07, + "logits/chosen": 42052073.6, + "logits/rejected": 39844899.2, + "logps/chosen": -169.72109375, + "logps/rejected": -151.2173583984375, + "loss": 0.4554294109344482, + "rewards/chosen": 0.2683689832687378, + "rewards/margins": 0.47904453277587894, + "rewards/rejected": -0.21067554950714112, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 2.6482839584350586, + "kl": 2.7335541248321533, + "learning_rate": 4.933333333333334e-07, + "logits/chosen": 37147670.4, + "logits/rejected": 37500460.8, + "logps/chosen": -143.32291259765626, + "logps/rejected": -160.43739013671876, + "loss": 0.46424403190612795, + "rewards/chosen": 0.15881721973419188, + "rewards/margins": 0.3237978339195251, + "rewards/rejected": -0.16498061418533325, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 2.6286239624023438, + "kl": 2.9139907360076904, + "learning_rate": 4.488888888888889e-07, + "logits/chosen": 22594232.0, + "logits/rejected": 20993777.6, + "logps/chosen": -151.5849365234375, + "logps/rejected": -194.24169921875, + "loss": 0.4465163230895996, + "rewards/chosen": 0.19300849437713624, + "rewards/margins": 0.5745944738388062, + "rewards/rejected": -0.38158597946166994, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.375143051147461, + "eval_logits/chosen": 29445828.608, + "eval_logits/rejected": 29209438.208, + "eval_logps/chosen": -155.187921875, + "eval_logps/rejected": -149.93753125, + "eval_loss": 0.4925803244113922, + "eval_rewards/chosen": 0.06021894836425781, + "eval_rewards/margins": 0.057583449840545656, + "eval_rewards/rejected": 0.0026354985237121583, + "eval_runtime": 211.2149, + "eval_samples_per_second": 4.735, + "eval_steps_per_second": 2.367, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 2.6833443641662598, + "kl": 3.5917434692382812, + "learning_rate": 4.0444444444444445e-07, + "logits/chosen": 33232332.8, + "logits/rejected": 32122198.4, + "logps/chosen": -146.055859375, + "logps/rejected": -152.858740234375, + "loss": 0.46697273254394533, + "rewards/chosen": 0.24598314762115478, + "rewards/margins": 0.3101099610328674, + "rewards/rejected": -0.06412681341171264, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 2.8079652786254883, + "kl": 5.262423992156982, + "learning_rate": 3.6e-07, + "logits/chosen": 26948995.2, + "logits/rejected": 25488443.2, + "logps/chosen": -135.2403564453125, + "logps/rejected": -163.37462158203124, + "loss": 0.4623889923095703, + "rewards/chosen": 0.43314542770385744, + "rewards/margins": 0.40463062524795534, + "rewards/rejected": 0.0285148024559021, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 2.987678289413452, + "kl": 2.790511131286621, + "learning_rate": 3.155555555555556e-07, + "logits/chosen": 29730611.2, + "logits/rejected": 29972553.6, + "logps/chosen": -167.71156005859376, + "logps/rejected": -171.79833984375, + "loss": 0.45536341667175295, + "rewards/chosen": 0.05922438502311707, + "rewards/margins": 0.4953587710857391, + "rewards/rejected": -0.43613438606262206, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 2.937406063079834, + "kl": 3.834505796432495, + "learning_rate": 2.7111111111111114e-07, + "logits/chosen": 30097795.2, + "logits/rejected": 28979769.6, + "logps/chosen": -167.03443603515626, + "logps/rejected": -143.852685546875, + "loss": 0.473237133026123, + "rewards/chosen": 0.20266783237457275, + "rewards/margins": 0.254498028755188, + "rewards/rejected": -0.051830196380615236, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 2.744769334793091, + "kl": 3.5745787620544434, + "learning_rate": 2.266666666666667e-07, + "logits/chosen": 36340227.2, + "logits/rejected": 34575104.0, + "logps/chosen": -149.65133056640624, + "logps/rejected": -160.3142333984375, + "loss": 0.472868013381958, + "rewards/chosen": 0.06828058958053589, + "rewards/margins": 0.3730572104454041, + "rewards/rejected": -0.3047766208648682, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_kl": 3.3557679653167725, + "eval_logits/chosen": 29278171.136, + "eval_logits/rejected": 29042731.008, + "eval_logps/chosen": -155.309109375, + "eval_logps/rejected": -150.058, + "eval_loss": 0.4926023483276367, + "eval_rewards/chosen": 0.048100093841552734, + "eval_rewards/margins": 0.05750944900512695, + "eval_rewards/rejected": -0.00940935516357422, + "eval_runtime": 211.1245, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 2.368, + "step": 1200 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1200/training_args.bin b/v5/KTO/KTO_5k/lora/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4b0593b1fb99fd0ef500fd051a7332500d83f31 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb38612e474b2b75329a529c7bd7e818140a323dc202e6e5201e7c6648635d30 +size 5649 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/README.md b/v5/KTO/KTO_5k/lora/checkpoint-1250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..438ce918727f679034b316b95d9dfc6ff83e3c62 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- kto +- lora +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a2c7fb826a92b8c340dc085ae4ee70addde7e565 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "q_proj", + "v_proj", + "gate_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_model.safetensors b/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77b9f5e435613cd045beb0f66319704b5505b36c --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720b371b30c818649afd37be34c4b78ee171efcd914d588a3b556c79b1f46c4b +size 180385008 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/chat_template.jinja b/v5/KTO/KTO_5k/lora/checkpoint-1250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/optimizer.pt b/v5/KTO/KTO_5k/lora/checkpoint-1250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2342d3702b80f57159194ca66fe6c152b6a68d0 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dd3fb9ce4b891d9f2cfb1c07af592f6465ac691fcc8e402261f1cfd0518b0c8 +size 360902475 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/rng_state.pth b/v5/KTO/KTO_5k/lora/checkpoint-1250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..68c0411dd375a388cbc8c58bea912cb904778ab8 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1786ad2057a678cc204dadc7fc5d1a4f939be477df219f770c7d40e9270281 +size 14645 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/scaler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..07ee4b50f44b17bb5b8227eea1d6870fb9256838 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81f7e3f1db34425f7108cfef030ac75cf6192318c9d437aad0ccd8bfd7f16788 +size 1383 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/scheduler.pt b/v5/KTO/KTO_5k/lora/checkpoint-1250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ceab9c79a1cd63a88945d7b9b92786ca3e8c2b4 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f8e77066e3080c50c9f5b35f7795c00de5a8bcb8beea0621e00addd66cf437a +size 1465 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer.json b/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer_config.json b/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/trainer_state.json b/v5/KTO/KTO_5k/lora/checkpoint-1250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..091ffeb21d82b103936b1c0bd374dd3e5504b9f5 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/trainer_state.json @@ -0,0 +1,2309 @@ +{ + "best_global_step": 1250, + "best_metric": 0.05818451833724976, + "best_model_checkpoint": "output/lora/checkpoint-1250", + "epoch": 2.0, + "eval_steps": 50, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 2.1381592750549316, + "kl": 0.01649792119860649, + "learning_rate": 3.6e-07, + "logits/chosen": 28205651.2, + "logits/rejected": 29669123.2, + "logps/chosen": -150.3176025390625, + "logps/rejected": -130.385302734375, + "loss": 0.4999302864074707, + "rewards/chosen": 0.0005133629310876131, + "rewards/margins": 0.0005571890709688887, + "rewards/rejected": -4.382613988127559e-05, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.9010003805160522, + "kl": 0.020609140396118164, + "learning_rate": 7.6e-07, + "logits/chosen": 52049945.6, + "logits/rejected": 51142873.6, + "logps/chosen": -140.97896728515624, + "logps/rejected": -153.13775634765625, + "loss": 0.49991936683654786, + "rewards/chosen": 0.0004100656602531672, + "rewards/margins": 0.0006456565577536821, + "rewards/rejected": -0.00023559089750051497, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 1.82417893409729, + "kl": 0.01093914546072483, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32594544.0, + "logits/rejected": 32500614.4, + "logps/chosen": -133.37647705078126, + "logps/rejected": -142.03988037109374, + "loss": 0.5000736713409424, + "rewards/chosen": -0.002948903851211071, + "rewards/margins": -0.0005901286378502844, + "rewards/rejected": -0.0023587752133607865, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 1.6120444536209106, + "kl": 0.010594606399536133, + "learning_rate": 1.56e-06, + "logits/chosen": 41530739.2, + "logits/rejected": 42298668.8, + "logps/chosen": -145.56357421875, + "logps/rejected": -147.24957275390625, + "loss": 0.5001413822174072, + "rewards/chosen": -0.0032692715525627137, + "rewards/margins": -0.0011312337592244148, + "rewards/rejected": -0.002138037793338299, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 1.3366488218307495, + "kl": 0.01903839036822319, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 40986995.2, + "logits/rejected": 42846995.2, + "logps/chosen": -142.60504150390625, + "logps/rejected": -155.85986328125, + "loss": 0.500172233581543, + "rewards/chosen": -0.002861199527978897, + "rewards/margins": -0.0013772012665867806, + "rewards/rejected": -0.0014839982613921165, + "step": 50 + }, + { + "epoch": 0.08, + "eval_kl": 0.03923250734806061, + "eval_logits/chosen": 37010317.312, + "eval_logits/rejected": 36932890.624, + "eval_logps/chosen": -155.7828125, + "eval_logps/rejected": -149.957953125, + "eval_loss": 0.4999832808971405, + "eval_rewards/chosen": 0.0007290065288543702, + "eval_rewards/margins": 0.00013377457857131968, + "eval_rewards/rejected": 0.0005952319502830505, + "eval_runtime": 211.9346, + "eval_samples_per_second": 4.718, + "eval_steps_per_second": 2.359, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 1.2370270490646362, + "kl": 0.042702484875917435, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 34543625.6, + "logits/rejected": 31963372.8, + "logps/chosen": -106.8656494140625, + "logps/rejected": -115.66375732421875, + "loss": 0.499523401260376, + "rewards/chosen": 0.00055726058781147, + "rewards/margins": 0.003814282640814781, + "rewards/rejected": -0.003257022053003311, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 1.6257128715515137, + "kl": 0.025073956698179245, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 45796806.4, + "logits/rejected": 44777859.2, + "logps/chosen": -167.59599609375, + "logps/rejected": -176.96552734375, + "loss": 0.4996492862701416, + "rewards/chosen": -0.017988091707229613, + "rewards/margins": 0.0028077125549316427, + "rewards/rejected": -0.020795804262161256, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 1.9464651346206665, + "kl": 0.032842040061950684, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 39214451.2, + "logits/rejected": 39159056.0, + "logps/chosen": -156.45654296875, + "logps/rejected": -164.982177734375, + "loss": 0.4997319221496582, + "rewards/chosen": -0.012030959129333496, + "rewards/margins": 0.0021469339728355415, + "rewards/rejected": -0.014177893102169038, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 1.664642572402954, + "kl": 0.13523416221141815, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 41776288.0, + "logits/rejected": 41958195.2, + "logps/chosen": -171.09915771484376, + "logps/rejected": -160.55670166015625, + "loss": 0.4979794979095459, + "rewards/chosen": 0.0015864329412579536, + "rewards/margins": 0.016174097545444965, + "rewards/rejected": -0.014587664604187011, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 2.069972515106201, + "kl": 0.2824670374393463, + "learning_rate": 3.96e-06, + "logits/chosen": 26015552.0, + "logits/rejected": 25218312.0, + "logps/chosen": -135.43885498046876, + "logps/rejected": -166.34676513671874, + "loss": 0.4990866184234619, + "rewards/chosen": 0.013756407797336579, + "rewards/margins": 0.007336309552192688, + "rewards/rejected": 0.00642009824514389, + "step": 100 + }, + { + "epoch": 0.16, + "eval_kl": 0.26922619342803955, + "eval_logits/chosen": 36658610.176, + "eval_logits/rejected": 36587118.592, + "eval_logps/chosen": -155.710015625, + "eval_logps/rejected": -149.88709375, + "eval_loss": 0.49995896220207214, + "eval_rewards/chosen": 0.008008602142333985, + "eval_rewards/margins": 0.0003274984359741221, + "eval_rewards/rejected": 0.007681103706359863, + "eval_runtime": 211.5606, + "eval_samples_per_second": 4.727, + "eval_steps_per_second": 2.363, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 1.6185622215270996, + "kl": 0.2616101801395416, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 36089392.0, + "logits/rejected": 35749190.4, + "logps/chosen": -131.4165283203125, + "logps/rejected": -136.50457763671875, + "loss": 0.4985805511474609, + "rewards/chosen": 0.014740067720413207, + "rewards/margins": 0.011374564468860626, + "rewards/rejected": 0.003365503251552582, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 1.6586893796920776, + "kl": 0.3709116578102112, + "learning_rate": 4.76e-06, + "logits/chosen": 44621308.8, + "logits/rejected": 44430220.8, + "logps/chosen": -163.35196533203126, + "logps/rejected": -134.7572998046875, + "loss": 0.500658369064331, + "rewards/chosen": 0.003282211720943451, + "rewards/margins": -0.0052413523197174065, + "rewards/rejected": 0.008523564040660857, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 1.287909746170044, + "kl": 0.7775768041610718, + "learning_rate": 4.982222222222222e-06, + "logits/chosen": 37691072.0, + "logits/rejected": 37058822.4, + "logps/chosen": -162.92593994140626, + "logps/rejected": -140.751171875, + "loss": 0.5000242233276367, + "rewards/chosen": 0.06439838409423829, + "rewards/margins": -0.00020327568054198664, + "rewards/rejected": 0.06460165977478027, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 1.4811052083969116, + "kl": 1.0720264911651611, + "learning_rate": 4.937777777777778e-06, + "logits/chosen": 39377881.6, + "logits/rejected": 41394512.0, + "logps/chosen": -142.946826171875, + "logps/rejected": -158.21910400390624, + "loss": 0.501332950592041, + "rewards/chosen": 0.0980217456817627, + "rewards/margins": -0.010658252239227298, + "rewards/rejected": 0.10867999792098999, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 1.6531552076339722, + "kl": 1.1869957447052002, + "learning_rate": 4.893333333333334e-06, + "logits/chosen": 39561158.4, + "logits/rejected": 40957766.4, + "logps/chosen": -130.68802490234376, + "logps/rejected": -117.63740234375, + "loss": 0.4958030223846436, + "rewards/chosen": 0.12626923322677613, + "rewards/margins": 0.0335154950618744, + "rewards/rejected": 0.09275373816490173, + "step": 150 + }, + { + "epoch": 0.24, + "eval_kl": 1.516471266746521, + "eval_logits/chosen": 37183991.808, + "eval_logits/rejected": 37034459.136, + "eval_logps/chosen": -154.240734375, + "eval_logps/rejected": -148.51396875, + "eval_loss": 0.49876031279563904, + "eval_rewards/chosen": 0.154937744140625, + "eval_rewards/margins": 0.009945236206054697, + "eval_rewards/rejected": 0.1449925079345703, + "eval_runtime": 211.5801, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 1.6023988723754883, + "kl": 1.7867761850357056, + "learning_rate": 4.848888888888889e-06, + "logits/chosen": 33342684.8, + "logits/rejected": 33521395.2, + "logps/chosen": -143.76165771484375, + "logps/rejected": -147.3908447265625, + "loss": 0.49936847686767577, + "rewards/chosen": 0.17880032062530518, + "rewards/margins": 0.005077493190765392, + "rewards/rejected": 0.1737228274345398, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 1.3449970483779907, + "kl": 2.119621753692627, + "learning_rate": 4.804444444444445e-06, + "logits/chosen": 36148233.6, + "logits/rejected": 38120403.2, + "logps/chosen": -137.6266845703125, + "logps/rejected": -149.0926025390625, + "loss": 0.4978325843811035, + "rewards/chosen": 0.22067615985870362, + "rewards/margins": 0.01742777824401856, + "rewards/rejected": 0.20324838161468506, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 1.9490461349487305, + "kl": 2.847586154937744, + "learning_rate": 4.76e-06, + "logits/chosen": 43064544.0, + "logits/rejected": 43962390.4, + "logps/chosen": -143.948681640625, + "logps/rejected": -169.59462890625, + "loss": 0.5009199619293213, + "rewards/chosen": 0.28105921745300294, + "rewards/margins": -0.007398319244384777, + "rewards/rejected": 0.2884575366973877, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 1.550258755683899, + "kl": 2.792905330657959, + "learning_rate": 4.715555555555556e-06, + "logits/chosen": 40556035.2, + "logits/rejected": 42005014.4, + "logps/chosen": -123.0137451171875, + "logps/rejected": -136.06087646484374, + "loss": 0.49814720153808595, + "rewards/chosen": 0.2867321491241455, + "rewards/margins": 0.014883160591125488, + "rewards/rejected": 0.27184898853302003, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 2.0262064933776855, + "kl": 3.2026119232177734, + "learning_rate": 4.6711111111111115e-06, + "logits/chosen": 45602153.6, + "logits/rejected": 46734368.0, + "logps/chosen": -149.51041259765626, + "logps/rejected": -171.689697265625, + "loss": 0.5007596492767334, + "rewards/chosen": 0.3171941041946411, + "rewards/margins": -0.006134414672851585, + "rewards/rejected": 0.3233285188674927, + "step": 200 + }, + { + "epoch": 0.32, + "eval_kl": 2.7445971965789795, + "eval_logits/chosen": 37825519.616, + "eval_logits/rejected": 37620092.928, + "eval_logps/chosen": -152.96146875, + "eval_logps/rejected": -147.30584375, + "eval_loss": 0.4978778660297394, + "eval_rewards/chosen": 0.2828658447265625, + "eval_rewards/margins": 0.017060607910156234, + "eval_rewards/rejected": 0.2658052368164063, + "eval_runtime": 211.7592, + "eval_samples_per_second": 4.722, + "eval_steps_per_second": 2.361, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 1.9593161344528198, + "kl": 2.349473714828491, + "learning_rate": 4.626666666666667e-06, + "logits/chosen": 32432160.0, + "logits/rejected": 32459036.8, + "logps/chosen": -140.2314453125, + "logps/rejected": -132.20750732421874, + "loss": 0.5013795852661133, + "rewards/chosen": 0.22654273509979247, + "rewards/margins": -0.011085557937622087, + "rewards/rejected": 0.23762829303741456, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 1.6649645566940308, + "kl": 2.177248239517212, + "learning_rate": 4.582222222222223e-06, + "logits/chosen": 34417107.2, + "logits/rejected": 35237868.8, + "logps/chosen": -101.06302490234376, + "logps/rejected": -125.54276123046876, + "loss": 0.5004647254943848, + "rewards/chosen": 0.20845344066619872, + "rewards/margins": -0.003761553764343256, + "rewards/rejected": 0.21221499443054198, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 1.7191715240478516, + "kl": 1.6887900829315186, + "learning_rate": 4.537777777777778e-06, + "logits/chosen": 46128198.4, + "logits/rejected": 45076755.2, + "logps/chosen": -185.54364013671875, + "logps/rejected": -163.69344482421874, + "loss": 0.5005609512329101, + "rewards/chosen": 0.1496042490005493, + "rewards/margins": -0.00507398843765261, + "rewards/rejected": 0.15467823743820192, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 1.6625752449035645, + "kl": 1.9195034503936768, + "learning_rate": 4.493333333333333e-06, + "logits/chosen": 46306035.2, + "logits/rejected": 46461657.6, + "logps/chosen": -176.698291015625, + "logps/rejected": -166.738232421875, + "loss": 0.5003566741943359, + "rewards/chosen": 0.1773249626159668, + "rewards/margins": -0.0034087777137756237, + "rewards/rejected": 0.18073374032974243, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 2.08888578414917, + "kl": 1.9199730157852173, + "learning_rate": 4.448888888888889e-06, + "logits/chosen": 35625705.6, + "logits/rejected": 33570604.8, + "logps/chosen": -176.07003173828124, + "logps/rejected": -145.513232421875, + "loss": 0.4901569366455078, + "rewards/chosen": 0.2223743438720703, + "rewards/margins": 0.078991961479187, + "rewards/rejected": 0.1433823823928833, + "step": 250 + }, + { + "epoch": 0.4, + "eval_kl": 1.8728376626968384, + "eval_logits/chosen": 37098446.848, + "eval_logits/rejected": 36913745.92, + "eval_logps/chosen": -153.93390625, + "eval_logps/rejected": -148.244546875, + "eval_loss": 0.4982966184616089, + "eval_rewards/chosen": 0.18562066650390624, + "eval_rewards/margins": 0.013686828613281243, + "eval_rewards/rejected": 0.171933837890625, + "eval_runtime": 212.9007, + "eval_samples_per_second": 4.697, + "eval_steps_per_second": 2.349, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 1.585038423538208, + "kl": 1.6053378582000732, + "learning_rate": 4.404444444444445e-06, + "logits/chosen": 28695961.6, + "logits/rejected": 27433849.6, + "logps/chosen": -129.23218994140626, + "logps/rejected": -132.16243896484374, + "loss": 0.4940999984741211, + "rewards/chosen": 0.16844781637191772, + "rewards/margins": 0.04744429588317871, + "rewards/rejected": 0.12100352048873901, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 2.185063362121582, + "kl": 2.0674309730529785, + "learning_rate": 4.360000000000001e-06, + "logits/chosen": 37351123.2, + "logits/rejected": 37435251.2, + "logps/chosen": -157.37740478515624, + "logps/rejected": -152.574072265625, + "loss": 0.4948906421661377, + "rewards/chosen": 0.18760323524475098, + "rewards/margins": 0.04087167978286743, + "rewards/rejected": 0.14673155546188354, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 1.842838168144226, + "kl": 1.880658745765686, + "learning_rate": 4.315555555555556e-06, + "logits/chosen": 31317900.8, + "logits/rejected": 25257848.0, + "logps/chosen": -168.01707763671874, + "logps/rejected": -140.544775390625, + "loss": 0.49900665283203127, + "rewards/chosen": 0.16304240226745606, + "rewards/margins": 0.008140754699707042, + "rewards/rejected": 0.15490164756774902, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 1.4501603841781616, + "kl": 2.7841668128967285, + "learning_rate": 4.271111111111111e-06, + "logits/chosen": 29001040.0, + "logits/rejected": 28279708.8, + "logps/chosen": -144.5850830078125, + "logps/rejected": -147.846533203125, + "loss": 0.48676314353942873, + "rewards/chosen": 0.322883677482605, + "rewards/margins": 0.10617766380310059, + "rewards/rejected": 0.2167060136795044, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 2.2522799968719482, + "kl": 2.674313545227051, + "learning_rate": 4.226666666666667e-06, + "logits/chosen": 39827664.0, + "logits/rejected": 40046345.6, + "logps/chosen": -168.2622314453125, + "logps/rejected": -181.0140869140625, + "loss": 0.4900949001312256, + "rewards/chosen": 0.22891669273376464, + "rewards/margins": 0.08893496990203856, + "rewards/rejected": 0.13998172283172608, + "step": 300 + }, + { + "epoch": 0.48, + "eval_kl": 2.443347215652466, + "eval_logits/chosen": 35836559.36, + "eval_logits/rejected": 35640664.064, + "eval_logps/chosen": -153.67809375, + "eval_logps/rejected": -147.97525, + "eval_loss": 0.49841761589050293, + "eval_rewards/chosen": 0.21120219421386718, + "eval_rewards/margins": 0.012337142944335938, + "eval_rewards/rejected": 0.19886505126953125, + "eval_runtime": 212.5056, + "eval_samples_per_second": 4.706, + "eval_steps_per_second": 2.353, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 1.8431649208068848, + "kl": 2.4258689880371094, + "learning_rate": 4.182222222222222e-06, + "logits/chosen": 31914422.4, + "logits/rejected": 32899113.6, + "logps/chosen": -146.6043701171875, + "logps/rejected": -155.2097412109375, + "loss": 0.5078470706939697, + "rewards/chosen": 0.1587265133857727, + "rewards/margins": -0.06373668909072877, + "rewards/rejected": 0.22246320247650148, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 2.756876230239868, + "kl": 1.3744744062423706, + "learning_rate": 4.1377777777777784e-06, + "logits/chosen": 35366806.4, + "logits/rejected": 33021190.4, + "logps/chosen": -161.6685302734375, + "logps/rejected": -134.85858154296875, + "loss": 0.5033583641052246, + "rewards/chosen": 0.047298938035964966, + "rewards/margins": -0.02729131579399109, + "rewards/rejected": 0.07459025382995606, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 1.6488304138183594, + "kl": 1.8090463876724243, + "learning_rate": 4.093333333333334e-06, + "logits/chosen": 44740480.0, + "logits/rejected": 41858704.0, + "logps/chosen": -153.429052734375, + "logps/rejected": -145.27188720703126, + "loss": 0.4838115692138672, + "rewards/chosen": 0.16980862617492676, + "rewards/margins": 0.13209896087646483, + "rewards/rejected": 0.03770966529846191, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 1.8334484100341797, + "kl": 1.624638557434082, + "learning_rate": 4.04888888888889e-06, + "logits/chosen": 36759168.0, + "logits/rejected": 37663475.2, + "logps/chosen": -135.3531005859375, + "logps/rejected": -144.48751220703124, + "loss": 0.49995737075805663, + "rewards/chosen": 0.11444320678710937, + "rewards/margins": -0.002312994003295904, + "rewards/rejected": 0.11675620079040527, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 1.7367093563079834, + "kl": 2.787097930908203, + "learning_rate": 4.004444444444445e-06, + "logits/chosen": 35158457.6, + "logits/rejected": 34068956.8, + "logps/chosen": -134.371630859375, + "logps/rejected": -169.529150390625, + "loss": 0.5052088737487793, + "rewards/chosen": 0.2354206323623657, + "rewards/margins": -0.04196789264678957, + "rewards/rejected": 0.2773885250091553, + "step": 350 + }, + { + "epoch": 0.56, + "eval_kl": 2.2334909439086914, + "eval_logits/chosen": 35659685.888, + "eval_logits/rejected": 35460202.496, + "eval_logps/chosen": -154.1329375, + "eval_logps/rejected": -148.447578125, + "eval_loss": 0.4981686472892761, + "eval_rewards/chosen": 0.16571835327148438, + "eval_rewards/margins": 0.014086547851562492, + "eval_rewards/rejected": 0.15163180541992188, + "eval_runtime": 210.7265, + "eval_samples_per_second": 4.745, + "eval_steps_per_second": 2.373, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 2.0100009441375732, + "kl": 2.0726542472839355, + "learning_rate": 3.96e-06, + "logits/chosen": 37896438.4, + "logits/rejected": 35973884.8, + "logps/chosen": -147.7560546875, + "logps/rejected": -113.3482177734375, + "loss": 0.48930912017822265, + "rewards/chosen": 0.16988544464111327, + "rewards/margins": 0.08941116333007812, + "rewards/rejected": 0.08047428131103515, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 1.6454890966415405, + "kl": 2.1270346641540527, + "learning_rate": 3.9155555555555554e-06, + "logits/chosen": 34360019.2, + "logits/rejected": 34796140.8, + "logps/chosen": -147.8162353515625, + "logps/rejected": -151.31751708984376, + "loss": 0.5053381443023681, + "rewards/chosen": 0.08656104803085327, + "rewards/margins": -0.053690028190612804, + "rewards/rejected": 0.14025107622146607, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 2.241021156311035, + "kl": 2.6244335174560547, + "learning_rate": 3.8711111111111115e-06, + "logits/chosen": 31317593.6, + "logits/rejected": 32079606.4, + "logps/chosen": -155.6551513671875, + "logps/rejected": -167.29332275390624, + "loss": 0.5026909828186035, + "rewards/chosen": 0.18726186752319335, + "rewards/margins": -0.035893630981445324, + "rewards/rejected": 0.22315549850463867, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 1.744504690170288, + "kl": 2.1470510959625244, + "learning_rate": 3.826666666666667e-06, + "logits/chosen": 27632387.2, + "logits/rejected": 26959638.4, + "logps/chosen": -176.283740234375, + "logps/rejected": -151.25721435546876, + "loss": 0.48785767555236814, + "rewards/chosen": 0.16686009168624877, + "rewards/margins": 0.09664145708084106, + "rewards/rejected": 0.07021863460540771, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 2.052776575088501, + "kl": 1.9149287939071655, + "learning_rate": 3.782222222222223e-06, + "logits/chosen": 38708992.0, + "logits/rejected": 36759104.0, + "logps/chosen": -150.0161376953125, + "logps/rejected": -137.032666015625, + "loss": 0.48592147827148435, + "rewards/chosen": 0.13577580451965332, + "rewards/margins": 0.11200562268495559, + "rewards/rejected": 0.023770181834697722, + "step": 400 + }, + { + "epoch": 0.64, + "eval_kl": 1.9213757514953613, + "eval_logits/chosen": 34004680.704, + "eval_logits/rejected": 33861222.4, + "eval_logps/chosen": -155.130359375, + "eval_logps/rejected": -149.4599375, + "eval_loss": 0.4978408217430115, + "eval_rewards/chosen": 0.06597476959228515, + "eval_rewards/margins": 0.015579383850097654, + "eval_rewards/rejected": 0.0503953857421875, + "eval_runtime": 215.3729, + "eval_samples_per_second": 4.643, + "eval_steps_per_second": 2.322, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 1.505508542060852, + "kl": 2.1655712127685547, + "learning_rate": 3.737777777777778e-06, + "logits/chosen": 39197555.2, + "logits/rejected": 36953779.2, + "logps/chosen": -148.135498046875, + "logps/rejected": -150.240234375, + "loss": 0.48557405471801757, + "rewards/chosen": 0.17278852462768554, + "rewards/margins": 0.12511927187442778, + "rewards/rejected": 0.04766925275325775, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 2.4999916553497314, + "kl": 1.624707579612732, + "learning_rate": 3.6933333333333337e-06, + "logits/chosen": 27496678.4, + "logits/rejected": 26063419.2, + "logps/chosen": -165.584228515625, + "logps/rejected": -133.94266357421876, + "loss": 0.49836010932922364, + "rewards/chosen": -0.088151615858078, + "rewards/margins": -0.013219672441482547, + "rewards/rejected": -0.07493194341659545, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 1.6226006746292114, + "kl": 1.6943508386611938, + "learning_rate": 3.648888888888889e-06, + "logits/chosen": 28216393.6, + "logits/rejected": 26552371.2, + "logps/chosen": -172.898681640625, + "logps/rejected": -121.724560546875, + "loss": 0.4999542236328125, + "rewards/chosen": -0.05118745565414429, + "rewards/margins": -0.011095824837684634, + "rewards/rejected": -0.04009163081645965, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 2.3899142742156982, + "kl": 2.0509917736053467, + "learning_rate": 3.604444444444445e-06, + "logits/chosen": 35301628.8, + "logits/rejected": 36549334.4, + "logps/chosen": -150.047705078125, + "logps/rejected": -170.48095703125, + "loss": 0.504191255569458, + "rewards/chosen": 0.07901791334152222, + "rewards/margins": -0.03352437019348144, + "rewards/rejected": 0.11254228353500366, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 1.915216088294983, + "kl": 2.2649385929107666, + "learning_rate": 3.5600000000000002e-06, + "logits/chosen": 27320227.2, + "logits/rejected": 30460025.6, + "logps/chosen": -100.75572509765625, + "logps/rejected": -155.90166015625, + "loss": 0.5083817481994629, + "rewards/chosen": 0.032135069370269775, + "rewards/margins": -0.08187388181686402, + "rewards/rejected": 0.11400895118713379, + "step": 450 + }, + { + "epoch": 0.72, + "eval_kl": 2.8508355617523193, + "eval_logits/chosen": 33711845.376, + "eval_logits/rejected": 33509806.08, + "eval_logps/chosen": -153.81721875, + "eval_logps/rejected": -148.221515625, + "eval_loss": 0.49701353907585144, + "eval_rewards/chosen": 0.19728890991210937, + "eval_rewards/margins": 0.02305152893066406, + "eval_rewards/rejected": 0.1742373809814453, + "eval_runtime": 210.8637, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 2.420854091644287, + "kl": 3.208031415939331, + "learning_rate": 3.515555555555556e-06, + "logits/chosen": 39778067.2, + "logits/rejected": 36642828.8, + "logps/chosen": -164.1375732421875, + "logps/rejected": -180.76805419921874, + "loss": 0.49746012687683105, + "rewards/chosen": 0.20578148365020751, + "rewards/margins": 0.009352195262908924, + "rewards/rejected": 0.1964292883872986, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 1.615047812461853, + "kl": 2.750185489654541, + "learning_rate": 3.471111111111111e-06, + "logits/chosen": 47119552.0, + "logits/rejected": 44422067.2, + "logps/chosen": -175.47635498046876, + "logps/rejected": -178.90897216796876, + "loss": 0.5061192512512207, + "rewards/chosen": -0.03337647318840027, + "rewards/margins": -0.035451799631118774, + "rewards/rejected": 0.0020753264427185057, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 2.1674153804779053, + "kl": 1.8968321084976196, + "learning_rate": 3.426666666666667e-06, + "logits/chosen": 37006256.0, + "logits/rejected": 36102249.6, + "logps/chosen": -153.7917724609375, + "logps/rejected": -165.3869140625, + "loss": 0.47748627662658694, + "rewards/chosen": 0.003384724259376526, + "rewards/margins": 0.18434576094150543, + "rewards/rejected": -0.1809610366821289, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 1.6082895994186401, + "kl": 2.0536258220672607, + "learning_rate": 3.3822222222222224e-06, + "logits/chosen": 22186331.2, + "logits/rejected": 20350340.8, + "logps/chosen": -151.07054443359374, + "logps/rejected": -150.1708740234375, + "loss": 0.4888655185699463, + "rewards/chosen": -0.01819072961807251, + "rewards/margins": 0.14195933341979983, + "rewards/rejected": -0.16015006303787233, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 2.310601234436035, + "kl": 3.129138469696045, + "learning_rate": 3.337777777777778e-06, + "logits/chosen": 37795449.6, + "logits/rejected": 35220675.2, + "logps/chosen": -181.1957763671875, + "logps/rejected": -132.0344970703125, + "loss": 0.48585872650146483, + "rewards/chosen": 0.3015714168548584, + "rewards/margins": 0.12076919078826903, + "rewards/rejected": 0.18080222606658936, + "step": 500 + }, + { + "epoch": 0.8, + "eval_kl": 2.6557276248931885, + "eval_logits/chosen": 32642598.912, + "eval_logits/rejected": 32467406.848, + "eval_logps/chosen": -154.477703125, + "eval_logps/rejected": -148.869640625, + "eval_loss": 0.49703630805015564, + "eval_rewards/chosen": 0.13124107360839843, + "eval_rewards/margins": 0.021815811157226556, + "eval_rewards/rejected": 0.10942526245117187, + "eval_runtime": 211.0284, + "eval_samples_per_second": 4.739, + "eval_steps_per_second": 2.369, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 2.234365224838257, + "kl": 1.7492139339447021, + "learning_rate": 3.2933333333333333e-06, + "logits/chosen": 39694166.4, + "logits/rejected": 40852192.0, + "logps/chosen": -148.5533447265625, + "logps/rejected": -162.5984375, + "loss": 0.4872898101806641, + "rewards/chosen": 0.013645458221435546, + "rewards/margins": 0.10560911893844604, + "rewards/rejected": -0.0919636607170105, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 2.3625874519348145, + "kl": 3.2840332984924316, + "learning_rate": 3.2488888888888894e-06, + "logits/chosen": 37915008.0, + "logits/rejected": 36570806.4, + "logps/chosen": -157.36220703125, + "logps/rejected": -164.6677734375, + "loss": 0.4821781635284424, + "rewards/chosen": 0.32392158508300783, + "rewards/margins": 0.17874917984008792, + "rewards/rejected": 0.1451724052429199, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 2.0702106952667236, + "kl": 2.1847689151763916, + "learning_rate": 3.2044444444444446e-06, + "logits/chosen": 33132352.0, + "logits/rejected": 32190089.6, + "logps/chosen": -144.52484130859375, + "logps/rejected": -179.6159912109375, + "loss": 0.5058434486389161, + "rewards/chosen": -0.07189960479736328, + "rewards/margins": -0.031065639853477475, + "rewards/rejected": -0.0408339649438858, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 2.4268131256103516, + "kl": 3.71620512008667, + "learning_rate": 3.1600000000000002e-06, + "logits/chosen": 27596662.4, + "logits/rejected": 26981862.4, + "logps/chosen": -150.15238037109376, + "logps/rejected": -159.4679443359375, + "loss": 0.46803932189941405, + "rewards/chosen": 0.3402720928192139, + "rewards/margins": 0.2841139912605286, + "rewards/rejected": 0.0561581015586853, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 2.0024309158325195, + "kl": 2.6835620403289795, + "learning_rate": 3.1155555555555555e-06, + "logits/chosen": 32613926.4, + "logits/rejected": 33945558.4, + "logps/chosen": -157.42744140625, + "logps/rejected": -171.3582275390625, + "loss": 0.49735183715820314, + "rewards/chosen": -0.0414805144071579, + "rewards/margins": 0.10379274189472197, + "rewards/rejected": -0.14527325630187987, + "step": 550 + }, + { + "epoch": 0.88, + "eval_kl": 2.534240245819092, + "eval_logits/chosen": 30600110.08, + "eval_logits/rejected": 30418843.648, + "eval_logps/chosen": -155.587328125, + "eval_logps/rejected": -149.98478125, + "eval_loss": 0.49648168683052063, + "eval_rewards/chosen": 0.02027870178222656, + "eval_rewards/margins": 0.022367488861083983, + "eval_rewards/rejected": -0.002088787078857422, + "eval_runtime": 210.9141, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.371, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 2.065328598022461, + "kl": 2.3084464073181152, + "learning_rate": 3.0711111111111115e-06, + "logits/chosen": 26840976.0, + "logits/rejected": 25480748.8, + "logps/chosen": -165.23271484375, + "logps/rejected": -152.96129150390624, + "loss": 0.49146738052368166, + "rewards/chosen": 0.038733655214309694, + "rewards/margins": 0.05340470671653748, + "rewards/rejected": -0.014671051502227783, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 1.7352122068405151, + "kl": 1.976243019104004, + "learning_rate": 3.0266666666666668e-06, + "logits/chosen": 30859900.8, + "logits/rejected": 29282716.8, + "logps/chosen": -153.5951904296875, + "logps/rejected": -138.8325927734375, + "loss": 0.48922386169433596, + "rewards/chosen": -0.16246808767318727, + "rewards/margins": 0.173832905292511, + "rewards/rejected": -0.33630099296569826, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 1.950156331062317, + "kl": 3.2299084663391113, + "learning_rate": 2.9822222222222224e-06, + "logits/chosen": 37093142.4, + "logits/rejected": 35527721.6, + "logps/chosen": -151.84559326171876, + "logps/rejected": -162.35234375, + "loss": 0.48923511505126954, + "rewards/chosen": -0.03407045304775238, + "rewards/margins": 0.13557693064212797, + "rewards/rejected": -0.16964738368988036, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 2.4236228466033936, + "kl": 3.0165860652923584, + "learning_rate": 2.937777777777778e-06, + "logits/chosen": 21686569.6, + "logits/rejected": 22070460.8, + "logps/chosen": -130.4562744140625, + "logps/rejected": -136.6388916015625, + "loss": 0.4813654899597168, + "rewards/chosen": 0.219277286529541, + "rewards/margins": 0.19798394441604614, + "rewards/rejected": 0.021293342113494873, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 2.492583990097046, + "kl": 2.3715996742248535, + "learning_rate": 2.8933333333333337e-06, + "logits/chosen": 20666136.0, + "logits/rejected": 23334753.6, + "logps/chosen": -153.04947509765626, + "logps/rejected": -153.43118896484376, + "loss": 0.522442102432251, + "rewards/chosen": -0.225014066696167, + "rewards/margins": -0.17056108117103577, + "rewards/rejected": -0.054452985525131226, + "step": 600 + }, + { + "epoch": 0.96, + "eval_kl": 2.428438425064087, + "eval_logits/chosen": 29508644.864, + "eval_logits/rejected": 29339983.872, + "eval_logps/chosen": -156.279359375, + "eval_logps/rejected": -150.735375, + "eval_loss": 0.49547284841537476, + "eval_rewards/chosen": -0.048923690795898436, + "eval_rewards/margins": 0.02822325134277344, + "eval_rewards/rejected": -0.07714694213867188, + "eval_runtime": 211.6125, + "eval_samples_per_second": 4.726, + "eval_steps_per_second": 2.363, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 1.849755883216858, + "kl": 4.1999735832214355, + "learning_rate": 2.8488888888888894e-06, + "logits/chosen": 35977270.4, + "logits/rejected": 32739616.0, + "logps/chosen": -196.6281982421875, + "logps/rejected": -171.48328857421876, + "loss": 0.4923978805541992, + "rewards/chosen": 0.14797959327697754, + "rewards/margins": 0.12893219590187074, + "rewards/rejected": 0.01904739737510681, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 2.2259228229522705, + "kl": 2.290156602859497, + "learning_rate": 2.8044444444444446e-06, + "logits/chosen": 18237147.2, + "logits/rejected": 16839369.6, + "logps/chosen": -143.46595458984376, + "logps/rejected": -140.927001953125, + "loss": 0.4779216766357422, + "rewards/chosen": 0.04038125276565552, + "rewards/margins": 0.21843221187591552, + "rewards/rejected": -0.17805095911026, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 1.9715921878814697, + "kl": 2.682957172393799, + "learning_rate": 2.7600000000000003e-06, + "logits/chosen": 28436249.6, + "logits/rejected": 29027222.4, + "logps/chosen": -155.50318603515626, + "logps/rejected": -156.95904541015625, + "loss": 0.4716252326965332, + "rewards/chosen": 0.20086636543273925, + "rewards/margins": 0.32287436723709106, + "rewards/rejected": -0.12200800180435181, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 1.9490185976028442, + "kl": 2.792576551437378, + "learning_rate": 2.715555555555556e-06, + "logits/chosen": 35926758.4, + "logits/rejected": 36004332.8, + "logps/chosen": -139.33636474609375, + "logps/rejected": -143.99090576171875, + "loss": 0.4631006717681885, + "rewards/chosen": 0.2187732219696045, + "rewards/margins": 0.31144561171531676, + "rewards/rejected": -0.09267238974571228, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 2.6189019680023193, + "kl": 2.2690906524658203, + "learning_rate": 2.6711111111111116e-06, + "logits/chosen": 32591907.2, + "logits/rejected": 32146668.8, + "logps/chosen": -138.1519775390625, + "logps/rejected": -154.90008544921875, + "loss": 0.47899231910705564, + "rewards/chosen": 0.09387065768241883, + "rewards/margins": 0.19471864104270936, + "rewards/rejected": -0.10084798336029052, + "step": 650 + }, + { + "epoch": 1.04, + "eval_kl": 2.578673839569092, + "eval_logits/chosen": 28904480.768, + "eval_logits/rejected": 28698251.264, + "eval_logps/chosen": -156.2849375, + "eval_logps/rejected": -150.820046875, + "eval_loss": 0.4945332705974579, + "eval_rewards/chosen": -0.049482513427734375, + "eval_rewards/margins": 0.03613288116455079, + "eval_rewards/rejected": -0.08561539459228516, + "eval_runtime": 210.9586, + "eval_samples_per_second": 4.74, + "eval_steps_per_second": 2.37, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 1.7106302976608276, + "kl": 3.502686023712158, + "learning_rate": 2.6266666666666668e-06, + "logits/chosen": 35256064.0, + "logits/rejected": 34360857.6, + "logps/chosen": -150.1806884765625, + "logps/rejected": -148.415771484375, + "loss": 0.5002040386199951, + "rewards/chosen": 0.014912448823451996, + "rewards/margins": 0.010746008902788162, + "rewards/rejected": 0.0041664399206638334, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 2.5150351524353027, + "kl": 1.832489013671875, + "learning_rate": 2.5822222222222224e-06, + "logits/chosen": 34642908.8, + "logits/rejected": 36512502.4, + "logps/chosen": -145.1234375, + "logps/rejected": -152.358642578125, + "loss": 0.4734458923339844, + "rewards/chosen": 0.06824090480804443, + "rewards/margins": 0.24784770011901855, + "rewards/rejected": -0.17960679531097412, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 1.9682056903839111, + "kl": 2.000276565551758, + "learning_rate": 2.537777777777778e-06, + "logits/chosen": 27413356.8, + "logits/rejected": 24885939.2, + "logps/chosen": -121.7287841796875, + "logps/rejected": -141.73358154296875, + "loss": 0.44886274337768556, + "rewards/chosen": 0.1158550500869751, + "rewards/margins": 0.5507017374038696, + "rewards/rejected": -0.4348466873168945, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 1.9337836503982544, + "kl": 3.4022421836853027, + "learning_rate": 2.4933333333333333e-06, + "logits/chosen": 31807184.0, + "logits/rejected": 30969852.8, + "logps/chosen": -124.63392333984375, + "logps/rejected": -149.852197265625, + "loss": 0.46492581367492675, + "rewards/chosen": 0.14620821475982665, + "rewards/margins": 0.39136860370635984, + "rewards/rejected": -0.2451603889465332, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 2.3496735095977783, + "kl": 2.625760078430176, + "learning_rate": 2.448888888888889e-06, + "logits/chosen": 31142732.8, + "logits/rejected": 31784934.4, + "logps/chosen": -136.10140380859374, + "logps/rejected": -160.45810546875, + "loss": 0.4712203025817871, + "rewards/chosen": 0.030043387413024904, + "rewards/margins": 0.2471763849258423, + "rewards/rejected": -0.2171329975128174, + "step": 700 + }, + { + "epoch": 1.12, + "eval_kl": 2.867997169494629, + "eval_logits/chosen": 28665548.8, + "eval_logits/rejected": 28371339.264, + "eval_logps/chosen": -155.87825, + "eval_logps/rejected": -150.4365625, + "eval_loss": 0.49442729353904724, + "eval_rewards/chosen": -0.008813528060913086, + "eval_rewards/margins": 0.038452112197875976, + "eval_rewards/rejected": -0.047265640258789064, + "eval_runtime": 210.8987, + "eval_samples_per_second": 4.742, + "eval_steps_per_second": 2.371, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 1.8647392988204956, + "kl": 3.5727601051330566, + "learning_rate": 2.4044444444444446e-06, + "logits/chosen": 23887081.6, + "logits/rejected": 20803046.4, + "logps/chosen": -186.822802734375, + "logps/rejected": -185.6416259765625, + "loss": 0.43730711936950684, + "rewards/chosen": 0.37336575984954834, + "rewards/margins": 0.6151942014694214, + "rewards/rejected": -0.24182844161987305, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 1.630876064300537, + "kl": 3.1620936393737793, + "learning_rate": 2.3600000000000003e-06, + "logits/chosen": 31887926.4, + "logits/rejected": 30896633.6, + "logps/chosen": -148.16866455078124, + "logps/rejected": -147.6019287109375, + "loss": 0.46859025955200195, + "rewards/chosen": 0.26157245635986326, + "rewards/margins": 0.22991548180580137, + "rewards/rejected": 0.03165697455406189, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 1.9033029079437256, + "kl": 2.6529040336608887, + "learning_rate": 2.3155555555555555e-06, + "logits/chosen": 21981940.8, + "logits/rejected": 20872329.6, + "logps/chosen": -118.38714599609375, + "logps/rejected": -131.70751953125, + "loss": 0.46525821685791013, + "rewards/chosen": -0.042069154977798465, + "rewards/margins": 0.3900075852870941, + "rewards/rejected": -0.43207674026489257, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 1.7811493873596191, + "kl": 4.9487690925598145, + "learning_rate": 2.2711111111111116e-06, + "logits/chosen": 32023660.8, + "logits/rejected": 32334473.6, + "logps/chosen": -160.63001708984376, + "logps/rejected": -155.8528076171875, + "loss": 0.4717572212219238, + "rewards/chosen": 0.43720192909240724, + "rewards/margins": 0.3068490982055664, + "rewards/rejected": 0.13035283088684083, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 2.2068121433258057, + "kl": 5.166882514953613, + "learning_rate": 2.226666666666667e-06, + "logits/chosen": 37952841.6, + "logits/rejected": 37582118.4, + "logps/chosen": -158.07510986328126, + "logps/rejected": -136.27884521484376, + "loss": 0.45236787796020506, + "rewards/chosen": 0.5586390972137452, + "rewards/margins": 0.447090494632721, + "rewards/rejected": 0.11154860258102417, + "step": 750 + }, + { + "epoch": 1.2, + "eval_kl": 4.169778823852539, + "eval_logits/chosen": 30752735.232, + "eval_logits/rejected": 30298806.272, + "eval_logps/chosen": -153.261625, + "eval_logps/rejected": -147.9041875, + "eval_loss": 0.49414026737213135, + "eval_rewards/chosen": 0.2528500061035156, + "eval_rewards/margins": 0.046879043579101526, + "eval_rewards/rejected": 0.20597096252441408, + "eval_runtime": 210.5527, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 2.375, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 1.7532247304916382, + "kl": 5.59327507019043, + "learning_rate": 2.1822222222222225e-06, + "logits/chosen": 34789904.0, + "logits/rejected": 35887366.4, + "logps/chosen": -140.38455810546876, + "logps/rejected": -154.4322021484375, + "loss": 0.4671647548675537, + "rewards/chosen": 0.6628150463104248, + "rewards/margins": 0.32444992065429684, + "rewards/rejected": 0.3383651256561279, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 1.8754905462265015, + "kl": 3.97855806350708, + "learning_rate": 2.137777777777778e-06, + "logits/chosen": 27867337.6, + "logits/rejected": 29447075.2, + "logps/chosen": -128.08131103515626, + "logps/rejected": -140.113671875, + "loss": 0.47170114517211914, + "rewards/chosen": 0.3410197257995605, + "rewards/margins": 0.21830989122390745, + "rewards/rejected": 0.12270983457565307, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 2.736323833465576, + "kl": 4.552127838134766, + "learning_rate": 2.0933333333333338e-06, + "logits/chosen": 33397673.6, + "logits/rejected": 34484364.8, + "logps/chosen": -137.85001220703126, + "logps/rejected": -139.0888427734375, + "loss": 0.45501227378845216, + "rewards/chosen": 0.355634069442749, + "rewards/margins": 0.35818901062011715, + "rewards/rejected": -0.0025549411773681642, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 1.9888898134231567, + "kl": 3.6503052711486816, + "learning_rate": 2.048888888888889e-06, + "logits/chosen": 29773580.8, + "logits/rejected": 28645248.0, + "logps/chosen": -151.4457275390625, + "logps/rejected": -138.25706787109374, + "loss": 0.46457924842834475, + "rewards/chosen": 0.29306089878082275, + "rewards/margins": 0.400502347946167, + "rewards/rejected": -0.10744144916534423, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 1.5264906883239746, + "kl": 5.461276531219482, + "learning_rate": 2.0044444444444446e-06, + "logits/chosen": 32210188.8, + "logits/rejected": 31754496.0, + "logps/chosen": -139.758837890625, + "logps/rejected": -154.13162841796876, + "loss": 0.47219176292419435, + "rewards/chosen": 0.6297782897949219, + "rewards/margins": 0.23436756134033204, + "rewards/rejected": 0.39541072845458985, + "step": 800 + }, + { + "epoch": 1.28, + "eval_kl": 4.377986907958984, + "eval_logits/chosen": 31938863.104, + "eval_logits/rejected": 31473901.568, + "eval_logps/chosen": -152.64290625, + "eval_logps/rejected": -147.31334375, + "eval_loss": 0.49390554428100586, + "eval_rewards/chosen": 0.3147206115722656, + "eval_rewards/margins": 0.04966500854492184, + "eval_rewards/rejected": 0.26505560302734377, + "eval_runtime": 210.9274, + "eval_samples_per_second": 4.741, + "eval_steps_per_second": 2.37, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 1.8228152990341187, + "kl": 4.066787242889404, + "learning_rate": 1.9600000000000003e-06, + "logits/chosen": 35085334.4, + "logits/rejected": 34852982.4, + "logps/chosen": -132.74925537109374, + "logps/rejected": -158.0354736328125, + "loss": 0.44358067512512206, + "rewards/chosen": 0.4759791374206543, + "rewards/margins": 0.4843965947628021, + "rewards/rejected": -0.008417457342147827, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 2.1346709728240967, + "kl": 4.6763811111450195, + "learning_rate": 1.915555555555556e-06, + "logits/chosen": 34945942.4, + "logits/rejected": 36177011.2, + "logps/chosen": -132.50538330078126, + "logps/rejected": -161.39248046875, + "loss": 0.47060041427612304, + "rewards/chosen": 0.4741304874420166, + "rewards/margins": 0.28036924600601193, + "rewards/rejected": 0.19376124143600465, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 2.9988934993743896, + "kl": 3.6898865699768066, + "learning_rate": 1.8711111111111114e-06, + "logits/chosen": 36008406.4, + "logits/rejected": 37406822.4, + "logps/chosen": -136.77215576171875, + "logps/rejected": -147.55556640625, + "loss": 0.46726012229919434, + "rewards/chosen": 0.3493239164352417, + "rewards/margins": 0.2635639488697052, + "rewards/rejected": 0.0857599675655365, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.762466311454773, + "kl": 4.097973346710205, + "learning_rate": 1.8266666666666668e-06, + "logits/chosen": 27684899.2, + "logits/rejected": 28798355.2, + "logps/chosen": -118.63145751953125, + "logps/rejected": -141.59239501953124, + "loss": 0.47371621131896974, + "rewards/chosen": 0.31828267574310304, + "rewards/margins": 0.29973786473274233, + "rewards/rejected": 0.018544811010360717, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.833742380142212, + "kl": 4.407935619354248, + "learning_rate": 1.7822222222222225e-06, + "logits/chosen": 33654512.0, + "logits/rejected": 33191171.2, + "logps/chosen": -151.17977294921874, + "logps/rejected": -145.46224365234374, + "loss": 0.47383294105529783, + "rewards/chosen": 0.16252880096435546, + "rewards/margins": 0.22959471344947813, + "rewards/rejected": -0.06706591248512268, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_kl": 3.920696973800659, + "eval_logits/chosen": 31450167.296, + "eval_logits/rejected": 31058980.864, + "eval_logps/chosen": -153.5180625, + "eval_logps/rejected": -148.185328125, + "eval_loss": 0.49386003613471985, + "eval_rewards/chosen": 0.22720516967773438, + "eval_rewards/margins": 0.04934913635253907, + "eval_rewards/rejected": 0.1778560333251953, + "eval_runtime": 211.4228, + "eval_samples_per_second": 4.73, + "eval_steps_per_second": 2.365, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 2.183370351791382, + "kl": 4.991496562957764, + "learning_rate": 1.737777777777778e-06, + "logits/chosen": 47378035.2, + "logits/rejected": 45498502.4, + "logps/chosen": -179.4380126953125, + "logps/rejected": -169.1109375, + "loss": 0.4720784664154053, + "rewards/chosen": 0.43294267654418944, + "rewards/margins": 0.21851625442504882, + "rewards/rejected": 0.21442642211914062, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 2.1924335956573486, + "kl": 5.118699073791504, + "learning_rate": 1.6933333333333336e-06, + "logits/chosen": 29969433.6, + "logits/rejected": 28471737.6, + "logps/chosen": -192.4035888671875, + "logps/rejected": -151.5296875, + "loss": 0.46271333694458006, + "rewards/chosen": 0.5544761657714844, + "rewards/margins": 0.31934370994567873, + "rewards/rejected": 0.23513245582580566, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 2.2703323364257812, + "kl": 2.6695058345794678, + "learning_rate": 1.648888888888889e-06, + "logits/chosen": 26682553.6, + "logits/rejected": 24626574.4, + "logps/chosen": -133.742041015625, + "logps/rejected": -147.77803955078124, + "loss": 0.43658647537231443, + "rewards/chosen": 0.262941312789917, + "rewards/margins": 0.628947639465332, + "rewards/rejected": -0.36600632667541505, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 1.7647465467453003, + "kl": 4.138214111328125, + "learning_rate": 1.6044444444444447e-06, + "logits/chosen": 27142838.4, + "logits/rejected": 26620787.2, + "logps/chosen": -133.81441650390624, + "logps/rejected": -135.07823486328124, + "loss": 0.4540394306182861, + "rewards/chosen": 0.3629532098770142, + "rewards/margins": 0.4822005391120911, + "rewards/rejected": -0.1192473292350769, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 2.0833559036254883, + "kl": 3.821770191192627, + "learning_rate": 1.56e-06, + "logits/chosen": 25844046.4, + "logits/rejected": 22660449.6, + "logps/chosen": -145.89912109375, + "logps/rejected": -130.87916259765626, + "loss": 0.44003853797912595, + "rewards/chosen": 0.4248363018035889, + "rewards/margins": 0.5529402971267701, + "rewards/rejected": -0.12810399532318115, + "step": 900 + }, + { + "epoch": 1.44, + "eval_kl": 4.193332672119141, + "eval_logits/chosen": 32452849.664, + "eval_logits/rejected": 32089155.584, + "eval_logps/chosen": -152.94290625, + "eval_logps/rejected": -147.62978125, + "eval_loss": 0.4937511086463928, + "eval_rewards/chosen": 0.28472021484375, + "eval_rewards/margins": 0.05130918884277341, + "eval_rewards/rejected": 0.23341102600097657, + "eval_runtime": 211.0632, + "eval_samples_per_second": 4.738, + "eval_steps_per_second": 2.369, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 1.9122978448867798, + "kl": 5.044002056121826, + "learning_rate": 1.5155555555555558e-06, + "logits/chosen": 28371500.8, + "logits/rejected": 26429561.6, + "logps/chosen": -170.57901611328126, + "logps/rejected": -170.0248046875, + "loss": 0.47440948486328127, + "rewards/chosen": 0.4807882308959961, + "rewards/margins": 0.20360822677612306, + "rewards/rejected": 0.27718000411987304, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 2.096123218536377, + "kl": 4.188933372497559, + "learning_rate": 1.4711111111111112e-06, + "logits/chosen": 34476659.2, + "logits/rejected": 31294201.6, + "logps/chosen": -165.4956787109375, + "logps/rejected": -143.78316650390624, + "loss": 0.44411406517028806, + "rewards/chosen": 0.5467419147491455, + "rewards/margins": 0.49016233682632443, + "rewards/rejected": 0.05657957792282105, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 2.136502265930176, + "kl": 5.916023254394531, + "learning_rate": 1.4266666666666668e-06, + "logits/chosen": 28660502.4, + "logits/rejected": 31565062.4, + "logps/chosen": -145.20224609375, + "logps/rejected": -184.6095947265625, + "loss": 0.4747187614440918, + "rewards/chosen": 0.5481678962707519, + "rewards/margins": 0.3383267402648925, + "rewards/rejected": 0.20984115600585937, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 3.4681079387664795, + "kl": 3.9104812145233154, + "learning_rate": 1.3822222222222223e-06, + "logits/chosen": 32520064.0, + "logits/rejected": 28152707.2, + "logps/chosen": -149.12630615234374, + "logps/rejected": -132.30379638671874, + "loss": 0.4755962371826172, + "rewards/chosen": 0.34991438388824464, + "rewards/margins": 0.19766778945922853, + "rewards/rejected": 0.1522465944290161, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 2.1049246788024902, + "kl": 4.365435600280762, + "learning_rate": 1.337777777777778e-06, + "logits/chosen": 37928726.4, + "logits/rejected": 36957033.6, + "logps/chosen": -154.08798828125, + "logps/rejected": -146.7669189453125, + "loss": 0.4579151630401611, + "rewards/chosen": 0.46179609298706054, + "rewards/margins": 0.37595014572143554, + "rewards/rejected": 0.085845947265625, + "step": 950 + }, + { + "epoch": 1.52, + "eval_kl": 4.171284198760986, + "eval_logits/chosen": 32833683.456, + "eval_logits/rejected": 32522022.912, + "eval_logps/chosen": -152.9865, + "eval_logps/rejected": -147.679296875, + "eval_loss": 0.4936215281486511, + "eval_rewards/chosen": 0.2803621826171875, + "eval_rewards/margins": 0.051901611328124986, + "eval_rewards/rejected": 0.2284605712890625, + "eval_runtime": 211.1202, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 2.368, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 2.297563076019287, + "kl": 4.221343994140625, + "learning_rate": 1.2933333333333334e-06, + "logits/chosen": 41744796.8, + "logits/rejected": 40177462.4, + "logps/chosen": -140.44161376953124, + "logps/rejected": -148.71304931640626, + "loss": 0.44645137786865235, + "rewards/chosen": 0.4949165344238281, + "rewards/margins": 0.46811245679855346, + "rewards/rejected": 0.026804077625274658, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 2.0365242958068848, + "kl": 4.4261579513549805, + "learning_rate": 1.248888888888889e-06, + "logits/chosen": 32556515.2, + "logits/rejected": 33512262.4, + "logps/chosen": -133.8440673828125, + "logps/rejected": -171.82977294921875, + "loss": 0.4730066776275635, + "rewards/chosen": 0.4945687294006348, + "rewards/margins": 0.2293097019195557, + "rewards/rejected": 0.2652590274810791, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 1.5643013715744019, + "kl": 4.663653373718262, + "learning_rate": 1.2044444444444447e-06, + "logits/chosen": 32883987.2, + "logits/rejected": 30414611.2, + "logps/chosen": -126.985400390625, + "logps/rejected": -116.391650390625, + "loss": 0.4877506732940674, + "rewards/chosen": 0.48381505012512205, + "rewards/margins": 0.09749135971069334, + "rewards/rejected": 0.3863236904144287, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 2.706939458847046, + "kl": 4.062044620513916, + "learning_rate": 1.1600000000000001e-06, + "logits/chosen": 32006976.0, + "logits/rejected": 31020704.0, + "logps/chosen": -166.25194091796874, + "logps/rejected": -154.0007080078125, + "loss": 0.44759297370910645, + "rewards/chosen": 0.35017178058624265, + "rewards/margins": 0.5452085971832275, + "rewards/rejected": -0.19503681659698485, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 1.8194115161895752, + "kl": 3.5774059295654297, + "learning_rate": 1.1155555555555558e-06, + "logits/chosen": 28698640.0, + "logits/rejected": 29143193.6, + "logps/chosen": -139.20194091796876, + "logps/rejected": -158.261376953125, + "loss": 0.4849833965301514, + "rewards/chosen": 0.1898583173751831, + "rewards/margins": 0.07131674289703369, + "rewards/rejected": 0.11854157447814942, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_kl": 4.220986366271973, + "eval_logits/chosen": 32568942.592, + "eval_logits/rejected": 32248098.816, + "eval_logps/chosen": -152.99434375, + "eval_logps/rejected": -147.7251875, + "eval_loss": 0.4932064116001129, + "eval_rewards/chosen": 0.2795771179199219, + "eval_rewards/margins": 0.055706237792968766, + "eval_rewards/rejected": 0.22387088012695314, + "eval_runtime": 901.9357, + "eval_samples_per_second": 1.109, + "eval_steps_per_second": 0.554, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 2.4502618312835693, + "kl": 3.438178539276123, + "learning_rate": 1.0711111111111112e-06, + "logits/chosen": 23637744.0, + "logits/rejected": 21885137.6, + "logps/chosen": -154.96070556640626, + "logps/rejected": -135.8352783203125, + "loss": 0.4609940528869629, + "rewards/chosen": 0.33989131450653076, + "rewards/margins": 0.2776340961456299, + "rewards/rejected": 0.06225721836090088, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 2.483098268508911, + "kl": 4.026124000549316, + "learning_rate": 1.0266666666666669e-06, + "logits/chosen": 33672102.4, + "logits/rejected": 33149174.4, + "logps/chosen": -172.960546875, + "logps/rejected": -169.11124267578126, + "loss": 0.4497981548309326, + "rewards/chosen": 0.3961763620376587, + "rewards/margins": 0.43121243715286256, + "rewards/rejected": -0.03503607511520386, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 1.9396111965179443, + "kl": 3.11645770072937, + "learning_rate": 9.822222222222223e-07, + "logits/chosen": 33916867.2, + "logits/rejected": 29841721.6, + "logps/chosen": -149.1454345703125, + "logps/rejected": -127.8354248046875, + "loss": 0.4286343574523926, + "rewards/chosen": 0.36662404537200927, + "rewards/margins": 0.662821626663208, + "rewards/rejected": -0.2961975812911987, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.473919153213501, + "kl": 3.834186553955078, + "learning_rate": 9.377777777777778e-07, + "logits/chosen": 34663897.6, + "logits/rejected": 32536246.4, + "logps/chosen": -142.38626708984376, + "logps/rejected": -151.28388671875, + "loss": 0.4545116901397705, + "rewards/chosen": 0.403075122833252, + "rewards/margins": 0.4584430515766144, + "rewards/rejected": -0.055367928743362424, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.401204824447632, + "kl": 2.8307433128356934, + "learning_rate": 8.933333333333334e-07, + "logits/chosen": 25615622.4, + "logits/rejected": 24212544.0, + "logps/chosen": -194.39169921875, + "logps/rejected": -139.34288330078124, + "loss": 0.48009257316589354, + "rewards/chosen": -0.09855471849441529, + "rewards/margins": 0.04281153678894044, + "rewards/rejected": -0.14136625528335572, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_kl": 3.395029306411743, + "eval_logits/chosen": 30017314.816, + "eval_logits/rejected": 29781239.808, + "eval_logps/chosen": -155.0224375, + "eval_logps/rejected": -149.753125, + "eval_loss": 0.4927977919578552, + "eval_rewards/chosen": 0.07676624298095704, + "eval_rewards/margins": 0.0556891098022461, + "eval_rewards/rejected": 0.021077133178710936, + "eval_runtime": 211.7163, + "eval_samples_per_second": 4.723, + "eval_steps_per_second": 2.362, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 2.213663101196289, + "kl": 2.5274927616119385, + "learning_rate": 8.488888888888889e-07, + "logits/chosen": 21050780.8, + "logits/rejected": 22174214.4, + "logps/chosen": -143.71790771484376, + "logps/rejected": -136.63013916015626, + "loss": 0.4820300579071045, + "rewards/chosen": 0.13502249717712403, + "rewards/margins": 0.1883419156074524, + "rewards/rejected": -0.05331941843032837, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 2.839602470397949, + "kl": 2.8527681827545166, + "learning_rate": 8.044444444444445e-07, + "logits/chosen": 31860320.0, + "logits/rejected": 34545088.0, + "logps/chosen": -123.39615478515626, + "logps/rejected": -144.3958740234375, + "loss": 0.48537321090698243, + "rewards/chosen": 0.06347188949584961, + "rewards/margins": 0.1278951048851013, + "rewards/rejected": -0.0644232153892517, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 2.238354206085205, + "kl": 3.4803032875061035, + "learning_rate": 7.6e-07, + "logits/chosen": 30298761.6, + "logits/rejected": 28377660.8, + "logps/chosen": -127.83525390625, + "logps/rejected": -187.0820556640625, + "loss": 0.46401171684265136, + "rewards/chosen": 0.19893896579742432, + "rewards/margins": 0.3267621874809265, + "rewards/rejected": -0.1278232216835022, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 2.201462507247925, + "kl": 3.8757426738739014, + "learning_rate": 7.155555555555556e-07, + "logits/chosen": 45421788.8, + "logits/rejected": 42151324.8, + "logps/chosen": -175.1874755859375, + "logps/rejected": -169.148291015625, + "loss": 0.48148083686828613, + "rewards/chosen": 0.16172538995742797, + "rewards/margins": 0.2750619053840637, + "rewards/rejected": -0.11333651542663574, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 1.8805228471755981, + "kl": 3.8151164054870605, + "learning_rate": 6.711111111111111e-07, + "logits/chosen": 31197808.0, + "logits/rejected": 27353356.8, + "logps/chosen": -152.6833740234375, + "logps/rejected": -185.17493896484376, + "loss": 0.4390877723693848, + "rewards/chosen": 0.15401217937469483, + "rewards/margins": 0.5842344522476196, + "rewards/rejected": -0.4302222728729248, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_kl": 3.337947368621826, + "eval_logits/chosen": 29766119.424, + "eval_logits/rejected": 29534177.28, + "eval_logps/chosen": -155.123390625, + "eval_logps/rejected": -149.8595625, + "eval_loss": 0.4927149713039398, + "eval_rewards/chosen": 0.06667286682128906, + "eval_rewards/margins": 0.056239251136779786, + "eval_rewards/rejected": 0.010433615684509278, + "eval_runtime": 211.2573, + "eval_samples_per_second": 4.734, + "eval_steps_per_second": 2.367, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 2.611490249633789, + "kl": 4.028485298156738, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": 30264723.2, + "logits/rejected": 30910204.8, + "logps/chosen": -174.413818359375, + "logps/rejected": -188.7349853515625, + "loss": 0.4568845272064209, + "rewards/chosen": 0.035149258375167844, + "rewards/margins": 0.44202625155448916, + "rewards/rejected": -0.4068769931793213, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 2.5337791442871094, + "kl": 3.7211251258850098, + "learning_rate": 5.822222222222223e-07, + "logits/chosen": 26241504.0, + "logits/rejected": 23940459.2, + "logps/chosen": -151.94140625, + "logps/rejected": -115.9620849609375, + "loss": 0.4610316276550293, + "rewards/chosen": 0.3789072036743164, + "rewards/margins": 0.3300951421260834, + "rewards/rejected": 0.04881206154823303, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 1.5708776712417603, + "kl": 3.1767425537109375, + "learning_rate": 5.377777777777779e-07, + "logits/chosen": 42052073.6, + "logits/rejected": 39844899.2, + "logps/chosen": -169.72109375, + "logps/rejected": -151.2173583984375, + "loss": 0.4554294109344482, + "rewards/chosen": 0.2683689832687378, + "rewards/margins": 0.47904453277587894, + "rewards/rejected": -0.21067554950714112, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 2.6482839584350586, + "kl": 2.7335541248321533, + "learning_rate": 4.933333333333334e-07, + "logits/chosen": 37147670.4, + "logits/rejected": 37500460.8, + "logps/chosen": -143.32291259765626, + "logps/rejected": -160.43739013671876, + "loss": 0.46424403190612795, + "rewards/chosen": 0.15881721973419188, + "rewards/margins": 0.3237978339195251, + "rewards/rejected": -0.16498061418533325, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 2.6286239624023438, + "kl": 2.9139907360076904, + "learning_rate": 4.488888888888889e-07, + "logits/chosen": 22594232.0, + "logits/rejected": 20993777.6, + "logps/chosen": -151.5849365234375, + "logps/rejected": -194.24169921875, + "loss": 0.4465163230895996, + "rewards/chosen": 0.19300849437713624, + "rewards/margins": 0.5745944738388062, + "rewards/rejected": -0.38158597946166994, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_kl": 3.375143051147461, + "eval_logits/chosen": 29445828.608, + "eval_logits/rejected": 29209438.208, + "eval_logps/chosen": -155.187921875, + "eval_logps/rejected": -149.93753125, + "eval_loss": 0.4925803244113922, + "eval_rewards/chosen": 0.06021894836425781, + "eval_rewards/margins": 0.057583449840545656, + "eval_rewards/rejected": 0.0026354985237121583, + "eval_runtime": 211.2149, + "eval_samples_per_second": 4.735, + "eval_steps_per_second": 2.367, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 2.6833443641662598, + "kl": 3.5917434692382812, + "learning_rate": 4.0444444444444445e-07, + "logits/chosen": 33232332.8, + "logits/rejected": 32122198.4, + "logps/chosen": -146.055859375, + "logps/rejected": -152.858740234375, + "loss": 0.46697273254394533, + "rewards/chosen": 0.24598314762115478, + "rewards/margins": 0.3101099610328674, + "rewards/rejected": -0.06412681341171264, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 2.8079652786254883, + "kl": 5.262423992156982, + "learning_rate": 3.6e-07, + "logits/chosen": 26948995.2, + "logits/rejected": 25488443.2, + "logps/chosen": -135.2403564453125, + "logps/rejected": -163.37462158203124, + "loss": 0.4623889923095703, + "rewards/chosen": 0.43314542770385744, + "rewards/margins": 0.40463062524795534, + "rewards/rejected": 0.0285148024559021, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 2.987678289413452, + "kl": 2.790511131286621, + "learning_rate": 3.155555555555556e-07, + "logits/chosen": 29730611.2, + "logits/rejected": 29972553.6, + "logps/chosen": -167.71156005859376, + "logps/rejected": -171.79833984375, + "loss": 0.45536341667175295, + "rewards/chosen": 0.05922438502311707, + "rewards/margins": 0.4953587710857391, + "rewards/rejected": -0.43613438606262206, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 2.937406063079834, + "kl": 3.834505796432495, + "learning_rate": 2.7111111111111114e-07, + "logits/chosen": 30097795.2, + "logits/rejected": 28979769.6, + "logps/chosen": -167.03443603515626, + "logps/rejected": -143.852685546875, + "loss": 0.473237133026123, + "rewards/chosen": 0.20266783237457275, + "rewards/margins": 0.254498028755188, + "rewards/rejected": -0.051830196380615236, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 2.744769334793091, + "kl": 3.5745787620544434, + "learning_rate": 2.266666666666667e-07, + "logits/chosen": 36340227.2, + "logits/rejected": 34575104.0, + "logps/chosen": -149.65133056640624, + "logps/rejected": -160.3142333984375, + "loss": 0.472868013381958, + "rewards/chosen": 0.06828058958053589, + "rewards/margins": 0.3730572104454041, + "rewards/rejected": -0.3047766208648682, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_kl": 3.3557679653167725, + "eval_logits/chosen": 29278171.136, + "eval_logits/rejected": 29042731.008, + "eval_logps/chosen": -155.309109375, + "eval_logps/rejected": -150.058, + "eval_loss": 0.4926023483276367, + "eval_rewards/chosen": 0.048100093841552734, + "eval_rewards/margins": 0.05750944900512695, + "eval_rewards/rejected": -0.00940935516357422, + "eval_runtime": 211.1245, + "eval_samples_per_second": 4.737, + "eval_steps_per_second": 2.368, + "step": 1200 + }, + { + "epoch": 1.936, + "grad_norm": 2.271873712539673, + "kl": 5.1952033042907715, + "learning_rate": 1.8222222222222226e-07, + "logits/chosen": 23779177.6, + "logits/rejected": 23033209.6, + "logps/chosen": -133.706884765625, + "logps/rejected": -144.84560546875, + "loss": 0.45861096382141114, + "rewards/chosen": 0.4111456871032715, + "rewards/margins": 0.41906877756118777, + "rewards/rejected": -0.00792309045791626, + "step": 1210 + }, + { + "epoch": 1.952, + "grad_norm": 2.38917875289917, + "kl": 3.805723190307617, + "learning_rate": 1.3777777777777778e-07, + "logits/chosen": 23066518.4, + "logits/rejected": 22046275.2, + "logps/chosen": -138.892236328125, + "logps/rejected": -143.47259521484375, + "loss": 0.45859084129333494, + "rewards/chosen": 0.3317830562591553, + "rewards/margins": 0.39049922823905947, + "rewards/rejected": -0.058716171979904176, + "step": 1220 + }, + { + "epoch": 1.968, + "grad_norm": 2.1651015281677246, + "kl": 4.553244113922119, + "learning_rate": 9.333333333333335e-08, + "logits/chosen": 24301694.4, + "logits/rejected": 25114072.0, + "logps/chosen": -190.6584228515625, + "logps/rejected": -167.1634521484375, + "loss": 0.4986537456512451, + "rewards/chosen": 0.06035754680633545, + "rewards/margins": 0.06962958574295045, + "rewards/rejected": -0.00927203893661499, + "step": 1230 + }, + { + "epoch": 1.984, + "grad_norm": 2.175057888031006, + "kl": 3.3482718467712402, + "learning_rate": 4.8888888888888894e-08, + "logits/chosen": 33222752.0, + "logits/rejected": 30686976.0, + "logps/chosen": -152.40816650390624, + "logps/rejected": -147.18714599609376, + "loss": 0.4648073673248291, + "rewards/chosen": 0.17594602108001708, + "rewards/margins": 0.41535995006561277, + "rewards/rejected": -0.23941392898559571, + "step": 1240 + }, + { + "epoch": 2.0, + "grad_norm": 2.4534552097320557, + "kl": 3.8523781299591064, + "learning_rate": 4.444444444444445e-09, + "logits/chosen": 19626976.0, + "logits/rejected": 20057414.4, + "logps/chosen": -150.80908203125, + "logps/rejected": -170.111376953125, + "loss": 0.46182851791381835, + "rewards/chosen": 0.2004603147506714, + "rewards/margins": 0.42479350566864016, + "rewards/rejected": -0.22433319091796874, + "step": 1250 + }, + { + "epoch": 2.0, + "eval_kl": 3.394835948944092, + "eval_logits/chosen": 29359652.864, + "eval_logits/rejected": 29117517.824, + "eval_logps/chosen": -155.215265625, + "eval_logps/rejected": -149.97090625, + "eval_loss": 0.49256065487861633, + "eval_rewards/chosen": 0.05748405456542969, + "eval_rewards/margins": 0.05818451833724976, + "eval_rewards/rejected": -0.0007004637718200684, + "eval_runtime": 212.6145, + "eval_samples_per_second": 4.703, + "eval_steps_per_second": 2.352, + "step": 1250 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/KTO/KTO_5k/lora/checkpoint-1250/training_args.bin b/v5/KTO/KTO_5k/lora/checkpoint-1250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..a4b0593b1fb99fd0ef500fd051a7332500d83f31 --- /dev/null +++ b/v5/KTO/KTO_5k/lora/checkpoint-1250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb38612e474b2b75329a529c7bd7e818140a323dc202e6e5201e7c6648635d30 +size 5649 diff --git a/v5/KTO/gen-output/KTO_10k/data-00000-of-00001.arrow b/v5/KTO/gen-output/KTO_10k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..07aa576523e1bc84b990ae849d892d5b30c91726 --- /dev/null +++ b/v5/KTO/gen-output/KTO_10k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2653f81f65fe34a19776d6f3edddc2fe184bffa58970ca8496b90b51d64f8d9 +size 931120 diff --git a/v5/KTO/gen-output/KTO_10k/dataset_info.json b/v5/KTO/gen-output/KTO_10k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/KTO/gen-output/KTO_10k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_10k/state.json b/v5/KTO/gen-output/KTO_10k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..44d4820654e8e6cd42d893bdf5685cddd5e03e0d --- /dev/null +++ b/v5/KTO/gen-output/KTO_10k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "2237472a3506a64d", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_1k/data-00000-of-00001.arrow b/v5/KTO/gen-output/KTO_1k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..d03f1234932a26ac7b57a7fe58514886a6a2d243 --- /dev/null +++ b/v5/KTO/gen-output/KTO_1k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f902afb1e58f8d3c09155c94b7f4e59fe78aed8675033dd7a2362456aa3969f3 +size 1001544 diff --git a/v5/KTO/gen-output/KTO_1k/dataset_info.json b/v5/KTO/gen-output/KTO_1k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/KTO/gen-output/KTO_1k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_1k/state.json b/v5/KTO/gen-output/KTO_1k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..8d8f92898c4a28fe04c0162bb0f1b62d6fa2417c --- /dev/null +++ b/v5/KTO/gen-output/KTO_1k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "393d42ed5c5cfb7e", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_20k/data-00000-of-00001.arrow b/v5/KTO/gen-output/KTO_20k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..6c93e32202aa79a57029e61b1013c9f4bbd282da --- /dev/null +++ b/v5/KTO/gen-output/KTO_20k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09d159f3feffc21647a88b36f7c8fed32a761a52b8e37508f020dcc8b24a4a5d +size 838696 diff --git a/v5/KTO/gen-output/KTO_20k/dataset_info.json b/v5/KTO/gen-output/KTO_20k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/KTO/gen-output/KTO_20k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_20k/state.json b/v5/KTO/gen-output/KTO_20k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..019fdec5fff8f771115d74ffc759b3fa30c5c38f --- /dev/null +++ b/v5/KTO/gen-output/KTO_20k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "e0650c3c5ea18967", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_2k/data-00000-of-00001.arrow b/v5/KTO/gen-output/KTO_2k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c04f5564746fa98838ff6967e16ed86f364a55fe --- /dev/null +++ b/v5/KTO/gen-output/KTO_2k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88373ffa51ba4a4c856a465d79bb95ae8e92ea0a74fe5f98e59d5f0f216d3e25 +size 990744 diff --git a/v5/KTO/gen-output/KTO_2k/dataset_info.json b/v5/KTO/gen-output/KTO_2k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/KTO/gen-output/KTO_2k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_2k/state.json b/v5/KTO/gen-output/KTO_2k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..aad9d02f0797bfdae853de222273aa7679aafea6 --- /dev/null +++ b/v5/KTO/gen-output/KTO_2k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "371fe76d2de305a7", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_5k/data-00000-of-00001.arrow b/v5/KTO/gen-output/KTO_5k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..991a1c7d988db728a0a3985adb5f4439315263d8 --- /dev/null +++ b/v5/KTO/gen-output/KTO_5k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9021fb3e317c6b93ac70e160eb6bc595530ac7973aa323ad37a2a05b4af9eed7 +size 953560 diff --git a/v5/KTO/gen-output/KTO_5k/dataset_info.json b/v5/KTO/gen-output/KTO_5k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/KTO/gen-output/KTO_5k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/KTO/gen-output/KTO_5k/state.json b/v5/KTO/gen-output/KTO_5k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..7e9c5dfc4fba6cae188ee450e32d487c941e06d8 --- /dev/null +++ b/v5/KTO/gen-output/KTO_5k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "e3aea3936c578a9e", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/chat_template.jinja b/v5/ORPO/ORPO_10k/MORPO_10k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/config.json b/v5/ORPO/ORPO_10k/MORPO_10k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/generation_config.json b/v5/ORPO/ORPO_10k/MORPO_10k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/model.safetensors b/v5/ORPO/ORPO_10k/MORPO_10k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..dfaa7caabc7d362b11289364fb8ac38b1a7b7b15 --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47ade04fa3489fb427c18e2f7b4e3067ba8b5c0c1deb0a638ecdd361653a69f6 +size 2471645464 diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer.json b/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer_config.json b/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/MORPO_10k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_10k/ORPO_10k/README.md b/v5/ORPO/ORPO_10k/ORPO_10k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_10k/ORPO_10k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/ORPO_10k/adapter_config.json b/v5/ORPO/ORPO_10k/ORPO_10k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28550dad7e9abe3072f5d3e51e504f7143e8a5f2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/ORPO_10k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "down_proj", + "v_proj", + "up_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/ORPO_10k/adapter_model.safetensors b/v5/ORPO/ORPO_10k/ORPO_10k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2266599e2fb3e628639f3fdfc8f095a7ba17f6d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/ORPO_10k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07697849ccda29cd9e5be02bdfa0c906b06cf423be53b1c4675b2db3aa281e74 +size 180385008 diff --git a/v5/ORPO/ORPO_10k/lora/README.md b/v5/ORPO/ORPO_10k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1b84a29c8fdbb2614e0828436e4ff35f4c04e91c --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- trl +- orpo +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/fccg208q) + + +This model was trained with ORPO, a method introduced in [ORPO: Monolithic Preference Optimization without Reference Model](https://huggingface.co/papers/2403.07691). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite ORPO as: + +```bibtex +@article{hong2024orpo, + title = {{ORPO: Monolithic Preference Optimization without Reference Model}}, + author = {Jiwoo Hong and Noah Lee and James Thorne}, + year = 2024, + eprint = {arXiv:2403.07691} +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/README.md b/v5/ORPO/ORPO_10k/lora/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28550dad7e9abe3072f5d3e51e504f7143e8a5f2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "down_proj", + "v_proj", + "up_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_model.safetensors b/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2266599e2fb3e628639f3fdfc8f095a7ba17f6d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07697849ccda29cd9e5be02bdfa0c906b06cf423be53b1c4675b2db3aa281e74 +size 180385008 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/chat_template.jinja b/v5/ORPO/ORPO_10k/lora/checkpoint-100/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/optimizer.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d54365963a2eb04216898603e3d74e6015cc0dc5 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d7429a3e947311273cfbc308fa44ed4bcdb27d991f62de08c2b9a9e3884ed7 +size 360902475 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/rng_state.pth b/v5/ORPO/ORPO_10k/lora/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/scaler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-100/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..215c5d2069bd81cb35727ebca07a510ac59c9d94 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4393a84a3109995aa1202073b039b12062e3189ed89aa0b94ef0510ba843009 +size 1383 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/scheduler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..658acf8a3135afcfcc2da1bdcf4d4cdc77f49eaf --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1688c06d47a884bf6f9f7ca5b0c04c17fe30d34ac38e756841b6ec602d5f5b8b +size 1465 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer.json b/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/trainer_state.json b/v5/ORPO/ORPO_10k/lora/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a5ad8c6e3914add312f2eabd979652eedccffeab --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/trainer_state.json @@ -0,0 +1,233 @@ +{ + "best_global_step": 100, + "best_metric": 0.550000011920929, + "best_model_checkpoint": "output/lora/checkpoint-100", + "epoch": 0.08, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8949731588363647, + "learning_rate": 2.88e-07, + "log_odds_chosen": -0.13458022475242615, + "log_odds_ratio": -0.8810430765151978, + "logits/chosen": 1.0847688913345337, + "logits/rejected": 1.0370358228683472, + "logps/chosen": -3.011305332183838, + "logps/rejected": -2.8771233558654785, + "loss": 3.5686809539794924, + "nll_loss": 3.480576992034912, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3011305630207062, + "rewards/margins": -0.013418207876384258, + "rewards/rejected": -0.28771233558654785, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.7958198189735413, + "learning_rate": 6.079999999999999e-07, + "log_odds_chosen": 0.10219261795282364, + "log_odds_ratio": -0.8167620897293091, + "logits/chosen": 1.068807601928711, + "logits/rejected": 1.0232031345367432, + "logps/chosen": -3.0397582054138184, + "logps/rejected": -3.1323461532592773, + "loss": 3.346195602416992, + "nll_loss": 3.264519453048706, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30397582054138184, + "rewards/margins": 0.009258817881345749, + "rewards/rejected": -0.3132346272468567, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.9128683805465698, + "learning_rate": 9.28e-07, + "log_odds_chosen": 0.12453228235244751, + "log_odds_ratio": -0.8030093312263489, + "logits/chosen": 1.2273896932601929, + "logits/rejected": 1.1259081363677979, + "logps/chosen": -2.7777199745178223, + "logps/rejected": -2.892120838165283, + "loss": 3.2673892974853516, + "nll_loss": 3.1870882511138916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2777720093727112, + "rewards/margins": 0.011440068483352661, + "rewards/rejected": -0.28921204805374146, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.6320874691009521, + "learning_rate": 1.248e-06, + "log_odds_chosen": 0.029524624347686768, + "log_odds_ratio": -0.8554368019104004, + "logits/chosen": 1.1522005796432495, + "logits/rejected": 1.1450908184051514, + "logps/chosen": -3.080202341079712, + "logps/rejected": -3.107326030731201, + "loss": 3.296055221557617, + "nll_loss": 3.2105109691619873, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3080202639102936, + "rewards/margins": 0.0027123407926410437, + "rewards/rejected": -0.31073254346847534, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.645889163017273, + "learning_rate": 1.568e-06, + "log_odds_chosen": 0.014121174812316895, + "log_odds_ratio": -0.9388389587402344, + "logits/chosen": 1.0003323554992676, + "logits/rejected": 0.9480252265930176, + "logps/chosen": -3.0624964237213135, + "logps/rejected": -3.0750041007995605, + "loss": 3.3580265045166016, + "nll_loss": 3.2641425132751465, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3062496483325958, + "rewards/margins": 0.001250785542652011, + "rewards/rejected": -0.307500422000885, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.6963360905647278, + "learning_rate": 1.8879999999999998e-06, + "log_odds_chosen": 0.08005297183990479, + "log_odds_ratio": -0.7886329293251038, + "logits/chosen": 1.0161622762680054, + "logits/rejected": 1.0301268100738525, + "logps/chosen": -2.7399508953094482, + "logps/rejected": -2.8282437324523926, + "loss": 3.1988595962524413, + "nll_loss": 3.1199958324432373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2739951014518738, + "rewards/margins": 0.008829282596707344, + "rewards/rejected": -0.28282439708709717, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.6785285472869873, + "learning_rate": 2.2080000000000003e-06, + "log_odds_chosen": 0.015721607953310013, + "log_odds_ratio": -0.8530643582344055, + "logits/chosen": 1.0702764987945557, + "logits/rejected": 1.093421459197998, + "logps/chosen": -2.7683191299438477, + "logps/rejected": -2.7858364582061768, + "loss": 3.0648569107055663, + "nll_loss": 2.97955060005188, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.2768319249153137, + "rewards/margins": 0.0017517365049570799, + "rewards/rejected": -0.2785836458206177, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.6778960227966309, + "learning_rate": 2.5279999999999998e-06, + "log_odds_chosen": -0.05771768093109131, + "log_odds_ratio": -0.9478788375854492, + "logits/chosen": 1.0575181245803833, + "logits/rejected": 1.058960199356079, + "logps/chosen": -3.0624351501464844, + "logps/rejected": -2.998605966567993, + "loss": 3.0089736938476563, + "nll_loss": 2.9141860008239746, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.30624353885650635, + "rewards/margins": -0.006382950581610203, + "rewards/rejected": -0.29986056685447693, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.6029064655303955, + "learning_rate": 2.8479999999999997e-06, + "log_odds_chosen": 0.03830999881029129, + "log_odds_ratio": -0.8156368136405945, + "logits/chosen": 1.0552040338516235, + "logits/rejected": 1.037821888923645, + "logps/chosen": -2.7874300479888916, + "logps/rejected": -2.83976674079895, + "loss": 2.9470733642578124, + "nll_loss": 2.865509510040283, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2787429690361023, + "rewards/margins": 0.005233690608292818, + "rewards/rejected": -0.28397664427757263, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.31492361426353455, + "learning_rate": 3.168e-06, + "log_odds_chosen": 0.041199591010808945, + "log_odds_ratio": -0.9007355570793152, + "logits/chosen": 1.0530426502227783, + "logits/rejected": 1.087805151939392, + "logps/chosen": -2.8392837047576904, + "logps/rejected": -2.8784425258636475, + "loss": 2.8420888900756838, + "nll_loss": 2.7520148754119873, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.28392836451530457, + "rewards/margins": 0.003915875218808651, + "rewards/rejected": -0.2878442704677582, + "step": 100 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.09687195718288422, + "eval_log_odds_ratio": -0.7778716087341309, + "eval_logits/chosen": 1.1028081178665161, + "eval_logits/rejected": 1.0724682807922363, + "eval_logps/chosen": -2.6042141914367676, + "eval_logps/rejected": -2.7047533988952637, + "eval_loss": 2.7646737098693848, + "eval_nll_loss": 2.686886787414551, + "eval_rewards/accuracies": 0.550000011920929, + "eval_rewards/chosen": -0.2604214549064636, + "eval_rewards/margins": 0.010053902864456177, + "eval_rewards/rejected": -0.27047526836395264, + "eval_runtime": 53.3988, + "eval_samples_per_second": 9.364, + "eval_steps_per_second": 4.682, + "step": 100 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-100/training_args.bin b/v5/ORPO/ORPO_10k/lora/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b47bcdc3c44dda631da7f475aa87c7bb2c782bec --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1615b59ec50a8a8f298af41b0a88c5959219b5898139e8f88d7ad75a43a2c3b +size 5521 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/README.md b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28550dad7e9abe3072f5d3e51e504f7143e8a5f2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "down_proj", + "v_proj", + "up_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_model.safetensors b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..07535ca8a20aab171dde5b49c54ac120db110621 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b286db9b003bd908c31909fb8e7b666907e575d4060c379876f01a3707c29a8b +size 180385008 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/chat_template.jinja b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/optimizer.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c60af2f22e1366e748cd8879ab135d1c01c08ad0 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c97d55f7ff4dd5b1095893daf0d812baca1f998ca3ffb3a929b3e599e2dc515 +size 360902475 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/rng_state.pth b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scaler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c83c039e83183ab3a0678557983f51465fbdff40 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da7c5085795b13d2bf0030671cbddb9f62ae43221bf1424a3830d4cf8c19012 +size 1383 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scheduler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0590ae07d3cfe72afec956a18d09cb7af30c381 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97d90edf49b3a951dec98d770bdfbd57b47097b22a5268022a3d388298283eac +size 1465 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/trainer_state.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7d081de5cceada0289b02c7709bc57903dc00a39 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/trainer_state.json @@ -0,0 +1,4810 @@ +{ + "best_global_step": 100, + "best_metric": 0.550000011920929, + "best_model_checkpoint": "output/lora/checkpoint-100", + "epoch": 1.92, + "eval_steps": 100, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8949731588363647, + "learning_rate": 2.88e-07, + "log_odds_chosen": -0.13458022475242615, + "log_odds_ratio": -0.8810430765151978, + "logits/chosen": 1.0847688913345337, + "logits/rejected": 1.0370358228683472, + "logps/chosen": -3.011305332183838, + "logps/rejected": -2.8771233558654785, + "loss": 3.5686809539794924, + "nll_loss": 3.480576992034912, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3011305630207062, + "rewards/margins": -0.013418207876384258, + "rewards/rejected": -0.28771233558654785, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.7958198189735413, + "learning_rate": 6.079999999999999e-07, + "log_odds_chosen": 0.10219261795282364, + "log_odds_ratio": -0.8167620897293091, + "logits/chosen": 1.068807601928711, + "logits/rejected": 1.0232031345367432, + "logps/chosen": -3.0397582054138184, + "logps/rejected": -3.1323461532592773, + "loss": 3.346195602416992, + "nll_loss": 3.264519453048706, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30397582054138184, + "rewards/margins": 0.009258817881345749, + "rewards/rejected": -0.3132346272468567, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.9128683805465698, + "learning_rate": 9.28e-07, + "log_odds_chosen": 0.12453228235244751, + "log_odds_ratio": -0.8030093312263489, + "logits/chosen": 1.2273896932601929, + "logits/rejected": 1.1259081363677979, + "logps/chosen": -2.7777199745178223, + "logps/rejected": -2.892120838165283, + "loss": 3.2673892974853516, + "nll_loss": 3.1870882511138916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2777720093727112, + "rewards/margins": 0.011440068483352661, + "rewards/rejected": -0.28921204805374146, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.6320874691009521, + "learning_rate": 1.248e-06, + "log_odds_chosen": 0.029524624347686768, + "log_odds_ratio": -0.8554368019104004, + "logits/chosen": 1.1522005796432495, + "logits/rejected": 1.1450908184051514, + "logps/chosen": -3.080202341079712, + "logps/rejected": -3.107326030731201, + "loss": 3.296055221557617, + "nll_loss": 3.2105109691619873, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3080202639102936, + "rewards/margins": 0.0027123407926410437, + "rewards/rejected": -0.31073254346847534, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.645889163017273, + "learning_rate": 1.568e-06, + "log_odds_chosen": 0.014121174812316895, + "log_odds_ratio": -0.9388389587402344, + "logits/chosen": 1.0003323554992676, + "logits/rejected": 0.9480252265930176, + "logps/chosen": -3.0624964237213135, + "logps/rejected": -3.0750041007995605, + "loss": 3.3580265045166016, + "nll_loss": 3.2641425132751465, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3062496483325958, + "rewards/margins": 0.001250785542652011, + "rewards/rejected": -0.307500422000885, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.6963360905647278, + "learning_rate": 1.8879999999999998e-06, + "log_odds_chosen": 0.08005297183990479, + "log_odds_ratio": -0.7886329293251038, + "logits/chosen": 1.0161622762680054, + "logits/rejected": 1.0301268100738525, + "logps/chosen": -2.7399508953094482, + "logps/rejected": -2.8282437324523926, + "loss": 3.1988595962524413, + "nll_loss": 3.1199958324432373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2739951014518738, + "rewards/margins": 0.008829282596707344, + "rewards/rejected": -0.28282439708709717, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.6785285472869873, + "learning_rate": 2.2080000000000003e-06, + "log_odds_chosen": 0.015721607953310013, + "log_odds_ratio": -0.8530643582344055, + "logits/chosen": 1.0702764987945557, + "logits/rejected": 1.093421459197998, + "logps/chosen": -2.7683191299438477, + "logps/rejected": -2.7858364582061768, + "loss": 3.0648569107055663, + "nll_loss": 2.97955060005188, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.2768319249153137, + "rewards/margins": 0.0017517365049570799, + "rewards/rejected": -0.2785836458206177, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.6778960227966309, + "learning_rate": 2.5279999999999998e-06, + "log_odds_chosen": -0.05771768093109131, + "log_odds_ratio": -0.9478788375854492, + "logits/chosen": 1.0575181245803833, + "logits/rejected": 1.058960199356079, + "logps/chosen": -3.0624351501464844, + "logps/rejected": -2.998605966567993, + "loss": 3.0089736938476563, + "nll_loss": 2.9141860008239746, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.30624353885650635, + "rewards/margins": -0.006382950581610203, + "rewards/rejected": -0.29986056685447693, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.6029064655303955, + "learning_rate": 2.8479999999999997e-06, + "log_odds_chosen": 0.03830999881029129, + "log_odds_ratio": -0.8156368136405945, + "logits/chosen": 1.0552040338516235, + "logits/rejected": 1.037821888923645, + "logps/chosen": -2.7874300479888916, + "logps/rejected": -2.83976674079895, + "loss": 2.9470733642578124, + "nll_loss": 2.865509510040283, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2787429690361023, + "rewards/margins": 0.005233690608292818, + "rewards/rejected": -0.28397664427757263, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.31492361426353455, + "learning_rate": 3.168e-06, + "log_odds_chosen": 0.041199591010808945, + "log_odds_ratio": -0.9007355570793152, + "logits/chosen": 1.0530426502227783, + "logits/rejected": 1.087805151939392, + "logps/chosen": -2.8392837047576904, + "logps/rejected": -2.8784425258636475, + "loss": 2.8420888900756838, + "nll_loss": 2.7520148754119873, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.28392836451530457, + "rewards/margins": 0.003915875218808651, + "rewards/rejected": -0.2878442704677582, + "step": 100 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.09687195718288422, + "eval_log_odds_ratio": -0.7778716087341309, + "eval_logits/chosen": 1.1028081178665161, + "eval_logits/rejected": 1.0724682807922363, + "eval_logps/chosen": -2.6042141914367676, + "eval_logps/rejected": -2.7047533988952637, + "eval_loss": 2.7646737098693848, + "eval_nll_loss": 2.686886787414551, + "eval_rewards/accuracies": 0.550000011920929, + "eval_rewards/chosen": -0.2604214549064636, + "eval_rewards/margins": 0.010053902864456177, + "eval_rewards/rejected": -0.27047526836395264, + "eval_runtime": 53.3988, + "eval_samples_per_second": 9.364, + "eval_steps_per_second": 4.682, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 0.34352463483810425, + "learning_rate": 3.488e-06, + "log_odds_chosen": -0.03931659460067749, + "log_odds_ratio": -0.820832371711731, + "logits/chosen": 1.133837342262268, + "logits/rejected": 1.0994086265563965, + "logps/chosen": -2.6451282501220703, + "logps/rejected": -2.60078763961792, + "loss": 2.6902618408203125, + "nll_loss": 2.6081786155700684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.264512836933136, + "rewards/margins": -0.004434076603502035, + "rewards/rejected": -0.2600787580013275, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 0.33482232689857483, + "learning_rate": 3.808e-06, + "log_odds_chosen": 0.06360156834125519, + "log_odds_ratio": -0.7800716757774353, + "logits/chosen": 1.1236859560012817, + "logits/rejected": 1.0961066484451294, + "logps/chosen": -2.4337990283966064, + "logps/rejected": -2.49751615524292, + "loss": 2.489605522155762, + "nll_loss": 2.4115982055664062, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.24337990581989288, + "rewards/margins": 0.006371702998876572, + "rewards/rejected": -0.24975161254405975, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 0.2967042028903961, + "learning_rate": 4.128e-06, + "log_odds_chosen": 0.09330085664987564, + "log_odds_ratio": -0.7330855131149292, + "logits/chosen": 1.014111876487732, + "logits/rejected": 0.9797853231430054, + "logps/chosen": -2.3611137866973877, + "logps/rejected": -2.441413164138794, + "loss": 2.3510934829711916, + "nll_loss": 2.277784824371338, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2361113727092743, + "rewards/margins": 0.008029930293560028, + "rewards/rejected": -0.24414131045341492, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 0.24006928503513336, + "learning_rate": 4.4480000000000004e-06, + "log_odds_chosen": -0.04334372282028198, + "log_odds_ratio": -0.7658584713935852, + "logits/chosen": 1.1320513486862183, + "logits/rejected": 1.0817039012908936, + "logps/chosen": -2.3058180809020996, + "logps/rejected": -2.2727291584014893, + "loss": 2.2678853988647463, + "nll_loss": 2.1912999153137207, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.23058180510997772, + "rewards/margins": -0.003308868035674095, + "rewards/rejected": -0.22727294266223907, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 0.2205500453710556, + "learning_rate": 4.768e-06, + "log_odds_chosen": 0.08752859383821487, + "log_odds_ratio": -0.6995586156845093, + "logits/chosen": 0.9395301938056946, + "logits/rejected": 0.8814845085144043, + "logps/chosen": -2.261115074157715, + "logps/rejected": -2.3379273414611816, + "loss": 2.204380226135254, + "nll_loss": 2.1344239711761475, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2261115312576294, + "rewards/margins": 0.007681201212108135, + "rewards/rejected": -0.23379270732402802, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 0.19835017621517181, + "learning_rate": 5.088e-06, + "log_odds_chosen": 0.3893406391143799, + "log_odds_ratio": -0.6617119908332825, + "logits/chosen": 1.1273963451385498, + "logits/rejected": 0.9621152877807617, + "logps/chosen": -2.1851847171783447, + "logps/rejected": -2.5612263679504395, + "loss": 2.104723358154297, + "nll_loss": 2.0385525226593018, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21851846575737, + "rewards/margins": 0.0376041904091835, + "rewards/rejected": -0.2561226487159729, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 0.20731030404567719, + "learning_rate": 5.408e-06, + "log_odds_chosen": 0.12455078214406967, + "log_odds_ratio": -0.7328735589981079, + "logits/chosen": 1.046876311302185, + "logits/rejected": 1.0955148935317993, + "logps/chosen": -2.223609447479248, + "logps/rejected": -2.3480441570281982, + "loss": 2.0970306396484375, + "nll_loss": 2.0237433910369873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22236093878746033, + "rewards/margins": 0.01244346983730793, + "rewards/rejected": -0.2348044365644455, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 0.28243404626846313, + "learning_rate": 5.727999999999999e-06, + "log_odds_chosen": 0.09875164180994034, + "log_odds_ratio": -0.7095295190811157, + "logits/chosen": 1.2436240911483765, + "logits/rejected": 1.275618553161621, + "logps/chosen": -2.1978044509887695, + "logps/rejected": -2.272761821746826, + "loss": 2.0345205307006835, + "nll_loss": 1.9635677337646484, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.21978041529655457, + "rewards/margins": 0.0074957506731152534, + "rewards/rejected": -0.22727617621421814, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 0.18770183622837067, + "learning_rate": 6.0479999999999995e-06, + "log_odds_chosen": 0.2480204850435257, + "log_odds_ratio": -0.7025401592254639, + "logits/chosen": 1.1238905191421509, + "logits/rejected": 1.1091030836105347, + "logps/chosen": -2.0656919479370117, + "logps/rejected": -2.3097949028015137, + "loss": 1.9743515014648438, + "nll_loss": 1.904097318649292, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20656922459602356, + "rewards/margins": 0.024410294368863106, + "rewards/rejected": -0.23097951710224152, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 0.18696419894695282, + "learning_rate": 6.368e-06, + "log_odds_chosen": 0.08414062857627869, + "log_odds_ratio": -0.7133861184120178, + "logits/chosen": 1.1996185779571533, + "logits/rejected": 1.0700123310089111, + "logps/chosen": -2.1116840839385986, + "logps/rejected": -2.192984104156494, + "loss": 1.9692060470581054, + "nll_loss": 1.897867202758789, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21116837859153748, + "rewards/margins": 0.008130033500492573, + "rewards/rejected": -0.21929840743541718, + "step": 200 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.03449448570609093, + "eval_log_odds_ratio": -0.7650534510612488, + "eval_logits/chosen": 1.1852835416793823, + "eval_logits/rejected": 1.1406316757202148, + "eval_logps/chosen": -2.1768786907196045, + "eval_logps/rejected": -2.2172398567199707, + "eval_loss": 2.0106678009033203, + "eval_nll_loss": 1.9341623783111572, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.21768784523010254, + "eval_rewards/margins": 0.00403614854440093, + "eval_rewards/rejected": -0.2217240035533905, + "eval_runtime": 53.0621, + "eval_samples_per_second": 9.423, + "eval_steps_per_second": 4.711, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 0.17617128789424896, + "learning_rate": 6.687999999999999e-06, + "log_odds_chosen": 0.05374947935342789, + "log_odds_ratio": -0.716423511505127, + "logits/chosen": 1.1509783267974854, + "logits/rejected": 1.1752405166625977, + "logps/chosen": -2.1427035331726074, + "logps/rejected": -2.19557523727417, + "loss": 1.9653192520141602, + "nll_loss": 1.893676996231079, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21427035331726074, + "rewards/margins": 0.0052871680818498135, + "rewards/rejected": -0.2195574939250946, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 0.17626060545444489, + "learning_rate": 7.008e-06, + "log_odds_chosen": 0.1770932376384735, + "log_odds_ratio": -0.7325607538223267, + "logits/chosen": 1.3423850536346436, + "logits/rejected": 1.2371774911880493, + "logps/chosen": -2.0640311241149902, + "logps/rejected": -2.250347375869751, + "loss": 1.9260797500610352, + "nll_loss": 1.8528236150741577, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20640310645103455, + "rewards/margins": 0.018631622195243835, + "rewards/rejected": -0.2250347137451172, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 0.1989600658416748, + "learning_rate": 7.328e-06, + "log_odds_chosen": 0.18904821574687958, + "log_odds_ratio": -0.6782074570655823, + "logits/chosen": 1.280461311340332, + "logits/rejected": 1.1252264976501465, + "logps/chosen": -2.0997283458709717, + "logps/rejected": -2.2417054176330566, + "loss": 1.870577049255371, + "nll_loss": 1.8027557134628296, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20997285842895508, + "rewards/margins": 0.014197695069015026, + "rewards/rejected": -0.2241705358028412, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 0.13494263589382172, + "learning_rate": 7.647999999999999e-06, + "log_odds_chosen": -0.036032918840646744, + "log_odds_ratio": -0.7879734039306641, + "logits/chosen": 1.448754072189331, + "logits/rejected": 1.3340137004852295, + "logps/chosen": -2.011657238006592, + "logps/rejected": -1.9867651462554932, + "loss": 1.9344427108764648, + "nll_loss": 1.8556454181671143, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20116575062274933, + "rewards/margins": -0.0024892189539968967, + "rewards/rejected": -0.19867651164531708, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 0.22943158447742462, + "learning_rate": 7.967999999999999e-06, + "log_odds_chosen": -0.0008321896311827004, + "log_odds_ratio": -0.7597036957740784, + "logits/chosen": 1.290684461593628, + "logits/rejected": 1.2675565481185913, + "logps/chosen": -2.0894367694854736, + "logps/rejected": -2.0819172859191895, + "loss": 1.90467529296875, + "nll_loss": 1.828704833984375, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2089436799287796, + "rewards/margins": -0.0007519676582887769, + "rewards/rejected": -0.20819172263145447, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 0.14598031342029572, + "learning_rate": 7.967999999999999e-06, + "log_odds_chosen": -0.06827996671199799, + "log_odds_ratio": -0.8000016212463379, + "logits/chosen": 1.2406280040740967, + "logits/rejected": 1.3126946687698364, + "logps/chosen": -2.0982561111450195, + "logps/rejected": -2.0350162982940674, + "loss": 1.8742866516113281, + "nll_loss": 1.7942863702774048, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.20982563495635986, + "rewards/margins": -0.006323990412056446, + "rewards/rejected": -0.2035016119480133, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 0.2379223257303238, + "learning_rate": 7.932444444444444e-06, + "log_odds_chosen": -0.09409850090742111, + "log_odds_ratio": -0.8200883865356445, + "logits/chosen": 1.3649566173553467, + "logits/rejected": 1.3874846696853638, + "logps/chosen": -2.1385960578918457, + "logps/rejected": -2.0628838539123535, + "loss": 1.8866275787353515, + "nll_loss": 1.80461847782135, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.21385960280895233, + "rewards/margins": -0.00757119944319129, + "rewards/rejected": -0.2062883824110031, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 0.22652894258499146, + "learning_rate": 7.896888888888888e-06, + "log_odds_chosen": 0.3813454508781433, + "log_odds_ratio": -0.6462765336036682, + "logits/chosen": 1.2896631956100464, + "logits/rejected": 1.1385802030563354, + "logps/chosen": -2.1021246910095215, + "logps/rejected": -2.4392008781433105, + "loss": 1.9009965896606444, + "nll_loss": 1.8363691568374634, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.21021243929862976, + "rewards/margins": 0.0337076410651207, + "rewards/rejected": -0.24392008781433105, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 0.19373102486133575, + "learning_rate": 7.861333333333334e-06, + "log_odds_chosen": 0.059820324182510376, + "log_odds_ratio": -0.7212327122688293, + "logits/chosen": 1.215315580368042, + "logits/rejected": 1.2433207035064697, + "logps/chosen": -2.052096366882324, + "logps/rejected": -2.108736515045166, + "loss": 1.8485704421997071, + "nll_loss": 1.7764475345611572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2052096426486969, + "rewards/margins": 0.0056640272960066795, + "rewards/rejected": -0.21087364852428436, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 0.12395530939102173, + "learning_rate": 7.825777777777778e-06, + "log_odds_chosen": 0.12978777289390564, + "log_odds_ratio": -0.7189403772354126, + "logits/chosen": 1.3819777965545654, + "logits/rejected": 1.4145673513412476, + "logps/chosen": -2.0398616790771484, + "logps/rejected": -2.170393228530884, + "loss": 1.814511489868164, + "nll_loss": 1.7426178455352783, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20398616790771484, + "rewards/margins": 0.013053147122263908, + "rewards/rejected": -0.2170393466949463, + "step": 300 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.03967902436852455, + "eval_log_odds_ratio": -0.76247638463974, + "eval_logits/chosen": 1.3745256662368774, + "eval_logits/rejected": 1.3320753574371338, + "eval_logps/chosen": -2.0912294387817383, + "eval_logps/rejected": -2.1344449520111084, + "eval_loss": 1.911361575126648, + "eval_nll_loss": 1.8351138830184937, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20912295579910278, + "eval_rewards/margins": 0.004321571905165911, + "eval_rewards/rejected": -0.21344450116157532, + "eval_runtime": 53.0491, + "eval_samples_per_second": 9.425, + "eval_steps_per_second": 4.713, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 0.20187616348266602, + "learning_rate": 7.790222222222222e-06, + "log_odds_chosen": -0.023714840412139893, + "log_odds_ratio": -0.7719189524650574, + "logits/chosen": 1.5215256214141846, + "logits/rejected": 1.5573723316192627, + "logps/chosen": -2.0848724842071533, + "logps/rejected": -2.065119981765747, + "loss": 1.8499135971069336, + "nll_loss": 1.772721529006958, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20848727226257324, + "rewards/margins": -0.0019752695225179195, + "rewards/rejected": -0.2065119743347168, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 0.19256918132305145, + "learning_rate": 7.754666666666667e-06, + "log_odds_chosen": 0.16798502206802368, + "log_odds_ratio": -0.6922372579574585, + "logits/chosen": 1.3285846710205078, + "logits/rejected": 1.3583507537841797, + "logps/chosen": -2.07856822013855, + "logps/rejected": -2.1995127201080322, + "loss": 1.8253305435180665, + "nll_loss": 1.7561067342758179, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20785681903362274, + "rewards/margins": 0.012094443663954735, + "rewards/rejected": -0.21995127201080322, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 0.16783681511878967, + "learning_rate": 7.719111111111111e-06, + "log_odds_chosen": 0.16193893551826477, + "log_odds_ratio": -0.6671011447906494, + "logits/chosen": 1.4124246835708618, + "logits/rejected": 1.460701823234558, + "logps/chosen": -2.0305984020233154, + "logps/rejected": -2.159663677215576, + "loss": 1.8951801300048827, + "nll_loss": 1.82846999168396, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2030598372220993, + "rewards/margins": 0.012906527146697044, + "rewards/rejected": -0.2159663736820221, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 0.17299160361289978, + "learning_rate": 7.683555555555555e-06, + "log_odds_chosen": 0.09661159664392471, + "log_odds_ratio": -0.7051470875740051, + "logits/chosen": 1.4825594425201416, + "logits/rejected": 1.3175979852676392, + "logps/chosen": -1.8920962810516357, + "logps/rejected": -1.9876978397369385, + "loss": 1.7514026641845704, + "nll_loss": 1.6808878183364868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18920964002609253, + "rewards/margins": 0.009560128673911095, + "rewards/rejected": -0.19876977801322937, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 0.1780914068222046, + "learning_rate": 7.647999999999999e-06, + "log_odds_chosen": 0.05790011212229729, + "log_odds_ratio": -0.7372554540634155, + "logits/chosen": 1.3375146389007568, + "logits/rejected": 1.4315671920776367, + "logps/chosen": -2.0841262340545654, + "logps/rejected": -2.132392644882202, + "loss": 1.8903158187866211, + "nll_loss": 1.8165900707244873, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20841261744499207, + "rewards/margins": 0.004826628603041172, + "rewards/rejected": -0.21323923766613007, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 0.1749999225139618, + "learning_rate": 7.612444444444444e-06, + "log_odds_chosen": 0.12323548644781113, + "log_odds_ratio": -0.689288318157196, + "logits/chosen": 1.4562640190124512, + "logits/rejected": 1.3619139194488525, + "logps/chosen": -1.990504264831543, + "logps/rejected": -2.096740484237671, + "loss": 1.829003143310547, + "nll_loss": 1.7600743770599365, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1990504264831543, + "rewards/margins": 0.010623643174767494, + "rewards/rejected": -0.20967407524585724, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 0.17985334992408752, + "learning_rate": 7.576888888888889e-06, + "log_odds_chosen": 0.09662418812513351, + "log_odds_ratio": -0.70073401927948, + "logits/chosen": 1.5395666360855103, + "logits/rejected": 1.412684679031372, + "logps/chosen": -1.976719856262207, + "logps/rejected": -2.0691561698913574, + "loss": 1.8579627990722656, + "nll_loss": 1.7878894805908203, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19767197966575623, + "rewards/margins": 0.009243631735444069, + "rewards/rejected": -0.20691561698913574, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 0.2149394452571869, + "learning_rate": 7.541333333333333e-06, + "log_odds_chosen": 0.1822991818189621, + "log_odds_ratio": -0.6692709922790527, + "logits/chosen": 1.3819072246551514, + "logits/rejected": 1.4091228246688843, + "logps/chosen": -1.9516513347625732, + "logps/rejected": -2.107210159301758, + "loss": 1.8095186233520508, + "nll_loss": 1.7425915002822876, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19516517221927643, + "rewards/margins": 0.015555836260318756, + "rewards/rejected": -0.2107209861278534, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 0.20533578097820282, + "learning_rate": 7.505777777777777e-06, + "log_odds_chosen": 0.13032521307468414, + "log_odds_ratio": -0.7293068170547485, + "logits/chosen": 1.4975590705871582, + "logits/rejected": 1.3673573732376099, + "logps/chosen": -1.9267303943634033, + "logps/rejected": -2.037688970565796, + "loss": 1.7859189987182618, + "nll_loss": 1.7129881381988525, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19267304241657257, + "rewards/margins": 0.011095861904323101, + "rewards/rejected": -0.20376892387866974, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 0.18664704263210297, + "learning_rate": 7.470222222222222e-06, + "log_odds_chosen": 0.10940120369195938, + "log_odds_ratio": -0.7268368005752563, + "logits/chosen": 1.4196456670761108, + "logits/rejected": 1.3341939449310303, + "logps/chosen": -1.920606017112732, + "logps/rejected": -2.0050148963928223, + "loss": 1.799521255493164, + "nll_loss": 1.7268375158309937, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19206061959266663, + "rewards/margins": 0.008440867997705936, + "rewards/rejected": -0.20050148665905, + "step": 400 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.033501941710710526, + "eval_log_odds_ratio": -0.7624432444572449, + "eval_logits/chosen": 1.455781102180481, + "eval_logits/rejected": 1.41294264793396, + "eval_logps/chosen": -2.0634007453918457, + "eval_logps/rejected": -2.0990681648254395, + "eval_loss": 1.8849780559539795, + "eval_nll_loss": 1.8087337017059326, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.20634005963802338, + "eval_rewards/margins": 0.0035667610354721546, + "eval_rewards/rejected": -0.20990681648254395, + "eval_runtime": 53.0804, + "eval_samples_per_second": 9.42, + "eval_steps_per_second": 4.71, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 0.2279583215713501, + "learning_rate": 7.434666666666667e-06, + "log_odds_chosen": 0.24681270122528076, + "log_odds_ratio": -0.6675797700881958, + "logits/chosen": 1.3346173763275146, + "logits/rejected": 1.3772237300872803, + "logps/chosen": -1.9703868627548218, + "logps/rejected": -2.1775240898132324, + "loss": 1.819239616394043, + "nll_loss": 1.7524816989898682, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1970386803150177, + "rewards/margins": 0.02071371115744114, + "rewards/rejected": -0.21775241196155548, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 0.18673349916934967, + "learning_rate": 7.399111111111111e-06, + "log_odds_chosen": 0.0723341852426529, + "log_odds_ratio": -0.7402567863464355, + "logits/chosen": 1.352061152458191, + "logits/rejected": 1.3581398725509644, + "logps/chosen": -1.9535837173461914, + "logps/rejected": -2.0189168453216553, + "loss": 1.790372085571289, + "nll_loss": 1.7163463830947876, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19535836577415466, + "rewards/margins": 0.0065332986414432526, + "rewards/rejected": -0.20189166069030762, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 0.13824671506881714, + "learning_rate": 7.3635555555555544e-06, + "log_odds_chosen": 0.07097329199314117, + "log_odds_ratio": -0.723812460899353, + "logits/chosen": 1.500058889389038, + "logits/rejected": 1.4235128164291382, + "logps/chosen": -2.0450332164764404, + "logps/rejected": -2.1189401149749756, + "loss": 1.8183156967163085, + "nll_loss": 1.7459347248077393, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20450334250926971, + "rewards/margins": 0.007390675134956837, + "rewards/rejected": -0.21189400553703308, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 0.19959300756454468, + "learning_rate": 7.328e-06, + "log_odds_chosen": -0.05561716482043266, + "log_odds_ratio": -0.8284331560134888, + "logits/chosen": 1.319535493850708, + "logits/rejected": 1.291486144065857, + "logps/chosen": -2.0064926147460938, + "logps/rejected": -1.936273217201233, + "loss": 1.8077302932739259, + "nll_loss": 1.7248871326446533, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20064929127693176, + "rewards/margins": -0.0070219277404248714, + "rewards/rejected": -0.19362732768058777, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 0.11968322098255157, + "learning_rate": 7.292444444444444e-06, + "log_odds_chosen": 0.15821342170238495, + "log_odds_ratio": -0.7299971580505371, + "logits/chosen": 1.4834121465682983, + "logits/rejected": 1.437281847000122, + "logps/chosen": -1.9749513864517212, + "logps/rejected": -2.1457695960998535, + "loss": 1.8616512298583985, + "nll_loss": 1.788651704788208, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19749514758586884, + "rewards/margins": 0.017081793397665024, + "rewards/rejected": -0.21457692980766296, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 0.2767660915851593, + "learning_rate": 7.2568888888888885e-06, + "log_odds_chosen": 0.05323999002575874, + "log_odds_ratio": -0.7374966740608215, + "logits/chosen": 1.5227715969085693, + "logits/rejected": 1.5175096988677979, + "logps/chosen": -1.996843695640564, + "logps/rejected": -2.0283195972442627, + "loss": 1.8424455642700195, + "nll_loss": 1.7686958312988281, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19968439638614655, + "rewards/margins": 0.003147574607282877, + "rewards/rejected": -0.20283198356628418, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 0.2534579038619995, + "learning_rate": 7.221333333333332e-06, + "log_odds_chosen": -0.01329396665096283, + "log_odds_ratio": -0.7843751907348633, + "logits/chosen": 1.5107393264770508, + "logits/rejected": 1.4327045679092407, + "logps/chosen": -2.023428440093994, + "logps/rejected": -2.012712001800537, + "loss": 1.762538528442383, + "nll_loss": 1.684100866317749, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20234286785125732, + "rewards/margins": -0.001071644015610218, + "rewards/rejected": -0.20127122104167938, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 0.22043545544147491, + "learning_rate": 7.185777777777778e-06, + "log_odds_chosen": 0.1587541103363037, + "log_odds_ratio": -0.709527850151062, + "logits/chosen": 1.4747645854949951, + "logits/rejected": 1.4961092472076416, + "logps/chosen": -1.9713242053985596, + "logps/rejected": -2.11765193939209, + "loss": 1.8060014724731446, + "nll_loss": 1.7350486516952515, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1971324235200882, + "rewards/margins": 0.014632781967520714, + "rewards/rejected": -0.21176521480083466, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 0.2087666392326355, + "learning_rate": 7.150222222222222e-06, + "log_odds_chosen": 0.26932230591773987, + "log_odds_ratio": -0.6544117331504822, + "logits/chosen": 1.504432201385498, + "logits/rejected": 1.5163673162460327, + "logps/chosen": -1.8658252954483032, + "logps/rejected": -2.088016986846924, + "loss": 1.816385269165039, + "nll_loss": 1.7509441375732422, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1865825355052948, + "rewards/margins": 0.022219162434339523, + "rewards/rejected": -0.20880170166492462, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 0.14658813178539276, + "learning_rate": 7.1146666666666664e-06, + "log_odds_chosen": -0.0933292880654335, + "log_odds_ratio": -0.8082043528556824, + "logits/chosen": 1.5245555639266968, + "logits/rejected": 1.505550742149353, + "logps/chosen": -2.04571533203125, + "logps/rejected": -1.9844484329223633, + "loss": 1.8789659500122071, + "nll_loss": 1.7981455326080322, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.20457151532173157, + "rewards/margins": -0.006126692984253168, + "rewards/rejected": -0.19844482839107513, + "step": 500 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.02602977305650711, + "eval_log_odds_ratio": -0.7631290555000305, + "eval_logits/chosen": 1.5728307962417603, + "eval_logits/rejected": 1.5338318347930908, + "eval_logps/chosen": -2.0366148948669434, + "eval_logps/rejected": -2.065042018890381, + "eval_loss": 1.8675161600112915, + "eval_nll_loss": 1.7912031412124634, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.20366153120994568, + "eval_rewards/margins": 0.002842681249603629, + "eval_rewards/rejected": -0.20650418102741241, + "eval_runtime": 53.0222, + "eval_samples_per_second": 9.43, + "eval_steps_per_second": 4.715, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 0.16157585382461548, + "learning_rate": 7.07911111111111e-06, + "log_odds_chosen": 0.19593098759651184, + "log_odds_ratio": -0.6945115923881531, + "logits/chosen": 1.6804075241088867, + "logits/rejected": 1.6551501750946045, + "logps/chosen": -1.8801660537719727, + "logps/rejected": -2.044741153717041, + "loss": 1.720193862915039, + "nll_loss": 1.650742769241333, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1880166232585907, + "rewards/margins": 0.016457516700029373, + "rewards/rejected": -0.20447412133216858, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 0.21952685713768005, + "learning_rate": 7.043555555555556e-06, + "log_odds_chosen": 0.11960093677043915, + "log_odds_ratio": -0.6947656869888306, + "logits/chosen": 1.6775137186050415, + "logits/rejected": 1.6482852697372437, + "logps/chosen": -1.9966636896133423, + "logps/rejected": -2.1046407222747803, + "loss": 1.7815994262695312, + "nll_loss": 1.7121226787567139, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19966639578342438, + "rewards/margins": 0.010797705501317978, + "rewards/rejected": -0.21046409010887146, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 0.23222282528877258, + "learning_rate": 7.008e-06, + "log_odds_chosen": -0.033172450959682465, + "log_odds_ratio": -0.8052657842636108, + "logits/chosen": 1.6025762557983398, + "logits/rejected": 1.6583106517791748, + "logps/chosen": -2.0893313884735107, + "logps/rejected": -2.0725255012512207, + "loss": 1.803743553161621, + "nll_loss": 1.7232167720794678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20893315970897675, + "rewards/margins": -0.0016806062776595354, + "rewards/rejected": -0.20725254714488983, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 0.15105725824832916, + "learning_rate": 6.9724444444444435e-06, + "log_odds_chosen": 0.08743356913328171, + "log_odds_ratio": -0.701758861541748, + "logits/chosen": 1.590755820274353, + "logits/rejected": 1.4823769330978394, + "logps/chosen": -1.9318288564682007, + "logps/rejected": -2.0069031715393066, + "loss": 1.8346128463745117, + "nll_loss": 1.7644370794296265, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19318291544914246, + "rewards/margins": 0.0075074234046041965, + "rewards/rejected": -0.20069031417369843, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 0.16001789271831512, + "learning_rate": 6.936888888888889e-06, + "log_odds_chosen": 0.18001510202884674, + "log_odds_ratio": -0.7216249704360962, + "logits/chosen": 1.5185306072235107, + "logits/rejected": 1.5836330652236938, + "logps/chosen": -1.9629827737808228, + "logps/rejected": -2.099684476852417, + "loss": 1.7317916870117187, + "nll_loss": 1.6596291065216064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19629831612110138, + "rewards/margins": 0.013670151121914387, + "rewards/rejected": -0.2099684774875641, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 0.2220832109451294, + "learning_rate": 6.901333333333333e-06, + "log_odds_chosen": -0.046985138207674026, + "log_odds_ratio": -0.7678507566452026, + "logits/chosen": 1.5970782041549683, + "logits/rejected": 1.5387827157974243, + "logps/chosen": -2.008047580718994, + "logps/rejected": -1.964775800704956, + "loss": 1.787788200378418, + "nll_loss": 1.711003065109253, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20080475509166718, + "rewards/margins": -0.004327182658016682, + "rewards/rejected": -0.19647757709026337, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 0.1927526891231537, + "learning_rate": 6.8657777777777776e-06, + "log_odds_chosen": 0.08939726650714874, + "log_odds_ratio": -0.7270767688751221, + "logits/chosen": 1.51679265499115, + "logits/rejected": 1.4389561414718628, + "logps/chosen": -1.9325392246246338, + "logps/rejected": -1.9886747598648071, + "loss": 1.7688678741455077, + "nll_loss": 1.6961603164672852, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19325393438339233, + "rewards/margins": 0.0056135449558496475, + "rewards/rejected": -0.19886748492717743, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 0.2728411853313446, + "learning_rate": 6.830222222222221e-06, + "log_odds_chosen": 0.06151670217514038, + "log_odds_ratio": -0.7394816875457764, + "logits/chosen": 1.6327412128448486, + "logits/rejected": 1.7010612487792969, + "logps/chosen": -2.0379953384399414, + "logps/rejected": -2.0829684734344482, + "loss": 1.7836915969848632, + "nll_loss": 1.7097432613372803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2037995308637619, + "rewards/margins": 0.0044972943142056465, + "rewards/rejected": -0.20829685032367706, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 0.13671930134296417, + "learning_rate": 6.794666666666667e-06, + "log_odds_chosen": 0.17000290751457214, + "log_odds_ratio": -0.6512231826782227, + "logits/chosen": 1.6878210306167603, + "logits/rejected": 1.6108119487762451, + "logps/chosen": -1.8216642141342163, + "logps/rejected": -1.9648876190185547, + "loss": 1.8262306213378907, + "nll_loss": 1.7611083984375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1821664273738861, + "rewards/margins": 0.01432233490049839, + "rewards/rejected": -0.19648873805999756, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 0.1815636157989502, + "learning_rate": 6.759111111111111e-06, + "log_odds_chosen": 0.08520406484603882, + "log_odds_ratio": -0.7176100611686707, + "logits/chosen": 1.4824538230895996, + "logits/rejected": 1.5591896772384644, + "logps/chosen": -1.965994119644165, + "logps/rejected": -2.0540990829467773, + "loss": 1.8055044174194337, + "nll_loss": 1.73374342918396, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19659940898418427, + "rewards/margins": 0.008810499683022499, + "rewards/rejected": -0.2054099142551422, + "step": 600 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.030262866988778114, + "eval_log_odds_ratio": -0.7621479034423828, + "eval_logits/chosen": 1.5845204591751099, + "eval_logits/rejected": 1.5468944311141968, + "eval_logps/chosen": -2.024677038192749, + "eval_logps/rejected": -2.0564591884613037, + "eval_loss": 1.8558011054992676, + "eval_nll_loss": 1.7795861959457397, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20246769487857819, + "eval_rewards/margins": 0.0031782032456249, + "eval_rewards/rejected": -0.20564593374729156, + "eval_runtime": 53.1235, + "eval_samples_per_second": 9.412, + "eval_steps_per_second": 4.706, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 0.17874790728092194, + "learning_rate": 6.7235555555555555e-06, + "log_odds_chosen": -0.14907710254192352, + "log_odds_ratio": -0.8681455850601196, + "logits/chosen": 1.6063718795776367, + "logits/rejected": 1.6573474407196045, + "logps/chosen": -2.107225179672241, + "logps/rejected": -1.97052001953125, + "loss": 1.8059173583984376, + "nll_loss": 1.7191026210784912, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21072253584861755, + "rewards/margins": -0.013670533895492554, + "rewards/rejected": -0.197052001953125, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 0.1660294085741043, + "learning_rate": 6.687999999999999e-06, + "log_odds_chosen": -0.017593836411833763, + "log_odds_ratio": -0.7714040875434875, + "logits/chosen": 1.6416600942611694, + "logits/rejected": 1.6900558471679688, + "logps/chosen": -1.9674263000488281, + "logps/rejected": -1.943830132484436, + "loss": 1.7882635116577148, + "nll_loss": 1.7111231088638306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19674262404441833, + "rewards/margins": -0.0023596244864165783, + "rewards/rejected": -0.19438298046588898, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 0.17608729004859924, + "learning_rate": 6.652444444444445e-06, + "log_odds_chosen": 0.2501833438873291, + "log_odds_ratio": -0.6537446975708008, + "logits/chosen": 1.6887744665145874, + "logits/rejected": 1.5986034870147705, + "logps/chosen": -1.9225715398788452, + "logps/rejected": -2.143808364868164, + "loss": 1.8512474060058595, + "nll_loss": 1.7858734130859375, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19225716590881348, + "rewards/margins": 0.02212369069457054, + "rewards/rejected": -0.21438086032867432, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 0.3151007294654846, + "learning_rate": 6.616888888888889e-06, + "log_odds_chosen": 0.04988854005932808, + "log_odds_ratio": -0.7158768177032471, + "logits/chosen": 1.569506049156189, + "logits/rejected": 1.4855579137802124, + "logps/chosen": -2.0310044288635254, + "logps/rejected": -2.07084059715271, + "loss": 1.8164968490600586, + "nll_loss": 1.744909644126892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20310044288635254, + "rewards/margins": 0.003983622882515192, + "rewards/rejected": -0.20708408951759338, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 0.2057182490825653, + "learning_rate": 6.5813333333333325e-06, + "log_odds_chosen": 0.44220876693725586, + "log_odds_ratio": -0.6088204979896545, + "logits/chosen": 1.682287573814392, + "logits/rejected": 1.5269925594329834, + "logps/chosen": -1.8356335163116455, + "logps/rejected": -2.236959934234619, + "loss": 1.7419025421142578, + "nll_loss": 1.6810203790664673, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18356335163116455, + "rewards/margins": 0.04013265669345856, + "rewards/rejected": -0.22369599342346191, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 0.16435767710208893, + "learning_rate": 6.545777777777777e-06, + "log_odds_chosen": 0.07594867050647736, + "log_odds_ratio": -0.742243766784668, + "logits/chosen": 1.66165030002594, + "logits/rejected": 1.5617786645889282, + "logps/chosen": -1.9473623037338257, + "logps/rejected": -2.006049871444702, + "loss": 1.8304746627807618, + "nll_loss": 1.7562503814697266, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19473622739315033, + "rewards/margins": 0.005868755746632814, + "rewards/rejected": -0.20060500502586365, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 0.15003077685832977, + "learning_rate": 6.510222222222222e-06, + "log_odds_chosen": 0.21500691771507263, + "log_odds_ratio": -0.6412914991378784, + "logits/chosen": 1.6037086248397827, + "logits/rejected": 1.438218355178833, + "logps/chosen": -1.9100940227508545, + "logps/rejected": -2.087085485458374, + "loss": 1.8060630798339843, + "nll_loss": 1.741934061050415, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19100941717624664, + "rewards/margins": 0.017699118703603745, + "rewards/rejected": -0.2087085247039795, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 0.1473378688097, + "learning_rate": 6.474666666666667e-06, + "log_odds_chosen": 0.0917447879910469, + "log_odds_ratio": -0.7169826626777649, + "logits/chosen": 1.63227117061615, + "logits/rejected": 1.5642629861831665, + "logps/chosen": -1.9436975717544556, + "logps/rejected": -2.0165576934814453, + "loss": 1.7880081176757812, + "nll_loss": 1.7163095474243164, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19436973333358765, + "rewards/margins": 0.0072860405780375, + "rewards/rejected": -0.201655775308609, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 0.156095951795578, + "learning_rate": 6.4391111111111105e-06, + "log_odds_chosen": 0.03318742290139198, + "log_odds_ratio": -0.7350637912750244, + "logits/chosen": 1.6490083932876587, + "logits/rejected": 1.5582869052886963, + "logps/chosen": -1.9969427585601807, + "logps/rejected": -2.019963264465332, + "loss": 1.8322141647338868, + "nll_loss": 1.7587080001831055, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19969427585601807, + "rewards/margins": 0.002302053850144148, + "rewards/rejected": -0.20199629664421082, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 0.19251035153865814, + "learning_rate": 6.403555555555555e-06, + "log_odds_chosen": 0.1400238573551178, + "log_odds_ratio": -0.6950441598892212, + "logits/chosen": 1.6086819171905518, + "logits/rejected": 1.5480411052703857, + "logps/chosen": -1.970768690109253, + "logps/rejected": -2.083794116973877, + "loss": 1.7771568298339844, + "nll_loss": 1.7076523303985596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19707687199115753, + "rewards/margins": 0.011302560567855835, + "rewards/rejected": -0.20837941765785217, + "step": 700 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.03070848062634468, + "eval_log_odds_ratio": -0.7618256211280823, + "eval_logits/chosen": 1.6476703882217407, + "eval_logits/rejected": 1.6128474473953247, + "eval_logps/chosen": -2.0164527893066406, + "eval_logps/rejected": -2.0491254329681396, + "eval_loss": 1.8477715253829956, + "eval_nll_loss": 1.771588921546936, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20164531469345093, + "eval_rewards/margins": 0.003267248161137104, + "eval_rewards/rejected": -0.20491254329681396, + "eval_runtime": 53.1062, + "eval_samples_per_second": 9.415, + "eval_steps_per_second": 4.708, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 0.18433217704296112, + "learning_rate": 6.368e-06, + "log_odds_chosen": 0.17153123021125793, + "log_odds_ratio": -0.6689559817314148, + "logits/chosen": 1.6745821237564087, + "logits/rejected": 1.6360340118408203, + "logps/chosen": -1.9238487482070923, + "logps/rejected": -2.0793042182922363, + "loss": 1.7662607192993165, + "nll_loss": 1.6993646621704102, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19238488376140594, + "rewards/margins": 0.015545527450740337, + "rewards/rejected": -0.20793041586875916, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 0.18722620606422424, + "learning_rate": 6.3324444444444445e-06, + "log_odds_chosen": -0.07723536342382431, + "log_odds_ratio": -0.8040523529052734, + "logits/chosen": 1.611930251121521, + "logits/rejected": 1.6221723556518555, + "logps/chosen": -2.042644500732422, + "logps/rejected": -1.9738566875457764, + "loss": 1.8212066650390626, + "nll_loss": 1.740801215171814, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20426444709300995, + "rewards/margins": -0.006878760643303394, + "rewards/rejected": -0.19738569855690002, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 0.20682425796985626, + "learning_rate": 6.296888888888888e-06, + "log_odds_chosen": 0.12106932699680328, + "log_odds_ratio": -0.7056177258491516, + "logits/chosen": 1.5577377080917358, + "logits/rejected": 1.5362805128097534, + "logps/chosen": -2.016268014907837, + "logps/rejected": -2.120490550994873, + "loss": 1.7989938735961915, + "nll_loss": 1.7284319400787354, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20162677764892578, + "rewards/margins": 0.010422252118587494, + "rewards/rejected": -0.21204905211925507, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 0.19108694791793823, + "learning_rate": 6.261333333333333e-06, + "log_odds_chosen": 0.0587974414229393, + "log_odds_ratio": -0.7309185862541199, + "logits/chosen": 1.6171478033065796, + "logits/rejected": 1.4924392700195312, + "logps/chosen": -1.9424350261688232, + "logps/rejected": -1.9815622568130493, + "loss": 1.7689041137695312, + "nll_loss": 1.6958122253417969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19424352049827576, + "rewards/margins": 0.003912704065442085, + "rewards/rejected": -0.1981562376022339, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 0.19131523370742798, + "learning_rate": 6.225777777777778e-06, + "log_odds_chosen": 0.023378366604447365, + "log_odds_ratio": -0.7382779717445374, + "logits/chosen": 1.5973578691482544, + "logits/rejected": 1.6551589965820312, + "logps/chosen": -1.959924340248108, + "logps/rejected": -1.9721896648406982, + "loss": 1.7737180709838867, + "nll_loss": 1.699890375137329, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19599245488643646, + "rewards/margins": 0.001226510270498693, + "rewards/rejected": -0.19721895456314087, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 0.15221014618873596, + "learning_rate": 6.190222222222222e-06, + "log_odds_chosen": 0.23301279544830322, + "log_odds_ratio": -0.6567850112915039, + "logits/chosen": 1.7112754583358765, + "logits/rejected": 1.6009547710418701, + "logps/chosen": -1.8999290466308594, + "logps/rejected": -2.1016502380371094, + "loss": 1.7749822616577149, + "nll_loss": 1.709303617477417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18999293446540833, + "rewards/margins": 0.020172089338302612, + "rewards/rejected": -0.21016499400138855, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 0.17331072688102722, + "learning_rate": 6.154666666666666e-06, + "log_odds_chosen": 0.08666707575321198, + "log_odds_ratio": -0.721922755241394, + "logits/chosen": 1.629499077796936, + "logits/rejected": 1.6233304738998413, + "logps/chosen": -1.966850996017456, + "logps/rejected": -2.0316665172576904, + "loss": 1.750493621826172, + "nll_loss": 1.678301215171814, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19668510556221008, + "rewards/margins": 0.006481558084487915, + "rewards/rejected": -0.2031666487455368, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 0.27533528208732605, + "learning_rate": 6.11911111111111e-06, + "log_odds_chosen": 0.2137812376022339, + "log_odds_ratio": -0.6508430242538452, + "logits/chosen": 1.6023156642913818, + "logits/rejected": 1.5528204441070557, + "logps/chosen": -1.9073442220687866, + "logps/rejected": -2.0768206119537354, + "loss": 1.7602737426757813, + "nll_loss": 1.6951894760131836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19073444604873657, + "rewards/margins": 0.016947634518146515, + "rewards/rejected": -0.2076820582151413, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 0.16427750885486603, + "learning_rate": 6.083555555555556e-06, + "log_odds_chosen": -0.015147974714636803, + "log_odds_ratio": -0.7511974573135376, + "logits/chosen": 1.529996633529663, + "logits/rejected": 1.5390806198120117, + "logps/chosen": -2.041637420654297, + "logps/rejected": -2.0313525199890137, + "loss": 1.8287666320800782, + "nll_loss": 1.7536464929580688, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.20416374504566193, + "rewards/margins": -0.0010284921154379845, + "rewards/rejected": -0.20313525199890137, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 0.18231312930583954, + "learning_rate": 6.0479999999999995e-06, + "log_odds_chosen": 0.15317580103874207, + "log_odds_ratio": -0.6632872223854065, + "logits/chosen": 1.6685699224472046, + "logits/rejected": 1.7259252071380615, + "logps/chosen": -2.016608476638794, + "logps/rejected": -2.1375017166137695, + "loss": 1.8088930130004883, + "nll_loss": 1.7425638437271118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20166082680225372, + "rewards/margins": 0.012089352123439312, + "rewards/rejected": -0.2137501984834671, + "step": 800 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.03274427726864815, + "eval_log_odds_ratio": -0.7604559063911438, + "eval_logits/chosen": 1.6779166460037231, + "eval_logits/rejected": 1.6468324661254883, + "eval_logps/chosen": -2.0128262042999268, + "eval_logps/rejected": -2.0465006828308105, + "eval_loss": 1.842376947402954, + "eval_nll_loss": 1.766331434249878, + "eval_rewards/accuracies": 0.5419999957084656, + "eval_rewards/chosen": -0.20128265023231506, + "eval_rewards/margins": 0.003367435419932008, + "eval_rewards/rejected": -0.20465007424354553, + "eval_runtime": 53.2521, + "eval_samples_per_second": 9.389, + "eval_steps_per_second": 4.695, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 0.2131498008966446, + "learning_rate": 6.012444444444444e-06, + "log_odds_chosen": 0.025272076949477196, + "log_odds_ratio": -0.7379814386367798, + "logits/chosen": 1.5435346364974976, + "logits/rejected": 1.6608002185821533, + "logps/chosen": -1.8906033039093018, + "logps/rejected": -1.9051824808120728, + "loss": 1.725562858581543, + "nll_loss": 1.6517642736434937, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18906034529209137, + "rewards/margins": 0.0014579046983271837, + "rewards/rejected": -0.19051823019981384, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 0.21816125512123108, + "learning_rate": 5.976888888888888e-06, + "log_odds_chosen": 0.10699774324893951, + "log_odds_ratio": -0.7373054027557373, + "logits/chosen": 1.756774663925171, + "logits/rejected": 1.7355453968048096, + "logps/chosen": -1.9655370712280273, + "logps/rejected": -2.060356378555298, + "loss": 1.8129741668701171, + "nll_loss": 1.7392438650131226, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19655370712280273, + "rewards/margins": 0.009481914341449738, + "rewards/rejected": -0.20603564381599426, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 0.14879928529262543, + "learning_rate": 5.941333333333334e-06, + "log_odds_chosen": 0.07696692645549774, + "log_odds_ratio": -0.7127344012260437, + "logits/chosen": 1.6554100513458252, + "logits/rejected": 1.6326297521591187, + "logps/chosen": -1.9419372081756592, + "logps/rejected": -2.013288974761963, + "loss": 1.7566993713378907, + "nll_loss": 1.6854259967803955, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19419369101524353, + "rewards/margins": 0.007135221268981695, + "rewards/rejected": -0.20132891833782196, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 0.221551775932312, + "learning_rate": 5.9057777777777774e-06, + "log_odds_chosen": 0.06290511786937714, + "log_odds_ratio": -0.7516866326332092, + "logits/chosen": 1.575165867805481, + "logits/rejected": 1.5511913299560547, + "logps/chosen": -1.9852020740509033, + "logps/rejected": -2.0387279987335205, + "loss": 1.745162010192871, + "nll_loss": 1.6699934005737305, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1985202133655548, + "rewards/margins": 0.005352598614990711, + "rewards/rejected": -0.20387279987335205, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 0.15309089422225952, + "learning_rate": 5.870222222222222e-06, + "log_odds_chosen": 0.1161905974149704, + "log_odds_ratio": -0.689677894115448, + "logits/chosen": 1.7609052658081055, + "logits/rejected": 1.7412450313568115, + "logps/chosen": -1.9071937799453735, + "logps/rejected": -2.0045714378356934, + "loss": 1.7892265319824219, + "nll_loss": 1.7202587127685547, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1907193958759308, + "rewards/margins": 0.00973774679005146, + "rewards/rejected": -0.2004571408033371, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 0.14245319366455078, + "learning_rate": 5.834666666666666e-06, + "log_odds_chosen": 0.1561126410961151, + "log_odds_ratio": -0.6676959991455078, + "logits/chosen": 1.6795034408569336, + "logits/rejected": 1.6025750637054443, + "logps/chosen": -1.942917823791504, + "logps/rejected": -2.074822187423706, + "loss": 1.7381248474121094, + "nll_loss": 1.6713546514511108, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19429180026054382, + "rewards/margins": 0.01319044642150402, + "rewards/rejected": -0.207482248544693, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 0.2671545743942261, + "learning_rate": 5.799111111111111e-06, + "log_odds_chosen": 0.20992258191108704, + "log_odds_ratio": -0.6939498782157898, + "logits/chosen": 1.7179876565933228, + "logits/rejected": 1.6866029500961304, + "logps/chosen": -1.9251248836517334, + "logps/rejected": -2.1142725944519043, + "loss": 1.770625686645508, + "nll_loss": 1.7012306451797485, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19251248240470886, + "rewards/margins": 0.01891477219760418, + "rewards/rejected": -0.211427241563797, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 0.17794589698314667, + "learning_rate": 5.763555555555555e-06, + "log_odds_chosen": 0.04583617299795151, + "log_odds_ratio": -0.7294620871543884, + "logits/chosen": 1.7066138982772827, + "logits/rejected": 1.7307049036026, + "logps/chosen": -1.933084487915039, + "logps/rejected": -1.966208815574646, + "loss": 1.7707733154296874, + "nll_loss": 1.6978269815444946, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19330844283103943, + "rewards/margins": 0.0033124610781669617, + "rewards/rejected": -0.1966208964586258, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 0.1971806436777115, + "learning_rate": 5.727999999999999e-06, + "log_odds_chosen": 0.0007457077736034989, + "log_odds_ratio": -0.7630642056465149, + "logits/chosen": 1.7499278783798218, + "logits/rejected": 1.7503166198730469, + "logps/chosen": -1.918731689453125, + "logps/rejected": -1.9218223094940186, + "loss": 1.7608676910400392, + "nll_loss": 1.6845613718032837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19187316298484802, + "rewards/margins": 0.0003090621903538704, + "rewards/rejected": -0.19218222796916962, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 0.17486584186553955, + "learning_rate": 5.692444444444445e-06, + "log_odds_chosen": 0.21501651406288147, + "log_odds_ratio": -0.6557433009147644, + "logits/chosen": 1.6884733438491821, + "logits/rejected": 1.6427476406097412, + "logps/chosen": -1.9072158336639404, + "logps/rejected": -2.0945980548858643, + "loss": 1.7564472198486327, + "nll_loss": 1.6908729076385498, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19072160124778748, + "rewards/margins": 0.018738189712166786, + "rewards/rejected": -0.2094598114490509, + "step": 900 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.03506353497505188, + "eval_log_odds_ratio": -0.7600105404853821, + "eval_logits/chosen": 1.699849009513855, + "eval_logits/rejected": 1.6695457696914673, + "eval_logps/chosen": -2.0012686252593994, + "eval_logps/rejected": -2.03646183013916, + "eval_loss": 1.8359203338623047, + "eval_nll_loss": 1.7599191665649414, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20012688636779785, + "eval_rewards/margins": 0.003519318765029311, + "eval_rewards/rejected": -0.2036461979150772, + "eval_runtime": 53.0131, + "eval_samples_per_second": 9.432, + "eval_steps_per_second": 4.716, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 0.2480229139328003, + "learning_rate": 5.656888888888889e-06, + "log_odds_chosen": 0.0064504086039960384, + "log_odds_ratio": -0.7494773268699646, + "logits/chosen": 1.6877663135528564, + "logits/rejected": 1.649176836013794, + "logps/chosen": -2.018514633178711, + "logps/rejected": -2.013549566268921, + "loss": 1.8033092498779297, + "nll_loss": 1.7283611297607422, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20185145735740662, + "rewards/margins": -0.0004965037223882973, + "rewards/rejected": -0.2013549506664276, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 0.19374053180217743, + "learning_rate": 5.621333333333333e-06, + "log_odds_chosen": 0.011787503957748413, + "log_odds_ratio": -0.7448769211769104, + "logits/chosen": 1.674197793006897, + "logits/rejected": 1.7002149820327759, + "logps/chosen": -1.8899329900741577, + "logps/rejected": -1.898186445236206, + "loss": 1.7617456436157226, + "nll_loss": 1.6872574090957642, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18899329006671906, + "rewards/margins": 0.0008253513951785862, + "rewards/rejected": -0.18981865048408508, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 0.156111478805542, + "learning_rate": 5.585777777777777e-06, + "log_odds_chosen": 0.05632457882165909, + "log_odds_ratio": -0.7555148005485535, + "logits/chosen": 1.805132269859314, + "logits/rejected": 1.8177416324615479, + "logps/chosen": -1.856993317604065, + "logps/rejected": -1.9158560037612915, + "loss": 1.8122926712036134, + "nll_loss": 1.736741304397583, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18569931387901306, + "rewards/margins": 0.0058862874284386635, + "rewards/rejected": -0.19158563017845154, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 0.19871090352535248, + "learning_rate": 5.550222222222223e-06, + "log_odds_chosen": 0.07815317809581757, + "log_odds_ratio": -0.7037801742553711, + "logits/chosen": 1.7272205352783203, + "logits/rejected": 1.8103294372558594, + "logps/chosen": -1.9780700206756592, + "logps/rejected": -2.0482537746429443, + "loss": 1.7779150009155273, + "nll_loss": 1.707537055015564, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19780699908733368, + "rewards/margins": 0.007018385920673609, + "rewards/rejected": -0.20482537150382996, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 0.22105462849140167, + "learning_rate": 5.5146666666666665e-06, + "log_odds_chosen": 0.1774008721113205, + "log_odds_ratio": -0.6982907056808472, + "logits/chosen": 1.6242185831069946, + "logits/rejected": 1.6261165142059326, + "logps/chosen": -1.941339135169983, + "logps/rejected": -2.0926012992858887, + "loss": 1.7724433898925782, + "nll_loss": 1.702614426612854, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1941339373588562, + "rewards/margins": 0.015126201324164867, + "rewards/rejected": -0.20926015079021454, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 0.2018124759197235, + "learning_rate": 5.479111111111111e-06, + "log_odds_chosen": 0.11117073148488998, + "log_odds_ratio": -0.7105401754379272, + "logits/chosen": 1.7710603475570679, + "logits/rejected": 1.6767441034317017, + "logps/chosen": -1.932356834411621, + "logps/rejected": -2.0336527824401855, + "loss": 1.7901832580566406, + "nll_loss": 1.719129204750061, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19323569536209106, + "rewards/margins": 0.010129592381417751, + "rewards/rejected": -0.2033652812242508, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 0.21761752665042877, + "learning_rate": 5.443555555555555e-06, + "log_odds_chosen": -0.05387473106384277, + "log_odds_ratio": -0.8190320134162903, + "logits/chosen": 1.6793544292449951, + "logits/rejected": 1.675851821899414, + "logps/chosen": -1.9604196548461914, + "logps/rejected": -1.9354708194732666, + "loss": 1.749907875061035, + "nll_loss": 1.6680047512054443, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19604197144508362, + "rewards/margins": -0.0024948944337666035, + "rewards/rejected": -0.1935470849275589, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 0.1489887237548828, + "learning_rate": 5.408e-06, + "log_odds_chosen": 0.10288698971271515, + "log_odds_ratio": -0.6924680471420288, + "logits/chosen": 1.7274665832519531, + "logits/rejected": 1.5743342638015747, + "logps/chosen": -1.913150429725647, + "logps/rejected": -2.003439426422119, + "loss": 1.816815185546875, + "nll_loss": 1.7475683689117432, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19131508469581604, + "rewards/margins": 0.009028871543705463, + "rewards/rejected": -0.20034393668174744, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 0.22919175028800964, + "learning_rate": 5.372444444444444e-06, + "log_odds_chosen": 0.26656976342201233, + "log_odds_ratio": -0.611583948135376, + "logits/chosen": 1.6766068935394287, + "logits/rejected": 1.680253028869629, + "logps/chosen": -1.9001144170761108, + "logps/rejected": -2.118840217590332, + "loss": 1.7628129959106444, + "nll_loss": 1.7016544342041016, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19001144170761108, + "rewards/margins": 0.021872568875551224, + "rewards/rejected": -0.2118840217590332, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 0.20565934479236603, + "learning_rate": 5.336888888888888e-06, + "log_odds_chosen": -0.10769150406122208, + "log_odds_ratio": -0.8185423612594604, + "logits/chosen": 1.7847192287445068, + "logits/rejected": 1.7344152927398682, + "logps/chosen": -1.9659755229949951, + "logps/rejected": -1.8742831945419312, + "loss": 1.791708755493164, + "nll_loss": 1.7098544836044312, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.19659757614135742, + "rewards/margins": -0.009169241413474083, + "rewards/rejected": -0.1874283254146576, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.03596750646829605, + "eval_log_odds_ratio": -0.7587533593177795, + "eval_logits/chosen": 1.7567447423934937, + "eval_logits/rejected": 1.7300385236740112, + "eval_logps/chosen": -1.9963513612747192, + "eval_logps/rejected": -2.0327131748199463, + "eval_loss": 1.83016037940979, + "eval_nll_loss": 1.7542850971221924, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.19963513314723969, + "eval_rewards/margins": 0.0036361950915306807, + "eval_rewards/rejected": -0.20327134430408478, + "eval_runtime": 52.9272, + "eval_samples_per_second": 9.447, + "eval_steps_per_second": 4.723, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 0.17682485282421112, + "learning_rate": 5.301333333333333e-06, + "log_odds_chosen": 0.2020380049943924, + "log_odds_ratio": -0.664644181728363, + "logits/chosen": 1.8102779388427734, + "logits/rejected": 1.7338097095489502, + "logps/chosen": -1.8925039768218994, + "logps/rejected": -2.061493396759033, + "loss": 1.7180700302124023, + "nll_loss": 1.6516058444976807, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1892503798007965, + "rewards/margins": 0.016898952424526215, + "rewards/rejected": -0.20614933967590332, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 0.1735365241765976, + "learning_rate": 5.265777777777778e-06, + "log_odds_chosen": 0.25237327814102173, + "log_odds_ratio": -0.6863128542900085, + "logits/chosen": 1.7943061590194702, + "logits/rejected": 1.7719309329986572, + "logps/chosen": -1.8711779117584229, + "logps/rejected": -2.1002743244171143, + "loss": 1.7531299591064453, + "nll_loss": 1.6844985485076904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.187117800116539, + "rewards/margins": 0.022909630089998245, + "rewards/rejected": -0.21002741158008575, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 0.16449587047100067, + "learning_rate": 5.230222222222222e-06, + "log_odds_chosen": 0.03473002091050148, + "log_odds_ratio": -0.7347549200057983, + "logits/chosen": 1.7814887762069702, + "logits/rejected": 1.641998291015625, + "logps/chosen": -1.9032049179077148, + "logps/rejected": -1.9274908304214478, + "loss": 1.7491247177124023, + "nll_loss": 1.6756490468978882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19032049179077148, + "rewards/margins": 0.0024286056868731976, + "rewards/rejected": -0.19274908304214478, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 0.29633238911628723, + "learning_rate": 5.194666666666666e-06, + "log_odds_chosen": 0.04156870022416115, + "log_odds_ratio": -0.7441189289093018, + "logits/chosen": 1.6556901931762695, + "logits/rejected": 1.6617138385772705, + "logps/chosen": -1.9671058654785156, + "logps/rejected": -1.9902048110961914, + "loss": 1.774675178527832, + "nll_loss": 1.700263261795044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19671057164669037, + "rewards/margins": 0.002309908624738455, + "rewards/rejected": -0.19902050495147705, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 0.3215846121311188, + "learning_rate": 5.159111111111111e-06, + "log_odds_chosen": 0.026115605607628822, + "log_odds_ratio": -0.7675689458847046, + "logits/chosen": 1.7200599908828735, + "logits/rejected": 1.7598450183868408, + "logps/chosen": -1.9010694026947021, + "logps/rejected": -1.948642373085022, + "loss": 1.7812797546386718, + "nll_loss": 1.7045230865478516, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19010695815086365, + "rewards/margins": 0.004757292568683624, + "rewards/rejected": -0.19486424326896667, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 0.1802208572626114, + "learning_rate": 5.1235555555555556e-06, + "log_odds_chosen": 0.24781334400177002, + "log_odds_ratio": -0.6703814268112183, + "logits/chosen": 1.7133815288543701, + "logits/rejected": 1.7213478088378906, + "logps/chosen": -1.7803637981414795, + "logps/rejected": -2.0020906925201416, + "loss": 1.6906932830810546, + "nll_loss": 1.6236553192138672, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1780363768339157, + "rewards/margins": 0.022172680124640465, + "rewards/rejected": -0.20020906627178192, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 0.15786853432655334, + "learning_rate": 5.088e-06, + "log_odds_chosen": 0.1844882071018219, + "log_odds_ratio": -0.6841514110565186, + "logits/chosen": 1.630793809890747, + "logits/rejected": 1.557401418685913, + "logps/chosen": -1.9179973602294922, + "logps/rejected": -2.0797696113586426, + "loss": 1.7438488006591797, + "nll_loss": 1.6754337549209595, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19179973006248474, + "rewards/margins": 0.016177207231521606, + "rewards/rejected": -0.20797693729400635, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 0.14527210593223572, + "learning_rate": 5.052444444444444e-06, + "log_odds_chosen": 0.01497338991612196, + "log_odds_ratio": -0.7727933526039124, + "logits/chosen": 1.7897268533706665, + "logits/rejected": 1.6854931116104126, + "logps/chosen": -1.8946952819824219, + "logps/rejected": -1.9192304611206055, + "loss": 1.8023780822753905, + "nll_loss": 1.7250983715057373, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.189469575881958, + "rewards/margins": 0.002453479217365384, + "rewards/rejected": -0.19192305207252502, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 0.17399252951145172, + "learning_rate": 5.016888888888888e-06, + "log_odds_chosen": 0.14793001115322113, + "log_odds_ratio": -0.688338041305542, + "logits/chosen": 1.8418161869049072, + "logits/rejected": 1.8267091512680054, + "logps/chosen": -1.9205141067504883, + "logps/rejected": -2.0402419567108154, + "loss": 1.799574851989746, + "nll_loss": 1.7307411432266235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19205141067504883, + "rewards/margins": 0.011972772888839245, + "rewards/rejected": -0.20402422547340393, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 0.1753067970275879, + "learning_rate": 4.9813333333333335e-06, + "log_odds_chosen": 0.11683394014835358, + "log_odds_ratio": -0.7051060795783997, + "logits/chosen": 1.799207091331482, + "logits/rejected": 1.817365288734436, + "logps/chosen": -1.9414863586425781, + "logps/rejected": -2.025501251220703, + "loss": 1.7512321472167969, + "nll_loss": 1.6807218790054321, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19414862990379333, + "rewards/margins": 0.008401499129831791, + "rewards/rejected": -0.20255012810230255, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.0326814129948616, + "eval_log_odds_ratio": -0.7593976855278015, + "eval_logits/chosen": 1.8343305587768555, + "eval_logits/rejected": 1.8080686330795288, + "eval_logps/chosen": -1.9903963804244995, + "eval_logps/rejected": -2.0238075256347656, + "eval_loss": 1.82623291015625, + "eval_nll_loss": 1.750293254852295, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19903963804244995, + "eval_rewards/margins": 0.0033411220647394657, + "eval_rewards/rejected": -0.20238077640533447, + "eval_runtime": 53.2827, + "eval_samples_per_second": 9.384, + "eval_steps_per_second": 4.692, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 0.16481465101242065, + "learning_rate": 4.945777777777777e-06, + "log_odds_chosen": 0.2010401487350464, + "log_odds_ratio": -0.7306350469589233, + "logits/chosen": 1.8544563055038452, + "logits/rejected": 1.8101685047149658, + "logps/chosen": -2.000797986984253, + "logps/rejected": -2.1975042819976807, + "loss": 1.840962791442871, + "nll_loss": 1.7678991556167603, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20007982850074768, + "rewards/margins": 0.019670633599162102, + "rewards/rejected": -0.21975044906139374, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 0.23172008991241455, + "learning_rate": 4.910222222222222e-06, + "log_odds_chosen": -0.013389323838055134, + "log_odds_ratio": -0.7784253358840942, + "logits/chosen": 1.8498008251190186, + "logits/rejected": 1.8121620416641235, + "logps/chosen": -1.996519684791565, + "logps/rejected": -1.9884834289550781, + "loss": 1.797800636291504, + "nll_loss": 1.7199580669403076, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19965195655822754, + "rewards/margins": -0.0008036093786358833, + "rewards/rejected": -0.19884835183620453, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 0.1659555435180664, + "learning_rate": 4.874666666666666e-06, + "log_odds_chosen": 0.061679303646087646, + "log_odds_ratio": -0.7274680137634277, + "logits/chosen": 1.8414134979248047, + "logits/rejected": 1.728663444519043, + "logps/chosen": -1.9462897777557373, + "logps/rejected": -1.996514916419983, + "loss": 1.7753076553344727, + "nll_loss": 1.7025604248046875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1946289837360382, + "rewards/margins": 0.005022515542805195, + "rewards/rejected": -0.19965150952339172, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 0.23450258374214172, + "learning_rate": 4.839111111111111e-06, + "log_odds_chosen": 0.013548937626183033, + "log_odds_ratio": -0.7774235606193542, + "logits/chosen": 1.762843370437622, + "logits/rejected": 1.7083097696304321, + "logps/chosen": -2.0285325050354004, + "logps/rejected": -2.0414459705352783, + "loss": 1.8112398147583009, + "nll_loss": 1.7334976196289062, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2028532326221466, + "rewards/margins": 0.0012913575628772378, + "rewards/rejected": -0.20414459705352783, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 0.24944870173931122, + "learning_rate": 4.803555555555555e-06, + "log_odds_chosen": 0.079915352165699, + "log_odds_ratio": -0.736060380935669, + "logits/chosen": 1.7173516750335693, + "logits/rejected": 1.64029860496521, + "logps/chosen": -1.9370155334472656, + "logps/rejected": -2.007744312286377, + "loss": 1.7322145462036134, + "nll_loss": 1.6586081981658936, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19370155036449432, + "rewards/margins": 0.00707287946715951, + "rewards/rejected": -0.2007744312286377, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 0.20937688648700714, + "learning_rate": 4.768e-06, + "log_odds_chosen": 0.15667779743671417, + "log_odds_ratio": -0.678199827671051, + "logits/chosen": 1.8389065265655518, + "logits/rejected": 1.811120629310608, + "logps/chosen": -1.8186038732528687, + "logps/rejected": -1.948980689048767, + "loss": 1.71126708984375, + "nll_loss": 1.643446922302246, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18186041712760925, + "rewards/margins": 0.013037679716944695, + "rewards/rejected": -0.1948980987071991, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 0.15026956796646118, + "learning_rate": 4.732444444444444e-06, + "log_odds_chosen": 0.10751942545175552, + "log_odds_ratio": -0.7100101709365845, + "logits/chosen": 1.8801990747451782, + "logits/rejected": 1.8757660388946533, + "logps/chosen": -1.9047510623931885, + "logps/rejected": -1.9837467670440674, + "loss": 1.7867193222045898, + "nll_loss": 1.7157180309295654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19047510623931885, + "rewards/margins": 0.007899556308984756, + "rewards/rejected": -0.1983746588230133, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 0.20250067114830017, + "learning_rate": 4.696888888888889e-06, + "log_odds_chosen": 0.10309334099292755, + "log_odds_ratio": -0.7117387056350708, + "logits/chosen": 1.816712737083435, + "logits/rejected": 1.689218282699585, + "logps/chosen": -1.9982006549835205, + "logps/rejected": -2.075469493865967, + "loss": 1.802886390686035, + "nll_loss": 1.7317125797271729, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19982007145881653, + "rewards/margins": 0.007726915180683136, + "rewards/rejected": -0.20754699409008026, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 0.1596970111131668, + "learning_rate": 4.661333333333333e-06, + "log_odds_chosen": 0.105086550116539, + "log_odds_ratio": -0.6774314641952515, + "logits/chosen": 1.7848875522613525, + "logits/rejected": 1.8055425882339478, + "logps/chosen": -1.9509786367416382, + "logps/rejected": -2.0330584049224854, + "loss": 1.8003599166870117, + "nll_loss": 1.7326167821884155, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.195097878575325, + "rewards/margins": 0.008207983337342739, + "rewards/rejected": -0.20330584049224854, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 0.18611599504947662, + "learning_rate": 4.625777777777777e-06, + "log_odds_chosen": 0.2091858834028244, + "log_odds_ratio": -0.6977974772453308, + "logits/chosen": 1.768100380897522, + "logits/rejected": 1.745809555053711, + "logps/chosen": -1.8548533916473389, + "logps/rejected": -2.024160861968994, + "loss": 1.7493671417236327, + "nll_loss": 1.6795871257781982, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1854853630065918, + "rewards/margins": 0.016930732876062393, + "rewards/rejected": -0.2024160921573639, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.04138939082622528, + "eval_log_odds_ratio": -0.7565131783485413, + "eval_logits/chosen": 1.7945410013198853, + "eval_logits/rejected": 1.7709347009658813, + "eval_logps/chosen": -1.9902427196502686, + "eval_logps/rejected": -2.0308339595794678, + "eval_loss": 1.821491003036499, + "eval_nll_loss": 1.7458395957946777, + "eval_rewards/accuracies": 0.5419999957084656, + "eval_rewards/chosen": -0.1990242600440979, + "eval_rewards/margins": 0.00405914057046175, + "eval_rewards/rejected": -0.20308341085910797, + "eval_runtime": 53.0681, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 0.19560930132865906, + "learning_rate": 4.5902222222222225e-06, + "log_odds_chosen": 0.1029076799750328, + "log_odds_ratio": -0.7135358452796936, + "logits/chosen": 1.7437855005264282, + "logits/rejected": 1.7035853862762451, + "logps/chosen": -1.9591327905654907, + "logps/rejected": -2.0517547130584717, + "loss": 1.7739175796508788, + "nll_loss": 1.7025638818740845, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19591325521469116, + "rewards/margins": 0.009262214414775372, + "rewards/rejected": -0.2051754742860794, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 0.161069855093956, + "learning_rate": 4.554666666666666e-06, + "log_odds_chosen": -0.12622077763080597, + "log_odds_ratio": -0.8286466598510742, + "logits/chosen": 1.9193840026855469, + "logits/rejected": 1.8464590311050415, + "logps/chosen": -1.9970428943634033, + "logps/rejected": -1.9042726755142212, + "loss": 1.7340478897094727, + "nll_loss": 1.6511831283569336, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.19970427453517914, + "rewards/margins": -0.009276997298002243, + "rewards/rejected": -0.1904272735118866, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 0.27507203817367554, + "learning_rate": 4.519111111111111e-06, + "log_odds_chosen": 0.03895152360200882, + "log_odds_ratio": -0.7696425318717957, + "logits/chosen": 1.8306211233139038, + "logits/rejected": 1.8841800689697266, + "logps/chosen": -1.9324079751968384, + "logps/rejected": -1.969473123550415, + "loss": 1.74224910736084, + "nll_loss": 1.6652848720550537, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19324080646038055, + "rewards/margins": 0.003706505987793207, + "rewards/rejected": -0.19694730639457703, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 0.34326133131980896, + "learning_rate": 4.483555555555555e-06, + "log_odds_chosen": 0.09721614420413971, + "log_odds_ratio": -0.746010959148407, + "logits/chosen": 1.8264877796173096, + "logits/rejected": 1.8560062646865845, + "logps/chosen": -1.8512614965438843, + "logps/rejected": -1.9252874851226807, + "loss": 1.749200439453125, + "nll_loss": 1.6745994091033936, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1851261556148529, + "rewards/margins": 0.007402592804282904, + "rewards/rejected": -0.19252872467041016, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 0.21193479001522064, + "learning_rate": 4.4480000000000004e-06, + "log_odds_chosen": 0.15323859453201294, + "log_odds_ratio": -0.6695608496665955, + "logits/chosen": 1.8079290390014648, + "logits/rejected": 1.8241838216781616, + "logps/chosen": -1.9059537649154663, + "logps/rejected": -2.032207489013672, + "loss": 1.7593469619750977, + "nll_loss": 1.6923907995224, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19059538841247559, + "rewards/margins": 0.012625358998775482, + "rewards/rejected": -0.20322072505950928, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 0.20177887380123138, + "learning_rate": 4.412444444444444e-06, + "log_odds_chosen": 0.18453414738178253, + "log_odds_ratio": -0.6602174639701843, + "logits/chosen": 1.7973260879516602, + "logits/rejected": 1.800276517868042, + "logps/chosen": -1.9282119274139404, + "logps/rejected": -2.0611374378204346, + "loss": 1.7722354888916017, + "nll_loss": 1.7062139511108398, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1928211897611618, + "rewards/margins": 0.013292545452713966, + "rewards/rejected": -0.20611374080181122, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 0.19174514710903168, + "learning_rate": 4.376888888888889e-06, + "log_odds_chosen": 0.17968787252902985, + "log_odds_ratio": -0.6729904413223267, + "logits/chosen": 1.825566053390503, + "logits/rejected": 1.7393420934677124, + "logps/chosen": -1.8471599817276, + "logps/rejected": -1.9890083074569702, + "loss": 1.722864532470703, + "nll_loss": 1.6555652618408203, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18471598625183105, + "rewards/margins": 0.01418487448245287, + "rewards/rejected": -0.19890084862709045, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 0.21415722370147705, + "learning_rate": 4.341333333333333e-06, + "log_odds_chosen": 0.1800077110528946, + "log_odds_ratio": -0.7383973598480225, + "logits/chosen": 1.849203109741211, + "logits/rejected": 1.7909198999404907, + "logps/chosen": -1.999140977859497, + "logps/rejected": -2.184372901916504, + "loss": 1.7739013671875, + "nll_loss": 1.7000617980957031, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1999140828847885, + "rewards/margins": 0.018523216247558594, + "rewards/rejected": -0.2184373140335083, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 0.21206778287887573, + "learning_rate": 4.305777777777778e-06, + "log_odds_chosen": -0.07741276919841766, + "log_odds_ratio": -0.817004382610321, + "logits/chosen": 1.7782243490219116, + "logits/rejected": 1.7478315830230713, + "logps/chosen": -1.9559736251831055, + "logps/rejected": -1.9229551553726196, + "loss": 1.7454774856567383, + "nll_loss": 1.6637769937515259, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19559738039970398, + "rewards/margins": -0.003301867749541998, + "rewards/rejected": -0.19229550659656525, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 0.2803303599357605, + "learning_rate": 4.270222222222222e-06, + "log_odds_chosen": 0.37417811155319214, + "log_odds_ratio": -0.6541550755500793, + "logits/chosen": 1.9080060720443726, + "logits/rejected": 1.781346082687378, + "logps/chosen": -1.8062732219696045, + "logps/rejected": -2.1362411975860596, + "loss": 1.7089204788208008, + "nll_loss": 1.6435050964355469, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18062731623649597, + "rewards/margins": 0.032996825873851776, + "rewards/rejected": -0.21362414956092834, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.03616252541542053, + "eval_log_odds_ratio": -0.7587297558784485, + "eval_logits/chosen": 1.8630521297454834, + "eval_logits/rejected": 1.8428224325180054, + "eval_logps/chosen": -1.9820479154586792, + "eval_logps/rejected": -2.0179696083068848, + "eval_loss": 1.816968560218811, + "eval_nll_loss": 1.7410955429077148, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19820478558540344, + "eval_rewards/margins": 0.0035921703092753887, + "eval_rewards/rejected": -0.20179696381092072, + "eval_runtime": 53.1139, + "eval_samples_per_second": 9.414, + "eval_steps_per_second": 4.707, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 0.142170250415802, + "learning_rate": 4.234666666666666e-06, + "log_odds_chosen": 0.08604643493890762, + "log_odds_ratio": -0.7244693636894226, + "logits/chosen": 1.8486318588256836, + "logits/rejected": 1.8644578456878662, + "logps/chosen": -1.8851855993270874, + "logps/rejected": -1.9627344608306885, + "loss": 1.7843137741088868, + "nll_loss": 1.7118666172027588, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18851855397224426, + "rewards/margins": 0.007754878140985966, + "rewards/rejected": -0.19627343118190765, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 0.19082266092300415, + "learning_rate": 4.199111111111111e-06, + "log_odds_chosen": 0.10004905611276627, + "log_odds_ratio": -0.7227860689163208, + "logits/chosen": 1.7743819952011108, + "logits/rejected": 1.8303782939910889, + "logps/chosen": -1.8624858856201172, + "logps/rejected": -1.9419944286346436, + "loss": 1.7142595291137694, + "nll_loss": 1.6419808864593506, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.18624861538410187, + "rewards/margins": 0.00795083586126566, + "rewards/rejected": -0.19419944286346436, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 0.15585362911224365, + "learning_rate": 4.1635555555555554e-06, + "log_odds_chosen": 0.13298745453357697, + "log_odds_ratio": -0.7184717655181885, + "logits/chosen": 1.8939129114151, + "logits/rejected": 1.8457763195037842, + "logps/chosen": -1.7758653163909912, + "logps/rejected": -1.9016306400299072, + "loss": 1.758875274658203, + "nll_loss": 1.687028169631958, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17758652567863464, + "rewards/margins": 0.012576532550156116, + "rewards/rejected": -0.19016307592391968, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 0.2081124633550644, + "learning_rate": 4.128e-06, + "log_odds_chosen": 0.04596617445349693, + "log_odds_ratio": -0.787051260471344, + "logits/chosen": 1.866758942604065, + "logits/rejected": 1.8375155925750732, + "logps/chosen": -2.0039141178131104, + "logps/rejected": -2.0341384410858154, + "loss": 1.7708185195922852, + "nll_loss": 1.6921135187149048, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20039141178131104, + "rewards/margins": 0.0030224404763430357, + "rewards/rejected": -0.20341384410858154, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 0.20339496433734894, + "learning_rate": 4.092444444444444e-06, + "log_odds_chosen": 0.06565378606319427, + "log_odds_ratio": -0.7353135943412781, + "logits/chosen": 1.8310129642486572, + "logits/rejected": 1.7633676528930664, + "logps/chosen": -1.9292491674423218, + "logps/rejected": -1.9627234935760498, + "loss": 1.7951900482177734, + "nll_loss": 1.72165846824646, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19292493164539337, + "rewards/margins": 0.0033474296797066927, + "rewards/rejected": -0.1962723433971405, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 0.1730116605758667, + "learning_rate": 4.056888888888889e-06, + "log_odds_chosen": 0.07272230833768845, + "log_odds_ratio": -0.7393074035644531, + "logits/chosen": 1.8491909503936768, + "logits/rejected": 1.849678635597229, + "logps/chosen": -2.0217833518981934, + "logps/rejected": -2.0789294242858887, + "loss": 1.7909440994262695, + "nll_loss": 1.7170133590698242, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20217831432819366, + "rewards/margins": 0.005714614875614643, + "rewards/rejected": -0.20789292454719543, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 0.20503339171409607, + "learning_rate": 4.021333333333333e-06, + "log_odds_chosen": 0.3173345625400543, + "log_odds_ratio": -0.6747878789901733, + "logits/chosen": 1.777547836303711, + "logits/rejected": 1.7704051733016968, + "logps/chosen": -1.83544921875, + "logps/rejected": -2.1259076595306396, + "loss": 1.6998504638671874, + "nll_loss": 1.6323719024658203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18354493379592896, + "rewards/margins": 0.02904583141207695, + "rewards/rejected": -0.2125907689332962, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 0.20165199041366577, + "learning_rate": 3.985777777777778e-06, + "log_odds_chosen": 0.0595523826777935, + "log_odds_ratio": -0.7062498927116394, + "logits/chosen": 1.8233400583267212, + "logits/rejected": 1.8652807474136353, + "logps/chosen": -1.9540764093399048, + "logps/rejected": -2.0037131309509277, + "loss": 1.7615316390991211, + "nll_loss": 1.6909065246582031, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1954076588153839, + "rewards/margins": 0.0049636876210570335, + "rewards/rejected": -0.20037131011486053, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 0.15956999361515045, + "learning_rate": 3.950222222222222e-06, + "log_odds_chosen": 0.050791315734386444, + "log_odds_ratio": -0.730897843837738, + "logits/chosen": 1.8976904153823853, + "logits/rejected": 1.8682187795639038, + "logps/chosen": -1.9485187530517578, + "logps/rejected": -1.9784114360809326, + "loss": 1.8059589385986328, + "nll_loss": 1.7328689098358154, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.19485187530517578, + "rewards/margins": 0.0029892651364207268, + "rewards/rejected": -0.1978411227464676, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 0.16580845415592194, + "learning_rate": 3.9146666666666666e-06, + "log_odds_chosen": 0.13212139904499054, + "log_odds_ratio": -0.7306901216506958, + "logits/chosen": 1.9302421808242798, + "logits/rejected": 1.9482580423355103, + "logps/chosen": -1.8804752826690674, + "logps/rejected": -1.9942712783813477, + "loss": 1.7713314056396485, + "nll_loss": 1.6982624530792236, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18804752826690674, + "rewards/margins": 0.01137961633503437, + "rewards/rejected": -0.19942712783813477, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.03508565574884415, + "eval_log_odds_ratio": -0.7589212656021118, + "eval_logits/chosen": 1.8507174253463745, + "eval_logits/rejected": 1.8313792943954468, + "eval_logps/chosen": -1.9807339906692505, + "eval_logps/rejected": -2.016141653060913, + "eval_loss": 1.8125864267349243, + "eval_nll_loss": 1.7366943359375, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19807341694831848, + "eval_rewards/margins": 0.0035407766699790955, + "eval_rewards/rejected": -0.20161418616771698, + "eval_runtime": 53.4841, + "eval_samples_per_second": 9.349, + "eval_steps_per_second": 4.674, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.21889333426952362, + "learning_rate": 3.87911111111111e-06, + "log_odds_chosen": 0.035620056092739105, + "log_odds_ratio": -0.7623010873794556, + "logits/chosen": 1.81222665309906, + "logits/rejected": 1.8632707595825195, + "logps/chosen": -1.960524320602417, + "logps/rejected": -1.9879045486450195, + "loss": 1.8127471923828125, + "nll_loss": 1.7365171909332275, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1960524320602417, + "rewards/margins": 0.002738040406256914, + "rewards/rejected": -0.19879046082496643, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.22390495240688324, + "learning_rate": 3.843555555555555e-06, + "log_odds_chosen": 0.0008658409351482987, + "log_odds_ratio": -0.7709900736808777, + "logits/chosen": 1.7942880392074585, + "logits/rejected": 1.8574626445770264, + "logps/chosen": -1.954573392868042, + "logps/rejected": -1.9510538578033447, + "loss": 1.7888818740844727, + "nll_loss": 1.7117828130722046, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1954573690891266, + "rewards/margins": -0.000351964496076107, + "rewards/rejected": -0.1951053887605667, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 0.2423250824213028, + "learning_rate": 3.808e-06, + "log_odds_chosen": 0.20300379395484924, + "log_odds_ratio": -0.7344587445259094, + "logits/chosen": 1.8965059518814087, + "logits/rejected": 1.865016222000122, + "logps/chosen": -1.830789566040039, + "logps/rejected": -2.030541181564331, + "loss": 1.7464214324951173, + "nll_loss": 1.672975778579712, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18307895958423615, + "rewards/margins": 0.01997516117990017, + "rewards/rejected": -0.20305411517620087, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 0.15725889801979065, + "learning_rate": 3.7724444444444445e-06, + "log_odds_chosen": -0.013941275887191296, + "log_odds_ratio": -0.7538126707077026, + "logits/chosen": 1.8744310140609741, + "logits/rejected": 1.8045070171356201, + "logps/chosen": -1.9909547567367554, + "logps/rejected": -1.9801406860351562, + "loss": 1.7410266876220704, + "nll_loss": 1.6656453609466553, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1990954577922821, + "rewards/margins": -0.0010814003180712461, + "rewards/rejected": -0.1980140656232834, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 0.17255175113677979, + "learning_rate": 3.7368888888888883e-06, + "log_odds_chosen": 0.010356083512306213, + "log_odds_ratio": -0.7647982239723206, + "logits/chosen": 1.9456676244735718, + "logits/rejected": 1.8851861953735352, + "logps/chosen": -1.9676564931869507, + "logps/rejected": -1.9645713567733765, + "loss": 1.7504322052001953, + "nll_loss": 1.6739521026611328, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19676563143730164, + "rewards/margins": -0.0003085043281316757, + "rewards/rejected": -0.1964571326971054, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 0.18616439402103424, + "learning_rate": 3.701333333333333e-06, + "log_odds_chosen": 0.06657058745622635, + "log_odds_ratio": -0.7217355370521545, + "logits/chosen": 1.8338171243667603, + "logits/rejected": 1.8438618183135986, + "logps/chosen": -1.98470139503479, + "logps/rejected": -2.043152332305908, + "loss": 1.7677242279052734, + "nll_loss": 1.6955506801605225, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1984701305627823, + "rewards/margins": 0.005845111794769764, + "rewards/rejected": -0.20431523025035858, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 0.22820734977722168, + "learning_rate": 3.6657777777777773e-06, + "log_odds_chosen": 0.10153277218341827, + "log_odds_ratio": -0.6983687281608582, + "logits/chosen": 1.915000319480896, + "logits/rejected": 1.9120140075683594, + "logps/chosen": -1.927222490310669, + "logps/rejected": -2.0119576454162598, + "loss": 1.7736129760742188, + "nll_loss": 1.7037763595581055, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19272224605083466, + "rewards/margins": 0.008473522961139679, + "rewards/rejected": -0.20119579136371613, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 0.17705458402633667, + "learning_rate": 3.630222222222222e-06, + "log_odds_chosen": 0.15250150859355927, + "log_odds_ratio": -0.7717846035957336, + "logits/chosen": 1.814196228981018, + "logits/rejected": 1.7537425756454468, + "logps/chosen": -1.879522681236267, + "logps/rejected": -2.0431935787200928, + "loss": 1.7213356018066406, + "nll_loss": 1.6441571712493896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18795228004455566, + "rewards/margins": 0.016367079690098763, + "rewards/rejected": -0.20431935787200928, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 0.18402282893657684, + "learning_rate": 3.5946666666666662e-06, + "log_odds_chosen": 0.019868457689881325, + "log_odds_ratio": -0.7824932336807251, + "logits/chosen": 1.932429552078247, + "logits/rejected": 1.8701515197753906, + "logps/chosen": -2.009019136428833, + "logps/rejected": -2.0110716819763184, + "loss": 1.7473308563232421, + "nll_loss": 1.6690819263458252, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20090194046497345, + "rewards/margins": 0.00020524598949123174, + "rewards/rejected": -0.2011071741580963, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 0.2257532775402069, + "learning_rate": 3.559111111111111e-06, + "log_odds_chosen": 0.17915096879005432, + "log_odds_ratio": -0.6777058839797974, + "logits/chosen": 1.8818708658218384, + "logits/rejected": 1.8758437633514404, + "logps/chosen": -1.7928282022476196, + "logps/rejected": -1.9602582454681396, + "loss": 1.6830949783325195, + "nll_loss": 1.6153247356414795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17928281426429749, + "rewards/margins": 0.016743017360568047, + "rewards/rejected": -0.1960258185863495, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.035420093685388565, + "eval_log_odds_ratio": -0.7593028545379639, + "eval_logits/chosen": 1.8614498376846313, + "eval_logits/rejected": 1.8436392545700073, + "eval_logps/chosen": -1.975562572479248, + "eval_logps/rejected": -2.0109596252441406, + "eval_loss": 1.8058576583862305, + "eval_nll_loss": 1.72992742061615, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.1975562423467636, + "eval_rewards/margins": 0.00353970006108284, + "eval_rewards/rejected": -0.20109596848487854, + "eval_runtime": 53.0644, + "eval_samples_per_second": 9.423, + "eval_steps_per_second": 4.711, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 0.2579845190048218, + "learning_rate": 3.5235555555555556e-06, + "log_odds_chosen": 0.10818381607532501, + "log_odds_ratio": -0.705440878868103, + "logits/chosen": 1.868035912513733, + "logits/rejected": 1.867733359336853, + "logps/chosen": -1.9699623584747314, + "logps/rejected": -2.055683135986328, + "loss": 1.7757377624511719, + "nll_loss": 1.7051931619644165, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19699624180793762, + "rewards/margins": 0.00857207365334034, + "rewards/rejected": -0.2055683135986328, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 0.1755809634923935, + "learning_rate": 3.488e-06, + "log_odds_chosen": 0.2761983275413513, + "log_odds_ratio": -0.6301968693733215, + "logits/chosen": 1.9086204767227173, + "logits/rejected": 1.905368447303772, + "logps/chosen": -1.8556480407714844, + "logps/rejected": -2.0749871730804443, + "loss": 1.7898880004882813, + "nll_loss": 1.7268680334091187, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1855648010969162, + "rewards/margins": 0.02193392440676689, + "rewards/rejected": -0.2074987143278122, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 0.17956162989139557, + "learning_rate": 3.4524444444444446e-06, + "log_odds_chosen": -0.14030766487121582, + "log_odds_ratio": -0.8471817970275879, + "logits/chosen": 1.790841817855835, + "logits/rejected": 1.7734079360961914, + "logps/chosen": -2.167576551437378, + "logps/rejected": -2.0355286598205566, + "loss": 1.8703905105590821, + "nll_loss": 1.7856724262237549, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2167576551437378, + "rewards/margins": -0.013204795308411121, + "rewards/rejected": -0.20355287194252014, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 0.23301228880882263, + "learning_rate": 3.416888888888889e-06, + "log_odds_chosen": 0.19442808628082275, + "log_odds_ratio": -0.6882377862930298, + "logits/chosen": 1.8302663564682007, + "logits/rejected": 1.734037160873413, + "logps/chosen": -1.9154956340789795, + "logps/rejected": -2.073822259902954, + "loss": 1.788707160949707, + "nll_loss": 1.7198832035064697, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19154959917068481, + "rewards/margins": 0.015832625329494476, + "rewards/rejected": -0.2073822021484375, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 0.26810088753700256, + "learning_rate": 3.3813333333333335e-06, + "log_odds_chosen": 0.1409793198108673, + "log_odds_ratio": -0.6711575388908386, + "logits/chosen": 1.708510398864746, + "logits/rejected": 1.7307822704315186, + "logps/chosen": -1.9060001373291016, + "logps/rejected": -2.020759105682373, + "loss": 1.6427623748779296, + "nll_loss": 1.5756465196609497, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19060000777244568, + "rewards/margins": 0.011475888080894947, + "rewards/rejected": -0.20207588374614716, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 0.2061609923839569, + "learning_rate": 3.3457777777777774e-06, + "log_odds_chosen": -0.006243853364139795, + "log_odds_ratio": -0.751319169998169, + "logits/chosen": 1.8289045095443726, + "logits/rejected": 1.8576431274414062, + "logps/chosen": -1.9457448720932007, + "logps/rejected": -1.9425971508026123, + "loss": 1.7755237579345704, + "nll_loss": 1.7003915309906006, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1945744901895523, + "rewards/margins": -0.000314765318762511, + "rewards/rejected": -0.19425971806049347, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 0.22716669738292694, + "learning_rate": 3.310222222222222e-06, + "log_odds_chosen": -0.006850877311080694, + "log_odds_ratio": -0.7553213834762573, + "logits/chosen": 1.8095941543579102, + "logits/rejected": 1.8769876956939697, + "logps/chosen": -1.980285882949829, + "logps/rejected": -1.961301565170288, + "loss": 1.7145851135253907, + "nll_loss": 1.6390529870986938, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19802860915660858, + "rewards/margins": -0.0018984429771080613, + "rewards/rejected": -0.19613012671470642, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 0.2657179534435272, + "learning_rate": 3.2746666666666663e-06, + "log_odds_chosen": 0.2205655872821808, + "log_odds_ratio": -0.6461672782897949, + "logits/chosen": 1.753933310508728, + "logits/rejected": 1.771116852760315, + "logps/chosen": -1.8913685083389282, + "logps/rejected": -2.0808615684509277, + "loss": 1.682852554321289, + "nll_loss": 1.6182358264923096, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1891368329524994, + "rewards/margins": 0.01894933171570301, + "rewards/rejected": -0.20808616280555725, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 0.22542321681976318, + "learning_rate": 3.239111111111111e-06, + "log_odds_chosen": 0.10629250854253769, + "log_odds_ratio": -0.720824658870697, + "logits/chosen": 1.9775307178497314, + "logits/rejected": 1.9780542850494385, + "logps/chosen": -1.898146629333496, + "logps/rejected": -1.9933639764785767, + "loss": 1.7453948974609375, + "nll_loss": 1.6733121871948242, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18981468677520752, + "rewards/margins": 0.00952172465622425, + "rewards/rejected": -0.19933640956878662, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 0.23766738176345825, + "learning_rate": 3.2035555555555553e-06, + "log_odds_chosen": 0.2950454652309418, + "log_odds_ratio": -0.646562397480011, + "logits/chosen": 1.8443620204925537, + "logits/rejected": 1.86007559299469, + "logps/chosen": -1.8024790287017822, + "logps/rejected": -2.0292277336120605, + "loss": 1.6824573516845702, + "nll_loss": 1.617801308631897, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18024791777133942, + "rewards/margins": 0.022674862295389175, + "rewards/rejected": -0.2029227763414383, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.03487193211913109, + "eval_log_odds_ratio": -0.7592138648033142, + "eval_logits/chosen": 1.8726606369018555, + "eval_logits/rejected": 1.8562979698181152, + "eval_logps/chosen": -1.975594162940979, + "eval_logps/rejected": -2.0101206302642822, + "eval_loss": 1.7973711490631104, + "eval_nll_loss": 1.721449851989746, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.1975594162940979, + "eval_rewards/margins": 0.0034526519011706114, + "eval_rewards/rejected": -0.20101207494735718, + "eval_runtime": 53.2399, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.696, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 0.18830521404743195, + "learning_rate": 3.168e-06, + "log_odds_chosen": 0.030118698254227638, + "log_odds_ratio": -0.7624102830886841, + "logits/chosen": 1.911879301071167, + "logits/rejected": 1.9447228908538818, + "logps/chosen": -2.0052685737609863, + "logps/rejected": -2.0304019451141357, + "loss": 1.7501512527465821, + "nll_loss": 1.6739099025726318, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2005268633365631, + "rewards/margins": 0.002513363491743803, + "rewards/rejected": -0.2030402421951294, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 0.21792171895503998, + "learning_rate": 3.1324444444444443e-06, + "log_odds_chosen": 0.2323339432477951, + "log_odds_ratio": -0.6613295078277588, + "logits/chosen": 1.8960516452789307, + "logits/rejected": 1.8715391159057617, + "logps/chosen": -1.7550809383392334, + "logps/rejected": -1.9603891372680664, + "loss": 1.681437873840332, + "nll_loss": 1.6153051853179932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17550811171531677, + "rewards/margins": 0.020530786365270615, + "rewards/rejected": -0.1960388869047165, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 0.2356945127248764, + "learning_rate": 3.096888888888889e-06, + "log_odds_chosen": 0.1009933203458786, + "log_odds_ratio": -0.7096881866455078, + "logits/chosen": 1.835608720779419, + "logits/rejected": 1.8802179098129272, + "logps/chosen": -1.9452965259552002, + "logps/rejected": -2.0278029441833496, + "loss": 1.7155517578125, + "nll_loss": 1.644582986831665, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19452962279319763, + "rewards/margins": 0.008250661194324493, + "rewards/rejected": -0.20278029143810272, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 0.22595520317554474, + "learning_rate": 3.0613333333333332e-06, + "log_odds_chosen": 0.1242746114730835, + "log_odds_ratio": -0.7135905027389526, + "logits/chosen": 1.8529307842254639, + "logits/rejected": 1.7763475179672241, + "logps/chosen": -1.9397623538970947, + "logps/rejected": -2.0596587657928467, + "loss": 1.7964458465576172, + "nll_loss": 1.7250868082046509, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1939762532711029, + "rewards/margins": 0.011989672668278217, + "rewards/rejected": -0.20596590638160706, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 0.24827434122562408, + "learning_rate": 3.025777777777778e-06, + "log_odds_chosen": -0.030877679586410522, + "log_odds_ratio": -0.7905367612838745, + "logits/chosen": 1.881206750869751, + "logits/rejected": 1.9151794910430908, + "logps/chosen": -1.8896806240081787, + "logps/rejected": -1.8718645572662354, + "loss": 1.757571029663086, + "nll_loss": 1.6785169839859009, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.18896806240081787, + "rewards/margins": -0.0017816193867474794, + "rewards/rejected": -0.18718644976615906, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 0.18985870480537415, + "learning_rate": 2.990222222222222e-06, + "log_odds_chosen": 0.20515501499176025, + "log_odds_ratio": -0.6580603718757629, + "logits/chosen": 2.0630805492401123, + "logits/rejected": 1.9575140476226807, + "logps/chosen": -1.8301007747650146, + "logps/rejected": -1.9865009784698486, + "loss": 1.735894012451172, + "nll_loss": 1.6700878143310547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18301010131835938, + "rewards/margins": 0.01563999056816101, + "rewards/rejected": -0.19865009188652039, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 0.26883554458618164, + "learning_rate": 2.9546666666666664e-06, + "log_odds_chosen": 0.261068195104599, + "log_odds_ratio": -0.6631112694740295, + "logits/chosen": 1.9392192363739014, + "logits/rejected": 1.8922193050384521, + "logps/chosen": -1.8296535015106201, + "logps/rejected": -2.047602891921997, + "loss": 1.6895122528076172, + "nll_loss": 1.6232010126113892, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18296536803245544, + "rewards/margins": 0.021794941276311874, + "rewards/rejected": -0.20476031303405762, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.2076137214899063, + "learning_rate": 2.9191111111111107e-06, + "log_odds_chosen": 0.08022954314947128, + "log_odds_ratio": -0.7145732045173645, + "logits/chosen": 1.903964638710022, + "logits/rejected": 1.8929418325424194, + "logps/chosen": -2.0156121253967285, + "logps/rejected": -2.0851552486419678, + "loss": 1.7568914413452148, + "nll_loss": 1.685434103012085, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20156121253967285, + "rewards/margins": 0.0069543360732495785, + "rewards/rejected": -0.20851555466651917, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.27581116557121277, + "learning_rate": 2.8835555555555554e-06, + "log_odds_chosen": 0.22984762489795685, + "log_odds_ratio": -0.6419785618782043, + "logits/chosen": 1.9384571313858032, + "logits/rejected": 1.9112564325332642, + "logps/chosen": -1.8010374307632446, + "logps/rejected": -1.9852313995361328, + "loss": 1.6407814025878906, + "nll_loss": 1.5765835046768188, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18010374903678894, + "rewards/margins": 0.018419397994875908, + "rewards/rejected": -0.1985231339931488, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.25675809383392334, + "learning_rate": 2.8479999999999997e-06, + "log_odds_chosen": 0.04376112297177315, + "log_odds_ratio": -0.753434956073761, + "logits/chosen": 1.9162076711654663, + "logits/rejected": 1.8986154794692993, + "logps/chosen": -1.8424959182739258, + "logps/rejected": -1.8933817148208618, + "loss": 1.7376501083374023, + "nll_loss": 1.662306547164917, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18424959480762482, + "rewards/margins": 0.0050885798409581184, + "rewards/rejected": -0.18933814764022827, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.03438640385866165, + "eval_log_odds_ratio": -0.7598773241043091, + "eval_logits/chosen": 1.9225552082061768, + "eval_logits/rejected": 1.9088143110275269, + "eval_logps/chosen": -1.9715032577514648, + "eval_logps/rejected": -2.0058045387268066, + "eval_loss": 1.7793264389038086, + "eval_nll_loss": 1.703338623046875, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.1971503496170044, + "eval_rewards/margins": 0.0034301180858165026, + "eval_rewards/rejected": -0.20058046281337738, + "eval_runtime": 53.2144, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.30271151661872864, + "learning_rate": 2.8124444444444444e-06, + "log_odds_chosen": 0.033895134925842285, + "log_odds_ratio": -0.729619562625885, + "logits/chosen": 1.8796659708023071, + "logits/rejected": 1.9074052572250366, + "logps/chosen": -2.0234713554382324, + "logps/rejected": -2.050968647003174, + "loss": 1.7294719696044922, + "nll_loss": 1.6565099954605103, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20234712958335876, + "rewards/margins": 0.0027496994007378817, + "rewards/rejected": -0.20509684085845947, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 0.3309627175331116, + "learning_rate": 2.7768888888888886e-06, + "log_odds_chosen": -0.0035074115730822086, + "log_odds_ratio": -0.781356930732727, + "logits/chosen": 1.9202197790145874, + "logits/rejected": 1.8943029642105103, + "logps/chosen": -2.0067138671875, + "logps/rejected": -1.9892246723175049, + "loss": 1.7083927154541017, + "nll_loss": 1.630257248878479, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20067138969898224, + "rewards/margins": -0.0017489356687292457, + "rewards/rejected": -0.19892247021198273, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 0.3594350516796112, + "learning_rate": 2.7413333333333333e-06, + "log_odds_chosen": 0.11887629330158234, + "log_odds_ratio": -0.6932175159454346, + "logits/chosen": 1.9436604976654053, + "logits/rejected": 1.9098408222198486, + "logps/chosen": -1.9862394332885742, + "logps/rejected": -2.096081495285034, + "loss": 1.758765411376953, + "nll_loss": 1.689443588256836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19862394034862518, + "rewards/margins": 0.010984222404658794, + "rewards/rejected": -0.20960816740989685, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 0.36076149344444275, + "learning_rate": 2.7057777777777776e-06, + "log_odds_chosen": 0.21053990721702576, + "log_odds_ratio": -0.6388333439826965, + "logits/chosen": 1.8976023197174072, + "logits/rejected": 1.9002765417099, + "logps/chosen": -1.854945182800293, + "logps/rejected": -2.0353338718414307, + "loss": 1.7823978424072267, + "nll_loss": 1.7185142040252686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18549451231956482, + "rewards/margins": 0.018038874492049217, + "rewards/rejected": -0.2035333663225174, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 0.32271715998649597, + "learning_rate": 2.6702222222222223e-06, + "log_odds_chosen": 0.1977493166923523, + "log_odds_ratio": -0.6741037368774414, + "logits/chosen": 1.8466112613677979, + "logits/rejected": 1.8674328327178955, + "logps/chosen": -1.9169870615005493, + "logps/rejected": -2.091139793395996, + "loss": 1.6603471755981445, + "nll_loss": 1.5929368734359741, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19169871509075165, + "rewards/margins": 0.01741526648402214, + "rewards/rejected": -0.2091139853000641, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 0.2843083441257477, + "learning_rate": 2.6346666666666665e-06, + "log_odds_chosen": 0.17168815433979034, + "log_odds_ratio": -0.6685199737548828, + "logits/chosen": 1.9677289724349976, + "logits/rejected": 1.964511513710022, + "logps/chosen": -1.8186986446380615, + "logps/rejected": -1.9506231546401978, + "loss": 1.723248291015625, + "nll_loss": 1.656396508216858, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18186986446380615, + "rewards/margins": 0.013192457146942616, + "rewards/rejected": -0.1950623244047165, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 0.2275550216436386, + "learning_rate": 2.5991111111111112e-06, + "log_odds_chosen": 0.16259385645389557, + "log_odds_ratio": -0.6815955638885498, + "logits/chosen": 2.034264326095581, + "logits/rejected": 1.9869372844696045, + "logps/chosen": -1.8783130645751953, + "logps/rejected": -2.0175511837005615, + "loss": 1.6774738311767579, + "nll_loss": 1.6093145608901978, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18783126771450043, + "rewards/margins": 0.013923834078013897, + "rewards/rejected": -0.2017551213502884, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 0.2949213683605194, + "learning_rate": 2.563555555555555e-06, + "log_odds_chosen": 0.14697907865047455, + "log_odds_ratio": -0.6774402856826782, + "logits/chosen": 1.9659030437469482, + "logits/rejected": 1.9208488464355469, + "logps/chosen": -1.8670680522918701, + "logps/rejected": -1.9739803075790405, + "loss": 1.7409107208251953, + "nll_loss": 1.6731666326522827, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1867068111896515, + "rewards/margins": 0.010691216215491295, + "rewards/rejected": -0.19739803671836853, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 0.2911042869091034, + "learning_rate": 2.5279999999999998e-06, + "log_odds_chosen": 0.03021297976374626, + "log_odds_ratio": -0.7644273042678833, + "logits/chosen": 1.9712092876434326, + "logits/rejected": 1.9553823471069336, + "logps/chosen": -1.9173786640167236, + "logps/rejected": -1.9468326568603516, + "loss": 1.6529146194458009, + "nll_loss": 1.5764720439910889, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.19173786044120789, + "rewards/margins": 0.0029454149771481752, + "rewards/rejected": -0.1946832686662674, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 0.19435548782348633, + "learning_rate": 2.492444444444444e-06, + "log_odds_chosen": 0.015966754406690598, + "log_odds_ratio": -0.7808157205581665, + "logits/chosen": 1.9463411569595337, + "logits/rejected": 1.9502232074737549, + "logps/chosen": -1.9179836511611938, + "logps/rejected": -1.9580169916152954, + "loss": 1.7467466354370118, + "nll_loss": 1.6686649322509766, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1917983591556549, + "rewards/margins": 0.004003344569355249, + "rewards/rejected": -0.19580169022083282, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.03914888948202133, + "eval_log_odds_ratio": -0.7578481435775757, + "eval_logits/chosen": 1.948970913887024, + "eval_logits/rejected": 1.9370065927505493, + "eval_logps/chosen": -1.975156545639038, + "eval_logps/rejected": -2.0133554935455322, + "eval_loss": 1.742074966430664, + "eval_nll_loss": 1.666290044784546, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19751565158367157, + "eval_rewards/margins": 0.0038199129048734903, + "eval_rewards/rejected": -0.20133554935455322, + "eval_runtime": 53.2113, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 0.2554956078529358, + "learning_rate": 2.4568888888888887e-06, + "log_odds_chosen": 0.07928521186113358, + "log_odds_ratio": -0.7100598812103271, + "logits/chosen": 1.9834693670272827, + "logits/rejected": 1.9129583835601807, + "logps/chosen": -1.9404462575912476, + "logps/rejected": -2.0127291679382324, + "loss": 1.6566553115844727, + "nll_loss": 1.5856493711471558, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19404461979866028, + "rewards/margins": 0.007228270173072815, + "rewards/rejected": -0.20127291977405548, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 0.23811131715774536, + "learning_rate": 2.421333333333333e-06, + "log_odds_chosen": 0.35999929904937744, + "log_odds_ratio": -0.6554363965988159, + "logits/chosen": 1.9690263271331787, + "logits/rejected": 1.9498993158340454, + "logps/chosen": -1.8379662036895752, + "logps/rejected": -2.1561756134033203, + "loss": 1.6408489227294922, + "nll_loss": 1.5753052234649658, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18379661440849304, + "rewards/margins": 0.03182096406817436, + "rewards/rejected": -0.2156175673007965, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 0.16817817091941833, + "learning_rate": 2.3857777777777777e-06, + "log_odds_chosen": 0.09530925005674362, + "log_odds_ratio": -0.7019798159599304, + "logits/chosen": 2.0015780925750732, + "logits/rejected": 1.9477115869522095, + "logps/chosen": -1.8819535970687866, + "logps/rejected": -1.9748623371124268, + "loss": 1.68689022064209, + "nll_loss": 1.6166921854019165, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1881953626871109, + "rewards/margins": 0.009290854446589947, + "rewards/rejected": -0.19748620688915253, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 0.25315138697624207, + "learning_rate": 2.3502222222222224e-06, + "log_odds_chosen": 0.019454699009656906, + "log_odds_ratio": -0.7317359447479248, + "logits/chosen": 1.866839051246643, + "logits/rejected": 1.8470100164413452, + "logps/chosen": -1.9150846004486084, + "logps/rejected": -1.9340898990631104, + "loss": 1.6823833465576172, + "nll_loss": 1.609209656715393, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1915084570646286, + "rewards/margins": 0.0019005045760422945, + "rewards/rejected": -0.19340898096561432, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 0.20591425895690918, + "learning_rate": 2.3146666666666666e-06, + "log_odds_chosen": 0.18473069369792938, + "log_odds_ratio": -0.6696128845214844, + "logits/chosen": 2.0007896423339844, + "logits/rejected": 1.9895092248916626, + "logps/chosen": -1.774839162826538, + "logps/rejected": -1.9293769598007202, + "loss": 1.626854133605957, + "nll_loss": 1.5598928928375244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1774839460849762, + "rewards/margins": 0.015453780069947243, + "rewards/rejected": -0.1929377168416977, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 0.19470006227493286, + "learning_rate": 2.2791111111111113e-06, + "log_odds_chosen": 0.08205848187208176, + "log_odds_ratio": -0.7239702343940735, + "logits/chosen": 1.9986556768417358, + "logits/rejected": 1.9231617450714111, + "logps/chosen": -1.8390166759490967, + "logps/rejected": -1.9078738689422607, + "loss": 1.6770751953125, + "nll_loss": 1.6046781539916992, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18390165269374847, + "rewards/margins": 0.0068857138976454735, + "rewards/rejected": -0.1907874047756195, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 0.1899077147245407, + "learning_rate": 2.2435555555555556e-06, + "log_odds_chosen": 0.08952027559280396, + "log_odds_ratio": -0.7206470370292664, + "logits/chosen": 1.937567114830017, + "logits/rejected": 1.8602908849716187, + "logps/chosen": -1.9591258764266968, + "logps/rejected": -2.041567802429199, + "loss": 1.7079832077026367, + "nll_loss": 1.6359182596206665, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19591261446475983, + "rewards/margins": 0.008244190365076065, + "rewards/rejected": -0.2041568011045456, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 0.21359078586101532, + "learning_rate": 2.2080000000000003e-06, + "log_odds_chosen": 0.12578465044498444, + "log_odds_ratio": -0.6887935400009155, + "logits/chosen": 1.9823474884033203, + "logits/rejected": 1.9967457056045532, + "logps/chosen": -1.842449426651001, + "logps/rejected": -1.9473011493682861, + "loss": 1.7107582092285156, + "nll_loss": 1.6418790817260742, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18424490094184875, + "rewards/margins": 0.010485194623470306, + "rewards/rejected": -0.19473011791706085, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 0.20844241976737976, + "learning_rate": 2.172444444444444e-06, + "log_odds_chosen": 0.07038307189941406, + "log_odds_ratio": -0.7394507527351379, + "logits/chosen": 1.9976749420166016, + "logits/rejected": 1.9397321939468384, + "logps/chosen": -1.891405701637268, + "logps/rejected": -1.9321295022964478, + "loss": 1.6750732421875, + "nll_loss": 1.6011279821395874, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18914058804512024, + "rewards/margins": 0.0040723890997469425, + "rewards/rejected": -0.19321295619010925, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 0.19770869612693787, + "learning_rate": 2.136888888888889e-06, + "log_odds_chosen": 0.15921947360038757, + "log_odds_ratio": -0.6770394444465637, + "logits/chosen": 2.00447154045105, + "logits/rejected": 1.9829330444335938, + "logps/chosen": -1.8935775756835938, + "logps/rejected": -2.022252321243286, + "loss": 1.694720458984375, + "nll_loss": 1.6270164251327515, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18935778737068176, + "rewards/margins": 0.012867463752627373, + "rewards/rejected": -0.20222525298595428, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.03703959658741951, + "eval_log_odds_ratio": -0.7581658959388733, + "eval_logits/chosen": 1.9618418216705322, + "eval_logits/rejected": 1.949436902999878, + "eval_logps/chosen": -1.9719505310058594, + "eval_logps/rejected": -2.0083794593811035, + "eval_loss": 1.7386270761489868, + "eval_nll_loss": 1.6628105640411377, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19719506800174713, + "eval_rewards/margins": 0.003642885247245431, + "eval_rewards/rejected": -0.20083795487880707, + "eval_runtime": 53.1784, + "eval_samples_per_second": 9.402, + "eval_steps_per_second": 4.701, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 0.20963209867477417, + "learning_rate": 2.101333333333333e-06, + "log_odds_chosen": 0.029030675068497658, + "log_odds_ratio": -0.7270594835281372, + "logits/chosen": 1.9133045673370361, + "logits/rejected": 1.9300514459609985, + "logps/chosen": -1.94406259059906, + "logps/rejected": -1.9692909717559814, + "loss": 1.6782550811767578, + "nll_loss": 1.6055490970611572, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19440627098083496, + "rewards/margins": 0.002522836672142148, + "rewards/rejected": -0.19692911207675934, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 0.2001485675573349, + "learning_rate": 2.0657777777777778e-06, + "log_odds_chosen": 0.2066613882780075, + "log_odds_ratio": -0.6765174865722656, + "logits/chosen": 2.0294079780578613, + "logits/rejected": 1.97390878200531, + "logps/chosen": -1.9244718551635742, + "logps/rejected": -2.0865769386291504, + "loss": 1.739645767211914, + "nll_loss": 1.6719939708709717, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19244717061519623, + "rewards/margins": 0.01621050387620926, + "rewards/rejected": -0.2086576670408249, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 0.159030944108963, + "learning_rate": 2.030222222222222e-06, + "log_odds_chosen": 0.14436787366867065, + "log_odds_ratio": -0.6960445046424866, + "logits/chosen": 2.0115323066711426, + "logits/rejected": 2.0005786418914795, + "logps/chosen": -1.8971054553985596, + "logps/rejected": -2.019014835357666, + "loss": 1.7051689147949218, + "nll_loss": 1.6355644464492798, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18971054255962372, + "rewards/margins": 0.012190921232104301, + "rewards/rejected": -0.20190146565437317, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 0.20508001744747162, + "learning_rate": 1.9946666666666663e-06, + "log_odds_chosen": 0.012411686591804028, + "log_odds_ratio": -0.7656394243240356, + "logits/chosen": 1.9236491918563843, + "logits/rejected": 1.928847312927246, + "logps/chosen": -1.9116830825805664, + "logps/rejected": -1.9297691583633423, + "loss": 1.6666793823242188, + "nll_loss": 1.5901156663894653, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19116830825805664, + "rewards/margins": 0.0018086109776049852, + "rewards/rejected": -0.1929769217967987, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 0.17614884674549103, + "learning_rate": 1.959111111111111e-06, + "log_odds_chosen": -0.00976226944476366, + "log_odds_ratio": -0.7704049348831177, + "logits/chosen": 1.9787086248397827, + "logits/rejected": 1.9729989767074585, + "logps/chosen": -1.9898617267608643, + "logps/rejected": -1.9705969095230103, + "loss": 1.6890064239501954, + "nll_loss": 1.6119661331176758, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19898617267608643, + "rewards/margins": -0.0019264683360233903, + "rewards/rejected": -0.19705967605113983, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 0.22970254719257355, + "learning_rate": 1.9235555555555553e-06, + "log_odds_chosen": 0.11413507163524628, + "log_odds_ratio": -0.7118924856185913, + "logits/chosen": 1.9141197204589844, + "logits/rejected": 1.927093744277954, + "logps/chosen": -1.9108161926269531, + "logps/rejected": -2.0101070404052734, + "loss": 1.6295789718627929, + "nll_loss": 1.558389663696289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19108164310455322, + "rewards/margins": 0.009929090738296509, + "rewards/rejected": -0.20101073384284973, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 0.3554319739341736, + "learning_rate": 1.8879999999999998e-06, + "log_odds_chosen": 0.139635369181633, + "log_odds_ratio": -0.7322131395339966, + "logits/chosen": 1.9641954898834229, + "logits/rejected": 1.9771820306777954, + "logps/chosen": -1.919316053390503, + "logps/rejected": -2.048070192337036, + "loss": 1.6570716857910157, + "nll_loss": 1.5838501453399658, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1919316053390503, + "rewards/margins": 0.012875407934188843, + "rewards/rejected": -0.20480699837207794, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 0.2408200353384018, + "learning_rate": 1.8524444444444442e-06, + "log_odds_chosen": 0.09701336920261383, + "log_odds_ratio": -0.7181066274642944, + "logits/chosen": 1.8994344472885132, + "logits/rejected": 1.8975048065185547, + "logps/chosen": -1.858982801437378, + "logps/rejected": -1.9471409320831299, + "loss": 1.618194580078125, + "nll_loss": 1.5463839769363403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18589827418327332, + "rewards/margins": 0.008815804496407509, + "rewards/rejected": -0.19471409916877747, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 0.23069331049919128, + "learning_rate": 1.816888888888889e-06, + "log_odds_chosen": 0.1935921609401703, + "log_odds_ratio": -0.6571868062019348, + "logits/chosen": 1.9328809976577759, + "logits/rejected": 1.9121859073638916, + "logps/chosen": -1.773535132408142, + "logps/rejected": -1.9285399913787842, + "loss": 1.6073417663574219, + "nll_loss": 1.5416228771209717, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17735353112220764, + "rewards/margins": 0.015500485897064209, + "rewards/rejected": -0.19285401701927185, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 0.2220882922410965, + "learning_rate": 1.7813333333333334e-06, + "log_odds_chosen": 0.18786410987377167, + "log_odds_ratio": -0.6733223795890808, + "logits/chosen": 1.9360544681549072, + "logits/rejected": 1.9118951559066772, + "logps/chosen": -1.8457376956939697, + "logps/rejected": -2.006767988204956, + "loss": 1.6750938415527343, + "nll_loss": 1.6077613830566406, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18457373976707458, + "rewards/margins": 0.01610305719077587, + "rewards/rejected": -0.2006767988204956, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.037848811596632004, + "eval_log_odds_ratio": -0.7588346004486084, + "eval_logits/chosen": 1.9487451314926147, + "eval_logits/rejected": 1.937835693359375, + "eval_logps/chosen": -1.9709864854812622, + "eval_logps/rejected": -2.008183479309082, + "eval_loss": 1.7392076253890991, + "eval_nll_loss": 1.663324236869812, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19709864258766174, + "eval_rewards/margins": 0.0037197000347077847, + "eval_rewards/rejected": -0.20081835985183716, + "eval_runtime": 53.1547, + "eval_samples_per_second": 9.406, + "eval_steps_per_second": 4.703, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 0.17311710119247437, + "learning_rate": 1.7457777777777779e-06, + "log_odds_chosen": 0.027943268418312073, + "log_odds_ratio": -0.7397163510322571, + "logits/chosen": 1.8568236827850342, + "logits/rejected": 1.851446509361267, + "logps/chosen": -1.9068208932876587, + "logps/rejected": -1.9251091480255127, + "loss": 1.649449348449707, + "nll_loss": 1.5754777193069458, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19068209826946259, + "rewards/margins": 0.0018288027495145798, + "rewards/rejected": -0.1925109177827835, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 0.22997044026851654, + "learning_rate": 1.7102222222222221e-06, + "log_odds_chosen": 0.1097046285867691, + "log_odds_ratio": -0.7040611505508423, + "logits/chosen": 1.9243297576904297, + "logits/rejected": 1.963524580001831, + "logps/chosen": -1.8955605030059814, + "logps/rejected": -1.9760913848876953, + "loss": 1.6310876846313476, + "nll_loss": 1.5606815814971924, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1895560324192047, + "rewards/margins": 0.00805308111011982, + "rewards/rejected": -0.19760914146900177, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 0.1864926517009735, + "learning_rate": 1.6746666666666666e-06, + "log_odds_chosen": 0.16068391501903534, + "log_odds_ratio": -0.6889876127243042, + "logits/chosen": 2.0080339908599854, + "logits/rejected": 1.954077959060669, + "logps/chosen": -1.8840856552124023, + "logps/rejected": -2.02380108833313, + "loss": 1.6726764678955077, + "nll_loss": 1.6037778854370117, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18840858340263367, + "rewards/margins": 0.013971516862511635, + "rewards/rejected": -0.20238009095191956, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.20873874425888062, + "learning_rate": 1.6391111111111111e-06, + "log_odds_chosen": 0.07395173609256744, + "log_odds_ratio": -0.7397626042366028, + "logits/chosen": 1.9810717105865479, + "logits/rejected": 1.9605712890625, + "logps/chosen": -1.887573003768921, + "logps/rejected": -1.9506385326385498, + "loss": 1.698026466369629, + "nll_loss": 1.6240499019622803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18875731527805328, + "rewards/margins": 0.006306570954620838, + "rewards/rejected": -0.19506387412548065, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.19733679294586182, + "learning_rate": 1.6035555555555556e-06, + "log_odds_chosen": 0.044853754341602325, + "log_odds_ratio": -0.7555680274963379, + "logits/chosen": 1.9993703365325928, + "logits/rejected": 1.9761343002319336, + "logps/chosen": -1.9512548446655273, + "logps/rejected": -1.9669195413589478, + "loss": 1.6620542526245117, + "nll_loss": 1.5864975452423096, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1951254904270172, + "rewards/margins": 0.0015664601232856512, + "rewards/rejected": -0.19669198989868164, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.21827088296413422, + "learning_rate": 1.568e-06, + "log_odds_chosen": -0.04072676971554756, + "log_odds_ratio": -0.8474240303039551, + "logits/chosen": 1.9101537466049194, + "logits/rejected": 1.9583427906036377, + "logps/chosen": -1.9866435527801514, + "logps/rejected": -1.9379768371582031, + "loss": 1.6317344665527345, + "nll_loss": 1.5469920635223389, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1986643671989441, + "rewards/margins": -0.00486668711528182, + "rewards/rejected": -0.19379767775535583, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.17806994915008545, + "learning_rate": 1.5324444444444443e-06, + "log_odds_chosen": -0.01947469636797905, + "log_odds_ratio": -0.7560760378837585, + "logits/chosen": 1.9823522567749023, + "logits/rejected": 1.9231178760528564, + "logps/chosen": -1.9635775089263916, + "logps/rejected": -1.949378252029419, + "loss": 1.6949882507324219, + "nll_loss": 1.6193805932998657, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.19635775685310364, + "rewards/margins": -0.0014199145371094346, + "rewards/rejected": -0.1949378401041031, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.18484769761562347, + "learning_rate": 1.4968888888888888e-06, + "log_odds_chosen": -0.011743311770260334, + "log_odds_ratio": -0.7704417109489441, + "logits/chosen": 1.9684324264526367, + "logits/rejected": 1.92658269405365, + "logps/chosen": -1.9983644485473633, + "logps/rejected": -1.9799554347991943, + "loss": 1.764794158935547, + "nll_loss": 1.687750220298767, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19983646273612976, + "rewards/margins": -0.0018409093609079719, + "rewards/rejected": -0.19799552857875824, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.19327755272388458, + "learning_rate": 1.4613333333333333e-06, + "log_odds_chosen": -0.010004746727645397, + "log_odds_ratio": -0.7772399187088013, + "logits/chosen": 1.9115307331085205, + "logits/rejected": 1.8927816152572632, + "logps/chosen": -1.9729045629501343, + "logps/rejected": -1.9708572626113892, + "loss": 1.6671720504760743, + "nll_loss": 1.5894482135772705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19729046523571014, + "rewards/margins": -0.00020474250777624547, + "rewards/rejected": -0.19708572328090668, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.18267710506916046, + "learning_rate": 1.4257777777777778e-06, + "log_odds_chosen": 0.03775619715452194, + "log_odds_ratio": -0.7274637222290039, + "logits/chosen": 1.965767502784729, + "logits/rejected": 1.9332348108291626, + "logps/chosen": -1.958343744277954, + "logps/rejected": -1.9895089864730835, + "loss": 1.6861879348754882, + "nll_loss": 1.6134417057037354, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19583436846733093, + "rewards/margins": 0.003116548527032137, + "rewards/rejected": -0.1989509016275406, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.035907384008169174, + "eval_log_odds_ratio": -0.7580814361572266, + "eval_logits/chosen": 1.9721317291259766, + "eval_logits/rejected": 1.9608474969863892, + "eval_logps/chosen": -1.9707766771316528, + "eval_logps/rejected": -2.0064241886138916, + "eval_loss": 1.7385348081588745, + "eval_nll_loss": 1.662726640701294, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.1970776617527008, + "eval_rewards/margins": 0.003564756363630295, + "eval_rewards/rejected": -0.2006424516439438, + "eval_runtime": 53.1332, + "eval_samples_per_second": 9.41, + "eval_steps_per_second": 4.705, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 0.2182595431804657, + "learning_rate": 1.3902222222222222e-06, + "log_odds_chosen": 0.07436065375804901, + "log_odds_ratio": -0.7376033067703247, + "logits/chosen": 2.033583641052246, + "logits/rejected": 1.9371395111083984, + "logps/chosen": -1.9411036968231201, + "logps/rejected": -2.0054962635040283, + "loss": 1.6851945877075196, + "nll_loss": 1.6114343404769897, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19411036372184753, + "rewards/margins": 0.006439276039600372, + "rewards/rejected": -0.2005496472120285, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 0.2041226029396057, + "learning_rate": 1.3546666666666667e-06, + "log_odds_chosen": 0.08603398501873016, + "log_odds_ratio": -0.7840120196342468, + "logits/chosen": 2.013866662979126, + "logits/rejected": 2.004415988922119, + "logps/chosen": -1.996930480003357, + "logps/rejected": -2.057905912399292, + "loss": 1.6655797958374023, + "nll_loss": 1.5871785879135132, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19969305396080017, + "rewards/margins": 0.006097549106925726, + "rewards/rejected": -0.20579060912132263, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 0.19871436059474945, + "learning_rate": 1.319111111111111e-06, + "log_odds_chosen": 0.2410924881696701, + "log_odds_ratio": -0.6583682298660278, + "logits/chosen": 1.9730018377304077, + "logits/rejected": 1.934480905532837, + "logps/chosen": -1.8520793914794922, + "logps/rejected": -2.0455081462860107, + "loss": 1.6601404190063476, + "nll_loss": 1.5943034887313843, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18520793318748474, + "rewards/margins": 0.019342893734574318, + "rewards/rejected": -0.2045508176088333, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 0.13996213674545288, + "learning_rate": 1.2835555555555555e-06, + "log_odds_chosen": 0.059573762118816376, + "log_odds_ratio": -0.7112005352973938, + "logits/chosen": 1.897156000137329, + "logits/rejected": 1.918378472328186, + "logps/chosen": -1.9309139251708984, + "logps/rejected": -1.9782907962799072, + "loss": 1.6874992370605468, + "nll_loss": 1.6163790225982666, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19309139251708984, + "rewards/margins": 0.004737673792988062, + "rewards/rejected": -0.19782906770706177, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 0.21009710431098938, + "learning_rate": 1.248e-06, + "log_odds_chosen": 0.09428197890520096, + "log_odds_ratio": -0.7023419141769409, + "logits/chosen": 2.001282215118408, + "logits/rejected": 2.0065598487854004, + "logps/chosen": -1.8621037006378174, + "logps/rejected": -1.9308359622955322, + "loss": 1.670359992980957, + "nll_loss": 1.6001259088516235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18621034920215607, + "rewards/margins": 0.006873233709484339, + "rewards/rejected": -0.19308359920978546, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 0.18963180482387543, + "learning_rate": 1.2124444444444444e-06, + "log_odds_chosen": 0.1198032945394516, + "log_odds_ratio": -0.786547839641571, + "logits/chosen": 1.8857593536376953, + "logits/rejected": 1.8772966861724854, + "logps/chosen": -1.964369535446167, + "logps/rejected": -2.0974040031433105, + "loss": 1.6600170135498047, + "nll_loss": 1.581362009048462, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19643697142601013, + "rewards/margins": 0.013303431682288647, + "rewards/rejected": -0.20974040031433105, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 0.18647311627864838, + "learning_rate": 1.176888888888889e-06, + "log_odds_chosen": 0.28815513849258423, + "log_odds_ratio": -0.6971661448478699, + "logits/chosen": 1.994585394859314, + "logits/rejected": 1.9948902130126953, + "logps/chosen": -1.9381263256072998, + "logps/rejected": -2.205533027648926, + "loss": 1.6973844528198243, + "nll_loss": 1.627667784690857, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19381265342235565, + "rewards/margins": 0.02674066089093685, + "rewards/rejected": -0.22055332362651825, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 0.2129177749156952, + "learning_rate": 1.1413333333333332e-06, + "log_odds_chosen": 0.17926549911499023, + "log_odds_ratio": -0.6743106245994568, + "logits/chosen": 1.8731542825698853, + "logits/rejected": 1.8746894598007202, + "logps/chosen": -1.849504828453064, + "logps/rejected": -2.0065646171569824, + "loss": 1.6047693252563477, + "nll_loss": 1.5373382568359375, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18495047092437744, + "rewards/margins": 0.015705987811088562, + "rewards/rejected": -0.200656458735466, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 0.1805354654788971, + "learning_rate": 1.1057777777777777e-06, + "log_odds_chosen": 0.010342921130359173, + "log_odds_ratio": -0.779272735118866, + "logits/chosen": 2.0200014114379883, + "logits/rejected": 2.009512424468994, + "logps/chosen": -2.044440984725952, + "logps/rejected": -2.0541317462921143, + "loss": 1.7150918960571289, + "nll_loss": 1.6371647119522095, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.20444409549236298, + "rewards/margins": 0.0009690720471553504, + "rewards/rejected": -0.20541317760944366, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 0.2175653576850891, + "learning_rate": 1.0702222222222221e-06, + "log_odds_chosen": 0.032114505767822266, + "log_odds_ratio": -0.7545596361160278, + "logits/chosen": 1.8662312030792236, + "logits/rejected": 1.8274072408676147, + "logps/chosen": -1.9221569299697876, + "logps/rejected": -1.9536468982696533, + "loss": 1.6281578063964843, + "nll_loss": 1.5527019500732422, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1922157108783722, + "rewards/margins": 0.003149004653096199, + "rewards/rejected": -0.19536468386650085, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.030956413596868515, + "eval_log_odds_ratio": -0.7591837644577026, + "eval_logits/chosen": 1.965054988861084, + "eval_logits/rejected": 1.9538328647613525, + "eval_logps/chosen": -1.9695806503295898, + "eval_logps/rejected": -2.000894784927368, + "eval_loss": 1.7384891510009766, + "eval_nll_loss": 1.662570834159851, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19695807993412018, + "eval_rewards/margins": 0.0031314240768551826, + "eval_rewards/rejected": -0.2000894993543625, + "eval_runtime": 52.991, + "eval_samples_per_second": 9.436, + "eval_steps_per_second": 4.718, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 0.18804052472114563, + "learning_rate": 1.0346666666666666e-06, + "log_odds_chosen": 0.047188155353069305, + "log_odds_ratio": -0.7515336275100708, + "logits/chosen": 1.929109811782837, + "logits/rejected": 1.8681122064590454, + "logps/chosen": -1.8629602193832397, + "logps/rejected": -1.8899915218353271, + "loss": 1.6528600692749023, + "nll_loss": 1.5777066946029663, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1862960159778595, + "rewards/margins": 0.0027031381614506245, + "rewards/rejected": -0.18899916112422943, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 0.15381857752799988, + "learning_rate": 9.99111111111111e-07, + "log_odds_chosen": 0.11675859987735748, + "log_odds_ratio": -0.7371183037757874, + "logits/chosen": 1.8915207386016846, + "logits/rejected": 1.9328769445419312, + "logps/chosen": -1.8506667613983154, + "logps/rejected": -1.961627721786499, + "loss": 1.6869701385498046, + "nll_loss": 1.6132583618164062, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18506669998168945, + "rewards/margins": 0.011096075177192688, + "rewards/rejected": -0.19616279006004333, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 0.19392751157283783, + "learning_rate": 9.635555555555556e-07, + "log_odds_chosen": 0.06520286947488785, + "log_odds_ratio": -0.7269853353500366, + "logits/chosen": 1.945910096168518, + "logits/rejected": 1.9712088108062744, + "logps/chosen": -1.8130731582641602, + "logps/rejected": -1.8833789825439453, + "loss": 1.6304901123046875, + "nll_loss": 1.5577917098999023, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.18130730092525482, + "rewards/margins": 0.0070305936969816685, + "rewards/rejected": -0.18833789229393005, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 0.19160452485084534, + "learning_rate": 9.28e-07, + "log_odds_chosen": 0.11965823173522949, + "log_odds_ratio": -0.7161513566970825, + "logits/chosen": 1.8624318838119507, + "logits/rejected": 1.8819353580474854, + "logps/chosen": -1.8883718252182007, + "logps/rejected": -2.0014452934265137, + "loss": 1.6923982620239257, + "nll_loss": 1.6207830905914307, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1888371855020523, + "rewards/margins": 0.011307370848953724, + "rewards/rejected": -0.20014457404613495, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 0.21190473437309265, + "learning_rate": 8.924444444444444e-07, + "log_odds_chosen": -0.1211489662528038, + "log_odds_ratio": -0.8296471834182739, + "logits/chosen": 2.0248475074768066, + "logits/rejected": 2.0085835456848145, + "logps/chosen": -1.9709552526474, + "logps/rejected": -1.8767433166503906, + "loss": 1.7127277374267578, + "nll_loss": 1.6297632455825806, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19709551334381104, + "rewards/margins": -0.00942118652164936, + "rewards/rejected": -0.18767431378364563, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 0.18952655792236328, + "learning_rate": 8.568888888888889e-07, + "log_odds_chosen": 0.07776130735874176, + "log_odds_ratio": -0.7154445648193359, + "logits/chosen": 1.886913537979126, + "logits/rejected": 1.9137483835220337, + "logps/chosen": -1.8462440967559814, + "logps/rejected": -1.9090255498886108, + "loss": 1.6406494140625, + "nll_loss": 1.5691049098968506, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18462440371513367, + "rewards/margins": 0.0062781586311757565, + "rewards/rejected": -0.19090254604816437, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.2062900811433792, + "learning_rate": 8.213333333333333e-07, + "log_odds_chosen": 0.1700354367494583, + "log_odds_ratio": -0.6777452230453491, + "logits/chosen": 1.9052644968032837, + "logits/rejected": 1.906673789024353, + "logps/chosen": -1.8593641519546509, + "logps/rejected": -2.0129494667053223, + "loss": 1.5919419288635255, + "nll_loss": 1.5241668224334717, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18593639135360718, + "rewards/margins": 0.015358559787273407, + "rewards/rejected": -0.20129497349262238, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.2041376829147339, + "learning_rate": 7.857777777777778e-07, + "log_odds_chosen": -0.033038415014743805, + "log_odds_ratio": -0.7522535920143127, + "logits/chosen": 1.958950400352478, + "logits/rejected": 1.9319322109222412, + "logps/chosen": -1.9543571472167969, + "logps/rejected": -1.9238964319229126, + "loss": 1.641855239868164, + "nll_loss": 1.5666298866271973, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19543573260307312, + "rewards/margins": -0.0030460860580205917, + "rewards/rejected": -0.19238965213298798, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.21807628870010376, + "learning_rate": 7.502222222222222e-07, + "log_odds_chosen": 0.1660190224647522, + "log_odds_ratio": -0.6850719451904297, + "logits/chosen": 1.9368999004364014, + "logits/rejected": 1.8725366592407227, + "logps/chosen": -1.899930715560913, + "logps/rejected": -2.044861316680908, + "loss": 1.653396987915039, + "nll_loss": 1.5848896503448486, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18999308347702026, + "rewards/margins": 0.014493053779006004, + "rewards/rejected": -0.20448613166809082, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.20989151298999786, + "learning_rate": 7.146666666666666e-07, + "log_odds_chosen": 0.1224382147192955, + "log_odds_ratio": -0.7186975479125977, + "logits/chosen": 1.9427839517593384, + "logits/rejected": 1.912305474281311, + "logps/chosen": -1.8933311700820923, + "logps/rejected": -1.9752848148345947, + "loss": 1.7118989944458007, + "nll_loss": 1.6400293111801147, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18933311104774475, + "rewards/margins": 0.008195372298359871, + "rewards/rejected": -0.19752849638462067, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.03177111968398094, + "eval_log_odds_ratio": -0.7596560120582581, + "eval_logits/chosen": 1.9661781787872314, + "eval_logits/rejected": 1.9550096988677979, + "eval_logps/chosen": -1.967788815498352, + "eval_logps/rejected": -1.9995970726013184, + "eval_loss": 1.7376213073730469, + "eval_nll_loss": 1.6616557836532593, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19677886366844177, + "eval_rewards/margins": 0.0031808456405997276, + "eval_rewards/rejected": -0.19995971024036407, + "eval_runtime": 52.9345, + "eval_samples_per_second": 9.446, + "eval_steps_per_second": 4.723, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.1686755120754242, + "learning_rate": 6.791111111111111e-07, + "log_odds_chosen": 0.2830939292907715, + "log_odds_ratio": -0.6540330648422241, + "logits/chosen": 2.0916292667388916, + "logits/rejected": 2.016447067260742, + "logps/chosen": -1.888832449913025, + "logps/rejected": -2.147658109664917, + "loss": 1.6795072555541992, + "nll_loss": 1.61410391330719, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18888327479362488, + "rewards/margins": 0.025882547721266747, + "rewards/rejected": -0.21476581692695618, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.23096708953380585, + "learning_rate": 6.435555555555556e-07, + "log_odds_chosen": -0.06739415228366852, + "log_odds_ratio": -0.7905227541923523, + "logits/chosen": 1.9669262170791626, + "logits/rejected": 1.9470195770263672, + "logps/chosen": -1.936089277267456, + "logps/rejected": -1.8776248693466187, + "loss": 1.6421504974365235, + "nll_loss": 1.5630981922149658, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19360892474651337, + "rewards/margins": -0.005846431478857994, + "rewards/rejected": -0.18776246905326843, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.19392915070056915, + "learning_rate": 6.079999999999999e-07, + "log_odds_chosen": 0.09846463054418564, + "log_odds_ratio": -0.7082722783088684, + "logits/chosen": 1.8751684427261353, + "logits/rejected": 1.8314625024795532, + "logps/chosen": -1.9364410638809204, + "logps/rejected": -2.011606454849243, + "loss": 1.597140407562256, + "nll_loss": 1.5263129472732544, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19364410638809204, + "rewards/margins": 0.007516547106206417, + "rewards/rejected": -0.20116063952445984, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.18473073840141296, + "learning_rate": 5.724444444444444e-07, + "log_odds_chosen": 0.015096393413841724, + "log_odds_ratio": -0.7561847567558289, + "logits/chosen": 1.7878471612930298, + "logits/rejected": 1.7647182941436768, + "logps/chosen": -2.0790276527404785, + "logps/rejected": -2.0806405544281006, + "loss": 1.6734830856323242, + "nll_loss": 1.5978648662567139, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20790274441242218, + "rewards/margins": 0.00016129556752275676, + "rewards/rejected": -0.20806407928466797, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 0.17758043110370636, + "learning_rate": 5.368888888888888e-07, + "log_odds_chosen": 0.020775090903043747, + "log_odds_ratio": -0.7736788392066956, + "logits/chosen": 1.8850494623184204, + "logits/rejected": 1.9317066669464111, + "logps/chosen": -1.9396501779556274, + "logps/rejected": -1.95382559299469, + "loss": 1.6969659805297852, + "nll_loss": 1.6195980310440063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19396504759788513, + "rewards/margins": 0.0014175325632095337, + "rewards/rejected": -0.19538256525993347, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 0.24013279378414154, + "learning_rate": 5.013333333333333e-07, + "log_odds_chosen": -0.049433931708335876, + "log_odds_ratio": -0.7668389081954956, + "logits/chosen": 1.91973078250885, + "logits/rejected": 1.8892580270767212, + "logps/chosen": -1.9745763540267944, + "logps/rejected": -1.9311202764511108, + "loss": 1.6504856109619142, + "nll_loss": 1.5738017559051514, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1974576711654663, + "rewards/margins": -0.004345631692558527, + "rewards/rejected": -0.19311201572418213, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 0.24050985276699066, + "learning_rate": 4.6577777777777775e-07, + "log_odds_chosen": 0.027837049216032028, + "log_odds_ratio": -0.7923186421394348, + "logits/chosen": 2.0242013931274414, + "logits/rejected": 1.9961684942245483, + "logps/chosen": -1.9506433010101318, + "logps/rejected": -1.9801626205444336, + "loss": 1.7079919815063476, + "nll_loss": 1.6287603378295898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19506433606147766, + "rewards/margins": 0.0029519214294850826, + "rewards/rejected": -0.19801625609397888, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 0.20600086450576782, + "learning_rate": 4.3022222222222223e-07, + "log_odds_chosen": 0.0014284685021266341, + "log_odds_ratio": -0.7495579719543457, + "logits/chosen": 1.9058793783187866, + "logits/rejected": 1.8871889114379883, + "logps/chosen": -1.951804757118225, + "logps/rejected": -1.9474728107452393, + "loss": 1.6623008728027344, + "nll_loss": 1.5873453617095947, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1951804906129837, + "rewards/margins": -0.0004331955569796264, + "rewards/rejected": -0.19474726915359497, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 0.1933748424053192, + "learning_rate": 3.9466666666666665e-07, + "log_odds_chosen": 0.08518671989440918, + "log_odds_ratio": -0.7107952833175659, + "logits/chosen": 1.983252763748169, + "logits/rejected": 1.9398502111434937, + "logps/chosen": -1.845097541809082, + "logps/rejected": -1.916144609451294, + "loss": 1.7064382553100585, + "nll_loss": 1.6353585720062256, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1845097690820694, + "rewards/margins": 0.007104730699211359, + "rewards/rejected": -0.19161446392536163, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 0.16034555435180664, + "learning_rate": 3.591111111111111e-07, + "log_odds_chosen": 0.20363900065422058, + "log_odds_ratio": -0.6528972387313843, + "logits/chosen": 1.9256999492645264, + "logits/rejected": 1.9299843311309814, + "logps/chosen": -1.7845226526260376, + "logps/rejected": -1.9445335865020752, + "loss": 1.6401901245117188, + "nll_loss": 1.5749002695083618, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.178452268242836, + "rewards/margins": 0.016001086682081223, + "rewards/rejected": -0.19445334374904633, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.03292645514011383, + "eval_log_odds_ratio": -0.7589800953865051, + "eval_logits/chosen": 1.9538424015045166, + "eval_logits/rejected": 1.9426116943359375, + "eval_logps/chosen": -1.967284917831421, + "eval_logps/rejected": -2.0000522136688232, + "eval_loss": 1.7372877597808838, + "eval_nll_loss": 1.6613895893096924, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19672849774360657, + "eval_rewards/margins": 0.00327673670835793, + "eval_rewards/rejected": -0.20000524818897247, + "eval_runtime": 52.9539, + "eval_samples_per_second": 9.442, + "eval_steps_per_second": 4.721, + "step": 2400 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2400/training_args.bin b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b47bcdc3c44dda631da7f475aa87c7bb2c782bec --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1615b59ec50a8a8f298af41b0a88c5959219b5898139e8f88d7ad75a43a2c3b +size 5521 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/README.md b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..28550dad7e9abe3072f5d3e51e504f7143e8a5f2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "down_proj", + "v_proj", + "up_proj", + "gate_proj", + "q_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_model.safetensors b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..531055db50f6a8e31bc8d080461b47f5f6d06c1b --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c6bb251acf1617cc822293489716582952e193692706b4c9d8f87609cea1a5d +size 180385008 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/chat_template.jinja b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/optimizer.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..46a63b1ad32f9926fada12b8588201de3cf862cd --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fef986be3ae8ee733ca1cf7ed6331c963997557dba9cee2558b90184b42d61f +size 360902475 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/rng_state.pth b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scaler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6bccf2b99239cf26ef4ea2b6a5f9f897042b61f --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861ce13e6ca091acee9a68ebfc5ca38479baf4b537c37b3949f071f77b81e9f0 +size 1383 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scheduler.pt b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8973e42f869a8ae35fa1685babe10aeefeb119c1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5384fd513fd07cd33e547381e3aa4a59c683ce14eee650e78b909b6ea4b9c19b +size 1465 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer_config.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/trainer_state.json b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34860c5a9ee377ee8554b305d38b61d8972966b3 --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/trainer_state.json @@ -0,0 +1,5009 @@ +{ + "best_global_step": 100, + "best_metric": 0.550000011920929, + "best_model_checkpoint": "output/lora/checkpoint-100", + "epoch": 2.0, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8949731588363647, + "learning_rate": 2.88e-07, + "log_odds_chosen": -0.13458022475242615, + "log_odds_ratio": -0.8810430765151978, + "logits/chosen": 1.0847688913345337, + "logits/rejected": 1.0370358228683472, + "logps/chosen": -3.011305332183838, + "logps/rejected": -2.8771233558654785, + "loss": 3.5686809539794924, + "nll_loss": 3.480576992034912, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3011305630207062, + "rewards/margins": -0.013418207876384258, + "rewards/rejected": -0.28771233558654785, + "step": 10 + }, + { + "epoch": 0.016, + "grad_norm": 0.7958198189735413, + "learning_rate": 6.079999999999999e-07, + "log_odds_chosen": 0.10219261795282364, + "log_odds_ratio": -0.8167620897293091, + "logits/chosen": 1.068807601928711, + "logits/rejected": 1.0232031345367432, + "logps/chosen": -3.0397582054138184, + "logps/rejected": -3.1323461532592773, + "loss": 3.346195602416992, + "nll_loss": 3.264519453048706, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30397582054138184, + "rewards/margins": 0.009258817881345749, + "rewards/rejected": -0.3132346272468567, + "step": 20 + }, + { + "epoch": 0.024, + "grad_norm": 0.9128683805465698, + "learning_rate": 9.28e-07, + "log_odds_chosen": 0.12453228235244751, + "log_odds_ratio": -0.8030093312263489, + "logits/chosen": 1.2273896932601929, + "logits/rejected": 1.1259081363677979, + "logps/chosen": -2.7777199745178223, + "logps/rejected": -2.892120838165283, + "loss": 3.2673892974853516, + "nll_loss": 3.1870882511138916, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2777720093727112, + "rewards/margins": 0.011440068483352661, + "rewards/rejected": -0.28921204805374146, + "step": 30 + }, + { + "epoch": 0.032, + "grad_norm": 0.6320874691009521, + "learning_rate": 1.248e-06, + "log_odds_chosen": 0.029524624347686768, + "log_odds_ratio": -0.8554368019104004, + "logits/chosen": 1.1522005796432495, + "logits/rejected": 1.1450908184051514, + "logps/chosen": -3.080202341079712, + "logps/rejected": -3.107326030731201, + "loss": 3.296055221557617, + "nll_loss": 3.2105109691619873, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3080202639102936, + "rewards/margins": 0.0027123407926410437, + "rewards/rejected": -0.31073254346847534, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.645889163017273, + "learning_rate": 1.568e-06, + "log_odds_chosen": 0.014121174812316895, + "log_odds_ratio": -0.9388389587402344, + "logits/chosen": 1.0003323554992676, + "logits/rejected": 0.9480252265930176, + "logps/chosen": -3.0624964237213135, + "logps/rejected": -3.0750041007995605, + "loss": 3.3580265045166016, + "nll_loss": 3.2641425132751465, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3062496483325958, + "rewards/margins": 0.001250785542652011, + "rewards/rejected": -0.307500422000885, + "step": 50 + }, + { + "epoch": 0.048, + "grad_norm": 0.6963360905647278, + "learning_rate": 1.8879999999999998e-06, + "log_odds_chosen": 0.08005297183990479, + "log_odds_ratio": -0.7886329293251038, + "logits/chosen": 1.0161622762680054, + "logits/rejected": 1.0301268100738525, + "logps/chosen": -2.7399508953094482, + "logps/rejected": -2.8282437324523926, + "loss": 3.1988595962524413, + "nll_loss": 3.1199958324432373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2739951014518738, + "rewards/margins": 0.008829282596707344, + "rewards/rejected": -0.28282439708709717, + "step": 60 + }, + { + "epoch": 0.056, + "grad_norm": 0.6785285472869873, + "learning_rate": 2.2080000000000003e-06, + "log_odds_chosen": 0.015721607953310013, + "log_odds_ratio": -0.8530643582344055, + "logits/chosen": 1.0702764987945557, + "logits/rejected": 1.093421459197998, + "logps/chosen": -2.7683191299438477, + "logps/rejected": -2.7858364582061768, + "loss": 3.0648569107055663, + "nll_loss": 2.97955060005188, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.2768319249153137, + "rewards/margins": 0.0017517365049570799, + "rewards/rejected": -0.2785836458206177, + "step": 70 + }, + { + "epoch": 0.064, + "grad_norm": 0.6778960227966309, + "learning_rate": 2.5279999999999998e-06, + "log_odds_chosen": -0.05771768093109131, + "log_odds_ratio": -0.9478788375854492, + "logits/chosen": 1.0575181245803833, + "logits/rejected": 1.058960199356079, + "logps/chosen": -3.0624351501464844, + "logps/rejected": -2.998605966567993, + "loss": 3.0089736938476563, + "nll_loss": 2.9141860008239746, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.30624353885650635, + "rewards/margins": -0.006382950581610203, + "rewards/rejected": -0.29986056685447693, + "step": 80 + }, + { + "epoch": 0.072, + "grad_norm": 0.6029064655303955, + "learning_rate": 2.8479999999999997e-06, + "log_odds_chosen": 0.03830999881029129, + "log_odds_ratio": -0.8156368136405945, + "logits/chosen": 1.0552040338516235, + "logits/rejected": 1.037821888923645, + "logps/chosen": -2.7874300479888916, + "logps/rejected": -2.83976674079895, + "loss": 2.9470733642578124, + "nll_loss": 2.865509510040283, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2787429690361023, + "rewards/margins": 0.005233690608292818, + "rewards/rejected": -0.28397664427757263, + "step": 90 + }, + { + "epoch": 0.08, + "grad_norm": 0.31492361426353455, + "learning_rate": 3.168e-06, + "log_odds_chosen": 0.041199591010808945, + "log_odds_ratio": -0.9007355570793152, + "logits/chosen": 1.0530426502227783, + "logits/rejected": 1.087805151939392, + "logps/chosen": -2.8392837047576904, + "logps/rejected": -2.8784425258636475, + "loss": 2.8420888900756838, + "nll_loss": 2.7520148754119873, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.28392836451530457, + "rewards/margins": 0.003915875218808651, + "rewards/rejected": -0.2878442704677582, + "step": 100 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.09687195718288422, + "eval_log_odds_ratio": -0.7778716087341309, + "eval_logits/chosen": 1.1028081178665161, + "eval_logits/rejected": 1.0724682807922363, + "eval_logps/chosen": -2.6042141914367676, + "eval_logps/rejected": -2.7047533988952637, + "eval_loss": 2.7646737098693848, + "eval_nll_loss": 2.686886787414551, + "eval_rewards/accuracies": 0.550000011920929, + "eval_rewards/chosen": -0.2604214549064636, + "eval_rewards/margins": 0.010053902864456177, + "eval_rewards/rejected": -0.27047526836395264, + "eval_runtime": 53.3988, + "eval_samples_per_second": 9.364, + "eval_steps_per_second": 4.682, + "step": 100 + }, + { + "epoch": 0.088, + "grad_norm": 0.34352463483810425, + "learning_rate": 3.488e-06, + "log_odds_chosen": -0.03931659460067749, + "log_odds_ratio": -0.820832371711731, + "logits/chosen": 1.133837342262268, + "logits/rejected": 1.0994086265563965, + "logps/chosen": -2.6451282501220703, + "logps/rejected": -2.60078763961792, + "loss": 2.6902618408203125, + "nll_loss": 2.6081786155700684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.264512836933136, + "rewards/margins": -0.004434076603502035, + "rewards/rejected": -0.2600787580013275, + "step": 110 + }, + { + "epoch": 0.096, + "grad_norm": 0.33482232689857483, + "learning_rate": 3.808e-06, + "log_odds_chosen": 0.06360156834125519, + "log_odds_ratio": -0.7800716757774353, + "logits/chosen": 1.1236859560012817, + "logits/rejected": 1.0961066484451294, + "logps/chosen": -2.4337990283966064, + "logps/rejected": -2.49751615524292, + "loss": 2.489605522155762, + "nll_loss": 2.4115982055664062, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.24337990581989288, + "rewards/margins": 0.006371702998876572, + "rewards/rejected": -0.24975161254405975, + "step": 120 + }, + { + "epoch": 0.104, + "grad_norm": 0.2967042028903961, + "learning_rate": 4.128e-06, + "log_odds_chosen": 0.09330085664987564, + "log_odds_ratio": -0.7330855131149292, + "logits/chosen": 1.014111876487732, + "logits/rejected": 0.9797853231430054, + "logps/chosen": -2.3611137866973877, + "logps/rejected": -2.441413164138794, + "loss": 2.3510934829711916, + "nll_loss": 2.277784824371338, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2361113727092743, + "rewards/margins": 0.008029930293560028, + "rewards/rejected": -0.24414131045341492, + "step": 130 + }, + { + "epoch": 0.112, + "grad_norm": 0.24006928503513336, + "learning_rate": 4.4480000000000004e-06, + "log_odds_chosen": -0.04334372282028198, + "log_odds_ratio": -0.7658584713935852, + "logits/chosen": 1.1320513486862183, + "logits/rejected": 1.0817039012908936, + "logps/chosen": -2.3058180809020996, + "logps/rejected": -2.2727291584014893, + "loss": 2.2678853988647463, + "nll_loss": 2.1912999153137207, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.23058180510997772, + "rewards/margins": -0.003308868035674095, + "rewards/rejected": -0.22727294266223907, + "step": 140 + }, + { + "epoch": 0.12, + "grad_norm": 0.2205500453710556, + "learning_rate": 4.768e-06, + "log_odds_chosen": 0.08752859383821487, + "log_odds_ratio": -0.6995586156845093, + "logits/chosen": 0.9395301938056946, + "logits/rejected": 0.8814845085144043, + "logps/chosen": -2.261115074157715, + "logps/rejected": -2.3379273414611816, + "loss": 2.204380226135254, + "nll_loss": 2.1344239711761475, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2261115312576294, + "rewards/margins": 0.007681201212108135, + "rewards/rejected": -0.23379270732402802, + "step": 150 + }, + { + "epoch": 0.128, + "grad_norm": 0.19835017621517181, + "learning_rate": 5.088e-06, + "log_odds_chosen": 0.3893406391143799, + "log_odds_ratio": -0.6617119908332825, + "logits/chosen": 1.1273963451385498, + "logits/rejected": 0.9621152877807617, + "logps/chosen": -2.1851847171783447, + "logps/rejected": -2.5612263679504395, + "loss": 2.104723358154297, + "nll_loss": 2.0385525226593018, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21851846575737, + "rewards/margins": 0.0376041904091835, + "rewards/rejected": -0.2561226487159729, + "step": 160 + }, + { + "epoch": 0.136, + "grad_norm": 0.20731030404567719, + "learning_rate": 5.408e-06, + "log_odds_chosen": 0.12455078214406967, + "log_odds_ratio": -0.7328735589981079, + "logits/chosen": 1.046876311302185, + "logits/rejected": 1.0955148935317993, + "logps/chosen": -2.223609447479248, + "logps/rejected": -2.3480441570281982, + "loss": 2.0970306396484375, + "nll_loss": 2.0237433910369873, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22236093878746033, + "rewards/margins": 0.01244346983730793, + "rewards/rejected": -0.2348044365644455, + "step": 170 + }, + { + "epoch": 0.144, + "grad_norm": 0.28243404626846313, + "learning_rate": 5.727999999999999e-06, + "log_odds_chosen": 0.09875164180994034, + "log_odds_ratio": -0.7095295190811157, + "logits/chosen": 1.2436240911483765, + "logits/rejected": 1.275618553161621, + "logps/chosen": -2.1978044509887695, + "logps/rejected": -2.272761821746826, + "loss": 2.0345205307006835, + "nll_loss": 1.9635677337646484, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.21978041529655457, + "rewards/margins": 0.0074957506731152534, + "rewards/rejected": -0.22727617621421814, + "step": 180 + }, + { + "epoch": 0.152, + "grad_norm": 0.18770183622837067, + "learning_rate": 6.0479999999999995e-06, + "log_odds_chosen": 0.2480204850435257, + "log_odds_ratio": -0.7025401592254639, + "logits/chosen": 1.1238905191421509, + "logits/rejected": 1.1091030836105347, + "logps/chosen": -2.0656919479370117, + "logps/rejected": -2.3097949028015137, + "loss": 1.9743515014648438, + "nll_loss": 1.904097318649292, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20656922459602356, + "rewards/margins": 0.024410294368863106, + "rewards/rejected": -0.23097951710224152, + "step": 190 + }, + { + "epoch": 0.16, + "grad_norm": 0.18696419894695282, + "learning_rate": 6.368e-06, + "log_odds_chosen": 0.08414062857627869, + "log_odds_ratio": -0.7133861184120178, + "logits/chosen": 1.1996185779571533, + "logits/rejected": 1.0700123310089111, + "logps/chosen": -2.1116840839385986, + "logps/rejected": -2.192984104156494, + "loss": 1.9692060470581054, + "nll_loss": 1.897867202758789, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.21116837859153748, + "rewards/margins": 0.008130033500492573, + "rewards/rejected": -0.21929840743541718, + "step": 200 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.03449448570609093, + "eval_log_odds_ratio": -0.7650534510612488, + "eval_logits/chosen": 1.1852835416793823, + "eval_logits/rejected": 1.1406316757202148, + "eval_logps/chosen": -2.1768786907196045, + "eval_logps/rejected": -2.2172398567199707, + "eval_loss": 2.0106678009033203, + "eval_nll_loss": 1.9341623783111572, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.21768784523010254, + "eval_rewards/margins": 0.00403614854440093, + "eval_rewards/rejected": -0.2217240035533905, + "eval_runtime": 53.0621, + "eval_samples_per_second": 9.423, + "eval_steps_per_second": 4.711, + "step": 200 + }, + { + "epoch": 0.168, + "grad_norm": 0.17617128789424896, + "learning_rate": 6.687999999999999e-06, + "log_odds_chosen": 0.05374947935342789, + "log_odds_ratio": -0.716423511505127, + "logits/chosen": 1.1509783267974854, + "logits/rejected": 1.1752405166625977, + "logps/chosen": -2.1427035331726074, + "logps/rejected": -2.19557523727417, + "loss": 1.9653192520141602, + "nll_loss": 1.893676996231079, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21427035331726074, + "rewards/margins": 0.0052871680818498135, + "rewards/rejected": -0.2195574939250946, + "step": 210 + }, + { + "epoch": 0.176, + "grad_norm": 0.17626060545444489, + "learning_rate": 7.008e-06, + "log_odds_chosen": 0.1770932376384735, + "log_odds_ratio": -0.7325607538223267, + "logits/chosen": 1.3423850536346436, + "logits/rejected": 1.2371774911880493, + "logps/chosen": -2.0640311241149902, + "logps/rejected": -2.250347375869751, + "loss": 1.9260797500610352, + "nll_loss": 1.8528236150741577, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20640310645103455, + "rewards/margins": 0.018631622195243835, + "rewards/rejected": -0.2250347137451172, + "step": 220 + }, + { + "epoch": 0.184, + "grad_norm": 0.1989600658416748, + "learning_rate": 7.328e-06, + "log_odds_chosen": 0.18904821574687958, + "log_odds_ratio": -0.6782074570655823, + "logits/chosen": 1.280461311340332, + "logits/rejected": 1.1252264976501465, + "logps/chosen": -2.0997283458709717, + "logps/rejected": -2.2417054176330566, + "loss": 1.870577049255371, + "nll_loss": 1.8027557134628296, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20997285842895508, + "rewards/margins": 0.014197695069015026, + "rewards/rejected": -0.2241705358028412, + "step": 230 + }, + { + "epoch": 0.192, + "grad_norm": 0.13494263589382172, + "learning_rate": 7.647999999999999e-06, + "log_odds_chosen": -0.036032918840646744, + "log_odds_ratio": -0.7879734039306641, + "logits/chosen": 1.448754072189331, + "logits/rejected": 1.3340137004852295, + "logps/chosen": -2.011657238006592, + "logps/rejected": -1.9867651462554932, + "loss": 1.9344427108764648, + "nll_loss": 1.8556454181671143, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20116575062274933, + "rewards/margins": -0.0024892189539968967, + "rewards/rejected": -0.19867651164531708, + "step": 240 + }, + { + "epoch": 0.2, + "grad_norm": 0.22943158447742462, + "learning_rate": 7.967999999999999e-06, + "log_odds_chosen": -0.0008321896311827004, + "log_odds_ratio": -0.7597036957740784, + "logits/chosen": 1.290684461593628, + "logits/rejected": 1.2675565481185913, + "logps/chosen": -2.0894367694854736, + "logps/rejected": -2.0819172859191895, + "loss": 1.90467529296875, + "nll_loss": 1.828704833984375, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2089436799287796, + "rewards/margins": -0.0007519676582887769, + "rewards/rejected": -0.20819172263145447, + "step": 250 + }, + { + "epoch": 0.208, + "grad_norm": 0.14598031342029572, + "learning_rate": 7.967999999999999e-06, + "log_odds_chosen": -0.06827996671199799, + "log_odds_ratio": -0.8000016212463379, + "logits/chosen": 1.2406280040740967, + "logits/rejected": 1.3126946687698364, + "logps/chosen": -2.0982561111450195, + "logps/rejected": -2.0350162982940674, + "loss": 1.8742866516113281, + "nll_loss": 1.7942863702774048, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.20982563495635986, + "rewards/margins": -0.006323990412056446, + "rewards/rejected": -0.2035016119480133, + "step": 260 + }, + { + "epoch": 0.216, + "grad_norm": 0.2379223257303238, + "learning_rate": 7.932444444444444e-06, + "log_odds_chosen": -0.09409850090742111, + "log_odds_ratio": -0.8200883865356445, + "logits/chosen": 1.3649566173553467, + "logits/rejected": 1.3874846696853638, + "logps/chosen": -2.1385960578918457, + "logps/rejected": -2.0628838539123535, + "loss": 1.8866275787353515, + "nll_loss": 1.80461847782135, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.21385960280895233, + "rewards/margins": -0.00757119944319129, + "rewards/rejected": -0.2062883824110031, + "step": 270 + }, + { + "epoch": 0.224, + "grad_norm": 0.22652894258499146, + "learning_rate": 7.896888888888888e-06, + "log_odds_chosen": 0.3813454508781433, + "log_odds_ratio": -0.6462765336036682, + "logits/chosen": 1.2896631956100464, + "logits/rejected": 1.1385802030563354, + "logps/chosen": -2.1021246910095215, + "logps/rejected": -2.4392008781433105, + "loss": 1.9009965896606444, + "nll_loss": 1.8363691568374634, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.21021243929862976, + "rewards/margins": 0.0337076410651207, + "rewards/rejected": -0.24392008781433105, + "step": 280 + }, + { + "epoch": 0.232, + "grad_norm": 0.19373102486133575, + "learning_rate": 7.861333333333334e-06, + "log_odds_chosen": 0.059820324182510376, + "log_odds_ratio": -0.7212327122688293, + "logits/chosen": 1.215315580368042, + "logits/rejected": 1.2433207035064697, + "logps/chosen": -2.052096366882324, + "logps/rejected": -2.108736515045166, + "loss": 1.8485704421997071, + "nll_loss": 1.7764475345611572, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2052096426486969, + "rewards/margins": 0.0056640272960066795, + "rewards/rejected": -0.21087364852428436, + "step": 290 + }, + { + "epoch": 0.24, + "grad_norm": 0.12395530939102173, + "learning_rate": 7.825777777777778e-06, + "log_odds_chosen": 0.12978777289390564, + "log_odds_ratio": -0.7189403772354126, + "logits/chosen": 1.3819777965545654, + "logits/rejected": 1.4145673513412476, + "logps/chosen": -2.0398616790771484, + "logps/rejected": -2.170393228530884, + "loss": 1.814511489868164, + "nll_loss": 1.7426178455352783, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20398616790771484, + "rewards/margins": 0.013053147122263908, + "rewards/rejected": -0.2170393466949463, + "step": 300 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.03967902436852455, + "eval_log_odds_ratio": -0.76247638463974, + "eval_logits/chosen": 1.3745256662368774, + "eval_logits/rejected": 1.3320753574371338, + "eval_logps/chosen": -2.0912294387817383, + "eval_logps/rejected": -2.1344449520111084, + "eval_loss": 1.911361575126648, + "eval_nll_loss": 1.8351138830184937, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20912295579910278, + "eval_rewards/margins": 0.004321571905165911, + "eval_rewards/rejected": -0.21344450116157532, + "eval_runtime": 53.0491, + "eval_samples_per_second": 9.425, + "eval_steps_per_second": 4.713, + "step": 300 + }, + { + "epoch": 0.248, + "grad_norm": 0.20187616348266602, + "learning_rate": 7.790222222222222e-06, + "log_odds_chosen": -0.023714840412139893, + "log_odds_ratio": -0.7719189524650574, + "logits/chosen": 1.5215256214141846, + "logits/rejected": 1.5573723316192627, + "logps/chosen": -2.0848724842071533, + "logps/rejected": -2.065119981765747, + "loss": 1.8499135971069336, + "nll_loss": 1.772721529006958, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20848727226257324, + "rewards/margins": -0.0019752695225179195, + "rewards/rejected": -0.2065119743347168, + "step": 310 + }, + { + "epoch": 0.256, + "grad_norm": 0.19256918132305145, + "learning_rate": 7.754666666666667e-06, + "log_odds_chosen": 0.16798502206802368, + "log_odds_ratio": -0.6922372579574585, + "logits/chosen": 1.3285846710205078, + "logits/rejected": 1.3583507537841797, + "logps/chosen": -2.07856822013855, + "logps/rejected": -2.1995127201080322, + "loss": 1.8253305435180665, + "nll_loss": 1.7561067342758179, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20785681903362274, + "rewards/margins": 0.012094443663954735, + "rewards/rejected": -0.21995127201080322, + "step": 320 + }, + { + "epoch": 0.264, + "grad_norm": 0.16783681511878967, + "learning_rate": 7.719111111111111e-06, + "log_odds_chosen": 0.16193893551826477, + "log_odds_ratio": -0.6671011447906494, + "logits/chosen": 1.4124246835708618, + "logits/rejected": 1.460701823234558, + "logps/chosen": -2.0305984020233154, + "logps/rejected": -2.159663677215576, + "loss": 1.8951801300048827, + "nll_loss": 1.82846999168396, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2030598372220993, + "rewards/margins": 0.012906527146697044, + "rewards/rejected": -0.2159663736820221, + "step": 330 + }, + { + "epoch": 0.272, + "grad_norm": 0.17299160361289978, + "learning_rate": 7.683555555555555e-06, + "log_odds_chosen": 0.09661159664392471, + "log_odds_ratio": -0.7051470875740051, + "logits/chosen": 1.4825594425201416, + "logits/rejected": 1.3175979852676392, + "logps/chosen": -1.8920962810516357, + "logps/rejected": -1.9876978397369385, + "loss": 1.7514026641845704, + "nll_loss": 1.6808878183364868, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18920964002609253, + "rewards/margins": 0.009560128673911095, + "rewards/rejected": -0.19876977801322937, + "step": 340 + }, + { + "epoch": 0.28, + "grad_norm": 0.1780914068222046, + "learning_rate": 7.647999999999999e-06, + "log_odds_chosen": 0.05790011212229729, + "log_odds_ratio": -0.7372554540634155, + "logits/chosen": 1.3375146389007568, + "logits/rejected": 1.4315671920776367, + "logps/chosen": -2.0841262340545654, + "logps/rejected": -2.132392644882202, + "loss": 1.8903158187866211, + "nll_loss": 1.8165900707244873, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.20841261744499207, + "rewards/margins": 0.004826628603041172, + "rewards/rejected": -0.21323923766613007, + "step": 350 + }, + { + "epoch": 0.288, + "grad_norm": 0.1749999225139618, + "learning_rate": 7.612444444444444e-06, + "log_odds_chosen": 0.12323548644781113, + "log_odds_ratio": -0.689288318157196, + "logits/chosen": 1.4562640190124512, + "logits/rejected": 1.3619139194488525, + "logps/chosen": -1.990504264831543, + "logps/rejected": -2.096740484237671, + "loss": 1.829003143310547, + "nll_loss": 1.7600743770599365, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1990504264831543, + "rewards/margins": 0.010623643174767494, + "rewards/rejected": -0.20967407524585724, + "step": 360 + }, + { + "epoch": 0.296, + "grad_norm": 0.17985334992408752, + "learning_rate": 7.576888888888889e-06, + "log_odds_chosen": 0.09662418812513351, + "log_odds_ratio": -0.70073401927948, + "logits/chosen": 1.5395666360855103, + "logits/rejected": 1.412684679031372, + "logps/chosen": -1.976719856262207, + "logps/rejected": -2.0691561698913574, + "loss": 1.8579627990722656, + "nll_loss": 1.7878894805908203, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19767197966575623, + "rewards/margins": 0.009243631735444069, + "rewards/rejected": -0.20691561698913574, + "step": 370 + }, + { + "epoch": 0.304, + "grad_norm": 0.2149394452571869, + "learning_rate": 7.541333333333333e-06, + "log_odds_chosen": 0.1822991818189621, + "log_odds_ratio": -0.6692709922790527, + "logits/chosen": 1.3819072246551514, + "logits/rejected": 1.4091228246688843, + "logps/chosen": -1.9516513347625732, + "logps/rejected": -2.107210159301758, + "loss": 1.8095186233520508, + "nll_loss": 1.7425915002822876, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19516517221927643, + "rewards/margins": 0.015555836260318756, + "rewards/rejected": -0.2107209861278534, + "step": 380 + }, + { + "epoch": 0.312, + "grad_norm": 0.20533578097820282, + "learning_rate": 7.505777777777777e-06, + "log_odds_chosen": 0.13032521307468414, + "log_odds_ratio": -0.7293068170547485, + "logits/chosen": 1.4975590705871582, + "logits/rejected": 1.3673573732376099, + "logps/chosen": -1.9267303943634033, + "logps/rejected": -2.037688970565796, + "loss": 1.7859189987182618, + "nll_loss": 1.7129881381988525, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19267304241657257, + "rewards/margins": 0.011095861904323101, + "rewards/rejected": -0.20376892387866974, + "step": 390 + }, + { + "epoch": 0.32, + "grad_norm": 0.18664704263210297, + "learning_rate": 7.470222222222222e-06, + "log_odds_chosen": 0.10940120369195938, + "log_odds_ratio": -0.7268368005752563, + "logits/chosen": 1.4196456670761108, + "logits/rejected": 1.3341939449310303, + "logps/chosen": -1.920606017112732, + "logps/rejected": -2.0050148963928223, + "loss": 1.799521255493164, + "nll_loss": 1.7268375158309937, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19206061959266663, + "rewards/margins": 0.008440867997705936, + "rewards/rejected": -0.20050148665905, + "step": 400 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.033501941710710526, + "eval_log_odds_ratio": -0.7624432444572449, + "eval_logits/chosen": 1.455781102180481, + "eval_logits/rejected": 1.41294264793396, + "eval_logps/chosen": -2.0634007453918457, + "eval_logps/rejected": -2.0990681648254395, + "eval_loss": 1.8849780559539795, + "eval_nll_loss": 1.8087337017059326, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.20634005963802338, + "eval_rewards/margins": 0.0035667610354721546, + "eval_rewards/rejected": -0.20990681648254395, + "eval_runtime": 53.0804, + "eval_samples_per_second": 9.42, + "eval_steps_per_second": 4.71, + "step": 400 + }, + { + "epoch": 0.328, + "grad_norm": 0.2279583215713501, + "learning_rate": 7.434666666666667e-06, + "log_odds_chosen": 0.24681270122528076, + "log_odds_ratio": -0.6675797700881958, + "logits/chosen": 1.3346173763275146, + "logits/rejected": 1.3772237300872803, + "logps/chosen": -1.9703868627548218, + "logps/rejected": -2.1775240898132324, + "loss": 1.819239616394043, + "nll_loss": 1.7524816989898682, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1970386803150177, + "rewards/margins": 0.02071371115744114, + "rewards/rejected": -0.21775241196155548, + "step": 410 + }, + { + "epoch": 0.336, + "grad_norm": 0.18673349916934967, + "learning_rate": 7.399111111111111e-06, + "log_odds_chosen": 0.0723341852426529, + "log_odds_ratio": -0.7402567863464355, + "logits/chosen": 1.352061152458191, + "logits/rejected": 1.3581398725509644, + "logps/chosen": -1.9535837173461914, + "logps/rejected": -2.0189168453216553, + "loss": 1.790372085571289, + "nll_loss": 1.7163463830947876, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19535836577415466, + "rewards/margins": 0.0065332986414432526, + "rewards/rejected": -0.20189166069030762, + "step": 420 + }, + { + "epoch": 0.344, + "grad_norm": 0.13824671506881714, + "learning_rate": 7.3635555555555544e-06, + "log_odds_chosen": 0.07097329199314117, + "log_odds_ratio": -0.723812460899353, + "logits/chosen": 1.500058889389038, + "logits/rejected": 1.4235128164291382, + "logps/chosen": -2.0450332164764404, + "logps/rejected": -2.1189401149749756, + "loss": 1.8183156967163085, + "nll_loss": 1.7459347248077393, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20450334250926971, + "rewards/margins": 0.007390675134956837, + "rewards/rejected": -0.21189400553703308, + "step": 430 + }, + { + "epoch": 0.352, + "grad_norm": 0.19959300756454468, + "learning_rate": 7.328e-06, + "log_odds_chosen": -0.05561716482043266, + "log_odds_ratio": -0.8284331560134888, + "logits/chosen": 1.319535493850708, + "logits/rejected": 1.291486144065857, + "logps/chosen": -2.0064926147460938, + "logps/rejected": -1.936273217201233, + "loss": 1.8077302932739259, + "nll_loss": 1.7248871326446533, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20064929127693176, + "rewards/margins": -0.0070219277404248714, + "rewards/rejected": -0.19362732768058777, + "step": 440 + }, + { + "epoch": 0.36, + "grad_norm": 0.11968322098255157, + "learning_rate": 7.292444444444444e-06, + "log_odds_chosen": 0.15821342170238495, + "log_odds_ratio": -0.7299971580505371, + "logits/chosen": 1.4834121465682983, + "logits/rejected": 1.437281847000122, + "logps/chosen": -1.9749513864517212, + "logps/rejected": -2.1457695960998535, + "loss": 1.8616512298583985, + "nll_loss": 1.788651704788208, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19749514758586884, + "rewards/margins": 0.017081793397665024, + "rewards/rejected": -0.21457692980766296, + "step": 450 + }, + { + "epoch": 0.368, + "grad_norm": 0.2767660915851593, + "learning_rate": 7.2568888888888885e-06, + "log_odds_chosen": 0.05323999002575874, + "log_odds_ratio": -0.7374966740608215, + "logits/chosen": 1.5227715969085693, + "logits/rejected": 1.5175096988677979, + "logps/chosen": -1.996843695640564, + "logps/rejected": -2.0283195972442627, + "loss": 1.8424455642700195, + "nll_loss": 1.7686958312988281, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19968439638614655, + "rewards/margins": 0.003147574607282877, + "rewards/rejected": -0.20283198356628418, + "step": 460 + }, + { + "epoch": 0.376, + "grad_norm": 0.2534579038619995, + "learning_rate": 7.221333333333332e-06, + "log_odds_chosen": -0.01329396665096283, + "log_odds_ratio": -0.7843751907348633, + "logits/chosen": 1.5107393264770508, + "logits/rejected": 1.4327045679092407, + "logps/chosen": -2.023428440093994, + "logps/rejected": -2.012712001800537, + "loss": 1.762538528442383, + "nll_loss": 1.684100866317749, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20234286785125732, + "rewards/margins": -0.001071644015610218, + "rewards/rejected": -0.20127122104167938, + "step": 470 + }, + { + "epoch": 0.384, + "grad_norm": 0.22043545544147491, + "learning_rate": 7.185777777777778e-06, + "log_odds_chosen": 0.1587541103363037, + "log_odds_ratio": -0.709527850151062, + "logits/chosen": 1.4747645854949951, + "logits/rejected": 1.4961092472076416, + "logps/chosen": -1.9713242053985596, + "logps/rejected": -2.11765193939209, + "loss": 1.8060014724731446, + "nll_loss": 1.7350486516952515, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1971324235200882, + "rewards/margins": 0.014632781967520714, + "rewards/rejected": -0.21176521480083466, + "step": 480 + }, + { + "epoch": 0.392, + "grad_norm": 0.2087666392326355, + "learning_rate": 7.150222222222222e-06, + "log_odds_chosen": 0.26932230591773987, + "log_odds_ratio": -0.6544117331504822, + "logits/chosen": 1.504432201385498, + "logits/rejected": 1.5163673162460327, + "logps/chosen": -1.8658252954483032, + "logps/rejected": -2.088016986846924, + "loss": 1.816385269165039, + "nll_loss": 1.7509441375732422, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1865825355052948, + "rewards/margins": 0.022219162434339523, + "rewards/rejected": -0.20880170166492462, + "step": 490 + }, + { + "epoch": 0.4, + "grad_norm": 0.14658813178539276, + "learning_rate": 7.1146666666666664e-06, + "log_odds_chosen": -0.0933292880654335, + "log_odds_ratio": -0.8082043528556824, + "logits/chosen": 1.5245555639266968, + "logits/rejected": 1.505550742149353, + "logps/chosen": -2.04571533203125, + "logps/rejected": -1.9844484329223633, + "loss": 1.8789659500122071, + "nll_loss": 1.7981455326080322, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.20457151532173157, + "rewards/margins": -0.006126692984253168, + "rewards/rejected": -0.19844482839107513, + "step": 500 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.02602977305650711, + "eval_log_odds_ratio": -0.7631290555000305, + "eval_logits/chosen": 1.5728307962417603, + "eval_logits/rejected": 1.5338318347930908, + "eval_logps/chosen": -2.0366148948669434, + "eval_logps/rejected": -2.065042018890381, + "eval_loss": 1.8675161600112915, + "eval_nll_loss": 1.7912031412124634, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.20366153120994568, + "eval_rewards/margins": 0.002842681249603629, + "eval_rewards/rejected": -0.20650418102741241, + "eval_runtime": 53.0222, + "eval_samples_per_second": 9.43, + "eval_steps_per_second": 4.715, + "step": 500 + }, + { + "epoch": 0.408, + "grad_norm": 0.16157585382461548, + "learning_rate": 7.07911111111111e-06, + "log_odds_chosen": 0.19593098759651184, + "log_odds_ratio": -0.6945115923881531, + "logits/chosen": 1.6804075241088867, + "logits/rejected": 1.6551501750946045, + "logps/chosen": -1.8801660537719727, + "logps/rejected": -2.044741153717041, + "loss": 1.720193862915039, + "nll_loss": 1.650742769241333, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1880166232585907, + "rewards/margins": 0.016457516700029373, + "rewards/rejected": -0.20447412133216858, + "step": 510 + }, + { + "epoch": 0.416, + "grad_norm": 0.21952685713768005, + "learning_rate": 7.043555555555556e-06, + "log_odds_chosen": 0.11960093677043915, + "log_odds_ratio": -0.6947656869888306, + "logits/chosen": 1.6775137186050415, + "logits/rejected": 1.6482852697372437, + "logps/chosen": -1.9966636896133423, + "logps/rejected": -2.1046407222747803, + "loss": 1.7815994262695312, + "nll_loss": 1.7121226787567139, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19966639578342438, + "rewards/margins": 0.010797705501317978, + "rewards/rejected": -0.21046409010887146, + "step": 520 + }, + { + "epoch": 0.424, + "grad_norm": 0.23222282528877258, + "learning_rate": 7.008e-06, + "log_odds_chosen": -0.033172450959682465, + "log_odds_ratio": -0.8052657842636108, + "logits/chosen": 1.6025762557983398, + "logits/rejected": 1.6583106517791748, + "logps/chosen": -2.0893313884735107, + "logps/rejected": -2.0725255012512207, + "loss": 1.803743553161621, + "nll_loss": 1.7232167720794678, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20893315970897675, + "rewards/margins": -0.0016806062776595354, + "rewards/rejected": -0.20725254714488983, + "step": 530 + }, + { + "epoch": 0.432, + "grad_norm": 0.15105725824832916, + "learning_rate": 6.9724444444444435e-06, + "log_odds_chosen": 0.08743356913328171, + "log_odds_ratio": -0.701758861541748, + "logits/chosen": 1.590755820274353, + "logits/rejected": 1.4823769330978394, + "logps/chosen": -1.9318288564682007, + "logps/rejected": -2.0069031715393066, + "loss": 1.8346128463745117, + "nll_loss": 1.7644370794296265, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19318291544914246, + "rewards/margins": 0.0075074234046041965, + "rewards/rejected": -0.20069031417369843, + "step": 540 + }, + { + "epoch": 0.44, + "grad_norm": 0.16001789271831512, + "learning_rate": 6.936888888888889e-06, + "log_odds_chosen": 0.18001510202884674, + "log_odds_ratio": -0.7216249704360962, + "logits/chosen": 1.5185306072235107, + "logits/rejected": 1.5836330652236938, + "logps/chosen": -1.9629827737808228, + "logps/rejected": -2.099684476852417, + "loss": 1.7317916870117187, + "nll_loss": 1.6596291065216064, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19629831612110138, + "rewards/margins": 0.013670151121914387, + "rewards/rejected": -0.2099684774875641, + "step": 550 + }, + { + "epoch": 0.448, + "grad_norm": 0.2220832109451294, + "learning_rate": 6.901333333333333e-06, + "log_odds_chosen": -0.046985138207674026, + "log_odds_ratio": -0.7678507566452026, + "logits/chosen": 1.5970782041549683, + "logits/rejected": 1.5387827157974243, + "logps/chosen": -2.008047580718994, + "logps/rejected": -1.964775800704956, + "loss": 1.787788200378418, + "nll_loss": 1.711003065109253, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20080475509166718, + "rewards/margins": -0.004327182658016682, + "rewards/rejected": -0.19647757709026337, + "step": 560 + }, + { + "epoch": 0.456, + "grad_norm": 0.1927526891231537, + "learning_rate": 6.8657777777777776e-06, + "log_odds_chosen": 0.08939726650714874, + "log_odds_ratio": -0.7270767688751221, + "logits/chosen": 1.51679265499115, + "logits/rejected": 1.4389561414718628, + "logps/chosen": -1.9325392246246338, + "logps/rejected": -1.9886747598648071, + "loss": 1.7688678741455077, + "nll_loss": 1.6961603164672852, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19325393438339233, + "rewards/margins": 0.0056135449558496475, + "rewards/rejected": -0.19886748492717743, + "step": 570 + }, + { + "epoch": 0.464, + "grad_norm": 0.2728411853313446, + "learning_rate": 6.830222222222221e-06, + "log_odds_chosen": 0.06151670217514038, + "log_odds_ratio": -0.7394816875457764, + "logits/chosen": 1.6327412128448486, + "logits/rejected": 1.7010612487792969, + "logps/chosen": -2.0379953384399414, + "logps/rejected": -2.0829684734344482, + "loss": 1.7836915969848632, + "nll_loss": 1.7097432613372803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2037995308637619, + "rewards/margins": 0.0044972943142056465, + "rewards/rejected": -0.20829685032367706, + "step": 580 + }, + { + "epoch": 0.472, + "grad_norm": 0.13671930134296417, + "learning_rate": 6.794666666666667e-06, + "log_odds_chosen": 0.17000290751457214, + "log_odds_ratio": -0.6512231826782227, + "logits/chosen": 1.6878210306167603, + "logits/rejected": 1.6108119487762451, + "logps/chosen": -1.8216642141342163, + "logps/rejected": -1.9648876190185547, + "loss": 1.8262306213378907, + "nll_loss": 1.7611083984375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1821664273738861, + "rewards/margins": 0.01432233490049839, + "rewards/rejected": -0.19648873805999756, + "step": 590 + }, + { + "epoch": 0.48, + "grad_norm": 0.1815636157989502, + "learning_rate": 6.759111111111111e-06, + "log_odds_chosen": 0.08520406484603882, + "log_odds_ratio": -0.7176100611686707, + "logits/chosen": 1.4824538230895996, + "logits/rejected": 1.5591896772384644, + "logps/chosen": -1.965994119644165, + "logps/rejected": -2.0540990829467773, + "loss": 1.8055044174194337, + "nll_loss": 1.73374342918396, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19659940898418427, + "rewards/margins": 0.008810499683022499, + "rewards/rejected": -0.2054099142551422, + "step": 600 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.030262866988778114, + "eval_log_odds_ratio": -0.7621479034423828, + "eval_logits/chosen": 1.5845204591751099, + "eval_logits/rejected": 1.5468944311141968, + "eval_logps/chosen": -2.024677038192749, + "eval_logps/rejected": -2.0564591884613037, + "eval_loss": 1.8558011054992676, + "eval_nll_loss": 1.7795861959457397, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20246769487857819, + "eval_rewards/margins": 0.0031782032456249, + "eval_rewards/rejected": -0.20564593374729156, + "eval_runtime": 53.1235, + "eval_samples_per_second": 9.412, + "eval_steps_per_second": 4.706, + "step": 600 + }, + { + "epoch": 0.488, + "grad_norm": 0.17874790728092194, + "learning_rate": 6.7235555555555555e-06, + "log_odds_chosen": -0.14907710254192352, + "log_odds_ratio": -0.8681455850601196, + "logits/chosen": 1.6063718795776367, + "logits/rejected": 1.6573474407196045, + "logps/chosen": -2.107225179672241, + "logps/rejected": -1.97052001953125, + "loss": 1.8059173583984376, + "nll_loss": 1.7191026210784912, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21072253584861755, + "rewards/margins": -0.013670533895492554, + "rewards/rejected": -0.197052001953125, + "step": 610 + }, + { + "epoch": 0.496, + "grad_norm": 0.1660294085741043, + "learning_rate": 6.687999999999999e-06, + "log_odds_chosen": -0.017593836411833763, + "log_odds_ratio": -0.7714040875434875, + "logits/chosen": 1.6416600942611694, + "logits/rejected": 1.6900558471679688, + "logps/chosen": -1.9674263000488281, + "logps/rejected": -1.943830132484436, + "loss": 1.7882635116577148, + "nll_loss": 1.7111231088638306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19674262404441833, + "rewards/margins": -0.0023596244864165783, + "rewards/rejected": -0.19438298046588898, + "step": 620 + }, + { + "epoch": 0.504, + "grad_norm": 0.17608729004859924, + "learning_rate": 6.652444444444445e-06, + "log_odds_chosen": 0.2501833438873291, + "log_odds_ratio": -0.6537446975708008, + "logits/chosen": 1.6887744665145874, + "logits/rejected": 1.5986034870147705, + "logps/chosen": -1.9225715398788452, + "logps/rejected": -2.143808364868164, + "loss": 1.8512474060058595, + "nll_loss": 1.7858734130859375, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19225716590881348, + "rewards/margins": 0.02212369069457054, + "rewards/rejected": -0.21438086032867432, + "step": 630 + }, + { + "epoch": 0.512, + "grad_norm": 0.3151007294654846, + "learning_rate": 6.616888888888889e-06, + "log_odds_chosen": 0.04988854005932808, + "log_odds_ratio": -0.7158768177032471, + "logits/chosen": 1.569506049156189, + "logits/rejected": 1.4855579137802124, + "logps/chosen": -2.0310044288635254, + "logps/rejected": -2.07084059715271, + "loss": 1.8164968490600586, + "nll_loss": 1.744909644126892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20310044288635254, + "rewards/margins": 0.003983622882515192, + "rewards/rejected": -0.20708408951759338, + "step": 640 + }, + { + "epoch": 0.52, + "grad_norm": 0.2057182490825653, + "learning_rate": 6.5813333333333325e-06, + "log_odds_chosen": 0.44220876693725586, + "log_odds_ratio": -0.6088204979896545, + "logits/chosen": 1.682287573814392, + "logits/rejected": 1.5269925594329834, + "logps/chosen": -1.8356335163116455, + "logps/rejected": -2.236959934234619, + "loss": 1.7419025421142578, + "nll_loss": 1.6810203790664673, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18356335163116455, + "rewards/margins": 0.04013265669345856, + "rewards/rejected": -0.22369599342346191, + "step": 650 + }, + { + "epoch": 0.528, + "grad_norm": 0.16435767710208893, + "learning_rate": 6.545777777777777e-06, + "log_odds_chosen": 0.07594867050647736, + "log_odds_ratio": -0.742243766784668, + "logits/chosen": 1.66165030002594, + "logits/rejected": 1.5617786645889282, + "logps/chosen": -1.9473623037338257, + "logps/rejected": -2.006049871444702, + "loss": 1.8304746627807618, + "nll_loss": 1.7562503814697266, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19473622739315033, + "rewards/margins": 0.005868755746632814, + "rewards/rejected": -0.20060500502586365, + "step": 660 + }, + { + "epoch": 0.536, + "grad_norm": 0.15003077685832977, + "learning_rate": 6.510222222222222e-06, + "log_odds_chosen": 0.21500691771507263, + "log_odds_ratio": -0.6412914991378784, + "logits/chosen": 1.6037086248397827, + "logits/rejected": 1.438218355178833, + "logps/chosen": -1.9100940227508545, + "logps/rejected": -2.087085485458374, + "loss": 1.8060630798339843, + "nll_loss": 1.741934061050415, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19100941717624664, + "rewards/margins": 0.017699118703603745, + "rewards/rejected": -0.2087085247039795, + "step": 670 + }, + { + "epoch": 0.544, + "grad_norm": 0.1473378688097, + "learning_rate": 6.474666666666667e-06, + "log_odds_chosen": 0.0917447879910469, + "log_odds_ratio": -0.7169826626777649, + "logits/chosen": 1.63227117061615, + "logits/rejected": 1.5642629861831665, + "logps/chosen": -1.9436975717544556, + "logps/rejected": -2.0165576934814453, + "loss": 1.7880081176757812, + "nll_loss": 1.7163095474243164, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19436973333358765, + "rewards/margins": 0.0072860405780375, + "rewards/rejected": -0.201655775308609, + "step": 680 + }, + { + "epoch": 0.552, + "grad_norm": 0.156095951795578, + "learning_rate": 6.4391111111111105e-06, + "log_odds_chosen": 0.03318742290139198, + "log_odds_ratio": -0.7350637912750244, + "logits/chosen": 1.6490083932876587, + "logits/rejected": 1.5582869052886963, + "logps/chosen": -1.9969427585601807, + "logps/rejected": -2.019963264465332, + "loss": 1.8322141647338868, + "nll_loss": 1.7587080001831055, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19969427585601807, + "rewards/margins": 0.002302053850144148, + "rewards/rejected": -0.20199629664421082, + "step": 690 + }, + { + "epoch": 0.56, + "grad_norm": 0.19251035153865814, + "learning_rate": 6.403555555555555e-06, + "log_odds_chosen": 0.1400238573551178, + "log_odds_ratio": -0.6950441598892212, + "logits/chosen": 1.6086819171905518, + "logits/rejected": 1.5480411052703857, + "logps/chosen": -1.970768690109253, + "logps/rejected": -2.083794116973877, + "loss": 1.7771568298339844, + "nll_loss": 1.7076523303985596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19707687199115753, + "rewards/margins": 0.011302560567855835, + "rewards/rejected": -0.20837941765785217, + "step": 700 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.03070848062634468, + "eval_log_odds_ratio": -0.7618256211280823, + "eval_logits/chosen": 1.6476703882217407, + "eval_logits/rejected": 1.6128474473953247, + "eval_logps/chosen": -2.0164527893066406, + "eval_logps/rejected": -2.0491254329681396, + "eval_loss": 1.8477715253829956, + "eval_nll_loss": 1.771588921546936, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20164531469345093, + "eval_rewards/margins": 0.003267248161137104, + "eval_rewards/rejected": -0.20491254329681396, + "eval_runtime": 53.1062, + "eval_samples_per_second": 9.415, + "eval_steps_per_second": 4.708, + "step": 700 + }, + { + "epoch": 0.568, + "grad_norm": 0.18433217704296112, + "learning_rate": 6.368e-06, + "log_odds_chosen": 0.17153123021125793, + "log_odds_ratio": -0.6689559817314148, + "logits/chosen": 1.6745821237564087, + "logits/rejected": 1.6360340118408203, + "logps/chosen": -1.9238487482070923, + "logps/rejected": -2.0793042182922363, + "loss": 1.7662607192993165, + "nll_loss": 1.6993646621704102, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19238488376140594, + "rewards/margins": 0.015545527450740337, + "rewards/rejected": -0.20793041586875916, + "step": 710 + }, + { + "epoch": 0.576, + "grad_norm": 0.18722620606422424, + "learning_rate": 6.3324444444444445e-06, + "log_odds_chosen": -0.07723536342382431, + "log_odds_ratio": -0.8040523529052734, + "logits/chosen": 1.611930251121521, + "logits/rejected": 1.6221723556518555, + "logps/chosen": -2.042644500732422, + "logps/rejected": -1.9738566875457764, + "loss": 1.8212066650390626, + "nll_loss": 1.740801215171814, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20426444709300995, + "rewards/margins": -0.006878760643303394, + "rewards/rejected": -0.19738569855690002, + "step": 720 + }, + { + "epoch": 0.584, + "grad_norm": 0.20682425796985626, + "learning_rate": 6.296888888888888e-06, + "log_odds_chosen": 0.12106932699680328, + "log_odds_ratio": -0.7056177258491516, + "logits/chosen": 1.5577377080917358, + "logits/rejected": 1.5362805128097534, + "logps/chosen": -2.016268014907837, + "logps/rejected": -2.120490550994873, + "loss": 1.7989938735961915, + "nll_loss": 1.7284319400787354, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20162677764892578, + "rewards/margins": 0.010422252118587494, + "rewards/rejected": -0.21204905211925507, + "step": 730 + }, + { + "epoch": 0.592, + "grad_norm": 0.19108694791793823, + "learning_rate": 6.261333333333333e-06, + "log_odds_chosen": 0.0587974414229393, + "log_odds_ratio": -0.7309185862541199, + "logits/chosen": 1.6171478033065796, + "logits/rejected": 1.4924392700195312, + "logps/chosen": -1.9424350261688232, + "logps/rejected": -1.9815622568130493, + "loss": 1.7689041137695312, + "nll_loss": 1.6958122253417969, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19424352049827576, + "rewards/margins": 0.003912704065442085, + "rewards/rejected": -0.1981562376022339, + "step": 740 + }, + { + "epoch": 0.6, + "grad_norm": 0.19131523370742798, + "learning_rate": 6.225777777777778e-06, + "log_odds_chosen": 0.023378366604447365, + "log_odds_ratio": -0.7382779717445374, + "logits/chosen": 1.5973578691482544, + "logits/rejected": 1.6551589965820312, + "logps/chosen": -1.959924340248108, + "logps/rejected": -1.9721896648406982, + "loss": 1.7737180709838867, + "nll_loss": 1.699890375137329, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19599245488643646, + "rewards/margins": 0.001226510270498693, + "rewards/rejected": -0.19721895456314087, + "step": 750 + }, + { + "epoch": 0.608, + "grad_norm": 0.15221014618873596, + "learning_rate": 6.190222222222222e-06, + "log_odds_chosen": 0.23301279544830322, + "log_odds_ratio": -0.6567850112915039, + "logits/chosen": 1.7112754583358765, + "logits/rejected": 1.6009547710418701, + "logps/chosen": -1.8999290466308594, + "logps/rejected": -2.1016502380371094, + "loss": 1.7749822616577149, + "nll_loss": 1.709303617477417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18999293446540833, + "rewards/margins": 0.020172089338302612, + "rewards/rejected": -0.21016499400138855, + "step": 760 + }, + { + "epoch": 0.616, + "grad_norm": 0.17331072688102722, + "learning_rate": 6.154666666666666e-06, + "log_odds_chosen": 0.08666707575321198, + "log_odds_ratio": -0.721922755241394, + "logits/chosen": 1.629499077796936, + "logits/rejected": 1.6233304738998413, + "logps/chosen": -1.966850996017456, + "logps/rejected": -2.0316665172576904, + "loss": 1.750493621826172, + "nll_loss": 1.678301215171814, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19668510556221008, + "rewards/margins": 0.006481558084487915, + "rewards/rejected": -0.2031666487455368, + "step": 770 + }, + { + "epoch": 0.624, + "grad_norm": 0.27533528208732605, + "learning_rate": 6.11911111111111e-06, + "log_odds_chosen": 0.2137812376022339, + "log_odds_ratio": -0.6508430242538452, + "logits/chosen": 1.6023156642913818, + "logits/rejected": 1.5528204441070557, + "logps/chosen": -1.9073442220687866, + "logps/rejected": -2.0768206119537354, + "loss": 1.7602737426757813, + "nll_loss": 1.6951894760131836, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19073444604873657, + "rewards/margins": 0.016947634518146515, + "rewards/rejected": -0.2076820582151413, + "step": 780 + }, + { + "epoch": 0.632, + "grad_norm": 0.16427750885486603, + "learning_rate": 6.083555555555556e-06, + "log_odds_chosen": -0.015147974714636803, + "log_odds_ratio": -0.7511974573135376, + "logits/chosen": 1.529996633529663, + "logits/rejected": 1.5390806198120117, + "logps/chosen": -2.041637420654297, + "logps/rejected": -2.0313525199890137, + "loss": 1.8287666320800782, + "nll_loss": 1.7536464929580688, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.20416374504566193, + "rewards/margins": -0.0010284921154379845, + "rewards/rejected": -0.20313525199890137, + "step": 790 + }, + { + "epoch": 0.64, + "grad_norm": 0.18231312930583954, + "learning_rate": 6.0479999999999995e-06, + "log_odds_chosen": 0.15317580103874207, + "log_odds_ratio": -0.6632872223854065, + "logits/chosen": 1.6685699224472046, + "logits/rejected": 1.7259252071380615, + "logps/chosen": -2.016608476638794, + "logps/rejected": -2.1375017166137695, + "loss": 1.8088930130004883, + "nll_loss": 1.7425638437271118, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20166082680225372, + "rewards/margins": 0.012089352123439312, + "rewards/rejected": -0.2137501984834671, + "step": 800 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.03274427726864815, + "eval_log_odds_ratio": -0.7604559063911438, + "eval_logits/chosen": 1.6779166460037231, + "eval_logits/rejected": 1.6468324661254883, + "eval_logps/chosen": -2.0128262042999268, + "eval_logps/rejected": -2.0465006828308105, + "eval_loss": 1.842376947402954, + "eval_nll_loss": 1.766331434249878, + "eval_rewards/accuracies": 0.5419999957084656, + "eval_rewards/chosen": -0.20128265023231506, + "eval_rewards/margins": 0.003367435419932008, + "eval_rewards/rejected": -0.20465007424354553, + "eval_runtime": 53.2521, + "eval_samples_per_second": 9.389, + "eval_steps_per_second": 4.695, + "step": 800 + }, + { + "epoch": 0.648, + "grad_norm": 0.2131498008966446, + "learning_rate": 6.012444444444444e-06, + "log_odds_chosen": 0.025272076949477196, + "log_odds_ratio": -0.7379814386367798, + "logits/chosen": 1.5435346364974976, + "logits/rejected": 1.6608002185821533, + "logps/chosen": -1.8906033039093018, + "logps/rejected": -1.9051824808120728, + "loss": 1.725562858581543, + "nll_loss": 1.6517642736434937, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18906034529209137, + "rewards/margins": 0.0014579046983271837, + "rewards/rejected": -0.19051823019981384, + "step": 810 + }, + { + "epoch": 0.656, + "grad_norm": 0.21816125512123108, + "learning_rate": 5.976888888888888e-06, + "log_odds_chosen": 0.10699774324893951, + "log_odds_ratio": -0.7373054027557373, + "logits/chosen": 1.756774663925171, + "logits/rejected": 1.7355453968048096, + "logps/chosen": -1.9655370712280273, + "logps/rejected": -2.060356378555298, + "loss": 1.8129741668701171, + "nll_loss": 1.7392438650131226, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19655370712280273, + "rewards/margins": 0.009481914341449738, + "rewards/rejected": -0.20603564381599426, + "step": 820 + }, + { + "epoch": 0.664, + "grad_norm": 0.14879928529262543, + "learning_rate": 5.941333333333334e-06, + "log_odds_chosen": 0.07696692645549774, + "log_odds_ratio": -0.7127344012260437, + "logits/chosen": 1.6554100513458252, + "logits/rejected": 1.6326297521591187, + "logps/chosen": -1.9419372081756592, + "logps/rejected": -2.013288974761963, + "loss": 1.7566993713378907, + "nll_loss": 1.6854259967803955, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19419369101524353, + "rewards/margins": 0.007135221268981695, + "rewards/rejected": -0.20132891833782196, + "step": 830 + }, + { + "epoch": 0.672, + "grad_norm": 0.221551775932312, + "learning_rate": 5.9057777777777774e-06, + "log_odds_chosen": 0.06290511786937714, + "log_odds_ratio": -0.7516866326332092, + "logits/chosen": 1.575165867805481, + "logits/rejected": 1.5511913299560547, + "logps/chosen": -1.9852020740509033, + "logps/rejected": -2.0387279987335205, + "loss": 1.745162010192871, + "nll_loss": 1.6699934005737305, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1985202133655548, + "rewards/margins": 0.005352598614990711, + "rewards/rejected": -0.20387279987335205, + "step": 840 + }, + { + "epoch": 0.68, + "grad_norm": 0.15309089422225952, + "learning_rate": 5.870222222222222e-06, + "log_odds_chosen": 0.1161905974149704, + "log_odds_ratio": -0.689677894115448, + "logits/chosen": 1.7609052658081055, + "logits/rejected": 1.7412450313568115, + "logps/chosen": -1.9071937799453735, + "logps/rejected": -2.0045714378356934, + "loss": 1.7892265319824219, + "nll_loss": 1.7202587127685547, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1907193958759308, + "rewards/margins": 0.00973774679005146, + "rewards/rejected": -0.2004571408033371, + "step": 850 + }, + { + "epoch": 0.688, + "grad_norm": 0.14245319366455078, + "learning_rate": 5.834666666666666e-06, + "log_odds_chosen": 0.1561126410961151, + "log_odds_ratio": -0.6676959991455078, + "logits/chosen": 1.6795034408569336, + "logits/rejected": 1.6025750637054443, + "logps/chosen": -1.942917823791504, + "logps/rejected": -2.074822187423706, + "loss": 1.7381248474121094, + "nll_loss": 1.6713546514511108, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19429180026054382, + "rewards/margins": 0.01319044642150402, + "rewards/rejected": -0.207482248544693, + "step": 860 + }, + { + "epoch": 0.696, + "grad_norm": 0.2671545743942261, + "learning_rate": 5.799111111111111e-06, + "log_odds_chosen": 0.20992258191108704, + "log_odds_ratio": -0.6939498782157898, + "logits/chosen": 1.7179876565933228, + "logits/rejected": 1.6866029500961304, + "logps/chosen": -1.9251248836517334, + "logps/rejected": -2.1142725944519043, + "loss": 1.770625686645508, + "nll_loss": 1.7012306451797485, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19251248240470886, + "rewards/margins": 0.01891477219760418, + "rewards/rejected": -0.211427241563797, + "step": 870 + }, + { + "epoch": 0.704, + "grad_norm": 0.17794589698314667, + "learning_rate": 5.763555555555555e-06, + "log_odds_chosen": 0.04583617299795151, + "log_odds_ratio": -0.7294620871543884, + "logits/chosen": 1.7066138982772827, + "logits/rejected": 1.7307049036026, + "logps/chosen": -1.933084487915039, + "logps/rejected": -1.966208815574646, + "loss": 1.7707733154296874, + "nll_loss": 1.6978269815444946, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19330844283103943, + "rewards/margins": 0.0033124610781669617, + "rewards/rejected": -0.1966208964586258, + "step": 880 + }, + { + "epoch": 0.712, + "grad_norm": 0.1971806436777115, + "learning_rate": 5.727999999999999e-06, + "log_odds_chosen": 0.0007457077736034989, + "log_odds_ratio": -0.7630642056465149, + "logits/chosen": 1.7499278783798218, + "logits/rejected": 1.7503166198730469, + "logps/chosen": -1.918731689453125, + "logps/rejected": -1.9218223094940186, + "loss": 1.7608676910400392, + "nll_loss": 1.6845613718032837, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19187316298484802, + "rewards/margins": 0.0003090621903538704, + "rewards/rejected": -0.19218222796916962, + "step": 890 + }, + { + "epoch": 0.72, + "grad_norm": 0.17486584186553955, + "learning_rate": 5.692444444444445e-06, + "log_odds_chosen": 0.21501651406288147, + "log_odds_ratio": -0.6557433009147644, + "logits/chosen": 1.6884733438491821, + "logits/rejected": 1.6427476406097412, + "logps/chosen": -1.9072158336639404, + "logps/rejected": -2.0945980548858643, + "loss": 1.7564472198486327, + "nll_loss": 1.6908729076385498, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19072160124778748, + "rewards/margins": 0.018738189712166786, + "rewards/rejected": -0.2094598114490509, + "step": 900 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.03506353497505188, + "eval_log_odds_ratio": -0.7600105404853821, + "eval_logits/chosen": 1.699849009513855, + "eval_logits/rejected": 1.6695457696914673, + "eval_logps/chosen": -2.0012686252593994, + "eval_logps/rejected": -2.03646183013916, + "eval_loss": 1.8359203338623047, + "eval_nll_loss": 1.7599191665649414, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20012688636779785, + "eval_rewards/margins": 0.003519318765029311, + "eval_rewards/rejected": -0.2036461979150772, + "eval_runtime": 53.0131, + "eval_samples_per_second": 9.432, + "eval_steps_per_second": 4.716, + "step": 900 + }, + { + "epoch": 0.728, + "grad_norm": 0.2480229139328003, + "learning_rate": 5.656888888888889e-06, + "log_odds_chosen": 0.0064504086039960384, + "log_odds_ratio": -0.7494773268699646, + "logits/chosen": 1.6877663135528564, + "logits/rejected": 1.649176836013794, + "logps/chosen": -2.018514633178711, + "logps/rejected": -2.013549566268921, + "loss": 1.8033092498779297, + "nll_loss": 1.7283611297607422, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20185145735740662, + "rewards/margins": -0.0004965037223882973, + "rewards/rejected": -0.2013549506664276, + "step": 910 + }, + { + "epoch": 0.736, + "grad_norm": 0.19374053180217743, + "learning_rate": 5.621333333333333e-06, + "log_odds_chosen": 0.011787503957748413, + "log_odds_ratio": -0.7448769211769104, + "logits/chosen": 1.674197793006897, + "logits/rejected": 1.7002149820327759, + "logps/chosen": -1.8899329900741577, + "logps/rejected": -1.898186445236206, + "loss": 1.7617456436157226, + "nll_loss": 1.6872574090957642, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18899329006671906, + "rewards/margins": 0.0008253513951785862, + "rewards/rejected": -0.18981865048408508, + "step": 920 + }, + { + "epoch": 0.744, + "grad_norm": 0.156111478805542, + "learning_rate": 5.585777777777777e-06, + "log_odds_chosen": 0.05632457882165909, + "log_odds_ratio": -0.7555148005485535, + "logits/chosen": 1.805132269859314, + "logits/rejected": 1.8177416324615479, + "logps/chosen": -1.856993317604065, + "logps/rejected": -1.9158560037612915, + "loss": 1.8122926712036134, + "nll_loss": 1.736741304397583, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18569931387901306, + "rewards/margins": 0.0058862874284386635, + "rewards/rejected": -0.19158563017845154, + "step": 930 + }, + { + "epoch": 0.752, + "grad_norm": 0.19871090352535248, + "learning_rate": 5.550222222222223e-06, + "log_odds_chosen": 0.07815317809581757, + "log_odds_ratio": -0.7037801742553711, + "logits/chosen": 1.7272205352783203, + "logits/rejected": 1.8103294372558594, + "logps/chosen": -1.9780700206756592, + "logps/rejected": -2.0482537746429443, + "loss": 1.7779150009155273, + "nll_loss": 1.707537055015564, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19780699908733368, + "rewards/margins": 0.007018385920673609, + "rewards/rejected": -0.20482537150382996, + "step": 940 + }, + { + "epoch": 0.76, + "grad_norm": 0.22105462849140167, + "learning_rate": 5.5146666666666665e-06, + "log_odds_chosen": 0.1774008721113205, + "log_odds_ratio": -0.6982907056808472, + "logits/chosen": 1.6242185831069946, + "logits/rejected": 1.6261165142059326, + "logps/chosen": -1.941339135169983, + "logps/rejected": -2.0926012992858887, + "loss": 1.7724433898925782, + "nll_loss": 1.702614426612854, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1941339373588562, + "rewards/margins": 0.015126201324164867, + "rewards/rejected": -0.20926015079021454, + "step": 950 + }, + { + "epoch": 0.768, + "grad_norm": 0.2018124759197235, + "learning_rate": 5.479111111111111e-06, + "log_odds_chosen": 0.11117073148488998, + "log_odds_ratio": -0.7105401754379272, + "logits/chosen": 1.7710603475570679, + "logits/rejected": 1.6767441034317017, + "logps/chosen": -1.932356834411621, + "logps/rejected": -2.0336527824401855, + "loss": 1.7901832580566406, + "nll_loss": 1.719129204750061, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19323569536209106, + "rewards/margins": 0.010129592381417751, + "rewards/rejected": -0.2033652812242508, + "step": 960 + }, + { + "epoch": 0.776, + "grad_norm": 0.21761752665042877, + "learning_rate": 5.443555555555555e-06, + "log_odds_chosen": -0.05387473106384277, + "log_odds_ratio": -0.8190320134162903, + "logits/chosen": 1.6793544292449951, + "logits/rejected": 1.675851821899414, + "logps/chosen": -1.9604196548461914, + "logps/rejected": -1.9354708194732666, + "loss": 1.749907875061035, + "nll_loss": 1.6680047512054443, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19604197144508362, + "rewards/margins": -0.0024948944337666035, + "rewards/rejected": -0.1935470849275589, + "step": 970 + }, + { + "epoch": 0.784, + "grad_norm": 0.1489887237548828, + "learning_rate": 5.408e-06, + "log_odds_chosen": 0.10288698971271515, + "log_odds_ratio": -0.6924680471420288, + "logits/chosen": 1.7274665832519531, + "logits/rejected": 1.5743342638015747, + "logps/chosen": -1.913150429725647, + "logps/rejected": -2.003439426422119, + "loss": 1.816815185546875, + "nll_loss": 1.7475683689117432, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19131508469581604, + "rewards/margins": 0.009028871543705463, + "rewards/rejected": -0.20034393668174744, + "step": 980 + }, + { + "epoch": 0.792, + "grad_norm": 0.22919175028800964, + "learning_rate": 5.372444444444444e-06, + "log_odds_chosen": 0.26656976342201233, + "log_odds_ratio": -0.611583948135376, + "logits/chosen": 1.6766068935394287, + "logits/rejected": 1.680253028869629, + "logps/chosen": -1.9001144170761108, + "logps/rejected": -2.118840217590332, + "loss": 1.7628129959106444, + "nll_loss": 1.7016544342041016, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19001144170761108, + "rewards/margins": 0.021872568875551224, + "rewards/rejected": -0.2118840217590332, + "step": 990 + }, + { + "epoch": 0.8, + "grad_norm": 0.20565934479236603, + "learning_rate": 5.336888888888888e-06, + "log_odds_chosen": -0.10769150406122208, + "log_odds_ratio": -0.8185423612594604, + "logits/chosen": 1.7847192287445068, + "logits/rejected": 1.7344152927398682, + "logps/chosen": -1.9659755229949951, + "logps/rejected": -1.8742831945419312, + "loss": 1.791708755493164, + "nll_loss": 1.7098544836044312, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": -0.19659757614135742, + "rewards/margins": -0.009169241413474083, + "rewards/rejected": -0.1874283254146576, + "step": 1000 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.03596750646829605, + "eval_log_odds_ratio": -0.7587533593177795, + "eval_logits/chosen": 1.7567447423934937, + "eval_logits/rejected": 1.7300385236740112, + "eval_logps/chosen": -1.9963513612747192, + "eval_logps/rejected": -2.0327131748199463, + "eval_loss": 1.83016037940979, + "eval_nll_loss": 1.7542850971221924, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.19963513314723969, + "eval_rewards/margins": 0.0036361950915306807, + "eval_rewards/rejected": -0.20327134430408478, + "eval_runtime": 52.9272, + "eval_samples_per_second": 9.447, + "eval_steps_per_second": 4.723, + "step": 1000 + }, + { + "epoch": 0.808, + "grad_norm": 0.17682485282421112, + "learning_rate": 5.301333333333333e-06, + "log_odds_chosen": 0.2020380049943924, + "log_odds_ratio": -0.664644181728363, + "logits/chosen": 1.8102779388427734, + "logits/rejected": 1.7338097095489502, + "logps/chosen": -1.8925039768218994, + "logps/rejected": -2.061493396759033, + "loss": 1.7180700302124023, + "nll_loss": 1.6516058444976807, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1892503798007965, + "rewards/margins": 0.016898952424526215, + "rewards/rejected": -0.20614933967590332, + "step": 1010 + }, + { + "epoch": 0.816, + "grad_norm": 0.1735365241765976, + "learning_rate": 5.265777777777778e-06, + "log_odds_chosen": 0.25237327814102173, + "log_odds_ratio": -0.6863128542900085, + "logits/chosen": 1.7943061590194702, + "logits/rejected": 1.7719309329986572, + "logps/chosen": -1.8711779117584229, + "logps/rejected": -2.1002743244171143, + "loss": 1.7531299591064453, + "nll_loss": 1.6844985485076904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.187117800116539, + "rewards/margins": 0.022909630089998245, + "rewards/rejected": -0.21002741158008575, + "step": 1020 + }, + { + "epoch": 0.824, + "grad_norm": 0.16449587047100067, + "learning_rate": 5.230222222222222e-06, + "log_odds_chosen": 0.03473002091050148, + "log_odds_ratio": -0.7347549200057983, + "logits/chosen": 1.7814887762069702, + "logits/rejected": 1.641998291015625, + "logps/chosen": -1.9032049179077148, + "logps/rejected": -1.9274908304214478, + "loss": 1.7491247177124023, + "nll_loss": 1.6756490468978882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19032049179077148, + "rewards/margins": 0.0024286056868731976, + "rewards/rejected": -0.19274908304214478, + "step": 1030 + }, + { + "epoch": 0.832, + "grad_norm": 0.29633238911628723, + "learning_rate": 5.194666666666666e-06, + "log_odds_chosen": 0.04156870022416115, + "log_odds_ratio": -0.7441189289093018, + "logits/chosen": 1.6556901931762695, + "logits/rejected": 1.6617138385772705, + "logps/chosen": -1.9671058654785156, + "logps/rejected": -1.9902048110961914, + "loss": 1.774675178527832, + "nll_loss": 1.700263261795044, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19671057164669037, + "rewards/margins": 0.002309908624738455, + "rewards/rejected": -0.19902050495147705, + "step": 1040 + }, + { + "epoch": 0.84, + "grad_norm": 0.3215846121311188, + "learning_rate": 5.159111111111111e-06, + "log_odds_chosen": 0.026115605607628822, + "log_odds_ratio": -0.7675689458847046, + "logits/chosen": 1.7200599908828735, + "logits/rejected": 1.7598450183868408, + "logps/chosen": -1.9010694026947021, + "logps/rejected": -1.948642373085022, + "loss": 1.7812797546386718, + "nll_loss": 1.7045230865478516, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19010695815086365, + "rewards/margins": 0.004757292568683624, + "rewards/rejected": -0.19486424326896667, + "step": 1050 + }, + { + "epoch": 0.848, + "grad_norm": 0.1802208572626114, + "learning_rate": 5.1235555555555556e-06, + "log_odds_chosen": 0.24781334400177002, + "log_odds_ratio": -0.6703814268112183, + "logits/chosen": 1.7133815288543701, + "logits/rejected": 1.7213478088378906, + "logps/chosen": -1.7803637981414795, + "logps/rejected": -2.0020906925201416, + "loss": 1.6906932830810546, + "nll_loss": 1.6236553192138672, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1780363768339157, + "rewards/margins": 0.022172680124640465, + "rewards/rejected": -0.20020906627178192, + "step": 1060 + }, + { + "epoch": 0.856, + "grad_norm": 0.15786853432655334, + "learning_rate": 5.088e-06, + "log_odds_chosen": 0.1844882071018219, + "log_odds_ratio": -0.6841514110565186, + "logits/chosen": 1.630793809890747, + "logits/rejected": 1.557401418685913, + "logps/chosen": -1.9179973602294922, + "logps/rejected": -2.0797696113586426, + "loss": 1.7438488006591797, + "nll_loss": 1.6754337549209595, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19179973006248474, + "rewards/margins": 0.016177207231521606, + "rewards/rejected": -0.20797693729400635, + "step": 1070 + }, + { + "epoch": 0.864, + "grad_norm": 0.14527210593223572, + "learning_rate": 5.052444444444444e-06, + "log_odds_chosen": 0.01497338991612196, + "log_odds_ratio": -0.7727933526039124, + "logits/chosen": 1.7897268533706665, + "logits/rejected": 1.6854931116104126, + "logps/chosen": -1.8946952819824219, + "logps/rejected": -1.9192304611206055, + "loss": 1.8023780822753905, + "nll_loss": 1.7250983715057373, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.189469575881958, + "rewards/margins": 0.002453479217365384, + "rewards/rejected": -0.19192305207252502, + "step": 1080 + }, + { + "epoch": 0.872, + "grad_norm": 0.17399252951145172, + "learning_rate": 5.016888888888888e-06, + "log_odds_chosen": 0.14793001115322113, + "log_odds_ratio": -0.688338041305542, + "logits/chosen": 1.8418161869049072, + "logits/rejected": 1.8267091512680054, + "logps/chosen": -1.9205141067504883, + "logps/rejected": -2.0402419567108154, + "loss": 1.799574851989746, + "nll_loss": 1.7307411432266235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19205141067504883, + "rewards/margins": 0.011972772888839245, + "rewards/rejected": -0.20402422547340393, + "step": 1090 + }, + { + "epoch": 0.88, + "grad_norm": 0.1753067970275879, + "learning_rate": 4.9813333333333335e-06, + "log_odds_chosen": 0.11683394014835358, + "log_odds_ratio": -0.7051060795783997, + "logits/chosen": 1.799207091331482, + "logits/rejected": 1.817365288734436, + "logps/chosen": -1.9414863586425781, + "logps/rejected": -2.025501251220703, + "loss": 1.7512321472167969, + "nll_loss": 1.6807218790054321, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19414862990379333, + "rewards/margins": 0.008401499129831791, + "rewards/rejected": -0.20255012810230255, + "step": 1100 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.0326814129948616, + "eval_log_odds_ratio": -0.7593976855278015, + "eval_logits/chosen": 1.8343305587768555, + "eval_logits/rejected": 1.8080686330795288, + "eval_logps/chosen": -1.9903963804244995, + "eval_logps/rejected": -2.0238075256347656, + "eval_loss": 1.82623291015625, + "eval_nll_loss": 1.750293254852295, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19903963804244995, + "eval_rewards/margins": 0.0033411220647394657, + "eval_rewards/rejected": -0.20238077640533447, + "eval_runtime": 53.2827, + "eval_samples_per_second": 9.384, + "eval_steps_per_second": 4.692, + "step": 1100 + }, + { + "epoch": 0.888, + "grad_norm": 0.16481465101242065, + "learning_rate": 4.945777777777777e-06, + "log_odds_chosen": 0.2010401487350464, + "log_odds_ratio": -0.7306350469589233, + "logits/chosen": 1.8544563055038452, + "logits/rejected": 1.8101685047149658, + "logps/chosen": -2.000797986984253, + "logps/rejected": -2.1975042819976807, + "loss": 1.840962791442871, + "nll_loss": 1.7678991556167603, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20007982850074768, + "rewards/margins": 0.019670633599162102, + "rewards/rejected": -0.21975044906139374, + "step": 1110 + }, + { + "epoch": 0.896, + "grad_norm": 0.23172008991241455, + "learning_rate": 4.910222222222222e-06, + "log_odds_chosen": -0.013389323838055134, + "log_odds_ratio": -0.7784253358840942, + "logits/chosen": 1.8498008251190186, + "logits/rejected": 1.8121620416641235, + "logps/chosen": -1.996519684791565, + "logps/rejected": -1.9884834289550781, + "loss": 1.797800636291504, + "nll_loss": 1.7199580669403076, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19965195655822754, + "rewards/margins": -0.0008036093786358833, + "rewards/rejected": -0.19884835183620453, + "step": 1120 + }, + { + "epoch": 0.904, + "grad_norm": 0.1659555435180664, + "learning_rate": 4.874666666666666e-06, + "log_odds_chosen": 0.061679303646087646, + "log_odds_ratio": -0.7274680137634277, + "logits/chosen": 1.8414134979248047, + "logits/rejected": 1.728663444519043, + "logps/chosen": -1.9462897777557373, + "logps/rejected": -1.996514916419983, + "loss": 1.7753076553344727, + "nll_loss": 1.7025604248046875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1946289837360382, + "rewards/margins": 0.005022515542805195, + "rewards/rejected": -0.19965150952339172, + "step": 1130 + }, + { + "epoch": 0.912, + "grad_norm": 0.23450258374214172, + "learning_rate": 4.839111111111111e-06, + "log_odds_chosen": 0.013548937626183033, + "log_odds_ratio": -0.7774235606193542, + "logits/chosen": 1.762843370437622, + "logits/rejected": 1.7083097696304321, + "logps/chosen": -2.0285325050354004, + "logps/rejected": -2.0414459705352783, + "loss": 1.8112398147583009, + "nll_loss": 1.7334976196289062, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2028532326221466, + "rewards/margins": 0.0012913575628772378, + "rewards/rejected": -0.20414459705352783, + "step": 1140 + }, + { + "epoch": 0.92, + "grad_norm": 0.24944870173931122, + "learning_rate": 4.803555555555555e-06, + "log_odds_chosen": 0.079915352165699, + "log_odds_ratio": -0.736060380935669, + "logits/chosen": 1.7173516750335693, + "logits/rejected": 1.64029860496521, + "logps/chosen": -1.9370155334472656, + "logps/rejected": -2.007744312286377, + "loss": 1.7322145462036134, + "nll_loss": 1.6586081981658936, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19370155036449432, + "rewards/margins": 0.00707287946715951, + "rewards/rejected": -0.2007744312286377, + "step": 1150 + }, + { + "epoch": 0.928, + "grad_norm": 0.20937688648700714, + "learning_rate": 4.768e-06, + "log_odds_chosen": 0.15667779743671417, + "log_odds_ratio": -0.678199827671051, + "logits/chosen": 1.8389065265655518, + "logits/rejected": 1.811120629310608, + "logps/chosen": -1.8186038732528687, + "logps/rejected": -1.948980689048767, + "loss": 1.71126708984375, + "nll_loss": 1.643446922302246, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18186041712760925, + "rewards/margins": 0.013037679716944695, + "rewards/rejected": -0.1948980987071991, + "step": 1160 + }, + { + "epoch": 0.936, + "grad_norm": 0.15026956796646118, + "learning_rate": 4.732444444444444e-06, + "log_odds_chosen": 0.10751942545175552, + "log_odds_ratio": -0.7100101709365845, + "logits/chosen": 1.8801990747451782, + "logits/rejected": 1.8757660388946533, + "logps/chosen": -1.9047510623931885, + "logps/rejected": -1.9837467670440674, + "loss": 1.7867193222045898, + "nll_loss": 1.7157180309295654, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19047510623931885, + "rewards/margins": 0.007899556308984756, + "rewards/rejected": -0.1983746588230133, + "step": 1170 + }, + { + "epoch": 0.944, + "grad_norm": 0.20250067114830017, + "learning_rate": 4.696888888888889e-06, + "log_odds_chosen": 0.10309334099292755, + "log_odds_ratio": -0.7117387056350708, + "logits/chosen": 1.816712737083435, + "logits/rejected": 1.689218282699585, + "logps/chosen": -1.9982006549835205, + "logps/rejected": -2.075469493865967, + "loss": 1.802886390686035, + "nll_loss": 1.7317125797271729, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19982007145881653, + "rewards/margins": 0.007726915180683136, + "rewards/rejected": -0.20754699409008026, + "step": 1180 + }, + { + "epoch": 0.952, + "grad_norm": 0.1596970111131668, + "learning_rate": 4.661333333333333e-06, + "log_odds_chosen": 0.105086550116539, + "log_odds_ratio": -0.6774314641952515, + "logits/chosen": 1.7848875522613525, + "logits/rejected": 1.8055425882339478, + "logps/chosen": -1.9509786367416382, + "logps/rejected": -2.0330584049224854, + "loss": 1.8003599166870117, + "nll_loss": 1.7326167821884155, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.195097878575325, + "rewards/margins": 0.008207983337342739, + "rewards/rejected": -0.20330584049224854, + "step": 1190 + }, + { + "epoch": 0.96, + "grad_norm": 0.18611599504947662, + "learning_rate": 4.625777777777777e-06, + "log_odds_chosen": 0.2091858834028244, + "log_odds_ratio": -0.6977974772453308, + "logits/chosen": 1.768100380897522, + "logits/rejected": 1.745809555053711, + "logps/chosen": -1.8548533916473389, + "logps/rejected": -2.024160861968994, + "loss": 1.7493671417236327, + "nll_loss": 1.6795871257781982, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1854853630065918, + "rewards/margins": 0.016930732876062393, + "rewards/rejected": -0.2024160921573639, + "step": 1200 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.04138939082622528, + "eval_log_odds_ratio": -0.7565131783485413, + "eval_logits/chosen": 1.7945410013198853, + "eval_logits/rejected": 1.7709347009658813, + "eval_logps/chosen": -1.9902427196502686, + "eval_logps/rejected": -2.0308339595794678, + "eval_loss": 1.821491003036499, + "eval_nll_loss": 1.7458395957946777, + "eval_rewards/accuracies": 0.5419999957084656, + "eval_rewards/chosen": -0.1990242600440979, + "eval_rewards/margins": 0.00405914057046175, + "eval_rewards/rejected": -0.20308341085910797, + "eval_runtime": 53.0681, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 4.711, + "step": 1200 + }, + { + "epoch": 0.968, + "grad_norm": 0.19560930132865906, + "learning_rate": 4.5902222222222225e-06, + "log_odds_chosen": 0.1029076799750328, + "log_odds_ratio": -0.7135358452796936, + "logits/chosen": 1.7437855005264282, + "logits/rejected": 1.7035853862762451, + "logps/chosen": -1.9591327905654907, + "logps/rejected": -2.0517547130584717, + "loss": 1.7739175796508788, + "nll_loss": 1.7025638818740845, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19591325521469116, + "rewards/margins": 0.009262214414775372, + "rewards/rejected": -0.2051754742860794, + "step": 1210 + }, + { + "epoch": 0.976, + "grad_norm": 0.161069855093956, + "learning_rate": 4.554666666666666e-06, + "log_odds_chosen": -0.12622077763080597, + "log_odds_ratio": -0.8286466598510742, + "logits/chosen": 1.9193840026855469, + "logits/rejected": 1.8464590311050415, + "logps/chosen": -1.9970428943634033, + "logps/rejected": -1.9042726755142212, + "loss": 1.7340478897094727, + "nll_loss": 1.6511831283569336, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.19970427453517914, + "rewards/margins": -0.009276997298002243, + "rewards/rejected": -0.1904272735118866, + "step": 1220 + }, + { + "epoch": 0.984, + "grad_norm": 0.27507203817367554, + "learning_rate": 4.519111111111111e-06, + "log_odds_chosen": 0.03895152360200882, + "log_odds_ratio": -0.7696425318717957, + "logits/chosen": 1.8306211233139038, + "logits/rejected": 1.8841800689697266, + "logps/chosen": -1.9324079751968384, + "logps/rejected": -1.969473123550415, + "loss": 1.74224910736084, + "nll_loss": 1.6652848720550537, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19324080646038055, + "rewards/margins": 0.003706505987793207, + "rewards/rejected": -0.19694730639457703, + "step": 1230 + }, + { + "epoch": 0.992, + "grad_norm": 0.34326133131980896, + "learning_rate": 4.483555555555555e-06, + "log_odds_chosen": 0.09721614420413971, + "log_odds_ratio": -0.746010959148407, + "logits/chosen": 1.8264877796173096, + "logits/rejected": 1.8560062646865845, + "logps/chosen": -1.8512614965438843, + "logps/rejected": -1.9252874851226807, + "loss": 1.749200439453125, + "nll_loss": 1.6745994091033936, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1851261556148529, + "rewards/margins": 0.007402592804282904, + "rewards/rejected": -0.19252872467041016, + "step": 1240 + }, + { + "epoch": 1.0, + "grad_norm": 0.21193479001522064, + "learning_rate": 4.4480000000000004e-06, + "log_odds_chosen": 0.15323859453201294, + "log_odds_ratio": -0.6695608496665955, + "logits/chosen": 1.8079290390014648, + "logits/rejected": 1.8241838216781616, + "logps/chosen": -1.9059537649154663, + "logps/rejected": -2.032207489013672, + "loss": 1.7593469619750977, + "nll_loss": 1.6923907995224, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19059538841247559, + "rewards/margins": 0.012625358998775482, + "rewards/rejected": -0.20322072505950928, + "step": 1250 + }, + { + "epoch": 1.008, + "grad_norm": 0.20177887380123138, + "learning_rate": 4.412444444444444e-06, + "log_odds_chosen": 0.18453414738178253, + "log_odds_ratio": -0.6602174639701843, + "logits/chosen": 1.7973260879516602, + "logits/rejected": 1.800276517868042, + "logps/chosen": -1.9282119274139404, + "logps/rejected": -2.0611374378204346, + "loss": 1.7722354888916017, + "nll_loss": 1.7062139511108398, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1928211897611618, + "rewards/margins": 0.013292545452713966, + "rewards/rejected": -0.20611374080181122, + "step": 1260 + }, + { + "epoch": 1.016, + "grad_norm": 0.19174514710903168, + "learning_rate": 4.376888888888889e-06, + "log_odds_chosen": 0.17968787252902985, + "log_odds_ratio": -0.6729904413223267, + "logits/chosen": 1.825566053390503, + "logits/rejected": 1.7393420934677124, + "logps/chosen": -1.8471599817276, + "logps/rejected": -1.9890083074569702, + "loss": 1.722864532470703, + "nll_loss": 1.6555652618408203, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18471598625183105, + "rewards/margins": 0.01418487448245287, + "rewards/rejected": -0.19890084862709045, + "step": 1270 + }, + { + "epoch": 1.024, + "grad_norm": 0.21415722370147705, + "learning_rate": 4.341333333333333e-06, + "log_odds_chosen": 0.1800077110528946, + "log_odds_ratio": -0.7383973598480225, + "logits/chosen": 1.849203109741211, + "logits/rejected": 1.7909198999404907, + "logps/chosen": -1.999140977859497, + "logps/rejected": -2.184372901916504, + "loss": 1.7739013671875, + "nll_loss": 1.7000617980957031, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1999140828847885, + "rewards/margins": 0.018523216247558594, + "rewards/rejected": -0.2184373140335083, + "step": 1280 + }, + { + "epoch": 1.032, + "grad_norm": 0.21206778287887573, + "learning_rate": 4.305777777777778e-06, + "log_odds_chosen": -0.07741276919841766, + "log_odds_ratio": -0.817004382610321, + "logits/chosen": 1.7782243490219116, + "logits/rejected": 1.7478315830230713, + "logps/chosen": -1.9559736251831055, + "logps/rejected": -1.9229551553726196, + "loss": 1.7454774856567383, + "nll_loss": 1.6637769937515259, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19559738039970398, + "rewards/margins": -0.003301867749541998, + "rewards/rejected": -0.19229550659656525, + "step": 1290 + }, + { + "epoch": 1.04, + "grad_norm": 0.2803303599357605, + "learning_rate": 4.270222222222222e-06, + "log_odds_chosen": 0.37417811155319214, + "log_odds_ratio": -0.6541550755500793, + "logits/chosen": 1.9080060720443726, + "logits/rejected": 1.781346082687378, + "logps/chosen": -1.8062732219696045, + "logps/rejected": -2.1362411975860596, + "loss": 1.7089204788208008, + "nll_loss": 1.6435050964355469, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18062731623649597, + "rewards/margins": 0.032996825873851776, + "rewards/rejected": -0.21362414956092834, + "step": 1300 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.03616252541542053, + "eval_log_odds_ratio": -0.7587297558784485, + "eval_logits/chosen": 1.8630521297454834, + "eval_logits/rejected": 1.8428224325180054, + "eval_logps/chosen": -1.9820479154586792, + "eval_logps/rejected": -2.0179696083068848, + "eval_loss": 1.816968560218811, + "eval_nll_loss": 1.7410955429077148, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19820478558540344, + "eval_rewards/margins": 0.0035921703092753887, + "eval_rewards/rejected": -0.20179696381092072, + "eval_runtime": 53.1139, + "eval_samples_per_second": 9.414, + "eval_steps_per_second": 4.707, + "step": 1300 + }, + { + "epoch": 1.048, + "grad_norm": 0.142170250415802, + "learning_rate": 4.234666666666666e-06, + "log_odds_chosen": 0.08604643493890762, + "log_odds_ratio": -0.7244693636894226, + "logits/chosen": 1.8486318588256836, + "logits/rejected": 1.8644578456878662, + "logps/chosen": -1.8851855993270874, + "logps/rejected": -1.9627344608306885, + "loss": 1.7843137741088868, + "nll_loss": 1.7118666172027588, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18851855397224426, + "rewards/margins": 0.007754878140985966, + "rewards/rejected": -0.19627343118190765, + "step": 1310 + }, + { + "epoch": 1.056, + "grad_norm": 0.19082266092300415, + "learning_rate": 4.199111111111111e-06, + "log_odds_chosen": 0.10004905611276627, + "log_odds_ratio": -0.7227860689163208, + "logits/chosen": 1.7743819952011108, + "logits/rejected": 1.8303782939910889, + "logps/chosen": -1.8624858856201172, + "logps/rejected": -1.9419944286346436, + "loss": 1.7142595291137694, + "nll_loss": 1.6419808864593506, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.18624861538410187, + "rewards/margins": 0.00795083586126566, + "rewards/rejected": -0.19419944286346436, + "step": 1320 + }, + { + "epoch": 1.064, + "grad_norm": 0.15585362911224365, + "learning_rate": 4.1635555555555554e-06, + "log_odds_chosen": 0.13298745453357697, + "log_odds_ratio": -0.7184717655181885, + "logits/chosen": 1.8939129114151, + "logits/rejected": 1.8457763195037842, + "logps/chosen": -1.7758653163909912, + "logps/rejected": -1.9016306400299072, + "loss": 1.758875274658203, + "nll_loss": 1.687028169631958, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17758652567863464, + "rewards/margins": 0.012576532550156116, + "rewards/rejected": -0.19016307592391968, + "step": 1330 + }, + { + "epoch": 1.072, + "grad_norm": 0.2081124633550644, + "learning_rate": 4.128e-06, + "log_odds_chosen": 0.04596617445349693, + "log_odds_ratio": -0.787051260471344, + "logits/chosen": 1.866758942604065, + "logits/rejected": 1.8375155925750732, + "logps/chosen": -2.0039141178131104, + "logps/rejected": -2.0341384410858154, + "loss": 1.7708185195922852, + "nll_loss": 1.6921135187149048, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20039141178131104, + "rewards/margins": 0.0030224404763430357, + "rewards/rejected": -0.20341384410858154, + "step": 1340 + }, + { + "epoch": 1.08, + "grad_norm": 0.20339496433734894, + "learning_rate": 4.092444444444444e-06, + "log_odds_chosen": 0.06565378606319427, + "log_odds_ratio": -0.7353135943412781, + "logits/chosen": 1.8310129642486572, + "logits/rejected": 1.7633676528930664, + "logps/chosen": -1.9292491674423218, + "logps/rejected": -1.9627234935760498, + "loss": 1.7951900482177734, + "nll_loss": 1.72165846824646, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19292493164539337, + "rewards/margins": 0.0033474296797066927, + "rewards/rejected": -0.1962723433971405, + "step": 1350 + }, + { + "epoch": 1.088, + "grad_norm": 0.1730116605758667, + "learning_rate": 4.056888888888889e-06, + "log_odds_chosen": 0.07272230833768845, + "log_odds_ratio": -0.7393074035644531, + "logits/chosen": 1.8491909503936768, + "logits/rejected": 1.849678635597229, + "logps/chosen": -2.0217833518981934, + "logps/rejected": -2.0789294242858887, + "loss": 1.7909440994262695, + "nll_loss": 1.7170133590698242, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.20217831432819366, + "rewards/margins": 0.005714614875614643, + "rewards/rejected": -0.20789292454719543, + "step": 1360 + }, + { + "epoch": 1.096, + "grad_norm": 0.20503339171409607, + "learning_rate": 4.021333333333333e-06, + "log_odds_chosen": 0.3173345625400543, + "log_odds_ratio": -0.6747878789901733, + "logits/chosen": 1.777547836303711, + "logits/rejected": 1.7704051733016968, + "logps/chosen": -1.83544921875, + "logps/rejected": -2.1259076595306396, + "loss": 1.6998504638671874, + "nll_loss": 1.6323719024658203, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18354493379592896, + "rewards/margins": 0.02904583141207695, + "rewards/rejected": -0.2125907689332962, + "step": 1370 + }, + { + "epoch": 1.104, + "grad_norm": 0.20165199041366577, + "learning_rate": 3.985777777777778e-06, + "log_odds_chosen": 0.0595523826777935, + "log_odds_ratio": -0.7062498927116394, + "logits/chosen": 1.8233400583267212, + "logits/rejected": 1.8652807474136353, + "logps/chosen": -1.9540764093399048, + "logps/rejected": -2.0037131309509277, + "loss": 1.7615316390991211, + "nll_loss": 1.6909065246582031, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1954076588153839, + "rewards/margins": 0.0049636876210570335, + "rewards/rejected": -0.20037131011486053, + "step": 1380 + }, + { + "epoch": 1.112, + "grad_norm": 0.15956999361515045, + "learning_rate": 3.950222222222222e-06, + "log_odds_chosen": 0.050791315734386444, + "log_odds_ratio": -0.730897843837738, + "logits/chosen": 1.8976904153823853, + "logits/rejected": 1.8682187795639038, + "logps/chosen": -1.9485187530517578, + "logps/rejected": -1.9784114360809326, + "loss": 1.8059589385986328, + "nll_loss": 1.7328689098358154, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.19485187530517578, + "rewards/margins": 0.0029892651364207268, + "rewards/rejected": -0.1978411227464676, + "step": 1390 + }, + { + "epoch": 1.12, + "grad_norm": 0.16580845415592194, + "learning_rate": 3.9146666666666666e-06, + "log_odds_chosen": 0.13212139904499054, + "log_odds_ratio": -0.7306901216506958, + "logits/chosen": 1.9302421808242798, + "logits/rejected": 1.9482580423355103, + "logps/chosen": -1.8804752826690674, + "logps/rejected": -1.9942712783813477, + "loss": 1.7713314056396485, + "nll_loss": 1.6982624530792236, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18804752826690674, + "rewards/margins": 0.01137961633503437, + "rewards/rejected": -0.19942712783813477, + "step": 1400 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.03508565574884415, + "eval_log_odds_ratio": -0.7589212656021118, + "eval_logits/chosen": 1.8507174253463745, + "eval_logits/rejected": 1.8313792943954468, + "eval_logps/chosen": -1.9807339906692505, + "eval_logps/rejected": -2.016141653060913, + "eval_loss": 1.8125864267349243, + "eval_nll_loss": 1.7366943359375, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19807341694831848, + "eval_rewards/margins": 0.0035407766699790955, + "eval_rewards/rejected": -0.20161418616771698, + "eval_runtime": 53.4841, + "eval_samples_per_second": 9.349, + "eval_steps_per_second": 4.674, + "step": 1400 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.21889333426952362, + "learning_rate": 3.87911111111111e-06, + "log_odds_chosen": 0.035620056092739105, + "log_odds_ratio": -0.7623010873794556, + "logits/chosen": 1.81222665309906, + "logits/rejected": 1.8632707595825195, + "logps/chosen": -1.960524320602417, + "logps/rejected": -1.9879045486450195, + "loss": 1.8127471923828125, + "nll_loss": 1.7365171909332275, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1960524320602417, + "rewards/margins": 0.002738040406256914, + "rewards/rejected": -0.19879046082496643, + "step": 1410 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.22390495240688324, + "learning_rate": 3.843555555555555e-06, + "log_odds_chosen": 0.0008658409351482987, + "log_odds_ratio": -0.7709900736808777, + "logits/chosen": 1.7942880392074585, + "logits/rejected": 1.8574626445770264, + "logps/chosen": -1.954573392868042, + "logps/rejected": -1.9510538578033447, + "loss": 1.7888818740844727, + "nll_loss": 1.7117828130722046, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1954573690891266, + "rewards/margins": -0.000351964496076107, + "rewards/rejected": -0.1951053887605667, + "step": 1420 + }, + { + "epoch": 1.144, + "grad_norm": 0.2423250824213028, + "learning_rate": 3.808e-06, + "log_odds_chosen": 0.20300379395484924, + "log_odds_ratio": -0.7344587445259094, + "logits/chosen": 1.8965059518814087, + "logits/rejected": 1.865016222000122, + "logps/chosen": -1.830789566040039, + "logps/rejected": -2.030541181564331, + "loss": 1.7464214324951173, + "nll_loss": 1.672975778579712, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18307895958423615, + "rewards/margins": 0.01997516117990017, + "rewards/rejected": -0.20305411517620087, + "step": 1430 + }, + { + "epoch": 1.152, + "grad_norm": 0.15725889801979065, + "learning_rate": 3.7724444444444445e-06, + "log_odds_chosen": -0.013941275887191296, + "log_odds_ratio": -0.7538126707077026, + "logits/chosen": 1.8744310140609741, + "logits/rejected": 1.8045070171356201, + "logps/chosen": -1.9909547567367554, + "logps/rejected": -1.9801406860351562, + "loss": 1.7410266876220704, + "nll_loss": 1.6656453609466553, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.1990954577922821, + "rewards/margins": -0.0010814003180712461, + "rewards/rejected": -0.1980140656232834, + "step": 1440 + }, + { + "epoch": 1.16, + "grad_norm": 0.17255175113677979, + "learning_rate": 3.7368888888888883e-06, + "log_odds_chosen": 0.010356083512306213, + "log_odds_ratio": -0.7647982239723206, + "logits/chosen": 1.9456676244735718, + "logits/rejected": 1.8851861953735352, + "logps/chosen": -1.9676564931869507, + "logps/rejected": -1.9645713567733765, + "loss": 1.7504322052001953, + "nll_loss": 1.6739521026611328, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19676563143730164, + "rewards/margins": -0.0003085043281316757, + "rewards/rejected": -0.1964571326971054, + "step": 1450 + }, + { + "epoch": 1.168, + "grad_norm": 0.18616439402103424, + "learning_rate": 3.701333333333333e-06, + "log_odds_chosen": 0.06657058745622635, + "log_odds_ratio": -0.7217355370521545, + "logits/chosen": 1.8338171243667603, + "logits/rejected": 1.8438618183135986, + "logps/chosen": -1.98470139503479, + "logps/rejected": -2.043152332305908, + "loss": 1.7677242279052734, + "nll_loss": 1.6955506801605225, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1984701305627823, + "rewards/margins": 0.005845111794769764, + "rewards/rejected": -0.20431523025035858, + "step": 1460 + }, + { + "epoch": 1.176, + "grad_norm": 0.22820734977722168, + "learning_rate": 3.6657777777777773e-06, + "log_odds_chosen": 0.10153277218341827, + "log_odds_ratio": -0.6983687281608582, + "logits/chosen": 1.915000319480896, + "logits/rejected": 1.9120140075683594, + "logps/chosen": -1.927222490310669, + "logps/rejected": -2.0119576454162598, + "loss": 1.7736129760742188, + "nll_loss": 1.7037763595581055, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19272224605083466, + "rewards/margins": 0.008473522961139679, + "rewards/rejected": -0.20119579136371613, + "step": 1470 + }, + { + "epoch": 1.184, + "grad_norm": 0.17705458402633667, + "learning_rate": 3.630222222222222e-06, + "log_odds_chosen": 0.15250150859355927, + "log_odds_ratio": -0.7717846035957336, + "logits/chosen": 1.814196228981018, + "logits/rejected": 1.7537425756454468, + "logps/chosen": -1.879522681236267, + "logps/rejected": -2.0431935787200928, + "loss": 1.7213356018066406, + "nll_loss": 1.6441571712493896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18795228004455566, + "rewards/margins": 0.016367079690098763, + "rewards/rejected": -0.20431935787200928, + "step": 1480 + }, + { + "epoch": 1.192, + "grad_norm": 0.18402282893657684, + "learning_rate": 3.5946666666666662e-06, + "log_odds_chosen": 0.019868457689881325, + "log_odds_ratio": -0.7824932336807251, + "logits/chosen": 1.932429552078247, + "logits/rejected": 1.8701515197753906, + "logps/chosen": -2.009019136428833, + "logps/rejected": -2.0110716819763184, + "loss": 1.7473308563232421, + "nll_loss": 1.6690819263458252, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20090194046497345, + "rewards/margins": 0.00020524598949123174, + "rewards/rejected": -0.2011071741580963, + "step": 1490 + }, + { + "epoch": 1.2, + "grad_norm": 0.2257532775402069, + "learning_rate": 3.559111111111111e-06, + "log_odds_chosen": 0.17915096879005432, + "log_odds_ratio": -0.6777058839797974, + "logits/chosen": 1.8818708658218384, + "logits/rejected": 1.8758437633514404, + "logps/chosen": -1.7928282022476196, + "logps/rejected": -1.9602582454681396, + "loss": 1.6830949783325195, + "nll_loss": 1.6153247356414795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17928281426429749, + "rewards/margins": 0.016743017360568047, + "rewards/rejected": -0.1960258185863495, + "step": 1500 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.035420093685388565, + "eval_log_odds_ratio": -0.7593028545379639, + "eval_logits/chosen": 1.8614498376846313, + "eval_logits/rejected": 1.8436392545700073, + "eval_logps/chosen": -1.975562572479248, + "eval_logps/rejected": -2.0109596252441406, + "eval_loss": 1.8058576583862305, + "eval_nll_loss": 1.72992742061615, + "eval_rewards/accuracies": 0.5400000214576721, + "eval_rewards/chosen": -0.1975562423467636, + "eval_rewards/margins": 0.00353970006108284, + "eval_rewards/rejected": -0.20109596848487854, + "eval_runtime": 53.0644, + "eval_samples_per_second": 9.423, + "eval_steps_per_second": 4.711, + "step": 1500 + }, + { + "epoch": 1.208, + "grad_norm": 0.2579845190048218, + "learning_rate": 3.5235555555555556e-06, + "log_odds_chosen": 0.10818381607532501, + "log_odds_ratio": -0.705440878868103, + "logits/chosen": 1.868035912513733, + "logits/rejected": 1.867733359336853, + "logps/chosen": -1.9699623584747314, + "logps/rejected": -2.055683135986328, + "loss": 1.7757377624511719, + "nll_loss": 1.7051931619644165, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19699624180793762, + "rewards/margins": 0.00857207365334034, + "rewards/rejected": -0.2055683135986328, + "step": 1510 + }, + { + "epoch": 1.216, + "grad_norm": 0.1755809634923935, + "learning_rate": 3.488e-06, + "log_odds_chosen": 0.2761983275413513, + "log_odds_ratio": -0.6301968693733215, + "logits/chosen": 1.9086204767227173, + "logits/rejected": 1.905368447303772, + "logps/chosen": -1.8556480407714844, + "logps/rejected": -2.0749871730804443, + "loss": 1.7898880004882813, + "nll_loss": 1.7268680334091187, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1855648010969162, + "rewards/margins": 0.02193392440676689, + "rewards/rejected": -0.2074987143278122, + "step": 1520 + }, + { + "epoch": 1.224, + "grad_norm": 0.17956162989139557, + "learning_rate": 3.4524444444444446e-06, + "log_odds_chosen": -0.14030766487121582, + "log_odds_ratio": -0.8471817970275879, + "logits/chosen": 1.790841817855835, + "logits/rejected": 1.7734079360961914, + "logps/chosen": -2.167576551437378, + "logps/rejected": -2.0355286598205566, + "loss": 1.8703905105590821, + "nll_loss": 1.7856724262237549, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2167576551437378, + "rewards/margins": -0.013204795308411121, + "rewards/rejected": -0.20355287194252014, + "step": 1530 + }, + { + "epoch": 1.232, + "grad_norm": 0.23301228880882263, + "learning_rate": 3.416888888888889e-06, + "log_odds_chosen": 0.19442808628082275, + "log_odds_ratio": -0.6882377862930298, + "logits/chosen": 1.8302663564682007, + "logits/rejected": 1.734037160873413, + "logps/chosen": -1.9154956340789795, + "logps/rejected": -2.073822259902954, + "loss": 1.788707160949707, + "nll_loss": 1.7198832035064697, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19154959917068481, + "rewards/margins": 0.015832625329494476, + "rewards/rejected": -0.2073822021484375, + "step": 1540 + }, + { + "epoch": 1.24, + "grad_norm": 0.26810088753700256, + "learning_rate": 3.3813333333333335e-06, + "log_odds_chosen": 0.1409793198108673, + "log_odds_ratio": -0.6711575388908386, + "logits/chosen": 1.708510398864746, + "logits/rejected": 1.7307822704315186, + "logps/chosen": -1.9060001373291016, + "logps/rejected": -2.020759105682373, + "loss": 1.6427623748779296, + "nll_loss": 1.5756465196609497, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19060000777244568, + "rewards/margins": 0.011475888080894947, + "rewards/rejected": -0.20207588374614716, + "step": 1550 + }, + { + "epoch": 1.248, + "grad_norm": 0.2061609923839569, + "learning_rate": 3.3457777777777774e-06, + "log_odds_chosen": -0.006243853364139795, + "log_odds_ratio": -0.751319169998169, + "logits/chosen": 1.8289045095443726, + "logits/rejected": 1.8576431274414062, + "logps/chosen": -1.9457448720932007, + "logps/rejected": -1.9425971508026123, + "loss": 1.7755237579345704, + "nll_loss": 1.7003915309906006, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1945744901895523, + "rewards/margins": -0.000314765318762511, + "rewards/rejected": -0.19425971806049347, + "step": 1560 + }, + { + "epoch": 1.256, + "grad_norm": 0.22716669738292694, + "learning_rate": 3.310222222222222e-06, + "log_odds_chosen": -0.006850877311080694, + "log_odds_ratio": -0.7553213834762573, + "logits/chosen": 1.8095941543579102, + "logits/rejected": 1.8769876956939697, + "logps/chosen": -1.980285882949829, + "logps/rejected": -1.961301565170288, + "loss": 1.7145851135253907, + "nll_loss": 1.6390529870986938, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19802860915660858, + "rewards/margins": -0.0018984429771080613, + "rewards/rejected": -0.19613012671470642, + "step": 1570 + }, + { + "epoch": 1.264, + "grad_norm": 0.2657179534435272, + "learning_rate": 3.2746666666666663e-06, + "log_odds_chosen": 0.2205655872821808, + "log_odds_ratio": -0.6461672782897949, + "logits/chosen": 1.753933310508728, + "logits/rejected": 1.771116852760315, + "logps/chosen": -1.8913685083389282, + "logps/rejected": -2.0808615684509277, + "loss": 1.682852554321289, + "nll_loss": 1.6182358264923096, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.1891368329524994, + "rewards/margins": 0.01894933171570301, + "rewards/rejected": -0.20808616280555725, + "step": 1580 + }, + { + "epoch": 1.272, + "grad_norm": 0.22542321681976318, + "learning_rate": 3.239111111111111e-06, + "log_odds_chosen": 0.10629250854253769, + "log_odds_ratio": -0.720824658870697, + "logits/chosen": 1.9775307178497314, + "logits/rejected": 1.9780542850494385, + "logps/chosen": -1.898146629333496, + "logps/rejected": -1.9933639764785767, + "loss": 1.7453948974609375, + "nll_loss": 1.6733121871948242, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18981468677520752, + "rewards/margins": 0.00952172465622425, + "rewards/rejected": -0.19933640956878662, + "step": 1590 + }, + { + "epoch": 1.28, + "grad_norm": 0.23766738176345825, + "learning_rate": 3.2035555555555553e-06, + "log_odds_chosen": 0.2950454652309418, + "log_odds_ratio": -0.646562397480011, + "logits/chosen": 1.8443620204925537, + "logits/rejected": 1.86007559299469, + "logps/chosen": -1.8024790287017822, + "logps/rejected": -2.0292277336120605, + "loss": 1.6824573516845702, + "nll_loss": 1.617801308631897, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18024791777133942, + "rewards/margins": 0.022674862295389175, + "rewards/rejected": -0.2029227763414383, + "step": 1600 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.03487193211913109, + "eval_log_odds_ratio": -0.7592138648033142, + "eval_logits/chosen": 1.8726606369018555, + "eval_logits/rejected": 1.8562979698181152, + "eval_logps/chosen": -1.975594162940979, + "eval_logps/rejected": -2.0101206302642822, + "eval_loss": 1.7973711490631104, + "eval_nll_loss": 1.721449851989746, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.1975594162940979, + "eval_rewards/margins": 0.0034526519011706114, + "eval_rewards/rejected": -0.20101207494735718, + "eval_runtime": 53.2399, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.696, + "step": 1600 + }, + { + "epoch": 1.288, + "grad_norm": 0.18830521404743195, + "learning_rate": 3.168e-06, + "log_odds_chosen": 0.030118698254227638, + "log_odds_ratio": -0.7624102830886841, + "logits/chosen": 1.911879301071167, + "logits/rejected": 1.9447228908538818, + "logps/chosen": -2.0052685737609863, + "logps/rejected": -2.0304019451141357, + "loss": 1.7501512527465821, + "nll_loss": 1.6739099025726318, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2005268633365631, + "rewards/margins": 0.002513363491743803, + "rewards/rejected": -0.2030402421951294, + "step": 1610 + }, + { + "epoch": 1.296, + "grad_norm": 0.21792171895503998, + "learning_rate": 3.1324444444444443e-06, + "log_odds_chosen": 0.2323339432477951, + "log_odds_ratio": -0.6613295078277588, + "logits/chosen": 1.8960516452789307, + "logits/rejected": 1.8715391159057617, + "logps/chosen": -1.7550809383392334, + "logps/rejected": -1.9603891372680664, + "loss": 1.681437873840332, + "nll_loss": 1.6153051853179932, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17550811171531677, + "rewards/margins": 0.020530786365270615, + "rewards/rejected": -0.1960388869047165, + "step": 1620 + }, + { + "epoch": 1.304, + "grad_norm": 0.2356945127248764, + "learning_rate": 3.096888888888889e-06, + "log_odds_chosen": 0.1009933203458786, + "log_odds_ratio": -0.7096881866455078, + "logits/chosen": 1.835608720779419, + "logits/rejected": 1.8802179098129272, + "logps/chosen": -1.9452965259552002, + "logps/rejected": -2.0278029441833496, + "loss": 1.7155517578125, + "nll_loss": 1.644582986831665, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19452962279319763, + "rewards/margins": 0.008250661194324493, + "rewards/rejected": -0.20278029143810272, + "step": 1630 + }, + { + "epoch": 1.312, + "grad_norm": 0.22595520317554474, + "learning_rate": 3.0613333333333332e-06, + "log_odds_chosen": 0.1242746114730835, + "log_odds_ratio": -0.7135905027389526, + "logits/chosen": 1.8529307842254639, + "logits/rejected": 1.7763475179672241, + "logps/chosen": -1.9397623538970947, + "logps/rejected": -2.0596587657928467, + "loss": 1.7964458465576172, + "nll_loss": 1.7250868082046509, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1939762532711029, + "rewards/margins": 0.011989672668278217, + "rewards/rejected": -0.20596590638160706, + "step": 1640 + }, + { + "epoch": 1.32, + "grad_norm": 0.24827434122562408, + "learning_rate": 3.025777777777778e-06, + "log_odds_chosen": -0.030877679586410522, + "log_odds_ratio": -0.7905367612838745, + "logits/chosen": 1.881206750869751, + "logits/rejected": 1.9151794910430908, + "logps/chosen": -1.8896806240081787, + "logps/rejected": -1.8718645572662354, + "loss": 1.757571029663086, + "nll_loss": 1.6785169839859009, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.18896806240081787, + "rewards/margins": -0.0017816193867474794, + "rewards/rejected": -0.18718644976615906, + "step": 1650 + }, + { + "epoch": 1.328, + "grad_norm": 0.18985870480537415, + "learning_rate": 2.990222222222222e-06, + "log_odds_chosen": 0.20515501499176025, + "log_odds_ratio": -0.6580603718757629, + "logits/chosen": 2.0630805492401123, + "logits/rejected": 1.9575140476226807, + "logps/chosen": -1.8301007747650146, + "logps/rejected": -1.9865009784698486, + "loss": 1.735894012451172, + "nll_loss": 1.6700878143310547, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18301010131835938, + "rewards/margins": 0.01563999056816101, + "rewards/rejected": -0.19865009188652039, + "step": 1660 + }, + { + "epoch": 1.336, + "grad_norm": 0.26883554458618164, + "learning_rate": 2.9546666666666664e-06, + "log_odds_chosen": 0.261068195104599, + "log_odds_ratio": -0.6631112694740295, + "logits/chosen": 1.9392192363739014, + "logits/rejected": 1.8922193050384521, + "logps/chosen": -1.8296535015106201, + "logps/rejected": -2.047602891921997, + "loss": 1.6895122528076172, + "nll_loss": 1.6232010126113892, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18296536803245544, + "rewards/margins": 0.021794941276311874, + "rewards/rejected": -0.20476031303405762, + "step": 1670 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.2076137214899063, + "learning_rate": 2.9191111111111107e-06, + "log_odds_chosen": 0.08022954314947128, + "log_odds_ratio": -0.7145732045173645, + "logits/chosen": 1.903964638710022, + "logits/rejected": 1.8929418325424194, + "logps/chosen": -2.0156121253967285, + "logps/rejected": -2.0851552486419678, + "loss": 1.7568914413452148, + "nll_loss": 1.685434103012085, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20156121253967285, + "rewards/margins": 0.0069543360732495785, + "rewards/rejected": -0.20851555466651917, + "step": 1680 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.27581116557121277, + "learning_rate": 2.8835555555555554e-06, + "log_odds_chosen": 0.22984762489795685, + "log_odds_ratio": -0.6419785618782043, + "logits/chosen": 1.9384571313858032, + "logits/rejected": 1.9112564325332642, + "logps/chosen": -1.8010374307632446, + "logps/rejected": -1.9852313995361328, + "loss": 1.6407814025878906, + "nll_loss": 1.5765835046768188, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18010374903678894, + "rewards/margins": 0.018419397994875908, + "rewards/rejected": -0.1985231339931488, + "step": 1690 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.25675809383392334, + "learning_rate": 2.8479999999999997e-06, + "log_odds_chosen": 0.04376112297177315, + "log_odds_ratio": -0.753434956073761, + "logits/chosen": 1.9162076711654663, + "logits/rejected": 1.8986154794692993, + "logps/chosen": -1.8424959182739258, + "logps/rejected": -1.8933817148208618, + "loss": 1.7376501083374023, + "nll_loss": 1.662306547164917, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18424959480762482, + "rewards/margins": 0.0050885798409581184, + "rewards/rejected": -0.18933814764022827, + "step": 1700 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.03438640385866165, + "eval_log_odds_ratio": -0.7598773241043091, + "eval_logits/chosen": 1.9225552082061768, + "eval_logits/rejected": 1.9088143110275269, + "eval_logps/chosen": -1.9715032577514648, + "eval_logps/rejected": -2.0058045387268066, + "eval_loss": 1.7793264389038086, + "eval_nll_loss": 1.703338623046875, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.1971503496170044, + "eval_rewards/margins": 0.0034301180858165026, + "eval_rewards/rejected": -0.20058046281337738, + "eval_runtime": 53.2144, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 1700 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.30271151661872864, + "learning_rate": 2.8124444444444444e-06, + "log_odds_chosen": 0.033895134925842285, + "log_odds_ratio": -0.729619562625885, + "logits/chosen": 1.8796659708023071, + "logits/rejected": 1.9074052572250366, + "logps/chosen": -2.0234713554382324, + "logps/rejected": -2.050968647003174, + "loss": 1.7294719696044922, + "nll_loss": 1.6565099954605103, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20234712958335876, + "rewards/margins": 0.0027496994007378817, + "rewards/rejected": -0.20509684085845947, + "step": 1710 + }, + { + "epoch": 1.376, + "grad_norm": 0.3309627175331116, + "learning_rate": 2.7768888888888886e-06, + "log_odds_chosen": -0.0035074115730822086, + "log_odds_ratio": -0.781356930732727, + "logits/chosen": 1.9202197790145874, + "logits/rejected": 1.8943029642105103, + "logps/chosen": -2.0067138671875, + "logps/rejected": -1.9892246723175049, + "loss": 1.7083927154541017, + "nll_loss": 1.630257248878479, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20067138969898224, + "rewards/margins": -0.0017489356687292457, + "rewards/rejected": -0.19892247021198273, + "step": 1720 + }, + { + "epoch": 1.384, + "grad_norm": 0.3594350516796112, + "learning_rate": 2.7413333333333333e-06, + "log_odds_chosen": 0.11887629330158234, + "log_odds_ratio": -0.6932175159454346, + "logits/chosen": 1.9436604976654053, + "logits/rejected": 1.9098408222198486, + "logps/chosen": -1.9862394332885742, + "logps/rejected": -2.096081495285034, + "loss": 1.758765411376953, + "nll_loss": 1.689443588256836, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19862394034862518, + "rewards/margins": 0.010984222404658794, + "rewards/rejected": -0.20960816740989685, + "step": 1730 + }, + { + "epoch": 1.392, + "grad_norm": 0.36076149344444275, + "learning_rate": 2.7057777777777776e-06, + "log_odds_chosen": 0.21053990721702576, + "log_odds_ratio": -0.6388333439826965, + "logits/chosen": 1.8976023197174072, + "logits/rejected": 1.9002765417099, + "logps/chosen": -1.854945182800293, + "logps/rejected": -2.0353338718414307, + "loss": 1.7823978424072267, + "nll_loss": 1.7185142040252686, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18549451231956482, + "rewards/margins": 0.018038874492049217, + "rewards/rejected": -0.2035333663225174, + "step": 1740 + }, + { + "epoch": 1.4, + "grad_norm": 0.32271715998649597, + "learning_rate": 2.6702222222222223e-06, + "log_odds_chosen": 0.1977493166923523, + "log_odds_ratio": -0.6741037368774414, + "logits/chosen": 1.8466112613677979, + "logits/rejected": 1.8674328327178955, + "logps/chosen": -1.9169870615005493, + "logps/rejected": -2.091139793395996, + "loss": 1.6603471755981445, + "nll_loss": 1.5929368734359741, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19169871509075165, + "rewards/margins": 0.01741526648402214, + "rewards/rejected": -0.2091139853000641, + "step": 1750 + }, + { + "epoch": 1.408, + "grad_norm": 0.2843083441257477, + "learning_rate": 2.6346666666666665e-06, + "log_odds_chosen": 0.17168815433979034, + "log_odds_ratio": -0.6685199737548828, + "logits/chosen": 1.9677289724349976, + "logits/rejected": 1.964511513710022, + "logps/chosen": -1.8186986446380615, + "logps/rejected": -1.9506231546401978, + "loss": 1.723248291015625, + "nll_loss": 1.656396508216858, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18186986446380615, + "rewards/margins": 0.013192457146942616, + "rewards/rejected": -0.1950623244047165, + "step": 1760 + }, + { + "epoch": 1.416, + "grad_norm": 0.2275550216436386, + "learning_rate": 2.5991111111111112e-06, + "log_odds_chosen": 0.16259385645389557, + "log_odds_ratio": -0.6815955638885498, + "logits/chosen": 2.034264326095581, + "logits/rejected": 1.9869372844696045, + "logps/chosen": -1.8783130645751953, + "logps/rejected": -2.0175511837005615, + "loss": 1.6774738311767579, + "nll_loss": 1.6093145608901978, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18783126771450043, + "rewards/margins": 0.013923834078013897, + "rewards/rejected": -0.2017551213502884, + "step": 1770 + }, + { + "epoch": 1.424, + "grad_norm": 0.2949213683605194, + "learning_rate": 2.563555555555555e-06, + "log_odds_chosen": 0.14697907865047455, + "log_odds_ratio": -0.6774402856826782, + "logits/chosen": 1.9659030437469482, + "logits/rejected": 1.9208488464355469, + "logps/chosen": -1.8670680522918701, + "logps/rejected": -1.9739803075790405, + "loss": 1.7409107208251953, + "nll_loss": 1.6731666326522827, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1867068111896515, + "rewards/margins": 0.010691216215491295, + "rewards/rejected": -0.19739803671836853, + "step": 1780 + }, + { + "epoch": 1.432, + "grad_norm": 0.2911042869091034, + "learning_rate": 2.5279999999999998e-06, + "log_odds_chosen": 0.03021297976374626, + "log_odds_ratio": -0.7644273042678833, + "logits/chosen": 1.9712092876434326, + "logits/rejected": 1.9553823471069336, + "logps/chosen": -1.9173786640167236, + "logps/rejected": -1.9468326568603516, + "loss": 1.6529146194458009, + "nll_loss": 1.5764720439910889, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.19173786044120789, + "rewards/margins": 0.0029454149771481752, + "rewards/rejected": -0.1946832686662674, + "step": 1790 + }, + { + "epoch": 1.44, + "grad_norm": 0.19435548782348633, + "learning_rate": 2.492444444444444e-06, + "log_odds_chosen": 0.015966754406690598, + "log_odds_ratio": -0.7808157205581665, + "logits/chosen": 1.9463411569595337, + "logits/rejected": 1.9502232074737549, + "logps/chosen": -1.9179836511611938, + "logps/rejected": -1.9580169916152954, + "loss": 1.7467466354370118, + "nll_loss": 1.6686649322509766, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1917983591556549, + "rewards/margins": 0.004003344569355249, + "rewards/rejected": -0.19580169022083282, + "step": 1800 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.03914888948202133, + "eval_log_odds_ratio": -0.7578481435775757, + "eval_logits/chosen": 1.948970913887024, + "eval_logits/rejected": 1.9370065927505493, + "eval_logps/chosen": -1.975156545639038, + "eval_logps/rejected": -2.0133554935455322, + "eval_loss": 1.742074966430664, + "eval_nll_loss": 1.666290044784546, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19751565158367157, + "eval_rewards/margins": 0.0038199129048734903, + "eval_rewards/rejected": -0.20133554935455322, + "eval_runtime": 53.2113, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 1800 + }, + { + "epoch": 1.448, + "grad_norm": 0.2554956078529358, + "learning_rate": 2.4568888888888887e-06, + "log_odds_chosen": 0.07928521186113358, + "log_odds_ratio": -0.7100598812103271, + "logits/chosen": 1.9834693670272827, + "logits/rejected": 1.9129583835601807, + "logps/chosen": -1.9404462575912476, + "logps/rejected": -2.0127291679382324, + "loss": 1.6566553115844727, + "nll_loss": 1.5856493711471558, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19404461979866028, + "rewards/margins": 0.007228270173072815, + "rewards/rejected": -0.20127291977405548, + "step": 1810 + }, + { + "epoch": 1.456, + "grad_norm": 0.23811131715774536, + "learning_rate": 2.421333333333333e-06, + "log_odds_chosen": 0.35999929904937744, + "log_odds_ratio": -0.6554363965988159, + "logits/chosen": 1.9690263271331787, + "logits/rejected": 1.9498993158340454, + "logps/chosen": -1.8379662036895752, + "logps/rejected": -2.1561756134033203, + "loss": 1.6408489227294922, + "nll_loss": 1.5753052234649658, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18379661440849304, + "rewards/margins": 0.03182096406817436, + "rewards/rejected": -0.2156175673007965, + "step": 1820 + }, + { + "epoch": 1.464, + "grad_norm": 0.16817817091941833, + "learning_rate": 2.3857777777777777e-06, + "log_odds_chosen": 0.09530925005674362, + "log_odds_ratio": -0.7019798159599304, + "logits/chosen": 2.0015780925750732, + "logits/rejected": 1.9477115869522095, + "logps/chosen": -1.8819535970687866, + "logps/rejected": -1.9748623371124268, + "loss": 1.68689022064209, + "nll_loss": 1.6166921854019165, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.1881953626871109, + "rewards/margins": 0.009290854446589947, + "rewards/rejected": -0.19748620688915253, + "step": 1830 + }, + { + "epoch": 1.472, + "grad_norm": 0.25315138697624207, + "learning_rate": 2.3502222222222224e-06, + "log_odds_chosen": 0.019454699009656906, + "log_odds_ratio": -0.7317359447479248, + "logits/chosen": 1.866839051246643, + "logits/rejected": 1.8470100164413452, + "logps/chosen": -1.9150846004486084, + "logps/rejected": -1.9340898990631104, + "loss": 1.6823833465576172, + "nll_loss": 1.609209656715393, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1915084570646286, + "rewards/margins": 0.0019005045760422945, + "rewards/rejected": -0.19340898096561432, + "step": 1840 + }, + { + "epoch": 1.48, + "grad_norm": 0.20591425895690918, + "learning_rate": 2.3146666666666666e-06, + "log_odds_chosen": 0.18473069369792938, + "log_odds_ratio": -0.6696128845214844, + "logits/chosen": 2.0007896423339844, + "logits/rejected": 1.9895092248916626, + "logps/chosen": -1.774839162826538, + "logps/rejected": -1.9293769598007202, + "loss": 1.626854133605957, + "nll_loss": 1.5598928928375244, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1774839460849762, + "rewards/margins": 0.015453780069947243, + "rewards/rejected": -0.1929377168416977, + "step": 1850 + }, + { + "epoch": 1.488, + "grad_norm": 0.19470006227493286, + "learning_rate": 2.2791111111111113e-06, + "log_odds_chosen": 0.08205848187208176, + "log_odds_ratio": -0.7239702343940735, + "logits/chosen": 1.9986556768417358, + "logits/rejected": 1.9231617450714111, + "logps/chosen": -1.8390166759490967, + "logps/rejected": -1.9078738689422607, + "loss": 1.6770751953125, + "nll_loss": 1.6046781539916992, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18390165269374847, + "rewards/margins": 0.0068857138976454735, + "rewards/rejected": -0.1907874047756195, + "step": 1860 + }, + { + "epoch": 1.496, + "grad_norm": 0.1899077147245407, + "learning_rate": 2.2435555555555556e-06, + "log_odds_chosen": 0.08952027559280396, + "log_odds_ratio": -0.7206470370292664, + "logits/chosen": 1.937567114830017, + "logits/rejected": 1.8602908849716187, + "logps/chosen": -1.9591258764266968, + "logps/rejected": -2.041567802429199, + "loss": 1.7079832077026367, + "nll_loss": 1.6359182596206665, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19591261446475983, + "rewards/margins": 0.008244190365076065, + "rewards/rejected": -0.2041568011045456, + "step": 1870 + }, + { + "epoch": 1.504, + "grad_norm": 0.21359078586101532, + "learning_rate": 2.2080000000000003e-06, + "log_odds_chosen": 0.12578465044498444, + "log_odds_ratio": -0.6887935400009155, + "logits/chosen": 1.9823474884033203, + "logits/rejected": 1.9967457056045532, + "logps/chosen": -1.842449426651001, + "logps/rejected": -1.9473011493682861, + "loss": 1.7107582092285156, + "nll_loss": 1.6418790817260742, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18424490094184875, + "rewards/margins": 0.010485194623470306, + "rewards/rejected": -0.19473011791706085, + "step": 1880 + }, + { + "epoch": 1.512, + "grad_norm": 0.20844241976737976, + "learning_rate": 2.172444444444444e-06, + "log_odds_chosen": 0.07038307189941406, + "log_odds_ratio": -0.7394507527351379, + "logits/chosen": 1.9976749420166016, + "logits/rejected": 1.9397321939468384, + "logps/chosen": -1.891405701637268, + "logps/rejected": -1.9321295022964478, + "loss": 1.6750732421875, + "nll_loss": 1.6011279821395874, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.18914058804512024, + "rewards/margins": 0.0040723890997469425, + "rewards/rejected": -0.19321295619010925, + "step": 1890 + }, + { + "epoch": 1.52, + "grad_norm": 0.19770869612693787, + "learning_rate": 2.136888888888889e-06, + "log_odds_chosen": 0.15921947360038757, + "log_odds_ratio": -0.6770394444465637, + "logits/chosen": 2.00447154045105, + "logits/rejected": 1.9829330444335938, + "logps/chosen": -1.8935775756835938, + "logps/rejected": -2.022252321243286, + "loss": 1.694720458984375, + "nll_loss": 1.6270164251327515, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18935778737068176, + "rewards/margins": 0.012867463752627373, + "rewards/rejected": -0.20222525298595428, + "step": 1900 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.03703959658741951, + "eval_log_odds_ratio": -0.7581658959388733, + "eval_logits/chosen": 1.9618418216705322, + "eval_logits/rejected": 1.949436902999878, + "eval_logps/chosen": -1.9719505310058594, + "eval_logps/rejected": -2.0083794593811035, + "eval_loss": 1.7386270761489868, + "eval_nll_loss": 1.6628105640411377, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19719506800174713, + "eval_rewards/margins": 0.003642885247245431, + "eval_rewards/rejected": -0.20083795487880707, + "eval_runtime": 53.1784, + "eval_samples_per_second": 9.402, + "eval_steps_per_second": 4.701, + "step": 1900 + }, + { + "epoch": 1.528, + "grad_norm": 0.20963209867477417, + "learning_rate": 2.101333333333333e-06, + "log_odds_chosen": 0.029030675068497658, + "log_odds_ratio": -0.7270594835281372, + "logits/chosen": 1.9133045673370361, + "logits/rejected": 1.9300514459609985, + "logps/chosen": -1.94406259059906, + "logps/rejected": -1.9692909717559814, + "loss": 1.6782550811767578, + "nll_loss": 1.6055490970611572, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19440627098083496, + "rewards/margins": 0.002522836672142148, + "rewards/rejected": -0.19692911207675934, + "step": 1910 + }, + { + "epoch": 1.536, + "grad_norm": 0.2001485675573349, + "learning_rate": 2.0657777777777778e-06, + "log_odds_chosen": 0.2066613882780075, + "log_odds_ratio": -0.6765174865722656, + "logits/chosen": 2.0294079780578613, + "logits/rejected": 1.97390878200531, + "logps/chosen": -1.9244718551635742, + "logps/rejected": -2.0865769386291504, + "loss": 1.739645767211914, + "nll_loss": 1.6719939708709717, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19244717061519623, + "rewards/margins": 0.01621050387620926, + "rewards/rejected": -0.2086576670408249, + "step": 1920 + }, + { + "epoch": 1.544, + "grad_norm": 0.159030944108963, + "learning_rate": 2.030222222222222e-06, + "log_odds_chosen": 0.14436787366867065, + "log_odds_ratio": -0.6960445046424866, + "logits/chosen": 2.0115323066711426, + "logits/rejected": 2.0005786418914795, + "logps/chosen": -1.8971054553985596, + "logps/rejected": -2.019014835357666, + "loss": 1.7051689147949218, + "nll_loss": 1.6355644464492798, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18971054255962372, + "rewards/margins": 0.012190921232104301, + "rewards/rejected": -0.20190146565437317, + "step": 1930 + }, + { + "epoch": 1.552, + "grad_norm": 0.20508001744747162, + "learning_rate": 1.9946666666666663e-06, + "log_odds_chosen": 0.012411686591804028, + "log_odds_ratio": -0.7656394243240356, + "logits/chosen": 1.9236491918563843, + "logits/rejected": 1.928847312927246, + "logps/chosen": -1.9116830825805664, + "logps/rejected": -1.9297691583633423, + "loss": 1.6666793823242188, + "nll_loss": 1.5901156663894653, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19116830825805664, + "rewards/margins": 0.0018086109776049852, + "rewards/rejected": -0.1929769217967987, + "step": 1940 + }, + { + "epoch": 1.56, + "grad_norm": 0.17614884674549103, + "learning_rate": 1.959111111111111e-06, + "log_odds_chosen": -0.00976226944476366, + "log_odds_ratio": -0.7704049348831177, + "logits/chosen": 1.9787086248397827, + "logits/rejected": 1.9729989767074585, + "logps/chosen": -1.9898617267608643, + "logps/rejected": -1.9705969095230103, + "loss": 1.6890064239501954, + "nll_loss": 1.6119661331176758, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19898617267608643, + "rewards/margins": -0.0019264683360233903, + "rewards/rejected": -0.19705967605113983, + "step": 1950 + }, + { + "epoch": 1.568, + "grad_norm": 0.22970254719257355, + "learning_rate": 1.9235555555555553e-06, + "log_odds_chosen": 0.11413507163524628, + "log_odds_ratio": -0.7118924856185913, + "logits/chosen": 1.9141197204589844, + "logits/rejected": 1.927093744277954, + "logps/chosen": -1.9108161926269531, + "logps/rejected": -2.0101070404052734, + "loss": 1.6295789718627929, + "nll_loss": 1.558389663696289, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19108164310455322, + "rewards/margins": 0.009929090738296509, + "rewards/rejected": -0.20101073384284973, + "step": 1960 + }, + { + "epoch": 1.576, + "grad_norm": 0.3554319739341736, + "learning_rate": 1.8879999999999998e-06, + "log_odds_chosen": 0.139635369181633, + "log_odds_ratio": -0.7322131395339966, + "logits/chosen": 1.9641954898834229, + "logits/rejected": 1.9771820306777954, + "logps/chosen": -1.919316053390503, + "logps/rejected": -2.048070192337036, + "loss": 1.6570716857910157, + "nll_loss": 1.5838501453399658, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1919316053390503, + "rewards/margins": 0.012875407934188843, + "rewards/rejected": -0.20480699837207794, + "step": 1970 + }, + { + "epoch": 1.584, + "grad_norm": 0.2408200353384018, + "learning_rate": 1.8524444444444442e-06, + "log_odds_chosen": 0.09701336920261383, + "log_odds_ratio": -0.7181066274642944, + "logits/chosen": 1.8994344472885132, + "logits/rejected": 1.8975048065185547, + "logps/chosen": -1.858982801437378, + "logps/rejected": -1.9471409320831299, + "loss": 1.618194580078125, + "nll_loss": 1.5463839769363403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18589827418327332, + "rewards/margins": 0.008815804496407509, + "rewards/rejected": -0.19471409916877747, + "step": 1980 + }, + { + "epoch": 1.592, + "grad_norm": 0.23069331049919128, + "learning_rate": 1.816888888888889e-06, + "log_odds_chosen": 0.1935921609401703, + "log_odds_ratio": -0.6571868062019348, + "logits/chosen": 1.9328809976577759, + "logits/rejected": 1.9121859073638916, + "logps/chosen": -1.773535132408142, + "logps/rejected": -1.9285399913787842, + "loss": 1.6073417663574219, + "nll_loss": 1.5416228771209717, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.17735353112220764, + "rewards/margins": 0.015500485897064209, + "rewards/rejected": -0.19285401701927185, + "step": 1990 + }, + { + "epoch": 1.6, + "grad_norm": 0.2220882922410965, + "learning_rate": 1.7813333333333334e-06, + "log_odds_chosen": 0.18786410987377167, + "log_odds_ratio": -0.6733223795890808, + "logits/chosen": 1.9360544681549072, + "logits/rejected": 1.9118951559066772, + "logps/chosen": -1.8457376956939697, + "logps/rejected": -2.006767988204956, + "loss": 1.6750938415527343, + "nll_loss": 1.6077613830566406, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18457373976707458, + "rewards/margins": 0.01610305719077587, + "rewards/rejected": -0.2006767988204956, + "step": 2000 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.037848811596632004, + "eval_log_odds_ratio": -0.7588346004486084, + "eval_logits/chosen": 1.9487451314926147, + "eval_logits/rejected": 1.937835693359375, + "eval_logps/chosen": -1.9709864854812622, + "eval_logps/rejected": -2.008183479309082, + "eval_loss": 1.7392076253890991, + "eval_nll_loss": 1.663324236869812, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19709864258766174, + "eval_rewards/margins": 0.0037197000347077847, + "eval_rewards/rejected": -0.20081835985183716, + "eval_runtime": 53.1547, + "eval_samples_per_second": 9.406, + "eval_steps_per_second": 4.703, + "step": 2000 + }, + { + "epoch": 1.608, + "grad_norm": 0.17311710119247437, + "learning_rate": 1.7457777777777779e-06, + "log_odds_chosen": 0.027943268418312073, + "log_odds_ratio": -0.7397163510322571, + "logits/chosen": 1.8568236827850342, + "logits/rejected": 1.851446509361267, + "logps/chosen": -1.9068208932876587, + "logps/rejected": -1.9251091480255127, + "loss": 1.649449348449707, + "nll_loss": 1.5754777193069458, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19068209826946259, + "rewards/margins": 0.0018288027495145798, + "rewards/rejected": -0.1925109177827835, + "step": 2010 + }, + { + "epoch": 1.616, + "grad_norm": 0.22997044026851654, + "learning_rate": 1.7102222222222221e-06, + "log_odds_chosen": 0.1097046285867691, + "log_odds_ratio": -0.7040611505508423, + "logits/chosen": 1.9243297576904297, + "logits/rejected": 1.963524580001831, + "logps/chosen": -1.8955605030059814, + "logps/rejected": -1.9760913848876953, + "loss": 1.6310876846313476, + "nll_loss": 1.5606815814971924, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1895560324192047, + "rewards/margins": 0.00805308111011982, + "rewards/rejected": -0.19760914146900177, + "step": 2020 + }, + { + "epoch": 1.624, + "grad_norm": 0.1864926517009735, + "learning_rate": 1.6746666666666666e-06, + "log_odds_chosen": 0.16068391501903534, + "log_odds_ratio": -0.6889876127243042, + "logits/chosen": 2.0080339908599854, + "logits/rejected": 1.954077959060669, + "logps/chosen": -1.8840856552124023, + "logps/rejected": -2.02380108833313, + "loss": 1.6726764678955077, + "nll_loss": 1.6037778854370117, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18840858340263367, + "rewards/margins": 0.013971516862511635, + "rewards/rejected": -0.20238009095191956, + "step": 2030 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.20873874425888062, + "learning_rate": 1.6391111111111111e-06, + "log_odds_chosen": 0.07395173609256744, + "log_odds_ratio": -0.7397626042366028, + "logits/chosen": 1.9810717105865479, + "logits/rejected": 1.9605712890625, + "logps/chosen": -1.887573003768921, + "logps/rejected": -1.9506385326385498, + "loss": 1.698026466369629, + "nll_loss": 1.6240499019622803, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18875731527805328, + "rewards/margins": 0.006306570954620838, + "rewards/rejected": -0.19506387412548065, + "step": 2040 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.19733679294586182, + "learning_rate": 1.6035555555555556e-06, + "log_odds_chosen": 0.044853754341602325, + "log_odds_ratio": -0.7555680274963379, + "logits/chosen": 1.9993703365325928, + "logits/rejected": 1.9761343002319336, + "logps/chosen": -1.9512548446655273, + "logps/rejected": -1.9669195413589478, + "loss": 1.6620542526245117, + "nll_loss": 1.5864975452423096, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1951254904270172, + "rewards/margins": 0.0015664601232856512, + "rewards/rejected": -0.19669198989868164, + "step": 2050 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.21827088296413422, + "learning_rate": 1.568e-06, + "log_odds_chosen": -0.04072676971554756, + "log_odds_ratio": -0.8474240303039551, + "logits/chosen": 1.9101537466049194, + "logits/rejected": 1.9583427906036377, + "logps/chosen": -1.9866435527801514, + "logps/rejected": -1.9379768371582031, + "loss": 1.6317344665527345, + "nll_loss": 1.5469920635223389, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1986643671989441, + "rewards/margins": -0.00486668711528182, + "rewards/rejected": -0.19379767775535583, + "step": 2060 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.17806994915008545, + "learning_rate": 1.5324444444444443e-06, + "log_odds_chosen": -0.01947469636797905, + "log_odds_ratio": -0.7560760378837585, + "logits/chosen": 1.9823522567749023, + "logits/rejected": 1.9231178760528564, + "logps/chosen": -1.9635775089263916, + "logps/rejected": -1.949378252029419, + "loss": 1.6949882507324219, + "nll_loss": 1.6193805932998657, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.19635775685310364, + "rewards/margins": -0.0014199145371094346, + "rewards/rejected": -0.1949378401041031, + "step": 2070 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.18484769761562347, + "learning_rate": 1.4968888888888888e-06, + "log_odds_chosen": -0.011743311770260334, + "log_odds_ratio": -0.7704417109489441, + "logits/chosen": 1.9684324264526367, + "logits/rejected": 1.92658269405365, + "logps/chosen": -1.9983644485473633, + "logps/rejected": -1.9799554347991943, + "loss": 1.764794158935547, + "nll_loss": 1.687750220298767, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19983646273612976, + "rewards/margins": -0.0018409093609079719, + "rewards/rejected": -0.19799552857875824, + "step": 2080 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.19327755272388458, + "learning_rate": 1.4613333333333333e-06, + "log_odds_chosen": -0.010004746727645397, + "log_odds_ratio": -0.7772399187088013, + "logits/chosen": 1.9115307331085205, + "logits/rejected": 1.8927816152572632, + "logps/chosen": -1.9729045629501343, + "logps/rejected": -1.9708572626113892, + "loss": 1.6671720504760743, + "nll_loss": 1.5894482135772705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19729046523571014, + "rewards/margins": -0.00020474250777624547, + "rewards/rejected": -0.19708572328090668, + "step": 2090 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.18267710506916046, + "learning_rate": 1.4257777777777778e-06, + "log_odds_chosen": 0.03775619715452194, + "log_odds_ratio": -0.7274637222290039, + "logits/chosen": 1.965767502784729, + "logits/rejected": 1.9332348108291626, + "logps/chosen": -1.958343744277954, + "logps/rejected": -1.9895089864730835, + "loss": 1.6861879348754882, + "nll_loss": 1.6134417057037354, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19583436846733093, + "rewards/margins": 0.003116548527032137, + "rewards/rejected": -0.1989509016275406, + "step": 2100 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.035907384008169174, + "eval_log_odds_ratio": -0.7580814361572266, + "eval_logits/chosen": 1.9721317291259766, + "eval_logits/rejected": 1.9608474969863892, + "eval_logps/chosen": -1.9707766771316528, + "eval_logps/rejected": -2.0064241886138916, + "eval_loss": 1.7385348081588745, + "eval_nll_loss": 1.662726640701294, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.1970776617527008, + "eval_rewards/margins": 0.003564756363630295, + "eval_rewards/rejected": -0.2006424516439438, + "eval_runtime": 53.1332, + "eval_samples_per_second": 9.41, + "eval_steps_per_second": 4.705, + "step": 2100 + }, + { + "epoch": 1.688, + "grad_norm": 0.2182595431804657, + "learning_rate": 1.3902222222222222e-06, + "log_odds_chosen": 0.07436065375804901, + "log_odds_ratio": -0.7376033067703247, + "logits/chosen": 2.033583641052246, + "logits/rejected": 1.9371395111083984, + "logps/chosen": -1.9411036968231201, + "logps/rejected": -2.0054962635040283, + "loss": 1.6851945877075196, + "nll_loss": 1.6114343404769897, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19411036372184753, + "rewards/margins": 0.006439276039600372, + "rewards/rejected": -0.2005496472120285, + "step": 2110 + }, + { + "epoch": 1.696, + "grad_norm": 0.2041226029396057, + "learning_rate": 1.3546666666666667e-06, + "log_odds_chosen": 0.08603398501873016, + "log_odds_ratio": -0.7840120196342468, + "logits/chosen": 2.013866662979126, + "logits/rejected": 2.004415988922119, + "logps/chosen": -1.996930480003357, + "logps/rejected": -2.057905912399292, + "loss": 1.6655797958374023, + "nll_loss": 1.5871785879135132, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19969305396080017, + "rewards/margins": 0.006097549106925726, + "rewards/rejected": -0.20579060912132263, + "step": 2120 + }, + { + "epoch": 1.704, + "grad_norm": 0.19871436059474945, + "learning_rate": 1.319111111111111e-06, + "log_odds_chosen": 0.2410924881696701, + "log_odds_ratio": -0.6583682298660278, + "logits/chosen": 1.9730018377304077, + "logits/rejected": 1.934480905532837, + "logps/chosen": -1.8520793914794922, + "logps/rejected": -2.0455081462860107, + "loss": 1.6601404190063476, + "nll_loss": 1.5943034887313843, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18520793318748474, + "rewards/margins": 0.019342893734574318, + "rewards/rejected": -0.2045508176088333, + "step": 2130 + }, + { + "epoch": 1.712, + "grad_norm": 0.13996213674545288, + "learning_rate": 1.2835555555555555e-06, + "log_odds_chosen": 0.059573762118816376, + "log_odds_ratio": -0.7112005352973938, + "logits/chosen": 1.897156000137329, + "logits/rejected": 1.918378472328186, + "logps/chosen": -1.9309139251708984, + "logps/rejected": -1.9782907962799072, + "loss": 1.6874992370605468, + "nll_loss": 1.6163790225982666, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19309139251708984, + "rewards/margins": 0.004737673792988062, + "rewards/rejected": -0.19782906770706177, + "step": 2140 + }, + { + "epoch": 1.72, + "grad_norm": 0.21009710431098938, + "learning_rate": 1.248e-06, + "log_odds_chosen": 0.09428197890520096, + "log_odds_ratio": -0.7023419141769409, + "logits/chosen": 2.001282215118408, + "logits/rejected": 2.0065598487854004, + "logps/chosen": -1.8621037006378174, + "logps/rejected": -1.9308359622955322, + "loss": 1.670359992980957, + "nll_loss": 1.6001259088516235, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18621034920215607, + "rewards/margins": 0.006873233709484339, + "rewards/rejected": -0.19308359920978546, + "step": 2150 + }, + { + "epoch": 1.728, + "grad_norm": 0.18963180482387543, + "learning_rate": 1.2124444444444444e-06, + "log_odds_chosen": 0.1198032945394516, + "log_odds_ratio": -0.786547839641571, + "logits/chosen": 1.8857593536376953, + "logits/rejected": 1.8772966861724854, + "logps/chosen": -1.964369535446167, + "logps/rejected": -2.0974040031433105, + "loss": 1.6600170135498047, + "nll_loss": 1.581362009048462, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19643697142601013, + "rewards/margins": 0.013303431682288647, + "rewards/rejected": -0.20974040031433105, + "step": 2160 + }, + { + "epoch": 1.736, + "grad_norm": 0.18647311627864838, + "learning_rate": 1.176888888888889e-06, + "log_odds_chosen": 0.28815513849258423, + "log_odds_ratio": -0.6971661448478699, + "logits/chosen": 1.994585394859314, + "logits/rejected": 1.9948902130126953, + "logps/chosen": -1.9381263256072998, + "logps/rejected": -2.205533027648926, + "loss": 1.6973844528198243, + "nll_loss": 1.627667784690857, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19381265342235565, + "rewards/margins": 0.02674066089093685, + "rewards/rejected": -0.22055332362651825, + "step": 2170 + }, + { + "epoch": 1.744, + "grad_norm": 0.2129177749156952, + "learning_rate": 1.1413333333333332e-06, + "log_odds_chosen": 0.17926549911499023, + "log_odds_ratio": -0.6743106245994568, + "logits/chosen": 1.8731542825698853, + "logits/rejected": 1.8746894598007202, + "logps/chosen": -1.849504828453064, + "logps/rejected": -2.0065646171569824, + "loss": 1.6047693252563477, + "nll_loss": 1.5373382568359375, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18495047092437744, + "rewards/margins": 0.015705987811088562, + "rewards/rejected": -0.200656458735466, + "step": 2180 + }, + { + "epoch": 1.752, + "grad_norm": 0.1805354654788971, + "learning_rate": 1.1057777777777777e-06, + "log_odds_chosen": 0.010342921130359173, + "log_odds_ratio": -0.779272735118866, + "logits/chosen": 2.0200014114379883, + "logits/rejected": 2.009512424468994, + "logps/chosen": -2.044440984725952, + "logps/rejected": -2.0541317462921143, + "loss": 1.7150918960571289, + "nll_loss": 1.6371647119522095, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -0.20444409549236298, + "rewards/margins": 0.0009690720471553504, + "rewards/rejected": -0.20541317760944366, + "step": 2190 + }, + { + "epoch": 1.76, + "grad_norm": 0.2175653576850891, + "learning_rate": 1.0702222222222221e-06, + "log_odds_chosen": 0.032114505767822266, + "log_odds_ratio": -0.7545596361160278, + "logits/chosen": 1.8662312030792236, + "logits/rejected": 1.8274072408676147, + "logps/chosen": -1.9221569299697876, + "logps/rejected": -1.9536468982696533, + "loss": 1.6281578063964843, + "nll_loss": 1.5527019500732422, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1922157108783722, + "rewards/margins": 0.003149004653096199, + "rewards/rejected": -0.19536468386650085, + "step": 2200 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.030956413596868515, + "eval_log_odds_ratio": -0.7591837644577026, + "eval_logits/chosen": 1.965054988861084, + "eval_logits/rejected": 1.9538328647613525, + "eval_logps/chosen": -1.9695806503295898, + "eval_logps/rejected": -2.000894784927368, + "eval_loss": 1.7384891510009766, + "eval_nll_loss": 1.662570834159851, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.19695807993412018, + "eval_rewards/margins": 0.0031314240768551826, + "eval_rewards/rejected": -0.2000894993543625, + "eval_runtime": 52.991, + "eval_samples_per_second": 9.436, + "eval_steps_per_second": 4.718, + "step": 2200 + }, + { + "epoch": 1.768, + "grad_norm": 0.18804052472114563, + "learning_rate": 1.0346666666666666e-06, + "log_odds_chosen": 0.047188155353069305, + "log_odds_ratio": -0.7515336275100708, + "logits/chosen": 1.929109811782837, + "logits/rejected": 1.8681122064590454, + "logps/chosen": -1.8629602193832397, + "logps/rejected": -1.8899915218353271, + "loss": 1.6528600692749023, + "nll_loss": 1.5777066946029663, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1862960159778595, + "rewards/margins": 0.0027031381614506245, + "rewards/rejected": -0.18899916112422943, + "step": 2210 + }, + { + "epoch": 1.776, + "grad_norm": 0.15381857752799988, + "learning_rate": 9.99111111111111e-07, + "log_odds_chosen": 0.11675859987735748, + "log_odds_ratio": -0.7371183037757874, + "logits/chosen": 1.8915207386016846, + "logits/rejected": 1.9328769445419312, + "logps/chosen": -1.8506667613983154, + "logps/rejected": -1.961627721786499, + "loss": 1.6869701385498046, + "nll_loss": 1.6132583618164062, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18506669998168945, + "rewards/margins": 0.011096075177192688, + "rewards/rejected": -0.19616279006004333, + "step": 2220 + }, + { + "epoch": 1.784, + "grad_norm": 0.19392751157283783, + "learning_rate": 9.635555555555556e-07, + "log_odds_chosen": 0.06520286947488785, + "log_odds_ratio": -0.7269853353500366, + "logits/chosen": 1.945910096168518, + "logits/rejected": 1.9712088108062744, + "logps/chosen": -1.8130731582641602, + "logps/rejected": -1.8833789825439453, + "loss": 1.6304901123046875, + "nll_loss": 1.5577917098999023, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.18130730092525482, + "rewards/margins": 0.0070305936969816685, + "rewards/rejected": -0.18833789229393005, + "step": 2230 + }, + { + "epoch": 1.792, + "grad_norm": 0.19160452485084534, + "learning_rate": 9.28e-07, + "log_odds_chosen": 0.11965823173522949, + "log_odds_ratio": -0.7161513566970825, + "logits/chosen": 1.8624318838119507, + "logits/rejected": 1.8819353580474854, + "logps/chosen": -1.8883718252182007, + "logps/rejected": -2.0014452934265137, + "loss": 1.6923982620239257, + "nll_loss": 1.6207830905914307, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1888371855020523, + "rewards/margins": 0.011307370848953724, + "rewards/rejected": -0.20014457404613495, + "step": 2240 + }, + { + "epoch": 1.8, + "grad_norm": 0.21190473437309265, + "learning_rate": 8.924444444444444e-07, + "log_odds_chosen": -0.1211489662528038, + "log_odds_ratio": -0.8296471834182739, + "logits/chosen": 2.0248475074768066, + "logits/rejected": 2.0085835456848145, + "logps/chosen": -1.9709552526474, + "logps/rejected": -1.8767433166503906, + "loss": 1.7127277374267578, + "nll_loss": 1.6297632455825806, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19709551334381104, + "rewards/margins": -0.00942118652164936, + "rewards/rejected": -0.18767431378364563, + "step": 2250 + }, + { + "epoch": 1.808, + "grad_norm": 0.18952655792236328, + "learning_rate": 8.568888888888889e-07, + "log_odds_chosen": 0.07776130735874176, + "log_odds_ratio": -0.7154445648193359, + "logits/chosen": 1.886913537979126, + "logits/rejected": 1.9137483835220337, + "logps/chosen": -1.8462440967559814, + "logps/rejected": -1.9090255498886108, + "loss": 1.6406494140625, + "nll_loss": 1.5691049098968506, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18462440371513367, + "rewards/margins": 0.0062781586311757565, + "rewards/rejected": -0.19090254604816437, + "step": 2260 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.2062900811433792, + "learning_rate": 8.213333333333333e-07, + "log_odds_chosen": 0.1700354367494583, + "log_odds_ratio": -0.6777452230453491, + "logits/chosen": 1.9052644968032837, + "logits/rejected": 1.906673789024353, + "logps/chosen": -1.8593641519546509, + "logps/rejected": -2.0129494667053223, + "loss": 1.5919419288635255, + "nll_loss": 1.5241668224334717, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18593639135360718, + "rewards/margins": 0.015358559787273407, + "rewards/rejected": -0.20129497349262238, + "step": 2270 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.2041376829147339, + "learning_rate": 7.857777777777778e-07, + "log_odds_chosen": -0.033038415014743805, + "log_odds_ratio": -0.7522535920143127, + "logits/chosen": 1.958950400352478, + "logits/rejected": 1.9319322109222412, + "logps/chosen": -1.9543571472167969, + "logps/rejected": -1.9238964319229126, + "loss": 1.641855239868164, + "nll_loss": 1.5666298866271973, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19543573260307312, + "rewards/margins": -0.0030460860580205917, + "rewards/rejected": -0.19238965213298798, + "step": 2280 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.21807628870010376, + "learning_rate": 7.502222222222222e-07, + "log_odds_chosen": 0.1660190224647522, + "log_odds_ratio": -0.6850719451904297, + "logits/chosen": 1.9368999004364014, + "logits/rejected": 1.8725366592407227, + "logps/chosen": -1.899930715560913, + "logps/rejected": -2.044861316680908, + "loss": 1.653396987915039, + "nll_loss": 1.5848896503448486, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18999308347702026, + "rewards/margins": 0.014493053779006004, + "rewards/rejected": -0.20448613166809082, + "step": 2290 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.20989151298999786, + "learning_rate": 7.146666666666666e-07, + "log_odds_chosen": 0.1224382147192955, + "log_odds_ratio": -0.7186975479125977, + "logits/chosen": 1.9427839517593384, + "logits/rejected": 1.912305474281311, + "logps/chosen": -1.8933311700820923, + "logps/rejected": -1.9752848148345947, + "loss": 1.7118989944458007, + "nll_loss": 1.6400293111801147, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18933311104774475, + "rewards/margins": 0.008195372298359871, + "rewards/rejected": -0.19752849638462067, + "step": 2300 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.03177111968398094, + "eval_log_odds_ratio": -0.7596560120582581, + "eval_logits/chosen": 1.9661781787872314, + "eval_logits/rejected": 1.9550096988677979, + "eval_logps/chosen": -1.967788815498352, + "eval_logps/rejected": -1.9995970726013184, + "eval_loss": 1.7376213073730469, + "eval_nll_loss": 1.6616557836532593, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19677886366844177, + "eval_rewards/margins": 0.0031808456405997276, + "eval_rewards/rejected": -0.19995971024036407, + "eval_runtime": 52.9345, + "eval_samples_per_second": 9.446, + "eval_steps_per_second": 4.723, + "step": 2300 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.1686755120754242, + "learning_rate": 6.791111111111111e-07, + "log_odds_chosen": 0.2830939292907715, + "log_odds_ratio": -0.6540330648422241, + "logits/chosen": 2.0916292667388916, + "logits/rejected": 2.016447067260742, + "logps/chosen": -1.888832449913025, + "logps/rejected": -2.147658109664917, + "loss": 1.6795072555541992, + "nll_loss": 1.61410391330719, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18888327479362488, + "rewards/margins": 0.025882547721266747, + "rewards/rejected": -0.21476581692695618, + "step": 2310 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.23096708953380585, + "learning_rate": 6.435555555555556e-07, + "log_odds_chosen": -0.06739415228366852, + "log_odds_ratio": -0.7905227541923523, + "logits/chosen": 1.9669262170791626, + "logits/rejected": 1.9470195770263672, + "logps/chosen": -1.936089277267456, + "logps/rejected": -1.8776248693466187, + "loss": 1.6421504974365235, + "nll_loss": 1.5630981922149658, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19360892474651337, + "rewards/margins": -0.005846431478857994, + "rewards/rejected": -0.18776246905326843, + "step": 2320 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.19392915070056915, + "learning_rate": 6.079999999999999e-07, + "log_odds_chosen": 0.09846463054418564, + "log_odds_ratio": -0.7082722783088684, + "logits/chosen": 1.8751684427261353, + "logits/rejected": 1.8314625024795532, + "logps/chosen": -1.9364410638809204, + "logps/rejected": -2.011606454849243, + "loss": 1.597140407562256, + "nll_loss": 1.5263129472732544, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19364410638809204, + "rewards/margins": 0.007516547106206417, + "rewards/rejected": -0.20116063952445984, + "step": 2330 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.18473073840141296, + "learning_rate": 5.724444444444444e-07, + "log_odds_chosen": 0.015096393413841724, + "log_odds_ratio": -0.7561847567558289, + "logits/chosen": 1.7878471612930298, + "logits/rejected": 1.7647182941436768, + "logps/chosen": -2.0790276527404785, + "logps/rejected": -2.0806405544281006, + "loss": 1.6734830856323242, + "nll_loss": 1.5978648662567139, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20790274441242218, + "rewards/margins": 0.00016129556752275676, + "rewards/rejected": -0.20806407928466797, + "step": 2340 + }, + { + "epoch": 1.88, + "grad_norm": 0.17758043110370636, + "learning_rate": 5.368888888888888e-07, + "log_odds_chosen": 0.020775090903043747, + "log_odds_ratio": -0.7736788392066956, + "logits/chosen": 1.8850494623184204, + "logits/rejected": 1.9317066669464111, + "logps/chosen": -1.9396501779556274, + "logps/rejected": -1.95382559299469, + "loss": 1.6969659805297852, + "nll_loss": 1.6195980310440063, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19396504759788513, + "rewards/margins": 0.0014175325632095337, + "rewards/rejected": -0.19538256525993347, + "step": 2350 + }, + { + "epoch": 1.888, + "grad_norm": 0.24013279378414154, + "learning_rate": 5.013333333333333e-07, + "log_odds_chosen": -0.049433931708335876, + "log_odds_ratio": -0.7668389081954956, + "logits/chosen": 1.91973078250885, + "logits/rejected": 1.8892580270767212, + "logps/chosen": -1.9745763540267944, + "logps/rejected": -1.9311202764511108, + "loss": 1.6504856109619142, + "nll_loss": 1.5738017559051514, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.1974576711654663, + "rewards/margins": -0.004345631692558527, + "rewards/rejected": -0.19311201572418213, + "step": 2360 + }, + { + "epoch": 1.896, + "grad_norm": 0.24050985276699066, + "learning_rate": 4.6577777777777775e-07, + "log_odds_chosen": 0.027837049216032028, + "log_odds_ratio": -0.7923186421394348, + "logits/chosen": 2.0242013931274414, + "logits/rejected": 1.9961684942245483, + "logps/chosen": -1.9506433010101318, + "logps/rejected": -1.9801626205444336, + "loss": 1.7079919815063476, + "nll_loss": 1.6287603378295898, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19506433606147766, + "rewards/margins": 0.0029519214294850826, + "rewards/rejected": -0.19801625609397888, + "step": 2370 + }, + { + "epoch": 1.904, + "grad_norm": 0.20600086450576782, + "learning_rate": 4.3022222222222223e-07, + "log_odds_chosen": 0.0014284685021266341, + "log_odds_ratio": -0.7495579719543457, + "logits/chosen": 1.9058793783187866, + "logits/rejected": 1.8871889114379883, + "logps/chosen": -1.951804757118225, + "logps/rejected": -1.9474728107452393, + "loss": 1.6623008728027344, + "nll_loss": 1.5873453617095947, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.1951804906129837, + "rewards/margins": -0.0004331955569796264, + "rewards/rejected": -0.19474726915359497, + "step": 2380 + }, + { + "epoch": 1.912, + "grad_norm": 0.1933748424053192, + "learning_rate": 3.9466666666666665e-07, + "log_odds_chosen": 0.08518671989440918, + "log_odds_ratio": -0.7107952833175659, + "logits/chosen": 1.983252763748169, + "logits/rejected": 1.9398502111434937, + "logps/chosen": -1.845097541809082, + "logps/rejected": -1.916144609451294, + "loss": 1.7064382553100585, + "nll_loss": 1.6353585720062256, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1845097690820694, + "rewards/margins": 0.007104730699211359, + "rewards/rejected": -0.19161446392536163, + "step": 2390 + }, + { + "epoch": 1.92, + "grad_norm": 0.16034555435180664, + "learning_rate": 3.591111111111111e-07, + "log_odds_chosen": 0.20363900065422058, + "log_odds_ratio": -0.6528972387313843, + "logits/chosen": 1.9256999492645264, + "logits/rejected": 1.9299843311309814, + "logps/chosen": -1.7845226526260376, + "logps/rejected": -1.9445335865020752, + "loss": 1.6401901245117188, + "nll_loss": 1.5749002695083618, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.178452268242836, + "rewards/margins": 0.016001086682081223, + "rewards/rejected": -0.19445334374904633, + "step": 2400 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.03292645514011383, + "eval_log_odds_ratio": -0.7589800953865051, + "eval_logits/chosen": 1.9538424015045166, + "eval_logits/rejected": 1.9426116943359375, + "eval_logps/chosen": -1.967284917831421, + "eval_logps/rejected": -2.0000522136688232, + "eval_loss": 1.7372877597808838, + "eval_nll_loss": 1.6613895893096924, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.19672849774360657, + "eval_rewards/margins": 0.00327673670835793, + "eval_rewards/rejected": -0.20000524818897247, + "eval_runtime": 52.9539, + "eval_samples_per_second": 9.442, + "eval_steps_per_second": 4.721, + "step": 2400 + }, + { + "epoch": 1.928, + "grad_norm": 0.2564340829849243, + "learning_rate": 3.235555555555555e-07, + "log_odds_chosen": 0.09919662773609161, + "log_odds_ratio": -0.6995732188224792, + "logits/chosen": 1.87993586063385, + "logits/rejected": 1.9208587408065796, + "logps/chosen": -1.9117933511734009, + "logps/rejected": -1.9977281093597412, + "loss": 1.7287012100219727, + "nll_loss": 1.658744215965271, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1911793202161789, + "rewards/margins": 0.008593486621975899, + "rewards/rejected": -0.19977280497550964, + "step": 2410 + }, + { + "epoch": 1.936, + "grad_norm": 0.17753444612026215, + "learning_rate": 2.88e-07, + "log_odds_chosen": 0.20836353302001953, + "log_odds_ratio": -0.6859520077705383, + "logits/chosen": 1.9739856719970703, + "logits/rejected": 1.889457106590271, + "logps/chosen": -1.85233473777771, + "logps/rejected": -2.0436553955078125, + "loss": 1.637980842590332, + "nll_loss": 1.5693857669830322, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1852334886789322, + "rewards/margins": 0.019132055342197418, + "rewards/rejected": -0.204365536570549, + "step": 2420 + }, + { + "epoch": 1.944, + "grad_norm": 0.22065120935440063, + "learning_rate": 2.5244444444444446e-07, + "log_odds_chosen": 0.2602199912071228, + "log_odds_ratio": -0.6660795211791992, + "logits/chosen": 1.9584786891937256, + "logits/rejected": 1.9201990365982056, + "logps/chosen": -1.8290560245513916, + "logps/rejected": -2.0524916648864746, + "loss": 1.5971319198608398, + "nll_loss": 1.5305240154266357, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18290559947490692, + "rewards/margins": 0.022343561053276062, + "rewards/rejected": -0.20524916052818298, + "step": 2430 + }, + { + "epoch": 1.952, + "grad_norm": 0.2034170925617218, + "learning_rate": 2.1688888888888886e-07, + "log_odds_chosen": 0.01949225924909115, + "log_odds_ratio": -0.7587012648582458, + "logits/chosen": 2.0220160484313965, + "logits/rejected": 1.9902786016464233, + "logps/chosen": -1.8832216262817383, + "logps/rejected": -1.8881313800811768, + "loss": 1.6478155136108399, + "nll_loss": 1.5719454288482666, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.18832215666770935, + "rewards/margins": 0.0004909929120913148, + "rewards/rejected": -0.18881314992904663, + "step": 2440 + }, + { + "epoch": 1.96, + "grad_norm": 0.20689211785793304, + "learning_rate": 1.8133333333333334e-07, + "log_odds_chosen": 0.03279653191566467, + "log_odds_ratio": -0.7331979274749756, + "logits/chosen": 1.8934736251831055, + "logits/rejected": 1.889977216720581, + "logps/chosen": -1.9526269435882568, + "logps/rejected": -1.9988235235214233, + "loss": 1.6669736862182618, + "nll_loss": 1.5936537981033325, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19526270031929016, + "rewards/margins": 0.004619671497493982, + "rewards/rejected": -0.1998823583126068, + "step": 2450 + }, + { + "epoch": 1.968, + "grad_norm": 0.18625201284885406, + "learning_rate": 1.4577777777777777e-07, + "log_odds_chosen": 0.16753992438316345, + "log_odds_ratio": -0.6985082626342773, + "logits/chosen": 1.9892451763153076, + "logits/rejected": 2.0005500316619873, + "logps/chosen": -1.9420229196548462, + "logps/rejected": -2.101797580718994, + "loss": 1.7038938522338867, + "nll_loss": 1.634042739868164, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1942022740840912, + "rewards/margins": 0.01597747765481472, + "rewards/rejected": -0.21017976105213165, + "step": 2460 + }, + { + "epoch": 1.976, + "grad_norm": 0.18154846131801605, + "learning_rate": 1.1022222222222222e-07, + "log_odds_chosen": 0.09283455461263657, + "log_odds_ratio": -0.7068299055099487, + "logits/chosen": 2.008981227874756, + "logits/rejected": 1.9530563354492188, + "logps/chosen": -1.9214563369750977, + "logps/rejected": -1.9895169734954834, + "loss": 1.6707321166992188, + "nll_loss": 1.6000492572784424, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19214564561843872, + "rewards/margins": 0.006806069053709507, + "rewards/rejected": -0.19895172119140625, + "step": 2470 + }, + { + "epoch": 1.984, + "grad_norm": 0.19886931777000427, + "learning_rate": 7.466666666666667e-08, + "log_odds_chosen": 0.1716727912425995, + "log_odds_ratio": -0.6732484102249146, + "logits/chosen": 2.009502649307251, + "logits/rejected": 1.9832922220230103, + "logps/chosen": -1.802361249923706, + "logps/rejected": -1.9303758144378662, + "loss": 1.63879451751709, + "nll_loss": 1.5714696645736694, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18023613095283508, + "rewards/margins": 0.012801451608538628, + "rewards/rejected": -0.19303759932518005, + "step": 2480 + }, + { + "epoch": 1.992, + "grad_norm": 0.1930459439754486, + "learning_rate": 3.9111111111111106e-08, + "log_odds_chosen": 0.11817089468240738, + "log_odds_ratio": -0.6851096749305725, + "logits/chosen": 1.9785648584365845, + "logits/rejected": 1.9935262203216553, + "logps/chosen": -1.8648144006729126, + "logps/rejected": -1.9673572778701782, + "loss": 1.6846115112304687, + "nll_loss": 1.616100549697876, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18648144602775574, + "rewards/margins": 0.010254275985062122, + "rewards/rejected": -0.1967357099056244, + "step": 2490 + }, + { + "epoch": 2.0, + "grad_norm": 0.15556873381137848, + "learning_rate": 3.5555555555555554e-09, + "log_odds_chosen": 0.052455950528383255, + "log_odds_ratio": -0.7234463691711426, + "logits/chosen": 2.058159351348877, + "logits/rejected": 2.0007574558258057, + "logps/chosen": -1.9284133911132812, + "logps/rejected": -1.976228952407837, + "loss": 1.7280191421508788, + "nll_loss": 1.655674695968628, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.1928413212299347, + "rewards/margins": 0.004781573079526424, + "rewards/rejected": -0.19762291014194489, + "step": 2500 + }, + { + "epoch": 2.0, + "eval_log_odds_chosen": 0.03403009846806526, + "eval_log_odds_ratio": -0.7589399218559265, + "eval_logits/chosen": 1.9558050632476807, + "eval_logits/rejected": 1.9448707103729248, + "eval_logps/chosen": -1.9683054685592651, + "eval_logps/rejected": -2.002143144607544, + "eval_loss": 1.73760986328125, + "eval_nll_loss": 1.6617158651351929, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.19683052599430084, + "eval_rewards/margins": 0.0033837889786809683, + "eval_rewards/rejected": -0.20021431148052216, + "eval_runtime": 53.0725, + "eval_samples_per_second": 9.421, + "eval_steps_per_second": 4.711, + "step": 2500 + } + ], + "logging_steps": 10, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_10k/lora/checkpoint-2500/training_args.bin b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..b47bcdc3c44dda631da7f475aa87c7bb2c782bec --- /dev/null +++ b/v5/ORPO/ORPO_10k/lora/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1615b59ec50a8a8f298af41b0a88c5959219b5898139e8f88d7ad75a43a2c3b +size 5521 diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/chat_template.jinja b/v5/ORPO/ORPO_1k/MORPO_1k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/config.json b/v5/ORPO/ORPO_1k/MORPO_1k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/generation_config.json b/v5/ORPO/ORPO_1k/MORPO_1k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/model.safetensors b/v5/ORPO/ORPO_1k/MORPO_1k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8cd1ff82c3481f97611285d204088529280bb9cc --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b53996771e146b6ce13ae0f7e6bf9346fd1f9e432d6dccfa11224055a23d8982 +size 2471645464 diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer.json b/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer_config.json b/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_1k/MORPO_1k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_1k/ORPO_1k/README.md b/v5/ORPO/ORPO_1k/ORPO_1k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_1k/ORPO_1k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/ORPO_1k/adapter_config.json b/v5/ORPO/ORPO_1k/ORPO_1k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38a3e195fc5044bb8b51d3f0386d896bb63d9faa --- /dev/null +++ b/v5/ORPO/ORPO_1k/ORPO_1k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/ORPO_1k/adapter_model.safetensors b/v5/ORPO/ORPO_1k/ORPO_1k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..579ab87f394a29a86eaabb28801efdb8b7c00ddc --- /dev/null +++ b/v5/ORPO/ORPO_1k/ORPO_1k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7df792f8ba8f6c34e83f9250fadc524a27d9c5cdeb952cd1dee6f468da5606 +size 180385008 diff --git a/v5/ORPO/ORPO_1k/lora/README.md b/v5/ORPO/ORPO_1k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1691cf1b8f4d4f75aa2fa4c339204301e77c9b58 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- trl +- orpo +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/axqxmxpa) + + +This model was trained with ORPO, a method introduced in [ORPO: Monolithic Preference Optimization without Reference Model](https://huggingface.co/papers/2403.07691). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite ORPO as: + +```bibtex +@article{hong2024orpo, + title = {{ORPO: Monolithic Preference Optimization without Reference Model}}, + author = {Jiwoo Hong and Noah Lee and James Thorne}, + year = 2024, + eprint = {arXiv:2403.07691} +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/README.md b/v5/ORPO/ORPO_1k/lora/checkpoint-240/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38a3e195fc5044bb8b51d3f0386d896bb63d9faa --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_model.safetensors b/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f0ffaf863a4bfc0b15c9711e2286acc5cb38a41b --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3166a8c2fb31896f59db768c5d327b4a8618371dab31c4413d7c931bb0b9af50 +size 180385008 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/chat_template.jinja b/v5/ORPO/ORPO_1k/lora/checkpoint-240/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/optimizer.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-240/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3df372984fb5187c079cb5dd778b0bee0e23946f --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d85a507608e071fa665bf34a710745c473bad600cdbc1e774cb12fd2c86786 +size 360902475 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/rng_state.pth b/v5/ORPO/ORPO_1k/lora/checkpoint-240/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/scaler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-240/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c9041f3b9dc92c4c71cfe27f1badefa3341d514 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:358036b71a9ab45fd32e9d2566e050fa5ce750795c3889b5da2b5cc1df201fc2 +size 1383 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/scheduler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-240/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d03d4162174312f07d1edf6513c5b843ed75a89 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c34e0ed91a7b47e1380e11b166cfe5121623bd13ca92f14ba0c10f9611cded7 +size 1465 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer.json b/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/trainer_state.json b/v5/ORPO/ORPO_1k/lora/checkpoint-240/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a840fa7fdfc2e5877c95b99dc9ebac21f73a7c9d --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/trainer_state.json @@ -0,0 +1,922 @@ +{ + "best_global_step": 30, + "best_metric": 0.5540000200271606, + "best_model_checkpoint": "output/lora/checkpoint-30", + "epoch": 1.92, + "eval_steps": 10, + "global_step": 240, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.9159289002418518, + "learning_rate": 2.88e-06, + "log_odds_chosen": 0.19562272727489471, + "log_odds_ratio": -0.8590701222419739, + "logits/chosen": 1.1002824306488037, + "logits/rejected": 1.0790246725082397, + "logps/chosen": -3.0543551445007324, + "logps/rejected": -3.247206211090088, + "loss": 3.5340042114257812, + "nll_loss": 3.4480972290039062, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.30543553829193115, + "rewards/margins": 0.01928507350385189, + "rewards/rejected": -0.3247205913066864, + "step": 10 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.14816458523273468, + "eval_log_odds_ratio": -0.8236475586891174, + "eval_logits/chosen": 1.075466513633728, + "eval_logits/rejected": 1.069645881652832, + "eval_logps/chosen": -3.0302510261535645, + "eval_logps/rejected": -3.178657293319702, + "eval_loss": 3.465601921081543, + "eval_nll_loss": 3.383236885070801, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": -0.30302515625953674, + "eval_rewards/margins": 0.014840577729046345, + "eval_rewards/rejected": -0.3178657293319702, + "eval_runtime": 52.3971, + "eval_samples_per_second": 9.543, + "eval_steps_per_second": 4.771, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 0.49432215094566345, + "learning_rate": 6.079999999999999e-06, + "log_odds_chosen": 0.051810234785079956, + "log_odds_ratio": -0.8136274218559265, + "logits/chosen": 1.0922297239303589, + "logits/rejected": 1.1531397104263306, + "logps/chosen": -3.053081750869751, + "logps/rejected": -3.1046817302703857, + "loss": 3.3105998992919923, + "nll_loss": 3.229236602783203, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.30530816316604614, + "rewards/margins": 0.005159988068044186, + "rewards/rejected": -0.3104681670665741, + "step": 20 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.136577308177948, + "eval_log_odds_ratio": -0.810167133808136, + "eval_logits/chosen": 1.0986508131027222, + "eval_logits/rejected": 1.084808588027954, + "eval_logps/chosen": -2.914625644683838, + "eval_logps/rejected": -3.05191969871521, + "eval_loss": 3.170542001724243, + "eval_nll_loss": 3.0895254611968994, + "eval_rewards/accuracies": 0.5479999780654907, + "eval_rewards/chosen": -0.2914625406265259, + "eval_rewards/margins": 0.013729416765272617, + "eval_rewards/rejected": -0.3051919937133789, + "eval_runtime": 53.3238, + "eval_samples_per_second": 9.377, + "eval_steps_per_second": 4.688, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 0.4138321876525879, + "learning_rate": 7.857777777777777e-06, + "log_odds_chosen": 0.08315258473157883, + "log_odds_ratio": -0.7477900981903076, + "logits/chosen": 1.192463755607605, + "logits/rejected": 1.1664505004882812, + "logps/chosen": -2.7590999603271484, + "logps/rejected": -2.8307957649230957, + "loss": 2.969002532958984, + "nll_loss": 2.89422345161438, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.275909960269928, + "rewards/margins": 0.0071696205995976925, + "rewards/rejected": -0.2830796241760254, + "step": 30 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.11084984242916107, + "eval_log_odds_ratio": -0.78859543800354, + "eval_logits/chosen": 1.1216861009597778, + "eval_logits/rejected": 1.0951279401779175, + "eval_logps/chosen": -2.7105276584625244, + "eval_logps/rejected": -2.823791265487671, + "eval_loss": 2.8847193717956543, + "eval_nll_loss": 2.8058602809906006, + "eval_rewards/accuracies": 0.5540000200271606, + "eval_rewards/chosen": -0.2710527777671814, + "eval_rewards/margins": 0.011326361447572708, + "eval_rewards/rejected": -0.2823791205883026, + "eval_runtime": 53.2199, + "eval_samples_per_second": 9.395, + "eval_steps_per_second": 4.697, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 0.34679412841796875, + "learning_rate": 7.502222222222222e-06, + "log_odds_chosen": 0.18765760958194733, + "log_odds_ratio": -0.7577568292617798, + "logits/chosen": 1.1061517000198364, + "logits/rejected": 1.107339859008789, + "logps/chosen": -2.500312566757202, + "logps/rejected": -2.6544313430786133, + "loss": 2.7145227432250976, + "nll_loss": 2.638747453689575, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2500312924385071, + "rewards/margins": 0.01541186310350895, + "rewards/rejected": -0.2654431462287903, + "step": 40 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.08204474300146103, + "eval_log_odds_ratio": -0.7724552750587463, + "eval_logits/chosen": 1.092315912246704, + "eval_logits/rejected": 1.05488121509552, + "eval_logps/chosen": -2.5186259746551514, + "eval_logps/rejected": -2.6051390171051025, + "eval_loss": 2.6419382095336914, + "eval_nll_loss": 2.564692974090576, + "eval_rewards/accuracies": 0.5519999861717224, + "eval_rewards/chosen": -0.25186261534690857, + "eval_rewards/margins": 0.008651292882859707, + "eval_rewards/rejected": -0.26051390171051025, + "eval_runtime": 53.3316, + "eval_samples_per_second": 9.375, + "eval_steps_per_second": 4.688, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 0.3510463535785675, + "learning_rate": 7.146666666666666e-06, + "log_odds_chosen": 0.05859034135937691, + "log_odds_ratio": -0.7385914325714111, + "logits/chosen": 1.1375240087509155, + "logits/rejected": 1.0450371503829956, + "logps/chosen": -2.42551326751709, + "logps/rejected": -2.4860575199127197, + "loss": 2.46743106842041, + "nll_loss": 2.393571376800537, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.24255135655403137, + "rewards/margins": 0.006054399069398642, + "rewards/rejected": -0.24860575795173645, + "step": 50 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.05682923272252083, + "eval_log_odds_ratio": -0.7677140235900879, + "eval_logits/chosen": 1.0331004858016968, + "eval_logits/rejected": 0.9877541065216064, + "eval_logps/chosen": -2.3998496532440186, + "eval_logps/rejected": -2.462172746658325, + "eval_loss": 2.429464817047119, + "eval_nll_loss": 2.352693557739258, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.23998498916625977, + "eval_rewards/margins": 0.0062323203310370445, + "eval_rewards/rejected": -0.24621731042861938, + "eval_runtime": 53.2401, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.696, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 0.20609980821609497, + "learning_rate": 6.791111111111111e-06, + "log_odds_chosen": 0.07380016148090363, + "log_odds_ratio": -0.7359625101089478, + "logits/chosen": 0.889764130115509, + "logits/rejected": 1.017002820968628, + "logps/chosen": -2.2573037147521973, + "logps/rejected": -2.305345058441162, + "loss": 2.3112926483154297, + "nll_loss": 2.2376961708068848, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22573037445545197, + "rewards/margins": 0.004804140888154507, + "rewards/rejected": -0.23053452372550964, + "step": 60 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.043189436197280884, + "eval_log_odds_ratio": -0.7669212222099304, + "eval_logits/chosen": 0.9866231679916382, + "eval_logits/rejected": 0.9370818734169006, + "eval_logps/chosen": -2.3348076343536377, + "eval_logps/rejected": -2.3840110301971436, + "eval_loss": 2.255201578140259, + "eval_nll_loss": 2.17850923538208, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.23348075151443481, + "eval_rewards/margins": 0.0049203126691281796, + "eval_rewards/rejected": -0.23840108513832092, + "eval_runtime": 53.4269, + "eval_samples_per_second": 9.359, + "eval_steps_per_second": 4.679, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 0.1984829604625702, + "learning_rate": 6.435555555555555e-06, + "log_odds_chosen": 0.00933628249913454, + "log_odds_ratio": -0.7400209903717041, + "logits/chosen": 1.046584129333496, + "logits/rejected": 1.0750269889831543, + "logps/chosen": -2.242738723754883, + "logps/rejected": -2.247896194458008, + "loss": 2.179552459716797, + "nll_loss": 2.1055500507354736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22427387535572052, + "rewards/margins": 0.0005157862906344235, + "rewards/rejected": -0.22478966414928436, + "step": 70 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.037273552268743515, + "eval_log_odds_ratio": -0.7664892673492432, + "eval_logits/chosen": 0.9825413823127747, + "eval_logits/rejected": 0.9323441982269287, + "eval_logps/chosen": -2.2950439453125, + "eval_logps/rejected": -2.338361978530884, + "eval_loss": 2.161572217941284, + "eval_nll_loss": 2.084923267364502, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22950439155101776, + "eval_rewards/margins": 0.004331790842115879, + "eval_rewards/rejected": -0.23383621871471405, + "eval_runtime": 53.2286, + "eval_samples_per_second": 9.393, + "eval_steps_per_second": 4.697, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 0.18926046788692474, + "learning_rate": 6.079999999999999e-06, + "log_odds_chosen": 0.16370263695716858, + "log_odds_ratio": -0.7065578699111938, + "logits/chosen": 0.9741449356079102, + "logits/rejected": 0.9439865350723267, + "logps/chosen": -2.226841926574707, + "logps/rejected": -2.386024236679077, + "loss": 2.133669853210449, + "nll_loss": 2.063014030456543, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22268421947956085, + "rewards/margins": 0.015918215736746788, + "rewards/rejected": -0.2386024296283722, + "step": 80 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.03516349196434021, + "eval_log_odds_ratio": -0.7663520574569702, + "eval_logits/chosen": 1.022588849067688, + "eval_logits/rejected": 0.9730709791183472, + "eval_logps/chosen": -2.2640902996063232, + "eval_logps/rejected": -2.305067300796509, + "eval_loss": 2.0895278453826904, + "eval_nll_loss": 2.012892246246338, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22640903294086456, + "eval_rewards/margins": 0.004097723867744207, + "eval_rewards/rejected": -0.2305067777633667, + "eval_runtime": 53.2228, + "eval_samples_per_second": 9.394, + "eval_steps_per_second": 4.697, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": 0.15444700419902802, + "learning_rate": 5.724444444444444e-06, + "log_odds_chosen": 0.049940235912799835, + "log_odds_ratio": -0.7443691492080688, + "logits/chosen": 1.0501600503921509, + "logits/rejected": 1.016621708869934, + "logps/chosen": -2.2212884426116943, + "logps/rejected": -2.2772510051727295, + "loss": 2.045332908630371, + "nll_loss": 1.9708961248397827, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.22212886810302734, + "rewards/margins": 0.005596236325800419, + "rewards/rejected": -0.2277251034975052, + "step": 90 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.033585965633392334, + "eval_log_odds_ratio": -0.766018271446228, + "eval_logits/chosen": 1.0759873390197754, + "eval_logits/rejected": 1.0270211696624756, + "eval_logps/chosen": -2.2369883060455322, + "eval_logps/rejected": -2.2760024070739746, + "eval_loss": 2.0355796813964844, + "eval_nll_loss": 1.9589776992797852, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.22369883954524994, + "eval_rewards/margins": 0.0039014029316604137, + "eval_rewards/rejected": -0.22760024666786194, + "eval_runtime": 53.2478, + "eval_samples_per_second": 9.39, + "eval_steps_per_second": 4.695, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 0.2406257539987564, + "learning_rate": 5.368888888888889e-06, + "log_odds_chosen": 0.040743522346019745, + "log_odds_ratio": -0.7394427061080933, + "logits/chosen": 0.9953991770744324, + "logits/rejected": 1.0465402603149414, + "logps/chosen": -2.2148966789245605, + "logps/rejected": -2.239614486694336, + "loss": 2.024346923828125, + "nll_loss": 1.950402855873108, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.22148966789245605, + "rewards/margins": 0.002471799496561289, + "rewards/rejected": -0.2239614725112915, + "step": 100 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.03151147812604904, + "eval_log_odds_ratio": -0.7650076746940613, + "eval_logits/chosen": 1.090487003326416, + "eval_logits/rejected": 1.0410875082015991, + "eval_logps/chosen": -2.2083094120025635, + "eval_logps/rejected": -2.2448291778564453, + "eval_loss": 1.9928127527236938, + "eval_nll_loss": 1.9163117408752441, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22083096206188202, + "eval_rewards/margins": 0.003651980310678482, + "eval_rewards/rejected": -0.2244829386472702, + "eval_runtime": 53.2544, + "eval_samples_per_second": 9.389, + "eval_steps_per_second": 4.694, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 0.14265325665473938, + "learning_rate": 5.013333333333333e-06, + "log_odds_chosen": -0.10137738287448883, + "log_odds_ratio": -0.8407853245735168, + "logits/chosen": 1.1193276643753052, + "logits/rejected": 1.0871385335922241, + "logps/chosen": -2.144883632659912, + "logps/rejected": -2.0626511573791504, + "loss": 1.977994155883789, + "nll_loss": 1.8939154148101807, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.21448838710784912, + "rewards/margins": -0.008223267272114754, + "rewards/rejected": -0.20626512169837952, + "step": 110 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.0301420409232378, + "eval_log_odds_ratio": -0.764117419719696, + "eval_logits/chosen": 1.1084222793579102, + "eval_logits/rejected": 1.059097170829773, + "eval_logps/chosen": -2.1800594329833984, + "eval_logps/rejected": -2.214940071105957, + "eval_loss": 1.9601762294769287, + "eval_nll_loss": 1.883764386177063, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.2180059254169464, + "eval_rewards/margins": 0.0034881029278039932, + "eval_rewards/rejected": -0.22149403393268585, + "eval_runtime": 53.2746, + "eval_samples_per_second": 9.385, + "eval_steps_per_second": 4.693, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 0.17654533684253693, + "learning_rate": 4.657777777777778e-06, + "log_odds_chosen": 0.07624180614948273, + "log_odds_ratio": -0.7496079802513123, + "logits/chosen": 1.138927698135376, + "logits/rejected": 1.1692471504211426, + "logps/chosen": -2.1000325679779053, + "logps/rejected": -2.1689352989196777, + "loss": 1.944967269897461, + "nll_loss": 1.8700063228607178, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21000322699546814, + "rewards/margins": 0.0068902671337127686, + "rewards/rejected": -0.2168935239315033, + "step": 120 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.029821552336215973, + "eval_log_odds_ratio": -0.763667643070221, + "eval_logits/chosen": 1.1056278944015503, + "eval_logits/rejected": 1.056398630142212, + "eval_logps/chosen": -2.158470630645752, + "eval_logps/rejected": -2.1928257942199707, + "eval_loss": 1.93770432472229, + "eval_nll_loss": 1.8613375425338745, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": -0.21584708988666534, + "eval_rewards/margins": 0.0034354773815721273, + "eval_rewards/rejected": -0.21928256750106812, + "eval_runtime": 53.2433, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.695, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 0.1827414184808731, + "learning_rate": 4.302222222222222e-06, + "log_odds_chosen": -0.001232819282449782, + "log_odds_ratio": -0.7414854764938354, + "logits/chosen": 1.2297706604003906, + "logits/rejected": 1.1010440587997437, + "logps/chosen": -1.9639602899551392, + "logps/rejected": -1.962048888206482, + "loss": 1.9267074584960937, + "nll_loss": 1.8525588512420654, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19639602303504944, + "rewards/margins": -0.00019112862355541438, + "rewards/rejected": -0.19620490074157715, + "step": 130 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.029177702963352203, + "eval_log_odds_ratio": -0.7638773322105408, + "eval_logits/chosen": 1.1144956350326538, + "eval_logits/rejected": 1.0663033723831177, + "eval_logps/chosen": -2.144742965698242, + "eval_logps/rejected": -2.1784965991973877, + "eval_loss": 1.920721173286438, + "eval_nll_loss": 1.844333529472351, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21447430551052094, + "eval_rewards/margins": 0.0033753456082195044, + "eval_rewards/rejected": -0.21784964203834534, + "eval_runtime": 53.2128, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 0.16919028759002686, + "learning_rate": 3.9466666666666664e-06, + "log_odds_chosen": -0.11807477474212646, + "log_odds_ratio": -0.8465667963027954, + "logits/chosen": 1.1062188148498535, + "logits/rejected": 1.1064749956130981, + "logps/chosen": -2.1417267322540283, + "logps/rejected": -2.0469985008239746, + "loss": 1.938684844970703, + "nll_loss": 1.854028344154358, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21417267620563507, + "rewards/margins": -0.009472792968153954, + "rewards/rejected": -0.20469987392425537, + "step": 140 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.028810443356633186, + "eval_log_odds_ratio": -0.7637351155281067, + "eval_logits/chosen": 1.1457773447036743, + "eval_logits/rejected": 1.0984731912612915, + "eval_logps/chosen": -2.1359357833862305, + "eval_logps/rejected": -2.1694324016571045, + "eval_loss": 1.907101035118103, + "eval_nll_loss": 1.8307276964187622, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21359357237815857, + "eval_rewards/margins": 0.0033496527466923, + "eval_rewards/rejected": -0.21694323420524597, + "eval_runtime": 195.8742, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 1.276, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 0.12811271846294403, + "learning_rate": 3.591111111111111e-06, + "log_odds_chosen": 0.10773544013500214, + "log_odds_ratio": -0.6882486343383789, + "logits/chosen": 1.1967742443084717, + "logits/rejected": 1.2303117513656616, + "logps/chosen": -2.0538058280944824, + "logps/rejected": -2.147270679473877, + "loss": 1.928931427001953, + "nll_loss": 1.8601064682006836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20538058876991272, + "rewards/margins": 0.009346459992229939, + "rewards/rejected": -0.21472707390785217, + "step": 150 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.029405459761619568, + "eval_log_odds_ratio": -0.7636828422546387, + "eval_logits/chosen": 1.1795330047607422, + "eval_logits/rejected": 1.1324467658996582, + "eval_logps/chosen": -2.1314361095428467, + "eval_logps/rejected": -2.165649175643921, + "eval_loss": 1.8977786302566528, + "eval_nll_loss": 1.8214104175567627, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.21314364671707153, + "eval_rewards/margins": 0.0034212982282042503, + "eval_rewards/rejected": -0.21656493842601776, + "eval_runtime": 53.2699, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 0.1704077273607254, + "learning_rate": 3.235555555555555e-06, + "log_odds_chosen": 0.14702661335468292, + "log_odds_ratio": -0.6978052854537964, + "logits/chosen": 1.1281222105026245, + "logits/rejected": 1.2165471315383911, + "logps/chosen": -2.0752506256103516, + "logps/rejected": -2.201073169708252, + "loss": 1.8814886093139649, + "nll_loss": 1.8117080926895142, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2075250893831253, + "rewards/margins": 0.012582269497215748, + "rewards/rejected": -0.22010734677314758, + "step": 160 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.0292374175041914, + "eval_log_odds_ratio": -0.763826310634613, + "eval_logits/chosen": 1.2024222612380981, + "eval_logits/rejected": 1.1555261611938477, + "eval_logps/chosen": -2.1266942024230957, + "eval_logps/rejected": -2.160792350769043, + "eval_loss": 1.8912572860717773, + "eval_nll_loss": 1.8148746490478516, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21266941726207733, + "eval_rewards/margins": 0.003409823402762413, + "eval_rewards/rejected": -0.2160792201757431, + "eval_runtime": 53.2802, + "eval_samples_per_second": 9.384, + "eval_steps_per_second": 4.692, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.1710982620716095, + "learning_rate": 2.88e-06, + "log_odds_chosen": 0.1420261412858963, + "log_odds_ratio": -0.6900883913040161, + "logits/chosen": 1.0993479490280151, + "logits/rejected": 1.2283376455307007, + "logps/chosen": -2.066962242126465, + "logps/rejected": -2.177448034286499, + "loss": 1.8723211288452148, + "nll_loss": 1.8033123016357422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20669622719287872, + "rewards/margins": 0.011048593558371067, + "rewards/rejected": -0.21774479746818542, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.028749050572514534, + "eval_log_odds_ratio": -0.763710081577301, + "eval_logits/chosen": 1.197767734527588, + "eval_logits/rejected": 1.1507377624511719, + "eval_logps/chosen": -2.1212124824523926, + "eval_logps/rejected": -2.1547811031341553, + "eval_loss": 1.8862611055374146, + "eval_nll_loss": 1.8098900318145752, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21212123334407806, + "eval_rewards/margins": 0.003356893314048648, + "eval_rewards/rejected": -0.21547812223434448, + "eval_runtime": 53.2604, + "eval_samples_per_second": 9.388, + "eval_steps_per_second": 4.694, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 0.14521150290966034, + "learning_rate": 2.5244444444444443e-06, + "log_odds_chosen": -0.005218303296715021, + "log_odds_ratio": -0.8199856877326965, + "logits/chosen": 1.2382605075836182, + "logits/rejected": 1.1947174072265625, + "logps/chosen": -2.0978918075561523, + "logps/rejected": -2.1037724018096924, + "loss": 1.8984029769897461, + "nll_loss": 1.8164045810699463, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2097891867160797, + "rewards/margins": 0.0005880504613742232, + "rewards/rejected": -0.21037724614143372, + "step": 180 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.02963738888502121, + "eval_log_odds_ratio": -0.7632617950439453, + "eval_logits/chosen": 1.2076901197433472, + "eval_logits/rejected": 1.1605749130249023, + "eval_logps/chosen": -2.117640733718872, + "eval_logps/rejected": -2.1519222259521484, + "eval_loss": 1.8823505640029907, + "eval_nll_loss": 1.806024432182312, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21176405251026154, + "eval_rewards/margins": 0.003428164403885603, + "eval_rewards/rejected": -0.21519219875335693, + "eval_runtime": 53.2338, + "eval_samples_per_second": 9.393, + "eval_steps_per_second": 4.696, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 0.1563858985900879, + "learning_rate": 2.168888888888889e-06, + "log_odds_chosen": 0.08264724910259247, + "log_odds_ratio": -0.6953542232513428, + "logits/chosen": 1.2215197086334229, + "logits/rejected": 1.1691173315048218, + "logps/chosen": -2.031200885772705, + "logps/rejected": -2.0946972370147705, + "loss": 1.912965965270996, + "nll_loss": 1.843430519104004, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2031201422214508, + "rewards/margins": 0.006349604576826096, + "rewards/rejected": -0.20946970582008362, + "step": 190 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.030445439741015434, + "eval_log_odds_ratio": -0.7631585597991943, + "eval_logits/chosen": 1.2250813245773315, + "eval_logits/rejected": 1.1781257390975952, + "eval_logps/chosen": -2.1141271591186523, + "eval_logps/rejected": -2.149076223373413, + "eval_loss": 1.879238247871399, + "eval_nll_loss": 1.802922248840332, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.211412712931633, + "eval_rewards/margins": 0.0034949015825986862, + "eval_rewards/rejected": -0.21490761637687683, + "eval_runtime": 53.2657, + "eval_samples_per_second": 9.387, + "eval_steps_per_second": 4.693, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 0.14611385762691498, + "learning_rate": 1.813333333333333e-06, + "log_odds_chosen": -0.10572721809148788, + "log_odds_ratio": -0.8197598457336426, + "logits/chosen": 1.202487826347351, + "logits/rejected": 1.1040087938308716, + "logps/chosen": -2.1150729656219482, + "logps/rejected": -2.0351948738098145, + "loss": 1.9483623504638672, + "nll_loss": 1.86638605594635, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.21150732040405273, + "rewards/margins": -0.007987814024090767, + "rewards/rejected": -0.20351949334144592, + "step": 200 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.0310534480959177, + "eval_log_odds_ratio": -0.7631565928459167, + "eval_logits/chosen": 1.2298425436019897, + "eval_logits/rejected": 1.1832287311553955, + "eval_logps/chosen": -2.11133074760437, + "eval_logps/rejected": -2.1467576026916504, + "eval_loss": 1.8768765926361084, + "eval_nll_loss": 1.8005608320236206, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.21113307774066925, + "eval_rewards/margins": 0.0035426774993538857, + "eval_rewards/rejected": -0.21467576920986176, + "eval_runtime": 53.2607, + "eval_samples_per_second": 9.388, + "eval_steps_per_second": 4.694, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.17115949094295502, + "learning_rate": 1.4577777777777778e-06, + "log_odds_chosen": 0.15168903768062592, + "log_odds_ratio": -0.6794174909591675, + "logits/chosen": 1.2208305597305298, + "logits/rejected": 1.1591063737869263, + "logps/chosen": -2.031393527984619, + "logps/rejected": -2.164114475250244, + "loss": 1.833169937133789, + "nll_loss": 1.7652279138565063, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20313934981822968, + "rewards/margins": 0.01327207125723362, + "rewards/rejected": -0.21641144156455994, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.03077588975429535, + "eval_log_odds_ratio": -0.7630413770675659, + "eval_logits/chosen": 1.2368669509887695, + "eval_logits/rejected": 1.1905410289764404, + "eval_logps/chosen": -2.1100478172302246, + "eval_logps/rejected": -2.1452219486236572, + "eval_loss": 1.8748760223388672, + "eval_nll_loss": 1.7985717058181763, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21100479364395142, + "eval_rewards/margins": 0.003517415374517441, + "eval_rewards/rejected": -0.21452219784259796, + "eval_runtime": 53.2709, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 0.136911541223526, + "learning_rate": 1.1022222222222222e-06, + "log_odds_chosen": -0.004690551199018955, + "log_odds_ratio": -0.7404999732971191, + "logits/chosen": 1.3654903173446655, + "logits/rejected": 1.3405206203460693, + "logps/chosen": -2.1515979766845703, + "logps/rejected": -2.1524949073791504, + "loss": 1.935328483581543, + "nll_loss": 1.8612781763076782, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2151598036289215, + "rewards/margins": 8.968896872829646e-05, + "rewards/rejected": -0.21524949371814728, + "step": 220 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.03143003210425377, + "eval_log_odds_ratio": -0.7629221081733704, + "eval_logits/chosen": 1.2453513145446777, + "eval_logits/rejected": 1.1991850137710571, + "eval_logps/chosen": -2.108854055404663, + "eval_logps/rejected": -2.14459490776062, + "eval_loss": 1.8734816312789917, + "eval_nll_loss": 1.797189474105835, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21088536083698273, + "eval_rewards/margins": 0.00357412570156157, + "eval_rewards/rejected": -0.21445949375629425, + "eval_runtime": 53.3291, + "eval_samples_per_second": 9.376, + "eval_steps_per_second": 4.688, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.14719471335411072, + "learning_rate": 7.466666666666667e-07, + "log_odds_chosen": 0.06841103732585907, + "log_odds_ratio": -0.7776535749435425, + "logits/chosen": 1.1754335165023804, + "logits/rejected": 1.1915993690490723, + "logps/chosen": -2.027627468109131, + "logps/rejected": -2.0764873027801514, + "loss": 1.8516361236572265, + "nll_loss": 1.7738704681396484, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20276275277137756, + "rewards/margins": 0.004885983653366566, + "rewards/rejected": -0.20764870941638947, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.03179427608847618, + "eval_log_odds_ratio": -0.7629269957542419, + "eval_logits/chosen": 1.2505085468292236, + "eval_logits/rejected": 1.2043695449829102, + "eval_logps/chosen": -2.1080172061920166, + "eval_logps/rejected": -2.1440720558166504, + "eval_loss": 1.8725281953811646, + "eval_nll_loss": 1.79623544216156, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21080172061920166, + "eval_rewards/margins": 0.0036054973024874926, + "eval_rewards/rejected": -0.21440719068050385, + "eval_runtime": 53.2692, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 0.21970191597938538, + "learning_rate": 3.911111111111111e-07, + "log_odds_chosen": 0.1418900340795517, + "log_odds_ratio": -0.7076598405838013, + "logits/chosen": 1.2581008672714233, + "logits/rejected": 1.2762272357940674, + "logps/chosen": -2.008291244506836, + "logps/rejected": -2.139103889465332, + "loss": 1.915215301513672, + "nll_loss": 1.8444492816925049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2008291482925415, + "rewards/margins": 0.0130812618881464, + "rewards/rejected": -0.21391041576862335, + "step": 240 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.031775180250406265, + "eval_log_odds_ratio": -0.7629340887069702, + "eval_logits/chosen": 1.254185438156128, + "eval_logits/rejected": 1.208067536354065, + "eval_logps/chosen": -2.1076254844665527, + "eval_logps/rejected": -2.1436824798583984, + "eval_loss": 1.8719455003738403, + "eval_nll_loss": 1.7956523895263672, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21076256036758423, + "eval_rewards/margins": 0.003605667734518647, + "eval_rewards/rejected": -0.21436822414398193, + "eval_runtime": 53.3098, + "eval_samples_per_second": 9.379, + "eval_steps_per_second": 4.69, + "step": 240 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-240/training_args.bin b/v5/ORPO/ORPO_1k/lora/checkpoint-240/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..21f17e0abb46ed5f8c3ca052b462718090535629 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-240/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6f6a67bdcce98c9a6e36d43a0e7aee42aa33a0edf20e2bc530a89f6cddc45c +size 5457 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/README.md b/v5/ORPO/ORPO_1k/lora/checkpoint-250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38a3e195fc5044bb8b51d3f0386d896bb63d9faa --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_model.safetensors b/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0161d06bde372b747390cae1c80c8906156e1cf5 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54c5947adcb7bdc427ff17652d05456300e88ba41abdfac88062582e8372901 +size 180385008 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/chat_template.jinja b/v5/ORPO/ORPO_1k/lora/checkpoint-250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/optimizer.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b59ccb584e2c2035b8e7480b060e7fe0b9e663c --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45c21ca793628531b1caa921a71d3d44ffc86e393ab12da58864893623c339cd +size 360902475 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/rng_state.pth b/v5/ORPO/ORPO_1k/lora/checkpoint-250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/scaler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..849c6fa8080705f2a2c4a4f07a89a8e05bf320fa --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d6fca631a6bcdfa2416587314d206a68f40e27a07bc674b76e72a93db4e5058 +size 1383 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/scheduler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae235752db61e7d4fdee61cf71bf6b012e99ee30 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:794a39cdc187ed608885f3e8a7dcb959e057431259710b9d40064146cd12480f +size 1465 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer.json b/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/trainer_state.json b/v5/ORPO/ORPO_1k/lora/checkpoint-250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f8e6b385df9c5f3cfc23a9ca6e4f6a5c41a7e95e --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/trainer_state.json @@ -0,0 +1,959 @@ +{ + "best_global_step": 30, + "best_metric": 0.5540000200271606, + "best_model_checkpoint": "output/lora/checkpoint-30", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.9159289002418518, + "learning_rate": 2.88e-06, + "log_odds_chosen": 0.19562272727489471, + "log_odds_ratio": -0.8590701222419739, + "logits/chosen": 1.1002824306488037, + "logits/rejected": 1.0790246725082397, + "logps/chosen": -3.0543551445007324, + "logps/rejected": -3.247206211090088, + "loss": 3.5340042114257812, + "nll_loss": 3.4480972290039062, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.30543553829193115, + "rewards/margins": 0.01928507350385189, + "rewards/rejected": -0.3247205913066864, + "step": 10 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.14816458523273468, + "eval_log_odds_ratio": -0.8236475586891174, + "eval_logits/chosen": 1.075466513633728, + "eval_logits/rejected": 1.069645881652832, + "eval_logps/chosen": -3.0302510261535645, + "eval_logps/rejected": -3.178657293319702, + "eval_loss": 3.465601921081543, + "eval_nll_loss": 3.383236885070801, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": -0.30302515625953674, + "eval_rewards/margins": 0.014840577729046345, + "eval_rewards/rejected": -0.3178657293319702, + "eval_runtime": 52.3971, + "eval_samples_per_second": 9.543, + "eval_steps_per_second": 4.771, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 0.49432215094566345, + "learning_rate": 6.079999999999999e-06, + "log_odds_chosen": 0.051810234785079956, + "log_odds_ratio": -0.8136274218559265, + "logits/chosen": 1.0922297239303589, + "logits/rejected": 1.1531397104263306, + "logps/chosen": -3.053081750869751, + "logps/rejected": -3.1046817302703857, + "loss": 3.3105998992919923, + "nll_loss": 3.229236602783203, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.30530816316604614, + "rewards/margins": 0.005159988068044186, + "rewards/rejected": -0.3104681670665741, + "step": 20 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.136577308177948, + "eval_log_odds_ratio": -0.810167133808136, + "eval_logits/chosen": 1.0986508131027222, + "eval_logits/rejected": 1.084808588027954, + "eval_logps/chosen": -2.914625644683838, + "eval_logps/rejected": -3.05191969871521, + "eval_loss": 3.170542001724243, + "eval_nll_loss": 3.0895254611968994, + "eval_rewards/accuracies": 0.5479999780654907, + "eval_rewards/chosen": -0.2914625406265259, + "eval_rewards/margins": 0.013729416765272617, + "eval_rewards/rejected": -0.3051919937133789, + "eval_runtime": 53.3238, + "eval_samples_per_second": 9.377, + "eval_steps_per_second": 4.688, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 0.4138321876525879, + "learning_rate": 7.857777777777777e-06, + "log_odds_chosen": 0.08315258473157883, + "log_odds_ratio": -0.7477900981903076, + "logits/chosen": 1.192463755607605, + "logits/rejected": 1.1664505004882812, + "logps/chosen": -2.7590999603271484, + "logps/rejected": -2.8307957649230957, + "loss": 2.969002532958984, + "nll_loss": 2.89422345161438, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.275909960269928, + "rewards/margins": 0.0071696205995976925, + "rewards/rejected": -0.2830796241760254, + "step": 30 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.11084984242916107, + "eval_log_odds_ratio": -0.78859543800354, + "eval_logits/chosen": 1.1216861009597778, + "eval_logits/rejected": 1.0951279401779175, + "eval_logps/chosen": -2.7105276584625244, + "eval_logps/rejected": -2.823791265487671, + "eval_loss": 2.8847193717956543, + "eval_nll_loss": 2.8058602809906006, + "eval_rewards/accuracies": 0.5540000200271606, + "eval_rewards/chosen": -0.2710527777671814, + "eval_rewards/margins": 0.011326361447572708, + "eval_rewards/rejected": -0.2823791205883026, + "eval_runtime": 53.2199, + "eval_samples_per_second": 9.395, + "eval_steps_per_second": 4.697, + "step": 30 + }, + { + "epoch": 0.32, + "grad_norm": 0.34679412841796875, + "learning_rate": 7.502222222222222e-06, + "log_odds_chosen": 0.18765760958194733, + "log_odds_ratio": -0.7577568292617798, + "logits/chosen": 1.1061517000198364, + "logits/rejected": 1.107339859008789, + "logps/chosen": -2.500312566757202, + "logps/rejected": -2.6544313430786133, + "loss": 2.7145227432250976, + "nll_loss": 2.638747453689575, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2500312924385071, + "rewards/margins": 0.01541186310350895, + "rewards/rejected": -0.2654431462287903, + "step": 40 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.08204474300146103, + "eval_log_odds_ratio": -0.7724552750587463, + "eval_logits/chosen": 1.092315912246704, + "eval_logits/rejected": 1.05488121509552, + "eval_logps/chosen": -2.5186259746551514, + "eval_logps/rejected": -2.6051390171051025, + "eval_loss": 2.6419382095336914, + "eval_nll_loss": 2.564692974090576, + "eval_rewards/accuracies": 0.5519999861717224, + "eval_rewards/chosen": -0.25186261534690857, + "eval_rewards/margins": 0.008651292882859707, + "eval_rewards/rejected": -0.26051390171051025, + "eval_runtime": 53.3316, + "eval_samples_per_second": 9.375, + "eval_steps_per_second": 4.688, + "step": 40 + }, + { + "epoch": 0.4, + "grad_norm": 0.3510463535785675, + "learning_rate": 7.146666666666666e-06, + "log_odds_chosen": 0.05859034135937691, + "log_odds_ratio": -0.7385914325714111, + "logits/chosen": 1.1375240087509155, + "logits/rejected": 1.0450371503829956, + "logps/chosen": -2.42551326751709, + "logps/rejected": -2.4860575199127197, + "loss": 2.46743106842041, + "nll_loss": 2.393571376800537, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.24255135655403137, + "rewards/margins": 0.006054399069398642, + "rewards/rejected": -0.24860575795173645, + "step": 50 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.05682923272252083, + "eval_log_odds_ratio": -0.7677140235900879, + "eval_logits/chosen": 1.0331004858016968, + "eval_logits/rejected": 0.9877541065216064, + "eval_logps/chosen": -2.3998496532440186, + "eval_logps/rejected": -2.462172746658325, + "eval_loss": 2.429464817047119, + "eval_nll_loss": 2.352693557739258, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.23998498916625977, + "eval_rewards/margins": 0.0062323203310370445, + "eval_rewards/rejected": -0.24621731042861938, + "eval_runtime": 53.2401, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.696, + "step": 50 + }, + { + "epoch": 0.48, + "grad_norm": 0.20609980821609497, + "learning_rate": 6.791111111111111e-06, + "log_odds_chosen": 0.07380016148090363, + "log_odds_ratio": -0.7359625101089478, + "logits/chosen": 0.889764130115509, + "logits/rejected": 1.017002820968628, + "logps/chosen": -2.2573037147521973, + "logps/rejected": -2.305345058441162, + "loss": 2.3112926483154297, + "nll_loss": 2.2376961708068848, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.22573037445545197, + "rewards/margins": 0.004804140888154507, + "rewards/rejected": -0.23053452372550964, + "step": 60 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.043189436197280884, + "eval_log_odds_ratio": -0.7669212222099304, + "eval_logits/chosen": 0.9866231679916382, + "eval_logits/rejected": 0.9370818734169006, + "eval_logps/chosen": -2.3348076343536377, + "eval_logps/rejected": -2.3840110301971436, + "eval_loss": 2.255201578140259, + "eval_nll_loss": 2.17850923538208, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.23348075151443481, + "eval_rewards/margins": 0.0049203126691281796, + "eval_rewards/rejected": -0.23840108513832092, + "eval_runtime": 53.4269, + "eval_samples_per_second": 9.359, + "eval_steps_per_second": 4.679, + "step": 60 + }, + { + "epoch": 0.56, + "grad_norm": 0.1984829604625702, + "learning_rate": 6.435555555555555e-06, + "log_odds_chosen": 0.00933628249913454, + "log_odds_ratio": -0.7400209903717041, + "logits/chosen": 1.046584129333496, + "logits/rejected": 1.0750269889831543, + "logps/chosen": -2.242738723754883, + "logps/rejected": -2.247896194458008, + "loss": 2.179552459716797, + "nll_loss": 2.1055500507354736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.22427387535572052, + "rewards/margins": 0.0005157862906344235, + "rewards/rejected": -0.22478966414928436, + "step": 70 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.037273552268743515, + "eval_log_odds_ratio": -0.7664892673492432, + "eval_logits/chosen": 0.9825413823127747, + "eval_logits/rejected": 0.9323441982269287, + "eval_logps/chosen": -2.2950439453125, + "eval_logps/rejected": -2.338361978530884, + "eval_loss": 2.161572217941284, + "eval_nll_loss": 2.084923267364502, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22950439155101776, + "eval_rewards/margins": 0.004331790842115879, + "eval_rewards/rejected": -0.23383621871471405, + "eval_runtime": 53.2286, + "eval_samples_per_second": 9.393, + "eval_steps_per_second": 4.697, + "step": 70 + }, + { + "epoch": 0.64, + "grad_norm": 0.18926046788692474, + "learning_rate": 6.079999999999999e-06, + "log_odds_chosen": 0.16370263695716858, + "log_odds_ratio": -0.7065578699111938, + "logits/chosen": 0.9741449356079102, + "logits/rejected": 0.9439865350723267, + "logps/chosen": -2.226841926574707, + "logps/rejected": -2.386024236679077, + "loss": 2.133669853210449, + "nll_loss": 2.063014030456543, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22268421947956085, + "rewards/margins": 0.015918215736746788, + "rewards/rejected": -0.2386024296283722, + "step": 80 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.03516349196434021, + "eval_log_odds_ratio": -0.7663520574569702, + "eval_logits/chosen": 1.022588849067688, + "eval_logits/rejected": 0.9730709791183472, + "eval_logps/chosen": -2.2640902996063232, + "eval_logps/rejected": -2.305067300796509, + "eval_loss": 2.0895278453826904, + "eval_nll_loss": 2.012892246246338, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22640903294086456, + "eval_rewards/margins": 0.004097723867744207, + "eval_rewards/rejected": -0.2305067777633667, + "eval_runtime": 53.2228, + "eval_samples_per_second": 9.394, + "eval_steps_per_second": 4.697, + "step": 80 + }, + { + "epoch": 0.72, + "grad_norm": 0.15444700419902802, + "learning_rate": 5.724444444444444e-06, + "log_odds_chosen": 0.049940235912799835, + "log_odds_ratio": -0.7443691492080688, + "logits/chosen": 1.0501600503921509, + "logits/rejected": 1.016621708869934, + "logps/chosen": -2.2212884426116943, + "logps/rejected": -2.2772510051727295, + "loss": 2.045332908630371, + "nll_loss": 1.9708961248397827, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.22212886810302734, + "rewards/margins": 0.005596236325800419, + "rewards/rejected": -0.2277251034975052, + "step": 90 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.033585965633392334, + "eval_log_odds_ratio": -0.766018271446228, + "eval_logits/chosen": 1.0759873390197754, + "eval_logits/rejected": 1.0270211696624756, + "eval_logps/chosen": -2.2369883060455322, + "eval_logps/rejected": -2.2760024070739746, + "eval_loss": 2.0355796813964844, + "eval_nll_loss": 1.9589776992797852, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.22369883954524994, + "eval_rewards/margins": 0.0039014029316604137, + "eval_rewards/rejected": -0.22760024666786194, + "eval_runtime": 53.2478, + "eval_samples_per_second": 9.39, + "eval_steps_per_second": 4.695, + "step": 90 + }, + { + "epoch": 0.8, + "grad_norm": 0.2406257539987564, + "learning_rate": 5.368888888888889e-06, + "log_odds_chosen": 0.040743522346019745, + "log_odds_ratio": -0.7394427061080933, + "logits/chosen": 0.9953991770744324, + "logits/rejected": 1.0465402603149414, + "logps/chosen": -2.2148966789245605, + "logps/rejected": -2.239614486694336, + "loss": 2.024346923828125, + "nll_loss": 1.950402855873108, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.22148966789245605, + "rewards/margins": 0.002471799496561289, + "rewards/rejected": -0.2239614725112915, + "step": 100 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.03151147812604904, + "eval_log_odds_ratio": -0.7650076746940613, + "eval_logits/chosen": 1.090487003326416, + "eval_logits/rejected": 1.0410875082015991, + "eval_logps/chosen": -2.2083094120025635, + "eval_logps/rejected": -2.2448291778564453, + "eval_loss": 1.9928127527236938, + "eval_nll_loss": 1.9163117408752441, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.22083096206188202, + "eval_rewards/margins": 0.003651980310678482, + "eval_rewards/rejected": -0.2244829386472702, + "eval_runtime": 53.2544, + "eval_samples_per_second": 9.389, + "eval_steps_per_second": 4.694, + "step": 100 + }, + { + "epoch": 0.88, + "grad_norm": 0.14265325665473938, + "learning_rate": 5.013333333333333e-06, + "log_odds_chosen": -0.10137738287448883, + "log_odds_ratio": -0.8407853245735168, + "logits/chosen": 1.1193276643753052, + "logits/rejected": 1.0871385335922241, + "logps/chosen": -2.144883632659912, + "logps/rejected": -2.0626511573791504, + "loss": 1.977994155883789, + "nll_loss": 1.8939154148101807, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.21448838710784912, + "rewards/margins": -0.008223267272114754, + "rewards/rejected": -0.20626512169837952, + "step": 110 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.0301420409232378, + "eval_log_odds_ratio": -0.764117419719696, + "eval_logits/chosen": 1.1084222793579102, + "eval_logits/rejected": 1.059097170829773, + "eval_logps/chosen": -2.1800594329833984, + "eval_logps/rejected": -2.214940071105957, + "eval_loss": 1.9601762294769287, + "eval_nll_loss": 1.883764386177063, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.2180059254169464, + "eval_rewards/margins": 0.0034881029278039932, + "eval_rewards/rejected": -0.22149403393268585, + "eval_runtime": 53.2746, + "eval_samples_per_second": 9.385, + "eval_steps_per_second": 4.693, + "step": 110 + }, + { + "epoch": 0.96, + "grad_norm": 0.17654533684253693, + "learning_rate": 4.657777777777778e-06, + "log_odds_chosen": 0.07624180614948273, + "log_odds_ratio": -0.7496079802513123, + "logits/chosen": 1.138927698135376, + "logits/rejected": 1.1692471504211426, + "logps/chosen": -2.1000325679779053, + "logps/rejected": -2.1689352989196777, + "loss": 1.944967269897461, + "nll_loss": 1.8700063228607178, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.21000322699546814, + "rewards/margins": 0.0068902671337127686, + "rewards/rejected": -0.2168935239315033, + "step": 120 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.029821552336215973, + "eval_log_odds_ratio": -0.763667643070221, + "eval_logits/chosen": 1.1056278944015503, + "eval_logits/rejected": 1.056398630142212, + "eval_logps/chosen": -2.158470630645752, + "eval_logps/rejected": -2.1928257942199707, + "eval_loss": 1.93770432472229, + "eval_nll_loss": 1.8613375425338745, + "eval_rewards/accuracies": 0.5220000147819519, + "eval_rewards/chosen": -0.21584708988666534, + "eval_rewards/margins": 0.0034354773815721273, + "eval_rewards/rejected": -0.21928256750106812, + "eval_runtime": 53.2433, + "eval_samples_per_second": 9.391, + "eval_steps_per_second": 4.695, + "step": 120 + }, + { + "epoch": 1.04, + "grad_norm": 0.1827414184808731, + "learning_rate": 4.302222222222222e-06, + "log_odds_chosen": -0.001232819282449782, + "log_odds_ratio": -0.7414854764938354, + "logits/chosen": 1.2297706604003906, + "logits/rejected": 1.1010440587997437, + "logps/chosen": -1.9639602899551392, + "logps/rejected": -1.962048888206482, + "loss": 1.9267074584960937, + "nll_loss": 1.8525588512420654, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19639602303504944, + "rewards/margins": -0.00019112862355541438, + "rewards/rejected": -0.19620490074157715, + "step": 130 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.029177702963352203, + "eval_log_odds_ratio": -0.7638773322105408, + "eval_logits/chosen": 1.1144956350326538, + "eval_logits/rejected": 1.0663033723831177, + "eval_logps/chosen": -2.144742965698242, + "eval_logps/rejected": -2.1784965991973877, + "eval_loss": 1.920721173286438, + "eval_nll_loss": 1.844333529472351, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21447430551052094, + "eval_rewards/margins": 0.0033753456082195044, + "eval_rewards/rejected": -0.21784964203834534, + "eval_runtime": 53.2128, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 4.698, + "step": 130 + }, + { + "epoch": 1.12, + "grad_norm": 0.16919028759002686, + "learning_rate": 3.9466666666666664e-06, + "log_odds_chosen": -0.11807477474212646, + "log_odds_ratio": -0.8465667963027954, + "logits/chosen": 1.1062188148498535, + "logits/rejected": 1.1064749956130981, + "logps/chosen": -2.1417267322540283, + "logps/rejected": -2.0469985008239746, + "loss": 1.938684844970703, + "nll_loss": 1.854028344154358, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21417267620563507, + "rewards/margins": -0.009472792968153954, + "rewards/rejected": -0.20469987392425537, + "step": 140 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.028810443356633186, + "eval_log_odds_ratio": -0.7637351155281067, + "eval_logits/chosen": 1.1457773447036743, + "eval_logits/rejected": 1.0984731912612915, + "eval_logps/chosen": -2.1359357833862305, + "eval_logps/rejected": -2.1694324016571045, + "eval_loss": 1.907101035118103, + "eval_nll_loss": 1.8307276964187622, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21359357237815857, + "eval_rewards/margins": 0.0033496527466923, + "eval_rewards/rejected": -0.21694323420524597, + "eval_runtime": 195.8742, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 1.276, + "step": 140 + }, + { + "epoch": 1.2, + "grad_norm": 0.12811271846294403, + "learning_rate": 3.591111111111111e-06, + "log_odds_chosen": 0.10773544013500214, + "log_odds_ratio": -0.6882486343383789, + "logits/chosen": 1.1967742443084717, + "logits/rejected": 1.2303117513656616, + "logps/chosen": -2.0538058280944824, + "logps/rejected": -2.147270679473877, + "loss": 1.928931427001953, + "nll_loss": 1.8601064682006836, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20538058876991272, + "rewards/margins": 0.009346459992229939, + "rewards/rejected": -0.21472707390785217, + "step": 150 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.029405459761619568, + "eval_log_odds_ratio": -0.7636828422546387, + "eval_logits/chosen": 1.1795330047607422, + "eval_logits/rejected": 1.1324467658996582, + "eval_logps/chosen": -2.1314361095428467, + "eval_logps/rejected": -2.165649175643921, + "eval_loss": 1.8977786302566528, + "eval_nll_loss": 1.8214104175567627, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.21314364671707153, + "eval_rewards/margins": 0.0034212982282042503, + "eval_rewards/rejected": -0.21656493842601776, + "eval_runtime": 53.2699, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 150 + }, + { + "epoch": 1.28, + "grad_norm": 0.1704077273607254, + "learning_rate": 3.235555555555555e-06, + "log_odds_chosen": 0.14702661335468292, + "log_odds_ratio": -0.6978052854537964, + "logits/chosen": 1.1281222105026245, + "logits/rejected": 1.2165471315383911, + "logps/chosen": -2.0752506256103516, + "logps/rejected": -2.201073169708252, + "loss": 1.8814886093139649, + "nll_loss": 1.8117080926895142, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2075250893831253, + "rewards/margins": 0.012582269497215748, + "rewards/rejected": -0.22010734677314758, + "step": 160 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.0292374175041914, + "eval_log_odds_ratio": -0.763826310634613, + "eval_logits/chosen": 1.2024222612380981, + "eval_logits/rejected": 1.1555261611938477, + "eval_logps/chosen": -2.1266942024230957, + "eval_logps/rejected": -2.160792350769043, + "eval_loss": 1.8912572860717773, + "eval_nll_loss": 1.8148746490478516, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21266941726207733, + "eval_rewards/margins": 0.003409823402762413, + "eval_rewards/rejected": -0.2160792201757431, + "eval_runtime": 53.2802, + "eval_samples_per_second": 9.384, + "eval_steps_per_second": 4.692, + "step": 160 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.1710982620716095, + "learning_rate": 2.88e-06, + "log_odds_chosen": 0.1420261412858963, + "log_odds_ratio": -0.6900883913040161, + "logits/chosen": 1.0993479490280151, + "logits/rejected": 1.2283376455307007, + "logps/chosen": -2.066962242126465, + "logps/rejected": -2.177448034286499, + "loss": 1.8723211288452148, + "nll_loss": 1.8033123016357422, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20669622719287872, + "rewards/margins": 0.011048593558371067, + "rewards/rejected": -0.21774479746818542, + "step": 170 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.028749050572514534, + "eval_log_odds_ratio": -0.763710081577301, + "eval_logits/chosen": 1.197767734527588, + "eval_logits/rejected": 1.1507377624511719, + "eval_logps/chosen": -2.1212124824523926, + "eval_logps/rejected": -2.1547811031341553, + "eval_loss": 1.8862611055374146, + "eval_nll_loss": 1.8098900318145752, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21212123334407806, + "eval_rewards/margins": 0.003356893314048648, + "eval_rewards/rejected": -0.21547812223434448, + "eval_runtime": 53.2604, + "eval_samples_per_second": 9.388, + "eval_steps_per_second": 4.694, + "step": 170 + }, + { + "epoch": 1.44, + "grad_norm": 0.14521150290966034, + "learning_rate": 2.5244444444444443e-06, + "log_odds_chosen": -0.005218303296715021, + "log_odds_ratio": -0.8199856877326965, + "logits/chosen": 1.2382605075836182, + "logits/rejected": 1.1947174072265625, + "logps/chosen": -2.0978918075561523, + "logps/rejected": -2.1037724018096924, + "loss": 1.8984029769897461, + "nll_loss": 1.8164045810699463, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2097891867160797, + "rewards/margins": 0.0005880504613742232, + "rewards/rejected": -0.21037724614143372, + "step": 180 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.02963738888502121, + "eval_log_odds_ratio": -0.7632617950439453, + "eval_logits/chosen": 1.2076901197433472, + "eval_logits/rejected": 1.1605749130249023, + "eval_logps/chosen": -2.117640733718872, + "eval_logps/rejected": -2.1519222259521484, + "eval_loss": 1.8823505640029907, + "eval_nll_loss": 1.806024432182312, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.21176405251026154, + "eval_rewards/margins": 0.003428164403885603, + "eval_rewards/rejected": -0.21519219875335693, + "eval_runtime": 53.2338, + "eval_samples_per_second": 9.393, + "eval_steps_per_second": 4.696, + "step": 180 + }, + { + "epoch": 1.52, + "grad_norm": 0.1563858985900879, + "learning_rate": 2.168888888888889e-06, + "log_odds_chosen": 0.08264724910259247, + "log_odds_ratio": -0.6953542232513428, + "logits/chosen": 1.2215197086334229, + "logits/rejected": 1.1691173315048218, + "logps/chosen": -2.031200885772705, + "logps/rejected": -2.0946972370147705, + "loss": 1.912965965270996, + "nll_loss": 1.843430519104004, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2031201422214508, + "rewards/margins": 0.006349604576826096, + "rewards/rejected": -0.20946970582008362, + "step": 190 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.030445439741015434, + "eval_log_odds_ratio": -0.7631585597991943, + "eval_logits/chosen": 1.2250813245773315, + "eval_logits/rejected": 1.1781257390975952, + "eval_logps/chosen": -2.1141271591186523, + "eval_logps/rejected": -2.149076223373413, + "eval_loss": 1.879238247871399, + "eval_nll_loss": 1.802922248840332, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.211412712931633, + "eval_rewards/margins": 0.0034949015825986862, + "eval_rewards/rejected": -0.21490761637687683, + "eval_runtime": 53.2657, + "eval_samples_per_second": 9.387, + "eval_steps_per_second": 4.693, + "step": 190 + }, + { + "epoch": 1.6, + "grad_norm": 0.14611385762691498, + "learning_rate": 1.813333333333333e-06, + "log_odds_chosen": -0.10572721809148788, + "log_odds_ratio": -0.8197598457336426, + "logits/chosen": 1.202487826347351, + "logits/rejected": 1.1040087938308716, + "logps/chosen": -2.1150729656219482, + "logps/rejected": -2.0351948738098145, + "loss": 1.9483623504638672, + "nll_loss": 1.86638605594635, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.21150732040405273, + "rewards/margins": -0.007987814024090767, + "rewards/rejected": -0.20351949334144592, + "step": 200 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.0310534480959177, + "eval_log_odds_ratio": -0.7631565928459167, + "eval_logits/chosen": 1.2298425436019897, + "eval_logits/rejected": 1.1832287311553955, + "eval_logps/chosen": -2.11133074760437, + "eval_logps/rejected": -2.1467576026916504, + "eval_loss": 1.8768765926361084, + "eval_nll_loss": 1.8005608320236206, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.21113307774066925, + "eval_rewards/margins": 0.0035426774993538857, + "eval_rewards/rejected": -0.21467576920986176, + "eval_runtime": 53.2607, + "eval_samples_per_second": 9.388, + "eval_steps_per_second": 4.694, + "step": 200 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.17115949094295502, + "learning_rate": 1.4577777777777778e-06, + "log_odds_chosen": 0.15168903768062592, + "log_odds_ratio": -0.6794174909591675, + "logits/chosen": 1.2208305597305298, + "logits/rejected": 1.1591063737869263, + "logps/chosen": -2.031393527984619, + "logps/rejected": -2.164114475250244, + "loss": 1.833169937133789, + "nll_loss": 1.7652279138565063, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20313934981822968, + "rewards/margins": 0.01327207125723362, + "rewards/rejected": -0.21641144156455994, + "step": 210 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.03077588975429535, + "eval_log_odds_ratio": -0.7630413770675659, + "eval_logits/chosen": 1.2368669509887695, + "eval_logits/rejected": 1.1905410289764404, + "eval_logps/chosen": -2.1100478172302246, + "eval_logps/rejected": -2.1452219486236572, + "eval_loss": 1.8748760223388672, + "eval_nll_loss": 1.7985717058181763, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21100479364395142, + "eval_rewards/margins": 0.003517415374517441, + "eval_rewards/rejected": -0.21452219784259796, + "eval_runtime": 53.2709, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 210 + }, + { + "epoch": 1.76, + "grad_norm": 0.136911541223526, + "learning_rate": 1.1022222222222222e-06, + "log_odds_chosen": -0.004690551199018955, + "log_odds_ratio": -0.7404999732971191, + "logits/chosen": 1.3654903173446655, + "logits/rejected": 1.3405206203460693, + "logps/chosen": -2.1515979766845703, + "logps/rejected": -2.1524949073791504, + "loss": 1.935328483581543, + "nll_loss": 1.8612781763076782, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2151598036289215, + "rewards/margins": 8.968896872829646e-05, + "rewards/rejected": -0.21524949371814728, + "step": 220 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.03143003210425377, + "eval_log_odds_ratio": -0.7629221081733704, + "eval_logits/chosen": 1.2453513145446777, + "eval_logits/rejected": 1.1991850137710571, + "eval_logps/chosen": -2.108854055404663, + "eval_logps/rejected": -2.14459490776062, + "eval_loss": 1.8734816312789917, + "eval_nll_loss": 1.797189474105835, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21088536083698273, + "eval_rewards/margins": 0.00357412570156157, + "eval_rewards/rejected": -0.21445949375629425, + "eval_runtime": 53.3291, + "eval_samples_per_second": 9.376, + "eval_steps_per_second": 4.688, + "step": 220 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.14719471335411072, + "learning_rate": 7.466666666666667e-07, + "log_odds_chosen": 0.06841103732585907, + "log_odds_ratio": -0.7776535749435425, + "logits/chosen": 1.1754335165023804, + "logits/rejected": 1.1915993690490723, + "logps/chosen": -2.027627468109131, + "logps/rejected": -2.0764873027801514, + "loss": 1.8516361236572265, + "nll_loss": 1.7738704681396484, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20276275277137756, + "rewards/margins": 0.004885983653366566, + "rewards/rejected": -0.20764870941638947, + "step": 230 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.03179427608847618, + "eval_log_odds_ratio": -0.7629269957542419, + "eval_logits/chosen": 1.2505085468292236, + "eval_logits/rejected": 1.2043695449829102, + "eval_logps/chosen": -2.1080172061920166, + "eval_logps/rejected": -2.1440720558166504, + "eval_loss": 1.8725281953811646, + "eval_nll_loss": 1.79623544216156, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21080172061920166, + "eval_rewards/margins": 0.0036054973024874926, + "eval_rewards/rejected": -0.21440719068050385, + "eval_runtime": 53.2692, + "eval_samples_per_second": 9.386, + "eval_steps_per_second": 4.693, + "step": 230 + }, + { + "epoch": 1.92, + "grad_norm": 0.21970191597938538, + "learning_rate": 3.911111111111111e-07, + "log_odds_chosen": 0.1418900340795517, + "log_odds_ratio": -0.7076598405838013, + "logits/chosen": 1.2581008672714233, + "logits/rejected": 1.2762272357940674, + "logps/chosen": -2.008291244506836, + "logps/rejected": -2.139103889465332, + "loss": 1.915215301513672, + "nll_loss": 1.8444492816925049, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2008291482925415, + "rewards/margins": 0.0130812618881464, + "rewards/rejected": -0.21391041576862335, + "step": 240 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.031775180250406265, + "eval_log_odds_ratio": -0.7629340887069702, + "eval_logits/chosen": 1.254185438156128, + "eval_logits/rejected": 1.208067536354065, + "eval_logps/chosen": -2.1076254844665527, + "eval_logps/rejected": -2.1436824798583984, + "eval_loss": 1.8719455003738403, + "eval_nll_loss": 1.7956523895263672, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21076256036758423, + "eval_rewards/margins": 0.003605667734518647, + "eval_rewards/rejected": -0.21436822414398193, + "eval_runtime": 53.3098, + "eval_samples_per_second": 9.379, + "eval_steps_per_second": 4.69, + "step": 240 + }, + { + "epoch": 2.0, + "grad_norm": 0.132158100605011, + "learning_rate": 3.5555555555555554e-08, + "log_odds_chosen": 0.10256216675043106, + "log_odds_ratio": -0.7014639973640442, + "logits/chosen": 1.3629863262176514, + "logits/rejected": 1.4673380851745605, + "logps/chosen": -2.040670156478882, + "logps/rejected": -2.1362738609313965, + "loss": 1.878919792175293, + "nll_loss": 1.8087730407714844, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20406702160835266, + "rewards/margins": 0.009560374543070793, + "rewards/rejected": -0.2136273831129074, + "step": 250 + }, + { + "epoch": 2.0, + "eval_log_odds_chosen": 0.03193635866045952, + "eval_log_odds_ratio": -0.7628265023231506, + "eval_logits/chosen": 1.2564101219177246, + "eval_logits/rejected": 1.2103327512741089, + "eval_logps/chosen": -2.1072919368743896, + "eval_logps/rejected": -2.1434714794158936, + "eval_loss": 1.8716939687728882, + "eval_nll_loss": 1.7954113483428955, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.21072918176651, + "eval_rewards/margins": 0.0036179511807858944, + "eval_rewards/rejected": -0.21434716880321503, + "eval_runtime": 53.2589, + "eval_samples_per_second": 9.388, + "eval_steps_per_second": 4.694, + "step": 250 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-250/training_args.bin b/v5/ORPO/ORPO_1k/lora/checkpoint-250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..21f17e0abb46ed5f8c3ca052b462718090535629 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6f6a67bdcce98c9a6e36d43a0e7aee42aa33a0edf20e2bc530a89f6cddc45c +size 5457 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/README.md b/v5/ORPO/ORPO_1k/lora/checkpoint-30/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..38a3e195fc5044bb8b51d3f0386d896bb63d9faa --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "q_proj", + "gate_proj", + "down_proj", + "o_proj", + "k_proj", + "v_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_model.safetensors b/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..579ab87f394a29a86eaabb28801efdb8b7c00ddc --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7df792f8ba8f6c34e83f9250fadc524a27d9c5cdeb952cd1dee6f468da5606 +size 180385008 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/chat_template.jinja b/v5/ORPO/ORPO_1k/lora/checkpoint-30/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/optimizer.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-30/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb7793cb2ab4e86dc27a95ff75606691b5f65deb --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd4ebb3cf2f72d13c2b758dc4accdb7b6ebb92cba838b5541d2961e3b16c47a +size 360902475 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/rng_state.pth b/v5/ORPO/ORPO_1k/lora/checkpoint-30/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..435e005883bf4440218c894822b086abf80abfc0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e2011629d8bed3ef560fa11175cac55684c4e12a72634bb24abf767b6c7399 +size 14645 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/scaler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-30/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6f821143042de202020205f18b0f074307d3e1cb --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20ab57b9b26fc7cb4418a4e1198e25ebb1da623aea7693e1fc71ff284d45724b +size 1383 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/scheduler.pt b/v5/ORPO/ORPO_1k/lora/checkpoint-30/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3927ac0b2979bf1e8476afc622aac66881d7c4c0 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c855dd7c449ae76f80b32fa6d45a7998d7f47a357685492fa67654b3914e15b +size 1465 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer.json b/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer_config.json b/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/trainer_state.json b/v5/ORPO/ORPO_1k/lora/checkpoint-30/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1379b4fe98278b666b660385a2c563473db9599f --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/trainer_state.json @@ -0,0 +1,145 @@ +{ + "best_global_step": 30, + "best_metric": 0.5540000200271606, + "best_model_checkpoint": "output/lora/checkpoint-30", + "epoch": 0.24, + "eval_steps": 10, + "global_step": 30, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.08, + "grad_norm": 0.9159289002418518, + "learning_rate": 2.88e-06, + "log_odds_chosen": 0.19562272727489471, + "log_odds_ratio": -0.8590701222419739, + "logits/chosen": 1.1002824306488037, + "logits/rejected": 1.0790246725082397, + "logps/chosen": -3.0543551445007324, + "logps/rejected": -3.247206211090088, + "loss": 3.5340042114257812, + "nll_loss": 3.4480972290039062, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.30543553829193115, + "rewards/margins": 0.01928507350385189, + "rewards/rejected": -0.3247205913066864, + "step": 10 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.14816458523273468, + "eval_log_odds_ratio": -0.8236475586891174, + "eval_logits/chosen": 1.075466513633728, + "eval_logits/rejected": 1.069645881652832, + "eval_logps/chosen": -3.0302510261535645, + "eval_logps/rejected": -3.178657293319702, + "eval_loss": 3.465601921081543, + "eval_nll_loss": 3.383236885070801, + "eval_rewards/accuracies": 0.5440000295639038, + "eval_rewards/chosen": -0.30302515625953674, + "eval_rewards/margins": 0.014840577729046345, + "eval_rewards/rejected": -0.3178657293319702, + "eval_runtime": 52.3971, + "eval_samples_per_second": 9.543, + "eval_steps_per_second": 4.771, + "step": 10 + }, + { + "epoch": 0.16, + "grad_norm": 0.49432215094566345, + "learning_rate": 6.079999999999999e-06, + "log_odds_chosen": 0.051810234785079956, + "log_odds_ratio": -0.8136274218559265, + "logits/chosen": 1.0922297239303589, + "logits/rejected": 1.1531397104263306, + "logps/chosen": -3.053081750869751, + "logps/rejected": -3.1046817302703857, + "loss": 3.3105998992919923, + "nll_loss": 3.229236602783203, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.30530816316604614, + "rewards/margins": 0.005159988068044186, + "rewards/rejected": -0.3104681670665741, + "step": 20 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.136577308177948, + "eval_log_odds_ratio": -0.810167133808136, + "eval_logits/chosen": 1.0986508131027222, + "eval_logits/rejected": 1.084808588027954, + "eval_logps/chosen": -2.914625644683838, + "eval_logps/rejected": -3.05191969871521, + "eval_loss": 3.170542001724243, + "eval_nll_loss": 3.0895254611968994, + "eval_rewards/accuracies": 0.5479999780654907, + "eval_rewards/chosen": -0.2914625406265259, + "eval_rewards/margins": 0.013729416765272617, + "eval_rewards/rejected": -0.3051919937133789, + "eval_runtime": 53.3238, + "eval_samples_per_second": 9.377, + "eval_steps_per_second": 4.688, + "step": 20 + }, + { + "epoch": 0.24, + "grad_norm": 0.4138321876525879, + "learning_rate": 7.857777777777777e-06, + "log_odds_chosen": 0.08315258473157883, + "log_odds_ratio": -0.7477900981903076, + "logits/chosen": 1.192463755607605, + "logits/rejected": 1.1664505004882812, + "logps/chosen": -2.7590999603271484, + "logps/rejected": -2.8307957649230957, + "loss": 2.969002532958984, + "nll_loss": 2.89422345161438, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.275909960269928, + "rewards/margins": 0.0071696205995976925, + "rewards/rejected": -0.2830796241760254, + "step": 30 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.11084984242916107, + "eval_log_odds_ratio": -0.78859543800354, + "eval_logits/chosen": 1.1216861009597778, + "eval_logits/rejected": 1.0951279401779175, + "eval_logps/chosen": -2.7105276584625244, + "eval_logps/rejected": -2.823791265487671, + "eval_loss": 2.8847193717956543, + "eval_nll_loss": 2.8058602809906006, + "eval_rewards/accuracies": 0.5540000200271606, + "eval_rewards/chosen": -0.2710527777671814, + "eval_rewards/margins": 0.011326361447572708, + "eval_rewards/rejected": -0.2823791205883026, + "eval_runtime": 53.2199, + "eval_samples_per_second": 9.395, + "eval_steps_per_second": 4.697, + "step": 30 + } + ], + "logging_steps": 10, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_1k/lora/checkpoint-30/training_args.bin b/v5/ORPO/ORPO_1k/lora/checkpoint-30/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..21f17e0abb46ed5f8c3ca052b462718090535629 --- /dev/null +++ b/v5/ORPO/ORPO_1k/lora/checkpoint-30/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d6f6a67bdcce98c9a6e36d43a0e7aee42aa33a0edf20e2bc530a89f6cddc45c +size 5457 diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/chat_template.jinja b/v5/ORPO/ORPO_5k/MORPO_5k/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/config.json b/v5/ORPO/ORPO_5k/MORPO_5k/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ec18bc1ae18922052a57480e28401f3b9c6b84 --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/config.json @@ -0,0 +1,40 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "float16", + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "pad_token_id": null, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_parameters": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_theta": 500000.0, + "rope_type": "llama3" + }, + "tie_word_embeddings": true, + "transformers_version": "5.2.0", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/generation_config.json b/v5/ORPO/ORPO_5k/MORPO_5k/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..898a2e748a41a4b7a931b754f8abbdac02039fae --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/generation_config.json @@ -0,0 +1,12 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "5.2.0" +} diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/model.safetensors b/v5/ORPO/ORPO_5k/MORPO_5k/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b68432659d4f550bbba845527b9f6bfe25578502 --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a839f05295ed832c5dc04bf7bb2fb60a25c19f95496a6c86de9ea77e17c0de5f +size 2471645464 diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer.json b/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer_config.json b/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_5k/MORPO_5k/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_5k/ORPO_5k/README.md b/v5/ORPO/ORPO_5k/ORPO_5k/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_5k/ORPO_5k/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/ORPO_5k/adapter_config.json b/v5/ORPO/ORPO_5k/ORPO_5k/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a69561f79e9919bdd2eb3aaaca6c08223d07d2b5 --- /dev/null +++ b/v5/ORPO/ORPO_5k/ORPO_5k/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/ORPO_5k/adapter_model.safetensors b/v5/ORPO/ORPO_5k/ORPO_5k/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1be825c4bc69dd5833ac59543703bfa19280668f --- /dev/null +++ b/v5/ORPO/ORPO_5k/ORPO_5k/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d7dde76c0a0f56e48f453104e8e0b46430c743c886fca1cff40881606bc2312 +size 180385008 diff --git a/v5/ORPO/ORPO_5k/lora/README.md b/v5/ORPO/ORPO_5k/lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0c73196b15dd0d8de40d4224f947442ac307b8d6 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/README.md @@ -0,0 +1,66 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: transformers +model_name: lora +tags: +- generated_from_trainer +- trl +- orpo +licence: license +--- + +# Model Card for lora + +This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct). +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="None", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/sea-rod/huggingface/runs/eqxngynt) + + +This model was trained with ORPO, a method introduced in [ORPO: Monolithic Preference Optimization without Reference Model](https://huggingface.co/papers/2403.07691). + +### Framework versions + +- TRL: 0.28.0 +- Transformers: 5.2.0 +- Pytorch: 2.10.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + +Cite ORPO as: + +```bibtex +@article{hong2024orpo, + title = {{ORPO: Monolithic Preference Optimization without Reference Model}}, + author = {Jiwoo Hong and Noah Lee and James Thorne}, + year = 2024, + eprint = {arXiv:2403.07691} +} +``` + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/README.md b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a69561f79e9919bdd2eb3aaaca6c08223d07d2b5 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_model.safetensors b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..399a62e328e5f50dc453872006f06c79b4feb1a2 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5a9a9664055d96ea8cc24775d5fed708ed89d9821ead91e45e0ff15cabc65e +size 180385008 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/chat_template.jinja b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/optimizer.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..08c92e22fccc63b21f999ad23d7db5a137f396ae --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5d190f3fe840dedd3578ec029df90e35a3110e17998a12420ebe0fd7ae21c6 +size 360902475 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/rng_state.pth b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1062af8f13c012194172bd08b0cf5acef1661de7 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe17a5fe328cdb084ec10afd49d2cb7bb9035b6736206385c2c19bf9f66c41 +size 14645 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scaler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3529b9e1021ddc95e3af7b2d72233fab602a2d19 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18783150ac09b6b81cea5af47876a10bfe5f36c3d76aca4ffce5382bdfaf7b28 +size 1383 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scheduler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f34f7ce88f7ddab7a609e9204a7a677a46a83fbd --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90967a68c5ed1a2c3456d966b62c9e097a88767bdd9b2401fe25a57273df4f6d +size 1465 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/trainer_state.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8a2705b0db967b80ae88d8134a8a4b591e9ef608 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/trainer_state.json @@ -0,0 +1,2650 @@ +{ + "best_global_step": 50, + "best_metric": 0.5519999861717224, + "best_model_checkpoint": "output/lora/checkpoint-50", + "epoch": 1.92, + "eval_steps": 50, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 1.3608590364456177, + "learning_rate": 5.76e-07, + "log_odds_chosen": 0.0073966654017567635, + "log_odds_ratio": -0.8660133481025696, + "logits/chosen": 1.1517311334609985, + "logits/rejected": 1.1107122898101807, + "logps/chosen": -3.0449740886688232, + "logps/rejected": -3.0518546104431152, + "loss": 3.4909488677978517, + "nll_loss": 3.4043469429016113, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3044974207878113, + "rewards/margins": 0.0006880179280415177, + "rewards/rejected": -0.3051854372024536, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.8278084993362427, + "learning_rate": 1.2159999999999999e-06, + "log_odds_chosen": -0.0764567106962204, + "log_odds_ratio": -0.9281005859375, + "logits/chosen": 0.985865592956543, + "logits/rejected": 0.9893043637275696, + "logps/chosen": -3.195783853530884, + "logps/rejected": -3.128960132598877, + "loss": 3.6714431762695314, + "nll_loss": 3.5786330699920654, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3195783793926239, + "rewards/margins": -0.006682366132736206, + "rewards/rejected": -0.3128960430622101, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 0.7320559024810791, + "learning_rate": 1.856e-06, + "log_odds_chosen": -0.13351905345916748, + "log_odds_ratio": -0.968097984790802, + "logits/chosen": 1.097598910331726, + "logits/rejected": 1.1367751359939575, + "logps/chosen": -3.1909520626068115, + "logps/rejected": -3.0626091957092285, + "loss": 3.345610427856445, + "nll_loss": 3.2488014698028564, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3190951943397522, + "rewards/margins": -0.01283429004251957, + "rewards/rejected": -0.3062609136104584, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 0.6406434178352356, + "learning_rate": 2.496e-06, + "log_odds_chosen": 0.0689389556646347, + "log_odds_ratio": -0.7773251533508301, + "logits/chosen": 1.0645023584365845, + "logits/rejected": 1.0285594463348389, + "logps/chosen": -2.8054141998291016, + "logps/rejected": -2.8708128929138184, + "loss": 3.268035125732422, + "nll_loss": 3.190302848815918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28054141998291016, + "rewards/margins": 0.006539878435432911, + "rewards/rejected": -0.2870813012123108, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 0.5944439172744751, + "learning_rate": 3.136e-06, + "log_odds_chosen": -0.14803443849086761, + "log_odds_ratio": -0.9101846814155579, + "logits/chosen": 1.166550874710083, + "logits/rejected": 1.1396485567092896, + "logps/chosen": -2.988274335861206, + "logps/rejected": -2.8451037406921387, + "loss": 3.138271141052246, + "nll_loss": 3.0472521781921387, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2988274395465851, + "rewards/margins": -0.01431706827133894, + "rewards/rejected": -0.28451037406921387, + "step": 50 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.13235610723495483, + "eval_log_odds_ratio": -0.8047618269920349, + "eval_logits/chosen": 1.097177267074585, + "eval_logits/rejected": 1.080869197845459, + "eval_logps/chosen": -2.87162446975708, + "eval_logps/rejected": -3.0049262046813965, + "eval_loss": 3.0927987098693848, + "eval_nll_loss": 3.012322425842285, + "eval_rewards/accuracies": 0.5519999861717224, + "eval_rewards/chosen": -0.2871624529361725, + "eval_rewards/margins": 0.013330196961760521, + "eval_rewards/rejected": -0.30049264430999756, + "eval_runtime": 53.8284, + "eval_samples_per_second": 9.289, + "eval_steps_per_second": 4.644, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 0.5271857380867004, + "learning_rate": 3.7759999999999995e-06, + "log_odds_chosen": 0.16638590395450592, + "log_odds_ratio": -0.7247543334960938, + "logits/chosen": 1.2056455612182617, + "logits/rejected": 1.1612131595611572, + "logps/chosen": -2.6139063835144043, + "logps/rejected": -2.7844128608703613, + "loss": 2.9017066955566406, + "nll_loss": 2.8292312622070312, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2613906264305115, + "rewards/margins": 0.017050642520189285, + "rewards/rejected": -0.27844128012657166, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 0.387198805809021, + "learning_rate": 4.416000000000001e-06, + "log_odds_chosen": 0.04409245774149895, + "log_odds_ratio": -0.7438842058181763, + "logits/chosen": 1.2054810523986816, + "logits/rejected": 1.1467866897583008, + "logps/chosen": -2.6231472492218018, + "logps/rejected": -2.6642415523529053, + "loss": 2.8387472152709963, + "nll_loss": 2.7643589973449707, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26231473684310913, + "rewards/margins": 0.004109424538910389, + "rewards/rejected": -0.26642411947250366, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 0.3270696699619293, + "learning_rate": 5.0559999999999995e-06, + "log_odds_chosen": 0.250882089138031, + "log_odds_ratio": -0.7435027956962585, + "logits/chosen": 1.0074714422225952, + "logits/rejected": 0.9184917211532593, + "logps/chosen": -2.6167237758636475, + "logps/rejected": -2.8408350944519043, + "loss": 2.674570655822754, + "nll_loss": 2.600220203399658, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26167237758636475, + "rewards/margins": 0.022411148995161057, + "rewards/rejected": -0.2840835154056549, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 0.30302050709724426, + "learning_rate": 5.695999999999999e-06, + "log_odds_chosen": 0.05548218637704849, + "log_odds_ratio": -0.7798537015914917, + "logits/chosen": 1.0686867237091064, + "logits/rejected": 1.0430196523666382, + "logps/chosen": -2.585648536682129, + "logps/rejected": -2.6337788105010986, + "loss": 2.5465234756469726, + "nll_loss": 2.468538522720337, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.258564829826355, + "rewards/margins": 0.004813040141016245, + "rewards/rejected": -0.2633778750896454, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.37143367528915405, + "learning_rate": 6.336e-06, + "log_odds_chosen": 0.2372448742389679, + "log_odds_ratio": -0.6427541971206665, + "logits/chosen": 0.9812337160110474, + "logits/rejected": 1.095284104347229, + "logps/chosen": -2.2601876258850098, + "logps/rejected": -2.4757676124572754, + "loss": 2.297770690917969, + "nll_loss": 2.2334952354431152, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2260187566280365, + "rewards/margins": 0.021558010950684547, + "rewards/rejected": -0.2475767582654953, + "step": 100 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.0465145967900753, + "eval_log_odds_ratio": -0.7675164341926575, + "eval_logits/chosen": 1.0497050285339355, + "eval_logits/rejected": 1.0021971464157104, + "eval_logps/chosen": -2.3477163314819336, + "eval_logps/rejected": -2.400268077850342, + "eval_loss": 2.2927565574645996, + "eval_nll_loss": 2.2160050868988037, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.23477165400981903, + "eval_rewards/margins": 0.005255142226815224, + "eval_rewards/rejected": -0.2400268018245697, + "eval_runtime": 53.4499, + "eval_samples_per_second": 9.355, + "eval_steps_per_second": 4.677, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 0.19999727606773376, + "learning_rate": 6.976e-06, + "log_odds_chosen": -0.004880452062934637, + "log_odds_ratio": -0.781264066696167, + "logits/chosen": 0.9816803932189941, + "logits/rejected": 0.9683195352554321, + "logps/chosen": -2.3489508628845215, + "logps/rejected": -2.3528401851654053, + "loss": 2.225248908996582, + "nll_loss": 2.1471219062805176, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.23489508032798767, + "rewards/margins": 0.00038894638419151306, + "rewards/rejected": -0.23528404533863068, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 0.23247133195400238, + "learning_rate": 7.616e-06, + "log_odds_chosen": 0.1978728026151657, + "log_odds_ratio": -0.692144513130188, + "logits/chosen": 1.0511195659637451, + "logits/rejected": 1.0562175512313843, + "logps/chosen": -2.1107664108276367, + "logps/rejected": -2.264862060546875, + "loss": 2.0843576431274413, + "nll_loss": 2.015143632888794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21107664704322815, + "rewards/margins": 0.01540955901145935, + "rewards/rejected": -0.2264862060546875, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 0.18884675204753876, + "learning_rate": 7.971555555555556e-06, + "log_odds_chosen": -0.04111287742853165, + "log_odds_ratio": -0.7880030870437622, + "logits/chosen": 1.189117193222046, + "logits/rejected": 1.155256986618042, + "logps/chosen": -2.2085041999816895, + "logps/rejected": -2.1815085411071777, + "loss": 2.1168283462524413, + "nll_loss": 2.0380282402038574, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.22085042297840118, + "rewards/margins": -0.0026995770167559385, + "rewards/rejected": -0.21815085411071777, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 0.18060511350631714, + "learning_rate": 7.900444444444444e-06, + "log_odds_chosen": -0.0038092255126684904, + "log_odds_ratio": -0.8108028173446655, + "logits/chosen": 1.0828830003738403, + "logits/rejected": 1.1538610458374023, + "logps/chosen": -2.226916790008545, + "logps/rejected": -2.2349116802215576, + "loss": 1.993095588684082, + "nll_loss": 1.9120155572891235, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.22269168496131897, + "rewards/margins": 0.0007995119085535407, + "rewards/rejected": -0.22349116206169128, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 0.17054790258407593, + "learning_rate": 7.829333333333333e-06, + "log_odds_chosen": -0.017614809796214104, + "log_odds_ratio": -0.8039711117744446, + "logits/chosen": 1.1850899457931519, + "logits/rejected": 1.0987098217010498, + "logps/chosen": -2.1193461418151855, + "logps/rejected": -2.0869622230529785, + "loss": 1.9284049987792968, + "nll_loss": 1.8480079174041748, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2119346410036087, + "rewards/margins": -0.003238401608541608, + "rewards/rejected": -0.20869621634483337, + "step": 150 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.030384650453925133, + "eval_log_odds_ratio": -0.7645502686500549, + "eval_logits/chosen": 1.1388652324676514, + "eval_logits/rejected": 1.0906065702438354, + "eval_logps/chosen": -2.155924081802368, + "eval_logps/rejected": -2.191805362701416, + "eval_loss": 1.9731156826019287, + "eval_nll_loss": 1.8966606855392456, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.2155924290418625, + "eval_rewards/margins": 0.0035881223157048225, + "eval_rewards/rejected": -0.21918053925037384, + "eval_runtime": 53.5115, + "eval_samples_per_second": 9.344, + "eval_steps_per_second": 4.672, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 0.1520330011844635, + "learning_rate": 7.75822222222222e-06, + "log_odds_chosen": 0.019025951623916626, + "log_odds_ratio": -0.7408558130264282, + "logits/chosen": 1.2669531106948853, + "logits/rejected": 1.1829397678375244, + "logps/chosen": -2.155996561050415, + "logps/rejected": -2.181856870651245, + "loss": 1.980598258972168, + "nll_loss": 1.9065126180648804, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2155996561050415, + "rewards/margins": 0.0025860387831926346, + "rewards/rejected": -0.218185693025589, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 0.18062834441661835, + "learning_rate": 7.68711111111111e-06, + "log_odds_chosen": 0.21886181831359863, + "log_odds_ratio": -0.7065083384513855, + "logits/chosen": 1.2067815065383911, + "logits/rejected": 1.1557743549346924, + "logps/chosen": -2.1274123191833496, + "logps/rejected": -2.323625087738037, + "loss": 1.917841339111328, + "nll_loss": 1.8471901416778564, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.21274125576019287, + "rewards/margins": 0.019621269777417183, + "rewards/rejected": -0.2323625087738037, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 0.1637572944164276, + "learning_rate": 7.616e-06, + "log_odds_chosen": -0.05331949517130852, + "log_odds_ratio": -0.7956011891365051, + "logits/chosen": 1.2504408359527588, + "logits/rejected": 1.2527806758880615, + "logps/chosen": -2.083944082260132, + "logps/rejected": -2.0458476543426514, + "loss": 1.9163087844848632, + "nll_loss": 1.836748719215393, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20839443802833557, + "rewards/margins": -0.0038096606731414795, + "rewards/rejected": -0.2045847624540329, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 0.16114692389965057, + "learning_rate": 7.544888888888889e-06, + "log_odds_chosen": 0.07258275896310806, + "log_odds_ratio": -0.723730206489563, + "logits/chosen": 1.211503505706787, + "logits/rejected": 1.2072746753692627, + "logps/chosen": -1.9964195489883423, + "logps/rejected": -2.0695366859436035, + "loss": 1.851585578918457, + "nll_loss": 1.7792127132415771, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19964194297790527, + "rewards/margins": 0.007311700377613306, + "rewards/rejected": -0.20695367455482483, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 0.1941564530134201, + "learning_rate": 7.473777777777777e-06, + "log_odds_chosen": 0.2467677891254425, + "log_odds_ratio": -0.638620913028717, + "logits/chosen": 1.1916354894638062, + "logits/rejected": 1.1236536502838135, + "logps/chosen": -1.9711834192276, + "logps/rejected": -2.1870553493499756, + "loss": 1.8319341659545898, + "nll_loss": 1.7680721282958984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19711832702159882, + "rewards/margins": 0.02158718928694725, + "rewards/rejected": -0.21870553493499756, + "step": 200 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.027904914692044258, + "eval_log_odds_ratio": -0.7658352851867676, + "eval_logits/chosen": 1.264652132987976, + "eval_logits/rejected": 1.221064567565918, + "eval_logps/chosen": -2.1049089431762695, + "eval_logps/rejected": -2.1378941535949707, + "eval_loss": 1.9108957052230835, + "eval_nll_loss": 1.8343122005462646, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.210490882396698, + "eval_rewards/margins": 0.0032985424622893333, + "eval_rewards/rejected": -0.2137894183397293, + "eval_runtime": 53.4723, + "eval_samples_per_second": 9.351, + "eval_steps_per_second": 4.675, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 0.21130956709384918, + "learning_rate": 7.402666666666666e-06, + "log_odds_chosen": 0.05619993805885315, + "log_odds_ratio": -0.6978410482406616, + "logits/chosen": 1.3616042137145996, + "logits/rejected": 1.0906412601470947, + "logps/chosen": -2.0596275329589844, + "logps/rejected": -2.1113193035125732, + "loss": 1.8328233718872071, + "nll_loss": 1.763039231300354, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20596274733543396, + "rewards/margins": 0.005169177893549204, + "rewards/rejected": -0.2111319601535797, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 0.1935756653547287, + "learning_rate": 7.3315555555555546e-06, + "log_odds_chosen": -0.14753268659114838, + "log_odds_ratio": -0.8668686151504517, + "logits/chosen": 1.3593966960906982, + "logits/rejected": 1.282545804977417, + "logps/chosen": -2.158689022064209, + "logps/rejected": -2.0243842601776123, + "loss": 1.855682373046875, + "nll_loss": 1.7689956426620483, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21586890518665314, + "rewards/margins": -0.01343047060072422, + "rewards/rejected": -0.20243844389915466, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 0.16679787635803223, + "learning_rate": 7.260444444444444e-06, + "log_odds_chosen": 0.06307810544967651, + "log_odds_ratio": -0.7288961410522461, + "logits/chosen": 1.3023065328598022, + "logits/rejected": 1.3040940761566162, + "logps/chosen": -2.015089511871338, + "logps/rejected": -2.0810322761535645, + "loss": 1.8415803909301758, + "nll_loss": 1.7686907052993774, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20150896906852722, + "rewards/margins": 0.00659425463527441, + "rewards/rejected": -0.2081032246351242, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 0.17210455238819122, + "learning_rate": 7.1893333333333325e-06, + "log_odds_chosen": 0.2240598499774933, + "log_odds_ratio": -0.6325433850288391, + "logits/chosen": 1.310367465019226, + "logits/rejected": 1.2338093519210815, + "logps/chosen": -1.9855142831802368, + "logps/rejected": -2.173218011856079, + "loss": 1.8426109313964845, + "nll_loss": 1.7793567180633545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19855143129825592, + "rewards/margins": 0.018770387396216393, + "rewards/rejected": -0.21732179820537567, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 0.1474185585975647, + "learning_rate": 7.118222222222222e-06, + "log_odds_chosen": -0.09763683378696442, + "log_odds_ratio": -0.811837375164032, + "logits/chosen": 1.3854808807373047, + "logits/rejected": 1.4551901817321777, + "logps/chosen": -2.0398902893066406, + "logps/rejected": -1.9660927057266235, + "loss": 1.808901596069336, + "nll_loss": 1.7277179956436157, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20398902893066406, + "rewards/margins": -0.007379765156656504, + "rewards/rejected": -0.1966092884540558, + "step": 250 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.027593065053224564, + "eval_log_odds_ratio": -0.7654376029968262, + "eval_logits/chosen": 1.3605482578277588, + "eval_logits/rejected": 1.3180304765701294, + "eval_logps/chosen": -2.083819627761841, + "eval_logps/rejected": -2.115978479385376, + "eval_loss": 1.891451358795166, + "eval_nll_loss": 1.8149076700210571, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": -0.20838195085525513, + "eval_rewards/margins": 0.003215902717784047, + "eval_rewards/rejected": -0.21159787476062775, + "eval_runtime": 53.4952, + "eval_samples_per_second": 9.347, + "eval_steps_per_second": 4.673, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 0.18132755160331726, + "learning_rate": 7.047111111111111e-06, + "log_odds_chosen": 0.07458068430423737, + "log_odds_ratio": -0.7160965800285339, + "logits/chosen": 1.4305390119552612, + "logits/rejected": 1.3339704275131226, + "logps/chosen": -1.9973361492156982, + "logps/rejected": -2.0638270378112793, + "loss": 1.8488407135009766, + "nll_loss": 1.777231216430664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19973360002040863, + "rewards/margins": 0.006649085786193609, + "rewards/rejected": -0.20638270676136017, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 0.156590074300766, + "learning_rate": 6.976e-06, + "log_odds_chosen": 0.21048691868782043, + "log_odds_ratio": -0.6846021413803101, + "logits/chosen": 1.3391703367233276, + "logits/rejected": 1.3043503761291504, + "logps/chosen": -1.9781110286712646, + "logps/rejected": -2.1710684299468994, + "loss": 1.8046062469482422, + "nll_loss": 1.736146330833435, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19781112670898438, + "rewards/margins": 0.019295744597911835, + "rewards/rejected": -0.21710684895515442, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 0.2862628698348999, + "learning_rate": 6.904888888888889e-06, + "log_odds_chosen": 0.19719335436820984, + "log_odds_ratio": -0.6589730978012085, + "logits/chosen": 1.3844366073608398, + "logits/rejected": 1.4240922927856445, + "logps/chosen": -1.9418405294418335, + "logps/rejected": -2.117654800415039, + "loss": 1.8532024383544923, + "nll_loss": 1.7873048782348633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1941840648651123, + "rewards/margins": 0.017581436783075333, + "rewards/rejected": -0.21176549792289734, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 0.18839485943317413, + "learning_rate": 6.833777777777778e-06, + "log_odds_chosen": 0.2900911867618561, + "log_odds_ratio": -0.6224602460861206, + "logits/chosen": 1.4391660690307617, + "logits/rejected": 1.3507264852523804, + "logps/chosen": -1.9003779888153076, + "logps/rejected": -2.1583569049835205, + "loss": 1.7574338912963867, + "nll_loss": 1.6951879262924194, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1900378167629242, + "rewards/margins": 0.025797897949814796, + "rewards/rejected": -0.21583569049835205, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 0.19719934463500977, + "learning_rate": 6.762666666666667e-06, + "log_odds_chosen": 0.07987387478351593, + "log_odds_ratio": -0.7276886105537415, + "logits/chosen": 1.3382771015167236, + "logits/rejected": 1.33270263671875, + "logps/chosen": -1.9756524562835693, + "logps/rejected": -2.0477142333984375, + "loss": 1.8089508056640624, + "nll_loss": 1.7361822128295898, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.1975652575492859, + "rewards/margins": 0.007206143345683813, + "rewards/rejected": -0.20477142930030823, + "step": 300 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.02609633468091488, + "eval_log_odds_ratio": -0.7655816674232483, + "eval_logits/chosen": 1.4215049743652344, + "eval_logits/rejected": 1.3795844316482544, + "eval_logps/chosen": -2.0664799213409424, + "eval_logps/rejected": -2.096339225769043, + "eval_loss": 1.8772507905960083, + "eval_nll_loss": 1.8006926774978638, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.20664800703525543, + "eval_rewards/margins": 0.002985927276313305, + "eval_rewards/rejected": -0.209633931517601, + "eval_runtime": 53.5083, + "eval_samples_per_second": 9.344, + "eval_steps_per_second": 4.672, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 0.1585138589143753, + "learning_rate": 6.691555555555555e-06, + "log_odds_chosen": -0.007139368914067745, + "log_odds_ratio": -0.7591060400009155, + "logits/chosen": 1.4326140880584717, + "logits/rejected": 1.3895976543426514, + "logps/chosen": -1.969129204750061, + "logps/rejected": -1.9710346460342407, + "loss": 1.8410327911376954, + "nll_loss": 1.7651220560073853, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19691291451454163, + "rewards/margins": 0.00019057458848692477, + "rewards/rejected": -0.19710348546504974, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 0.22688154876232147, + "learning_rate": 6.620444444444444e-06, + "log_odds_chosen": 0.06559257209300995, + "log_odds_ratio": -0.7108487486839294, + "logits/chosen": 1.448866367340088, + "logits/rejected": 1.4073840379714966, + "logps/chosen": -2.066251039505005, + "logps/rejected": -2.116321086883545, + "loss": 1.8646768569946288, + "nll_loss": 1.793591856956482, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2066251039505005, + "rewards/margins": 0.0050069959834218025, + "rewards/rejected": -0.21163210272789001, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 0.18054792284965515, + "learning_rate": 6.549333333333333e-06, + "log_odds_chosen": 0.017626959830522537, + "log_odds_ratio": -0.7846770882606506, + "logits/chosen": 1.3628690242767334, + "logits/rejected": 1.4442317485809326, + "logps/chosen": -1.9719655513763428, + "logps/rejected": -1.9827516078948975, + "loss": 1.7763628005981444, + "nll_loss": 1.6978952884674072, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19719655811786652, + "rewards/margins": 0.001078630331903696, + "rewards/rejected": -0.19827519357204437, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 0.20558743178844452, + "learning_rate": 6.478222222222222e-06, + "log_odds_chosen": 0.16140693426132202, + "log_odds_ratio": -0.6672384738922119, + "logits/chosen": 1.4473990201950073, + "logits/rejected": 1.3463249206542969, + "logps/chosen": -2.042762041091919, + "logps/rejected": -2.17926287651062, + "loss": 1.8391897201538085, + "nll_loss": 1.772465705871582, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2042761743068695, + "rewards/margins": 0.013650094158947468, + "rewards/rejected": -0.21792630851268768, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 0.198579341173172, + "learning_rate": 6.407111111111111e-06, + "log_odds_chosen": 0.15977905690670013, + "log_odds_ratio": -0.6974108815193176, + "logits/chosen": 1.4000948667526245, + "logits/rejected": 1.3558040857315063, + "logps/chosen": -2.04255747795105, + "logps/rejected": -2.184256076812744, + "loss": 1.8088817596435547, + "nll_loss": 1.7391407489776611, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20425572991371155, + "rewards/margins": 0.014169883914291859, + "rewards/rejected": -0.21842563152313232, + "step": 350 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.022830627858638763, + "eval_log_odds_ratio": -0.7658974528312683, + "eval_logits/chosen": 1.4262471199035645, + "eval_logits/rejected": 1.3848557472229004, + "eval_logps/chosen": -2.054938554763794, + "eval_logps/rejected": -2.0817372798919678, + "eval_loss": 1.8663586378097534, + "eval_nll_loss": 1.7897688150405884, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.20549388229846954, + "eval_rewards/margins": 0.0026798879262059927, + "eval_rewards/rejected": -0.2081737518310547, + "eval_runtime": 53.4938, + "eval_samples_per_second": 9.347, + "eval_steps_per_second": 4.673, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 0.2030162364244461, + "learning_rate": 6.336e-06, + "log_odds_chosen": 0.1422426402568817, + "log_odds_ratio": -0.7028877139091492, + "logits/chosen": 1.510601282119751, + "logits/rejected": 1.4965307712554932, + "logps/chosen": -2.0040202140808105, + "logps/rejected": -2.131716251373291, + "loss": 1.8563343048095704, + "nll_loss": 1.7860454320907593, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20040205121040344, + "rewards/margins": 0.01276957057416439, + "rewards/rejected": -0.21317163109779358, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 0.21605312824249268, + "learning_rate": 6.2648888888888885e-06, + "log_odds_chosen": 0.05190245434641838, + "log_odds_ratio": -0.7171397805213928, + "logits/chosen": 1.393733263015747, + "logits/rejected": 1.3512167930603027, + "logps/chosen": -2.0085949897766113, + "logps/rejected": -2.0461935997009277, + "loss": 1.8606908798217774, + "nll_loss": 1.7889766693115234, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20085950195789337, + "rewards/margins": 0.0037598726339638233, + "rewards/rejected": -0.204619362950325, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 0.16586966812610626, + "learning_rate": 6.193777777777778e-06, + "log_odds_chosen": 0.1487434357404709, + "log_odds_ratio": -0.7054386138916016, + "logits/chosen": 1.4523228406906128, + "logits/rejected": 1.491560697555542, + "logps/chosen": -2.011794090270996, + "logps/rejected": -2.1287624835968018, + "loss": 1.7962881088256837, + "nll_loss": 1.7257442474365234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20117942988872528, + "rewards/margins": 0.01169683039188385, + "rewards/rejected": -0.21287624537944794, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 0.2835615873336792, + "learning_rate": 6.1226666666666664e-06, + "log_odds_chosen": 0.13590653240680695, + "log_odds_ratio": -0.6991716623306274, + "logits/chosen": 1.3472636938095093, + "logits/rejected": 1.4732224941253662, + "logps/chosen": -1.972169280052185, + "logps/rejected": -2.08345365524292, + "loss": 1.8266469955444335, + "nll_loss": 1.7567298412322998, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1972169280052185, + "rewards/margins": 0.01112845353782177, + "rewards/rejected": -0.20834538340568542, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 0.15232031047344208, + "learning_rate": 6.051555555555556e-06, + "log_odds_chosen": 0.1982167661190033, + "log_odds_ratio": -0.6656922698020935, + "logits/chosen": 1.6487038135528564, + "logits/rejected": 1.6309928894042969, + "logps/chosen": -1.919835090637207, + "logps/rejected": -2.1005043983459473, + "loss": 1.8271181106567382, + "nll_loss": 1.7605489492416382, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19198350608348846, + "rewards/margins": 0.018066909164190292, + "rewards/rejected": -0.21005041897296906, + "step": 400 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.028749216347932816, + "eval_log_odds_ratio": -0.763251543045044, + "eval_logits/chosen": 1.5312973260879517, + "eval_logits/rejected": 1.4910560846328735, + "eval_logps/chosen": -2.0474772453308105, + "eval_logps/rejected": -2.079118013381958, + "eval_loss": 1.85878586769104, + "eval_nll_loss": 1.7824609279632568, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.20474772155284882, + "eval_rewards/margins": 0.0031640806701034307, + "eval_rewards/rejected": -0.20791178941726685, + "eval_runtime": 53.5987, + "eval_samples_per_second": 9.329, + "eval_steps_per_second": 4.664, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 0.19347190856933594, + "learning_rate": 5.980444444444444e-06, + "log_odds_chosen": 0.16494014859199524, + "log_odds_ratio": -0.6926692724227905, + "logits/chosen": 1.5291383266448975, + "logits/rejected": 1.4509787559509277, + "logps/chosen": -2.0051121711730957, + "logps/rejected": -2.141641139984131, + "loss": 1.814227294921875, + "nll_loss": 1.7449604272842407, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20051121711730957, + "rewards/margins": 0.013652893714606762, + "rewards/rejected": -0.2141641080379486, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 0.22619830071926117, + "learning_rate": 5.909333333333333e-06, + "log_odds_chosen": 0.13791924715042114, + "log_odds_ratio": -0.7470442056655884, + "logits/chosen": 1.604962706565857, + "logits/rejected": 1.5795490741729736, + "logps/chosen": -1.9936443567276, + "logps/rejected": -2.138291835784912, + "loss": 1.7467041015625, + "nll_loss": 1.6719995737075806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19936442375183105, + "rewards/margins": 0.014464760199189186, + "rewards/rejected": -0.21382920444011688, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 0.2198985368013382, + "learning_rate": 5.838222222222221e-06, + "log_odds_chosen": 0.10624992847442627, + "log_odds_ratio": -0.7233898639678955, + "logits/chosen": 1.5435715913772583, + "logits/rejected": 1.4956719875335693, + "logps/chosen": -1.9429054260253906, + "logps/rejected": -2.0160951614379883, + "loss": 1.7684484481811524, + "nll_loss": 1.696109414100647, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19429054856300354, + "rewards/margins": 0.0073189930990338326, + "rewards/rejected": -0.2016095370054245, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 0.18691220879554749, + "learning_rate": 5.767111111111111e-06, + "log_odds_chosen": -0.021660882979631424, + "log_odds_ratio": -0.7595964670181274, + "logits/chosen": 1.6979032754898071, + "logits/rejected": 1.6603370904922485, + "logps/chosen": -1.9889543056488037, + "logps/rejected": -1.965158462524414, + "loss": 1.8263116836547852, + "nll_loss": 1.750352144241333, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19889545440673828, + "rewards/margins": -0.002379600191488862, + "rewards/rejected": -0.19651584327220917, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 0.15867650508880615, + "learning_rate": 5.695999999999999e-06, + "log_odds_chosen": -0.0730157420039177, + "log_odds_ratio": -0.8161319494247437, + "logits/chosen": 1.5580207109451294, + "logits/rejected": 1.5313141345977783, + "logps/chosen": -2.104485034942627, + "logps/rejected": -2.0238442420959473, + "loss": 1.8731468200683594, + "nll_loss": 1.7915337085723877, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.2104485034942627, + "rewards/margins": -0.008064089342951775, + "rewards/rejected": -0.20238442718982697, + "step": 450 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.02652684412896633, + "eval_log_odds_ratio": -0.7636462450027466, + "eval_logits/chosen": 1.5893203020095825, + "eval_logits/rejected": 1.5504162311553955, + "eval_logps/chosen": -2.0457892417907715, + "eval_logps/rejected": -2.0760769844055176, + "eval_loss": 1.854221224784851, + "eval_nll_loss": 1.7778565883636475, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.2045789510011673, + "eval_rewards/margins": 0.003028758568689227, + "eval_rewards/rejected": -0.2076077163219452, + "eval_runtime": 53.545, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 4.669, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 0.2154403030872345, + "learning_rate": 5.624888888888889e-06, + "log_odds_chosen": 0.1506679207086563, + "log_odds_ratio": -0.6985970139503479, + "logits/chosen": 1.5237703323364258, + "logits/rejected": 1.5915908813476562, + "logps/chosen": -2.0275137424468994, + "logps/rejected": -2.1536142826080322, + "loss": 1.8502899169921876, + "nll_loss": 1.7804298400878906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20275135338306427, + "rewards/margins": 0.012610049918293953, + "rewards/rejected": -0.21536140143871307, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 0.17461568117141724, + "learning_rate": 5.553777777777777e-06, + "log_odds_chosen": 0.17391428351402283, + "log_odds_ratio": -0.6869101524353027, + "logits/chosen": 1.5581696033477783, + "logits/rejected": 1.4896894693374634, + "logps/chosen": -1.9319404363632202, + "logps/rejected": -2.0868048667907715, + "loss": 1.7752193450927733, + "nll_loss": 1.7065280675888062, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19319406151771545, + "rewards/margins": 0.01548641175031662, + "rewards/rejected": -0.20868046581745148, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 0.2354680746793747, + "learning_rate": 5.482666666666667e-06, + "log_odds_chosen": 0.0810302346944809, + "log_odds_ratio": -0.7345963716506958, + "logits/chosen": 1.6190745830535889, + "logits/rejected": 1.5643622875213623, + "logps/chosen": -1.9312503337860107, + "logps/rejected": -2.0015344619750977, + "loss": 1.8230710983276368, + "nll_loss": 1.7496116161346436, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19312502443790436, + "rewards/margins": 0.007028433494269848, + "rewards/rejected": -0.20015347003936768, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 0.16735392808914185, + "learning_rate": 5.411555555555555e-06, + "log_odds_chosen": -0.01586019992828369, + "log_odds_ratio": -0.7565222382545471, + "logits/chosen": 1.479667067527771, + "logits/rejected": 1.504528522491455, + "logps/chosen": -2.03438138961792, + "logps/rejected": -2.0231566429138184, + "loss": 1.823404884338379, + "nll_loss": 1.7477527856826782, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20343813300132751, + "rewards/margins": -0.0011224561603739858, + "rewards/rejected": -0.20231568813323975, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 0.1829444319009781, + "learning_rate": 5.3404444444444445e-06, + "log_odds_chosen": 0.11972711235284805, + "log_odds_ratio": -0.7277721166610718, + "logits/chosen": 1.5400944948196411, + "logits/rejected": 1.490321397781372, + "logps/chosen": -2.0169591903686523, + "logps/rejected": -2.1097054481506348, + "loss": 1.7935520172119142, + "nll_loss": 1.7207750082015991, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20169591903686523, + "rewards/margins": 0.009274644777178764, + "rewards/rejected": -0.21097056567668915, + "step": 500 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.023190366104245186, + "eval_log_odds_ratio": -0.7631560564041138, + "eval_logits/chosen": 1.5937296152114868, + "eval_logits/rejected": 1.5535894632339478, + "eval_logps/chosen": -2.0367894172668457, + "eval_logps/rejected": -2.063542366027832, + "eval_loss": 1.8493844270706177, + "eval_nll_loss": 1.773068904876709, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2036789506673813, + "eval_rewards/margins": 0.002675286727026105, + "eval_rewards/rejected": -0.20635424554347992, + "eval_runtime": 53.6528, + "eval_samples_per_second": 9.319, + "eval_steps_per_second": 4.66, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 0.18359120190143585, + "learning_rate": 5.269333333333333e-06, + "log_odds_chosen": 0.0819496288895607, + "log_odds_ratio": -0.7168647646903992, + "logits/chosen": 1.5115034580230713, + "logits/rejected": 1.4402801990509033, + "logps/chosen": -2.0256223678588867, + "logps/rejected": -2.0993785858154297, + "loss": 1.8416709899902344, + "nll_loss": 1.7699846029281616, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20256221294403076, + "rewards/margins": 0.007375650107860565, + "rewards/rejected": -0.20993788540363312, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 0.17817825078964233, + "learning_rate": 5.1982222222222225e-06, + "log_odds_chosen": 0.2734625041484833, + "log_odds_ratio": -0.6966907382011414, + "logits/chosen": 1.5594890117645264, + "logits/rejected": 1.5883136987686157, + "logps/chosen": -1.941902756690979, + "logps/rejected": -2.200758457183838, + "loss": 1.8082841873168944, + "nll_loss": 1.7386150360107422, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19419027864933014, + "rewards/margins": 0.02588556334376335, + "rewards/rejected": -0.2200758457183838, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 0.2906912863254547, + "learning_rate": 5.12711111111111e-06, + "log_odds_chosen": -0.05212322995066643, + "log_odds_ratio": -0.7876973152160645, + "logits/chosen": 1.5465106964111328, + "logits/rejected": 1.5060274600982666, + "logps/chosen": -2.023458480834961, + "logps/rejected": -1.9740943908691406, + "loss": 1.8264562606811523, + "nll_loss": 1.747686743736267, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2023458480834961, + "rewards/margins": -0.004936427343636751, + "rewards/rejected": -0.19740943610668182, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 0.20073647797107697, + "learning_rate": 5.0559999999999995e-06, + "log_odds_chosen": 0.005497487727552652, + "log_odds_ratio": -0.7783851623535156, + "logits/chosen": 1.685173749923706, + "logits/rejected": 1.7713344097137451, + "logps/chosen": -1.9037139415740967, + "logps/rejected": -1.8979514837265015, + "loss": 1.8080127716064454, + "nll_loss": 1.7301738262176514, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19037137925624847, + "rewards/margins": -0.0005762483924627304, + "rewards/rejected": -0.1897951364517212, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 0.1273553967475891, + "learning_rate": 4.984888888888888e-06, + "log_odds_chosen": 0.18364550173282623, + "log_odds_ratio": -0.6565154790878296, + "logits/chosen": 1.587527871131897, + "logits/rejected": 1.5570650100708008, + "logps/chosen": -1.878670334815979, + "logps/rejected": -2.036764621734619, + "loss": 1.8042585372924804, + "nll_loss": 1.738607406616211, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18786704540252686, + "rewards/margins": 0.01580941304564476, + "rewards/rejected": -0.20367643237113953, + "step": 550 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.025964470580220222, + "eval_log_odds_ratio": -0.7649032473564148, + "eval_logits/chosen": 1.5951756238937378, + "eval_logits/rejected": 1.5590118169784546, + "eval_logps/chosen": -2.032235860824585, + "eval_logps/rejected": -2.061086654663086, + "eval_loss": 1.8456532955169678, + "eval_nll_loss": 1.7691627740859985, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.2032235562801361, + "eval_rewards/margins": 0.00288510974496603, + "eval_rewards/rejected": -0.2061086744070053, + "eval_runtime": 53.5193, + "eval_samples_per_second": 9.342, + "eval_steps_per_second": 4.671, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 0.2574485242366791, + "learning_rate": 4.9137777777777775e-06, + "log_odds_chosen": 0.2498103827238083, + "log_odds_ratio": -0.6482200026512146, + "logits/chosen": 1.4691909551620483, + "logits/rejected": 1.3890944719314575, + "logps/chosen": -1.944493055343628, + "logps/rejected": -2.15970778465271, + "loss": 1.757819938659668, + "nll_loss": 1.6929981708526611, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1944493055343628, + "rewards/margins": 0.021521473303437233, + "rewards/rejected": -0.21597079932689667, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 0.1717718094587326, + "learning_rate": 4.842666666666666e-06, + "log_odds_chosen": 0.06141955778002739, + "log_odds_ratio": -0.7500567436218262, + "logits/chosen": 1.6567814350128174, + "logits/rejected": 1.5995619297027588, + "logps/chosen": -1.9717029333114624, + "logps/rejected": -2.0279083251953125, + "loss": 1.7976764678955077, + "nll_loss": 1.722670555114746, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19717030227184296, + "rewards/margins": 0.0056205070577561855, + "rewards/rejected": -0.20279082655906677, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 0.2214452028274536, + "learning_rate": 4.771555555555555e-06, + "log_odds_chosen": -0.03478344902396202, + "log_odds_ratio": -0.7769675850868225, + "logits/chosen": 1.6345102787017822, + "logits/rejected": 1.5030030012130737, + "logps/chosen": -1.9924647808074951, + "logps/rejected": -1.9646708965301514, + "loss": 1.8113759994506835, + "nll_loss": 1.7336797714233398, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19924648106098175, + "rewards/margins": -0.002779375296086073, + "rewards/rejected": -0.1964671015739441, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 0.17967616021633148, + "learning_rate": 4.700444444444445e-06, + "log_odds_chosen": 0.10119867324829102, + "log_odds_ratio": -0.7037830352783203, + "logits/chosen": 1.5076220035552979, + "logits/rejected": 1.4201761484146118, + "logps/chosen": -1.9879302978515625, + "logps/rejected": -2.082368850708008, + "loss": 1.7988115310668946, + "nll_loss": 1.728433609008789, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19879302382469177, + "rewards/margins": 0.009443843737244606, + "rewards/rejected": -0.20823685824871063, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 0.1719515323638916, + "learning_rate": 4.629333333333333e-06, + "log_odds_chosen": 0.06453205645084381, + "log_odds_ratio": -0.722458004951477, + "logits/chosen": 1.5820884704589844, + "logits/rejected": 1.511212944984436, + "logps/chosen": -1.8608150482177734, + "logps/rejected": -1.913921594619751, + "loss": 1.73321590423584, + "nll_loss": 1.6609699726104736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18608148396015167, + "rewards/margins": 0.005310675594955683, + "rewards/rejected": -0.19139216840267181, + "step": 600 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.03060421720147133, + "eval_log_odds_ratio": -0.7616795897483826, + "eval_logits/chosen": 1.6089800596237183, + "eval_logits/rejected": 1.5700582265853882, + "eval_logps/chosen": -2.03167986869812, + "eval_logps/rejected": -2.0645487308502197, + "eval_loss": 1.8425160646438599, + "eval_nll_loss": 1.7663480043411255, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20316800475120544, + "eval_rewards/margins": 0.0032868909183889627, + "eval_rewards/rejected": -0.20645487308502197, + "eval_runtime": 53.5679, + "eval_samples_per_second": 9.334, + "eval_steps_per_second": 4.667, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 0.1661933809518814, + "learning_rate": 4.558222222222223e-06, + "log_odds_chosen": 0.29323670268058777, + "log_odds_ratio": -0.6596352458000183, + "logits/chosen": 1.619877576828003, + "logits/rejected": 1.5766648054122925, + "logps/chosen": -1.8296048641204834, + "logps/rejected": -2.056410789489746, + "loss": 1.7442157745361329, + "nll_loss": 1.6782522201538086, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18296048045158386, + "rewards/margins": 0.02268058992922306, + "rewards/rejected": -0.20564107596874237, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 0.2402229607105255, + "learning_rate": 4.487111111111111e-06, + "log_odds_chosen": 0.09888540208339691, + "log_odds_ratio": -0.7262139916419983, + "logits/chosen": 1.5223571062088013, + "logits/rejected": 1.5384490489959717, + "logps/chosen": -1.81551194190979, + "logps/rejected": -1.8770872354507446, + "loss": 1.7421588897705078, + "nll_loss": 1.6695371866226196, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.18155118823051453, + "rewards/margins": 0.00615753885358572, + "rewards/rejected": -0.18770872056484222, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 0.20038332045078278, + "learning_rate": 4.416000000000001e-06, + "log_odds_chosen": 0.22199416160583496, + "log_odds_ratio": -0.702614426612854, + "logits/chosen": 1.5671889781951904, + "logits/rejected": 1.4507954120635986, + "logps/chosen": -1.8749492168426514, + "logps/rejected": -2.072239637374878, + "loss": 1.7758310317993165, + "nll_loss": 1.7055692672729492, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18749494850635529, + "rewards/margins": 0.019729027524590492, + "rewards/rejected": -0.20722396671772003, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 0.4241769015789032, + "learning_rate": 4.344888888888888e-06, + "log_odds_chosen": 0.13136598467826843, + "log_odds_ratio": -0.6841301918029785, + "logits/chosen": 1.5495809316635132, + "logits/rejected": 1.551116943359375, + "logps/chosen": -1.9033386707305908, + "logps/rejected": -2.0172486305236816, + "loss": 1.8465911865234375, + "nll_loss": 1.7781782150268555, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19033385813236237, + "rewards/margins": 0.011390982195734978, + "rewards/rejected": -0.2017248421907425, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 0.3452068269252777, + "learning_rate": 4.273777777777778e-06, + "log_odds_chosen": -0.07351900637149811, + "log_odds_ratio": -0.817279040813446, + "logits/chosen": 1.584192156791687, + "logits/rejected": 1.5928871631622314, + "logps/chosen": -2.058300495147705, + "logps/rejected": -1.9885162115097046, + "loss": 1.7821306228637694, + "nll_loss": 1.7004029750823975, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20583002269268036, + "rewards/margins": -0.006978417746722698, + "rewards/rejected": -0.19885161519050598, + "step": 650 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.025821426883339882, + "eval_log_odds_ratio": -0.7627764344215393, + "eval_logits/chosen": 1.5995525121688843, + "eval_logits/rejected": 1.5644874572753906, + "eval_logps/chosen": -2.020664930343628, + "eval_logps/rejected": -2.0486767292022705, + "eval_loss": 1.8382052183151245, + "eval_nll_loss": 1.761927604675293, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20206648111343384, + "eval_rewards/margins": 0.0028012022376060486, + "eval_rewards/rejected": -0.2048676759004593, + "eval_runtime": 53.5051, + "eval_samples_per_second": 9.345, + "eval_steps_per_second": 4.672, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 0.2219850867986679, + "learning_rate": 4.202666666666666e-06, + "log_odds_chosen": 0.0981612354516983, + "log_odds_ratio": -0.7084988951683044, + "logits/chosen": 1.566150188446045, + "logits/rejected": 1.5409971475601196, + "logps/chosen": -1.9153779745101929, + "logps/rejected": -1.993215799331665, + "loss": 1.7639646530151367, + "nll_loss": 1.6931148767471313, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1915377825498581, + "rewards/margins": 0.007783782668411732, + "rewards/rejected": -0.19932158291339874, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 0.2081521451473236, + "learning_rate": 4.1315555555555556e-06, + "log_odds_chosen": 0.1091976910829544, + "log_odds_ratio": -0.6956084370613098, + "logits/chosen": 1.7070366144180298, + "logits/rejected": 1.643048882484436, + "logps/chosen": -1.9937528371810913, + "logps/rejected": -2.1002330780029297, + "loss": 1.8045967102050782, + "nll_loss": 1.7350358963012695, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19937530159950256, + "rewards/margins": 0.010648000985383987, + "rewards/rejected": -0.21002328395843506, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 0.15036334097385406, + "learning_rate": 4.060444444444444e-06, + "log_odds_chosen": 0.06587956845760345, + "log_odds_ratio": -0.735454261302948, + "logits/chosen": 1.6563133001327515, + "logits/rejected": 1.6276830434799194, + "logps/chosen": -1.9550457000732422, + "logps/rejected": -2.027952194213867, + "loss": 1.7694936752319337, + "nll_loss": 1.6959483623504639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1955045759677887, + "rewards/margins": 0.007290668785572052, + "rewards/rejected": -0.20279522240161896, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 0.18966805934906006, + "learning_rate": 3.989333333333333e-06, + "log_odds_chosen": 0.10347900539636612, + "log_odds_ratio": -0.7044586539268494, + "logits/chosen": 1.7145198583602905, + "logits/rejected": 1.6799052953720093, + "logps/chosen": -1.9247310161590576, + "logps/rejected": -2.026733875274658, + "loss": 1.7826107025146485, + "nll_loss": 1.712165117263794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19247311353683472, + "rewards/margins": 0.010200263932347298, + "rewards/rejected": -0.20267336070537567, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 0.27768856287002563, + "learning_rate": 3.918222222222222e-06, + "log_odds_chosen": 0.22244243323802948, + "log_odds_ratio": -0.6758590936660767, + "logits/chosen": 1.5261728763580322, + "logits/rejected": 1.4202882051467896, + "logps/chosen": -1.9367029666900635, + "logps/rejected": -2.1093690395355225, + "loss": 1.7446697235107422, + "nll_loss": 1.677083969116211, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19367030262947083, + "rewards/margins": 0.01726660504937172, + "rewards/rejected": -0.21093690395355225, + "step": 700 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.028961200267076492, + "eval_log_odds_ratio": -0.7605804800987244, + "eval_logits/chosen": 1.6280906200408936, + "eval_logits/rejected": 1.5934500694274902, + "eval_logps/chosen": -2.018935441970825, + "eval_logps/rejected": -2.049320936203003, + "eval_loss": 1.8348098993301392, + "eval_nll_loss": 1.7587517499923706, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20189355313777924, + "eval_rewards/margins": 0.003038552822545171, + "eval_rewards/rejected": -0.2049321085214615, + "eval_runtime": 53.5567, + "eval_samples_per_second": 9.336, + "eval_steps_per_second": 4.668, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.2013629972934723, + "learning_rate": 3.8471111111111105e-06, + "log_odds_chosen": -0.03864391893148422, + "log_odds_ratio": -0.7773549556732178, + "logits/chosen": 1.677484154701233, + "logits/rejected": 1.539167881011963, + "logps/chosen": -2.087277889251709, + "logps/rejected": -2.0545527935028076, + "loss": 1.8523176193237305, + "nll_loss": 1.7745821475982666, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20872780680656433, + "rewards/margins": -0.0032725154887884855, + "rewards/rejected": -0.20545530319213867, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 0.20682214200496674, + "learning_rate": 3.7759999999999995e-06, + "log_odds_chosen": 0.03799188882112503, + "log_odds_ratio": -0.7313598394393921, + "logits/chosen": 1.670763373374939, + "logits/rejected": 1.6574833393096924, + "logps/chosen": -1.883700966835022, + "logps/rejected": -1.9155511856079102, + "loss": 1.8039417266845703, + "nll_loss": 1.73080575466156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18837013840675354, + "rewards/margins": 0.0031849914230406284, + "rewards/rejected": -0.19155511260032654, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 0.23043783009052277, + "learning_rate": 3.7048888888888885e-06, + "log_odds_chosen": -0.0075264484621584415, + "log_odds_ratio": -0.7775508761405945, + "logits/chosen": 1.742677092552185, + "logits/rejected": 1.7868130207061768, + "logps/chosen": -1.9592937231063843, + "logps/rejected": -1.9675123691558838, + "loss": 1.8238216400146485, + "nll_loss": 1.7460663318634033, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1959293633699417, + "rewards/margins": 0.000821849680505693, + "rewards/rejected": -0.196751207113266, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 0.18770243227481842, + "learning_rate": 3.633777777777778e-06, + "log_odds_chosen": 0.1855059713125229, + "log_odds_ratio": -0.6716901063919067, + "logits/chosen": 1.7457072734832764, + "logits/rejected": 1.7863832712173462, + "logps/chosen": -1.9091510772705078, + "logps/rejected": -2.0810976028442383, + "loss": 1.7321632385253907, + "nll_loss": 1.6649940013885498, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19091510772705078, + "rewards/margins": 0.017194656655192375, + "rewards/rejected": -0.2081097662448883, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 0.133562833070755, + "learning_rate": 3.562666666666667e-06, + "log_odds_chosen": 0.3153020143508911, + "log_odds_ratio": -0.6737623810768127, + "logits/chosen": 1.7664339542388916, + "logits/rejected": 1.658424973487854, + "logps/chosen": -1.9272540807724, + "logps/rejected": -2.21852707862854, + "loss": 1.769679069519043, + "nll_loss": 1.7023029327392578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19272543489933014, + "rewards/margins": 0.02912726439535618, + "rewards/rejected": -0.22185268998146057, + "step": 750 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.02566058374941349, + "eval_log_odds_ratio": -0.7629475593566895, + "eval_logits/chosen": 1.6977092027664185, + "eval_logits/rejected": 1.6649681329727173, + "eval_logps/chosen": -2.014634609222412, + "eval_logps/rejected": -2.042663812637329, + "eval_loss": 1.8331661224365234, + "eval_nll_loss": 1.7568713426589966, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2014634758234024, + "eval_rewards/margins": 0.002802920062094927, + "eval_rewards/rejected": -0.20426639914512634, + "eval_runtime": 53.583, + "eval_samples_per_second": 9.331, + "eval_steps_per_second": 4.666, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 0.19514085352420807, + "learning_rate": 3.4915555555555558e-06, + "log_odds_chosen": 0.07212933897972107, + "log_odds_ratio": -0.727311909198761, + "logits/chosen": 1.7513700723648071, + "logits/rejected": 1.649224877357483, + "logps/chosen": -1.9654823541641235, + "logps/rejected": -2.0274367332458496, + "loss": 1.7462617874145507, + "nll_loss": 1.6735305786132812, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1965482383966446, + "rewards/margins": 0.006195452995598316, + "rewards/rejected": -0.20274372398853302, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 0.16212475299835205, + "learning_rate": 3.4204444444444443e-06, + "log_odds_chosen": 0.27029961347579956, + "log_odds_ratio": -0.6815747022628784, + "logits/chosen": 1.6886869668960571, + "logits/rejected": 1.5897142887115479, + "logps/chosen": -1.9044355154037476, + "logps/rejected": -2.1495070457458496, + "loss": 1.7639043807983399, + "nll_loss": 1.6957467794418335, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19044354557991028, + "rewards/margins": 0.024507205933332443, + "rewards/rejected": -0.21495072543621063, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 0.1794252097606659, + "learning_rate": 3.3493333333333333e-06, + "log_odds_chosen": 0.1325821578502655, + "log_odds_ratio": -0.6765932440757751, + "logits/chosen": 1.7684406042099, + "logits/rejected": 1.7867968082427979, + "logps/chosen": -1.85635244846344, + "logps/rejected": -1.9697904586791992, + "loss": 1.7592267990112305, + "nll_loss": 1.6915674209594727, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18563523888587952, + "rewards/margins": 0.011343811638653278, + "rewards/rejected": -0.19697906076908112, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 0.21208560466766357, + "learning_rate": 3.2782222222222222e-06, + "log_odds_chosen": -0.15325720608234406, + "log_odds_ratio": -0.8170074224472046, + "logits/chosen": 1.7933919429779053, + "logits/rejected": 1.777390718460083, + "logps/chosen": -2.049445390701294, + "logps/rejected": -1.9202073812484741, + "loss": 1.826498794555664, + "nll_loss": 1.7447984218597412, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20494452118873596, + "rewards/margins": -0.012923778966069221, + "rewards/rejected": -0.1920207440853119, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 0.22979456186294556, + "learning_rate": 3.207111111111111e-06, + "log_odds_chosen": 0.17507728934288025, + "log_odds_ratio": -0.6654232740402222, + "logits/chosen": 1.7312673330307007, + "logits/rejected": 1.7077823877334595, + "logps/chosen": -1.9784681797027588, + "logps/rejected": -2.1287527084350586, + "loss": 1.7675779342651368, + "nll_loss": 1.7010358572006226, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19784680008888245, + "rewards/margins": 0.015028467401862144, + "rewards/rejected": -0.21287527680397034, + "step": 800 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.0278985183686018, + "eval_log_odds_ratio": -0.7607023119926453, + "eval_logits/chosen": 1.7193233966827393, + "eval_logits/rejected": 1.6865739822387695, + "eval_logps/chosen": -2.012319326400757, + "eval_logps/rejected": -2.0422775745391846, + "eval_loss": 1.8310211896896362, + "eval_nll_loss": 1.7549511194229126, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.20123191177845, + "eval_rewards/margins": 0.0029958393424749374, + "eval_rewards/rejected": -0.2042277753353119, + "eval_runtime": 53.4403, + "eval_samples_per_second": 9.356, + "eval_steps_per_second": 4.678, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 0.19002483785152435, + "learning_rate": 3.136e-06, + "log_odds_chosen": -0.021020114421844482, + "log_odds_ratio": -0.7389410138130188, + "logits/chosen": 1.7910667657852173, + "logits/rejected": 1.743281602859497, + "logps/chosen": -2.0423073768615723, + "logps/rejected": -2.030003070831299, + "loss": 1.8281953811645508, + "nll_loss": 1.7543014287948608, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20423074066638947, + "rewards/margins": -0.0012304515112191439, + "rewards/rejected": -0.2030002772808075, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 0.18885421752929688, + "learning_rate": 3.0648888888888887e-06, + "log_odds_chosen": -0.009633201174438, + "log_odds_ratio": -0.8210141062736511, + "logits/chosen": 1.713783621788025, + "logits/rejected": 1.663731575012207, + "logps/chosen": -2.0672500133514404, + "logps/rejected": -2.0374674797058105, + "loss": 1.8184293746948241, + "nll_loss": 1.736328125, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20672500133514404, + "rewards/margins": -0.002978231757879257, + "rewards/rejected": -0.20374679565429688, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 0.19928835332393646, + "learning_rate": 2.9937777777777776e-06, + "log_odds_chosen": -0.09898372739553452, + "log_odds_ratio": -0.8769068717956543, + "logits/chosen": 1.627624273300171, + "logits/rejected": 1.614092469215393, + "logps/chosen": -1.9781357049942017, + "logps/rejected": -1.8761498928070068, + "loss": 1.7701539993286133, + "nll_loss": 1.6824630498886108, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.19781355559825897, + "rewards/margins": -0.010198570787906647, + "rewards/rejected": -0.18761499226093292, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.19081026315689087, + "learning_rate": 2.9226666666666666e-06, + "log_odds_chosen": -0.04123927652835846, + "log_odds_ratio": -0.7816277742385864, + "logits/chosen": 1.6747424602508545, + "logits/rejected": 1.5965977907180786, + "logps/chosen": -1.9864768981933594, + "logps/rejected": -1.956075668334961, + "loss": 1.797834587097168, + "nll_loss": 1.7196719646453857, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.19864769279956818, + "rewards/margins": -0.0030401155818253756, + "rewards/rejected": -0.19560757279396057, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.23284944891929626, + "learning_rate": 2.8515555555555555e-06, + "log_odds_chosen": 0.2058713734149933, + "log_odds_ratio": -0.6709384918212891, + "logits/chosen": 1.5520470142364502, + "logits/rejected": 1.5949593782424927, + "logps/chosen": -1.895453691482544, + "logps/rejected": -2.06461763381958, + "loss": 1.7646823883056642, + "nll_loss": 1.697588562965393, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18954536318778992, + "rewards/margins": 0.01691642962396145, + "rewards/rejected": -0.2064618170261383, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.026527805253863335, + "eval_log_odds_ratio": -0.7608199119567871, + "eval_logits/chosen": 1.6602368354797363, + "eval_logits/rejected": 1.62636137008667, + "eval_logps/chosen": -2.0066142082214355, + "eval_logps/rejected": -2.034489393234253, + "eval_loss": 1.8279491662979126, + "eval_nll_loss": 1.7518671751022339, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20066142082214355, + "eval_rewards/margins": 0.002787541365250945, + "eval_rewards/rejected": -0.20344896614551544, + "eval_runtime": 53.4886, + "eval_samples_per_second": 9.348, + "eval_steps_per_second": 4.674, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 0.16045095026493073, + "learning_rate": 2.7804444444444445e-06, + "log_odds_chosen": 0.15138807892799377, + "log_odds_ratio": -0.6764562129974365, + "logits/chosen": 1.6566671133041382, + "logits/rejected": 1.6294691562652588, + "logps/chosen": -1.9152675867080688, + "logps/rejected": -2.047109603881836, + "loss": 1.759444808959961, + "nll_loss": 1.6917991638183594, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19152674078941345, + "rewards/margins": 0.01318420935422182, + "rewards/rejected": -0.2047109305858612, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 0.15155339241027832, + "learning_rate": 2.7093333333333335e-06, + "log_odds_chosen": 0.28874093294143677, + "log_odds_ratio": -0.6331272125244141, + "logits/chosen": 1.696692705154419, + "logits/rejected": 1.656974196434021, + "logps/chosen": -1.8829189538955688, + "logps/rejected": -2.1337971687316895, + "loss": 1.8225980758666993, + "nll_loss": 1.7592853307724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18829190731048584, + "rewards/margins": 0.02508782222867012, + "rewards/rejected": -0.21337974071502686, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 0.23297204077243805, + "learning_rate": 2.638222222222222e-06, + "log_odds_chosen": 0.08328817784786224, + "log_odds_ratio": -0.717012882232666, + "logits/chosen": 1.7321844100952148, + "logits/rejected": 1.7046935558319092, + "logps/chosen": -1.954520583152771, + "logps/rejected": -2.024279832839966, + "loss": 1.7641315460205078, + "nll_loss": 1.6924302577972412, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19545204937458038, + "rewards/margins": 0.006975927390158176, + "rewards/rejected": -0.20242798328399658, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 0.20234379172325134, + "learning_rate": 2.567111111111111e-06, + "log_odds_chosen": 0.097869873046875, + "log_odds_ratio": -0.7072083353996277, + "logits/chosen": 1.7079193592071533, + "logits/rejected": 1.625478982925415, + "logps/chosen": -1.9695608615875244, + "logps/rejected": -2.0492820739746094, + "loss": 1.783566665649414, + "nll_loss": 1.712845802307129, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1969560831785202, + "rewards/margins": 0.007972110994160175, + "rewards/rejected": -0.2049282044172287, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 0.1814461499452591, + "learning_rate": 2.496e-06, + "log_odds_chosen": 0.026505127549171448, + "log_odds_ratio": -0.7608965635299683, + "logits/chosen": 1.6023356914520264, + "logits/rejected": 1.6222490072250366, + "logps/chosen": -2.0157110691070557, + "logps/rejected": -2.047163486480713, + "loss": 1.7937837600708009, + "nll_loss": 1.7176940441131592, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20157113671302795, + "rewards/margins": 0.0031452514231204987, + "rewards/rejected": -0.20471635460853577, + "step": 900 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.02784898318350315, + "eval_log_odds_ratio": -0.7608876824378967, + "eval_logits/chosen": 1.6986433267593384, + "eval_logits/rejected": 1.666439175605774, + "eval_logps/chosen": -2.00809645652771, + "eval_logps/rejected": -2.0373218059539795, + "eval_loss": 1.8273944854736328, + "eval_nll_loss": 1.7513054609298706, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.20080965757369995, + "eval_rewards/margins": 0.00292251817882061, + "eval_rewards/rejected": -0.2037321925163269, + "eval_runtime": 53.4308, + "eval_samples_per_second": 9.358, + "eval_steps_per_second": 4.679, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 0.2085212618112564, + "learning_rate": 2.424888888888889e-06, + "log_odds_chosen": 0.18774743378162384, + "log_odds_ratio": -0.6731225848197937, + "logits/chosen": 1.6927309036254883, + "logits/rejected": 1.6234180927276611, + "logps/chosen": -1.8558231592178345, + "logps/rejected": -2.020806312561035, + "loss": 1.7055980682373046, + "nll_loss": 1.6382856369018555, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18558230996131897, + "rewards/margins": 0.016498321667313576, + "rewards/rejected": -0.202080637216568, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 0.1574070155620575, + "learning_rate": 2.353777777777778e-06, + "log_odds_chosen": 0.09441863000392914, + "log_odds_ratio": -0.6969857215881348, + "logits/chosen": 1.7141573429107666, + "logits/rejected": 1.7830305099487305, + "logps/chosen": -1.9033101797103882, + "logps/rejected": -1.989233374595642, + "loss": 1.7647336959838866, + "nll_loss": 1.6950347423553467, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19033101201057434, + "rewards/margins": 0.008592324331402779, + "rewards/rejected": -0.19892333447933197, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 0.14556527137756348, + "learning_rate": 2.2826666666666664e-06, + "log_odds_chosen": 0.10065688192844391, + "log_odds_ratio": -0.6970812082290649, + "logits/chosen": 1.7773923873901367, + "logits/rejected": 1.7257808446884155, + "logps/chosen": -1.8344297409057617, + "logps/rejected": -1.910177230834961, + "loss": 1.795981216430664, + "nll_loss": 1.7262731790542603, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18344298005104065, + "rewards/margins": 0.007574764080345631, + "rewards/rejected": -0.19101771712303162, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 0.13724461197853088, + "learning_rate": 2.2115555555555553e-06, + "log_odds_chosen": 0.07881642878055573, + "log_odds_ratio": -0.722042977809906, + "logits/chosen": 1.7508437633514404, + "logits/rejected": 1.724735975265503, + "logps/chosen": -1.9031156301498413, + "logps/rejected": -1.9599933624267578, + "loss": 1.7547365188598634, + "nll_loss": 1.6825320720672607, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19031158089637756, + "rewards/margins": 0.005687765311449766, + "rewards/rejected": -0.1959993541240692, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 0.17632044851779938, + "learning_rate": 2.1404444444444443e-06, + "log_odds_chosen": -0.024805480614304543, + "log_odds_ratio": -0.7657346129417419, + "logits/chosen": 1.7735048532485962, + "logits/rejected": 1.7464863061904907, + "logps/chosen": -1.9814984798431396, + "logps/rejected": -1.952270269393921, + "loss": 1.776956558227539, + "nll_loss": 1.700383186340332, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19814984500408173, + "rewards/margins": -0.0029228185303509235, + "rewards/rejected": -0.19522707164287567, + "step": 950 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.028202777728438377, + "eval_log_odds_ratio": -0.7601404786109924, + "eval_logits/chosen": 1.724393606185913, + "eval_logits/rejected": 1.691964864730835, + "eval_logps/chosen": -2.0063939094543457, + "eval_logps/rejected": -2.036207675933838, + "eval_loss": 1.8266409635543823, + "eval_nll_loss": 1.7506269216537476, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20063939690589905, + "eval_rewards/margins": 0.002981391968205571, + "eval_rewards/rejected": -0.2036207914352417, + "eval_runtime": 53.4898, + "eval_samples_per_second": 9.348, + "eval_steps_per_second": 4.674, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 0.16243591904640198, + "learning_rate": 2.0693333333333332e-06, + "log_odds_chosen": 0.10965070873498917, + "log_odds_ratio": -0.6967736482620239, + "logits/chosen": 1.8899224996566772, + "logits/rejected": 1.8401854038238525, + "logps/chosen": -1.945563554763794, + "logps/rejected": -2.0543692111968994, + "loss": 1.8785415649414063, + "nll_loss": 1.8088642358779907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1945563554763794, + "rewards/margins": 0.010880568996071815, + "rewards/rejected": -0.20543691515922546, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 0.21627004444599152, + "learning_rate": 1.998222222222222e-06, + "log_odds_chosen": 0.0991244837641716, + "log_odds_ratio": -0.7134609222412109, + "logits/chosen": 1.6730775833129883, + "logits/rejected": 1.6934372186660767, + "logps/chosen": -1.9332492351531982, + "logps/rejected": -2.012324333190918, + "loss": 1.7858917236328125, + "nll_loss": 1.7145456075668335, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1933249533176422, + "rewards/margins": 0.007907481864094734, + "rewards/rejected": -0.2012324333190918, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 0.1899784356355667, + "learning_rate": 1.927111111111111e-06, + "log_odds_chosen": 0.22156307101249695, + "log_odds_ratio": -0.6330583095550537, + "logits/chosen": 1.6686766147613525, + "logits/rejected": 1.634007453918457, + "logps/chosen": -1.867730736732483, + "logps/rejected": -2.0626068115234375, + "loss": 1.7414642333984376, + "nll_loss": 1.6781585216522217, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18677309155464172, + "rewards/margins": 0.019487615674734116, + "rewards/rejected": -0.20626071095466614, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 0.17067763209342957, + "learning_rate": 1.856e-06, + "log_odds_chosen": 0.18612933158874512, + "log_odds_ratio": -0.6796912550926208, + "logits/chosen": 1.6610866785049438, + "logits/rejected": 1.5915647745132446, + "logps/chosen": -1.931335687637329, + "logps/rejected": -2.088463306427002, + "loss": 1.752705192565918, + "nll_loss": 1.684735655784607, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19313354790210724, + "rewards/margins": 0.01571280136704445, + "rewards/rejected": -0.20884636044502258, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 0.2076931893825531, + "learning_rate": 1.7848888888888888e-06, + "log_odds_chosen": 0.1702508181333542, + "log_odds_ratio": -0.6891772747039795, + "logits/chosen": 1.7464933395385742, + "logits/rejected": 1.6518672704696655, + "logps/chosen": -1.9614388942718506, + "logps/rejected": -2.1067023277282715, + "loss": 1.8084089279174804, + "nll_loss": 1.7394912242889404, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19614391028881073, + "rewards/margins": 0.014526346698403358, + "rewards/rejected": -0.21067026257514954, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.02751515619456768, + "eval_log_odds_ratio": -0.7607263326644897, + "eval_logits/chosen": 1.7029526233673096, + "eval_logits/rejected": 1.67202889919281, + "eval_logps/chosen": -2.0067930221557617, + "eval_logps/rejected": -2.0359673500061035, + "eval_loss": 1.8261206150054932, + "eval_nll_loss": 1.7500479221343994, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20067930221557617, + "eval_rewards/margins": 0.002917409408837557, + "eval_rewards/rejected": -0.20359672605991364, + "eval_runtime": 53.4833, + "eval_samples_per_second": 9.349, + "eval_steps_per_second": 4.674, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 0.24526309967041016, + "learning_rate": 1.7137777777777778e-06, + "log_odds_chosen": 0.18274818360805511, + "log_odds_ratio": -0.6588706374168396, + "logits/chosen": 1.748281717300415, + "logits/rejected": 1.8165216445922852, + "logps/chosen": -1.9252090454101562, + "logps/rejected": -2.0696587562561035, + "loss": 1.8064483642578124, + "nll_loss": 1.7405614852905273, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19252091646194458, + "rewards/margins": 0.014444932341575623, + "rewards/rejected": -0.2069658488035202, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.21430929005146027, + "learning_rate": 1.6426666666666666e-06, + "log_odds_chosen": 0.20202656090259552, + "log_odds_ratio": -0.6916796565055847, + "logits/chosen": 1.726967215538025, + "logits/rejected": 1.7503808736801147, + "logps/chosen": -1.885287880897522, + "logps/rejected": -2.0325818061828613, + "loss": 1.7769804000854492, + "nll_loss": 1.7078125476837158, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18852879106998444, + "rewards/margins": 0.014729383401572704, + "rewards/rejected": -0.2032581865787506, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.14660713076591492, + "learning_rate": 1.5715555555555555e-06, + "log_odds_chosen": 0.08981350809335709, + "log_odds_ratio": -0.7998946309089661, + "logits/chosen": 1.7373206615447998, + "logits/rejected": 1.685080885887146, + "logps/chosen": -1.9951941967010498, + "logps/rejected": -2.0966899394989014, + "loss": 1.7756771087646483, + "nll_loss": 1.6956878900527954, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.19951942563056946, + "rewards/margins": 0.010149596258997917, + "rewards/rejected": -0.20966899394989014, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.20548580586910248, + "learning_rate": 1.5004444444444445e-06, + "log_odds_chosen": 0.2052316665649414, + "log_odds_ratio": -0.6860161423683167, + "logits/chosen": 1.640928030014038, + "logits/rejected": 1.6122572422027588, + "logps/chosen": -1.8723407983779907, + "logps/rejected": -2.0312299728393555, + "loss": 1.6772642135620117, + "nll_loss": 1.6086626052856445, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1872340738773346, + "rewards/margins": 0.01588893122971058, + "rewards/rejected": -0.20312300324440002, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.16696485877037048, + "learning_rate": 1.4293333333333332e-06, + "log_odds_chosen": 0.14438050985336304, + "log_odds_ratio": -0.7038607597351074, + "logits/chosen": 1.71317458152771, + "logits/rejected": 1.705130934715271, + "logps/chosen": -1.912302017211914, + "logps/rejected": -2.036870002746582, + "loss": 1.7682226181030274, + "nll_loss": 1.6978362798690796, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19123020768165588, + "rewards/margins": 0.012456776574254036, + "rewards/rejected": -0.20368699729442596, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.02581177093088627, + "eval_log_odds_ratio": -0.7616178393363953, + "eval_logits/chosen": 1.7283238172531128, + "eval_logits/rejected": 1.6983083486557007, + "eval_logps/chosen": -2.000814914703369, + "eval_logps/rejected": -2.028439998626709, + "eval_loss": 1.8245528936386108, + "eval_nll_loss": 1.7483911514282227, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2000814825296402, + "eval_rewards/margins": 0.0027625402435660362, + "eval_rewards/rejected": -0.20284400880336761, + "eval_runtime": 53.4143, + "eval_samples_per_second": 9.361, + "eval_steps_per_second": 4.68, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 0.2045900672674179, + "learning_rate": 1.3582222222222222e-06, + "log_odds_chosen": -0.061510004103183746, + "log_odds_ratio": -0.8348624110221863, + "logits/chosen": 1.605478286743164, + "logits/rejected": 1.642643690109253, + "logps/chosen": -2.066878080368042, + "logps/rejected": -2.0267105102539062, + "loss": 1.7784732818603515, + "nll_loss": 1.6949872970581055, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2066878080368042, + "rewards/margins": -0.004016753286123276, + "rewards/rejected": -0.20267105102539062, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 0.20866218209266663, + "learning_rate": 1.2871111111111111e-06, + "log_odds_chosen": 0.20968547463417053, + "log_odds_ratio": -0.6722984313964844, + "logits/chosen": 1.7972224950790405, + "logits/rejected": 1.7789695262908936, + "logps/chosen": -1.88360595703125, + "logps/rejected": -2.0617847442626953, + "loss": 1.757400894165039, + "nll_loss": 1.690171241760254, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18836061656475067, + "rewards/margins": 0.017817873507738113, + "rewards/rejected": -0.2061784714460373, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 0.1793377697467804, + "learning_rate": 1.2159999999999999e-06, + "log_odds_chosen": -0.09240353852510452, + "log_odds_ratio": -0.8129439353942871, + "logits/chosen": 1.7852309942245483, + "logits/rejected": 1.7333061695098877, + "logps/chosen": -1.9978444576263428, + "logps/rejected": -1.9168767929077148, + "loss": 1.7519147872924805, + "nll_loss": 1.670620322227478, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19978444278240204, + "rewards/margins": -0.008096768520772457, + "rewards/rejected": -0.191687673330307, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 0.11997362971305847, + "learning_rate": 1.1448888888888888e-06, + "log_odds_chosen": -0.11117003858089447, + "log_odds_ratio": -0.8359676599502563, + "logits/chosen": 1.8102128505706787, + "logits/rejected": 1.8499844074249268, + "logps/chosen": -1.976915717124939, + "logps/rejected": -1.890819787979126, + "loss": 1.8335563659667968, + "nll_loss": 1.7499593496322632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19769158959388733, + "rewards/margins": -0.008609614335000515, + "rewards/rejected": -0.18908196687698364, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 0.1679936945438385, + "learning_rate": 1.0737777777777776e-06, + "log_odds_chosen": 0.17219075560569763, + "log_odds_ratio": -0.7893794775009155, + "logits/chosen": 1.6944379806518555, + "logits/rejected": 1.5971474647521973, + "logps/chosen": -1.9780410528182983, + "logps/rejected": -2.1673245429992676, + "loss": 1.7718217849731446, + "nll_loss": 1.692883849143982, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19780410826206207, + "rewards/margins": 0.01892835833132267, + "rewards/rejected": -0.216732457280159, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.029876820743083954, + "eval_log_odds_ratio": -0.7605326175689697, + "eval_logits/chosen": 1.7276082038879395, + "eval_logits/rejected": 1.6985164880752563, + "eval_logps/chosen": -2.0030901432037354, + "eval_logps/rejected": -2.0341217517852783, + "eval_loss": 1.8238236904144287, + "eval_nll_loss": 1.7477705478668213, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.20030902326107025, + "eval_rewards/margins": 0.003103181254118681, + "eval_rewards/rejected": -0.20341220498085022, + "eval_runtime": 53.5841, + "eval_samples_per_second": 9.331, + "eval_steps_per_second": 4.666, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 0.18926319479942322, + "learning_rate": 1.0026666666666665e-06, + "log_odds_chosen": 0.14324593544006348, + "log_odds_ratio": -0.6872311234474182, + "logits/chosen": 1.7348664999008179, + "logits/rejected": 1.749889612197876, + "logps/chosen": -1.9070713520050049, + "logps/rejected": -2.018203020095825, + "loss": 1.7610118865966797, + "nll_loss": 1.6922893524169922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19070713222026825, + "rewards/margins": 0.011113164946436882, + "rewards/rejected": -0.20182029902935028, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 0.1704036146402359, + "learning_rate": 9.315555555555555e-07, + "log_odds_chosen": 0.12752032279968262, + "log_odds_ratio": -0.7044534683227539, + "logits/chosen": 1.6711995601654053, + "logits/rejected": 1.711930513381958, + "logps/chosen": -2.0130393505096436, + "logps/rejected": -2.13267183303833, + "loss": 1.7703908920288085, + "nll_loss": 1.6999456882476807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20130392909049988, + "rewards/margins": 0.011963268741965294, + "rewards/rejected": -0.21326720714569092, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 0.25536438822746277, + "learning_rate": 8.604444444444445e-07, + "log_odds_chosen": 0.051788054406642914, + "log_odds_ratio": -0.7237650156021118, + "logits/chosen": 1.7273956537246704, + "logits/rejected": 1.7318992614746094, + "logps/chosen": -1.9477020502090454, + "logps/rejected": -1.9869747161865234, + "loss": 1.742520523071289, + "nll_loss": 1.6701440811157227, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1947702020406723, + "rewards/margins": 0.003927274607121944, + "rewards/rejected": -0.19869747757911682, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.1945340931415558, + "learning_rate": 7.893333333333333e-07, + "log_odds_chosen": 0.07333675771951675, + "log_odds_ratio": -0.7535021901130676, + "logits/chosen": 1.6395387649536133, + "logits/rejected": 1.667943000793457, + "logps/chosen": -1.9120795726776123, + "logps/rejected": -1.9539562463760376, + "loss": 1.7532955169677735, + "nll_loss": 1.6779453754425049, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19120794534683228, + "rewards/margins": 0.004187657497823238, + "rewards/rejected": -0.19539561867713928, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.1398656815290451, + "learning_rate": 7.182222222222222e-07, + "log_odds_chosen": 0.16766589879989624, + "log_odds_ratio": -0.6838506460189819, + "logits/chosen": 1.7418153285980225, + "logits/rejected": 1.7515296936035156, + "logps/chosen": -1.9456104040145874, + "logps/rejected": -2.071044445037842, + "loss": 1.7342472076416016, + "nll_loss": 1.6658618450164795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19456104934215546, + "rewards/margins": 0.01254339050501585, + "rewards/rejected": -0.20710444450378418, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.02730737067759037, + "eval_log_odds_ratio": -0.7609456181526184, + "eval_logits/chosen": 1.732469916343689, + "eval_logits/rejected": 1.703104853630066, + "eval_logps/chosen": -2.002647638320923, + "eval_logps/rejected": -2.0316905975341797, + "eval_loss": 1.8241521120071411, + "eval_nll_loss": 1.74805748462677, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20026475191116333, + "eval_rewards/margins": 0.002904308494180441, + "eval_rewards/rejected": -0.2031690627336502, + "eval_runtime": 53.4648, + "eval_samples_per_second": 9.352, + "eval_steps_per_second": 4.676, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.22400490939617157, + "learning_rate": 6.47111111111111e-07, + "log_odds_chosen": 0.1743636131286621, + "log_odds_ratio": -0.67372065782547, + "logits/chosen": 1.6396913528442383, + "logits/rejected": 1.667543649673462, + "logps/chosen": -1.782273292541504, + "logps/rejected": -1.9327503442764282, + "loss": 1.6943111419677734, + "nll_loss": 1.6269391775131226, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17822733521461487, + "rewards/margins": 0.015047693625092506, + "rewards/rejected": -0.19327504932880402, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.18468748033046722, + "learning_rate": 5.76e-07, + "log_odds_chosen": 0.03543982282280922, + "log_odds_ratio": -0.7380915284156799, + "logits/chosen": 1.768194556236267, + "logits/rejected": 1.7483928203582764, + "logps/chosen": -1.959222435951233, + "logps/rejected": -1.9920673370361328, + "loss": 1.8366264343261718, + "nll_loss": 1.762817621231079, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19592224061489105, + "rewards/margins": 0.0032845207024365664, + "rewards/rejected": -0.19920675456523895, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 0.22529029846191406, + "learning_rate": 5.048888888888889e-07, + "log_odds_chosen": 0.05032297968864441, + "log_odds_ratio": -0.7138996720314026, + "logits/chosen": 1.744073510169983, + "logits/rejected": 1.708142638206482, + "logps/chosen": -1.8569657802581787, + "logps/rejected": -1.9047315120697021, + "loss": 1.6976106643676758, + "nll_loss": 1.626220941543579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1856965720653534, + "rewards/margins": 0.0047765769995749, + "rewards/rejected": -0.19047315418720245, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 0.15872281789779663, + "learning_rate": 4.3377777777777773e-07, + "log_odds_chosen": 0.11936229467391968, + "log_odds_ratio": -0.6942839622497559, + "logits/chosen": 1.8646312952041626, + "logits/rejected": 1.788290023803711, + "logps/chosen": -1.9186222553253174, + "logps/rejected": -2.0216126441955566, + "loss": 1.7527915954589843, + "nll_loss": 1.6833631992340088, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19186219573020935, + "rewards/margins": 0.01029905118048191, + "rewards/rejected": -0.2021612673997879, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 0.21331432461738586, + "learning_rate": 3.626666666666667e-07, + "log_odds_chosen": 0.15667389333248138, + "log_odds_ratio": -0.7080708146095276, + "logits/chosen": 1.7474346160888672, + "logits/rejected": 1.706602692604065, + "logps/chosen": -1.8474972248077393, + "logps/rejected": -1.9875065088272095, + "loss": 1.7560319900512695, + "nll_loss": 1.6852247714996338, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18474970757961273, + "rewards/margins": 0.014000937342643738, + "rewards/rejected": -0.19875064492225647, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.028188293799757957, + "eval_log_odds_ratio": -0.7610485553741455, + "eval_logits/chosen": 1.7313594818115234, + "eval_logits/rejected": 1.7020314931869507, + "eval_logps/chosen": -2.0000693798065186, + "eval_logps/rejected": -2.0295324325561523, + "eval_loss": 1.8229460716247559, + "eval_nll_loss": 1.7468411922454834, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.20000694692134857, + "eval_rewards/margins": 0.002946300432085991, + "eval_rewards/rejected": -0.2029532641172409, + "eval_runtime": 53.5458, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 4.669, + "step": 1200 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1200/training_args.bin b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a6aabf8274a2e96d42587f441ac5dfa45450316 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e300a4b9ea9cb4eee02d826775897387bbbe1b2eb5ac963e6331fd584f8ee0 +size 5457 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/README.md b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a69561f79e9919bdd2eb3aaaca6c08223d07d2b5 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_model.safetensors b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d4c9d99d861bb7443b14db200a56836c451a7b87 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d225c2e9ce89202661efcce1606db3f986945f0b521e052269692bb9754be1 +size 180385008 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/chat_template.jinja b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/optimizer.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..488943467989a31178a1d88c5dde513339e2232d --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c74d0adeae4399680fbc9631ff4ede8a236d0268118d0c797679d2500a246c82 +size 360902475 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/rng_state.pth b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2275456c138024633720ad98c6c63acbe9777008 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a95215f64b02d62fb58ace326ad670f1d16eb1761f7fa3b3478d43d2b8d6108 +size 14645 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scaler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..07ee4b50f44b17bb5b8227eea1d6870fb9256838 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81f7e3f1db34425f7108cfef030ac75cf6192318c9d437aad0ccd8bfd7f16788 +size 1383 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scheduler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f275c37686aa1edf0b6a2a0d6990784f9c44b68 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b082124f96195b46786269fec0d33d84cac0ff6a412c8304af759bf69ed3c871 +size 1465 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/trainer_state.json b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4e9570c90c307b93552aeef18fd1da9686372370 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/trainer_state.json @@ -0,0 +1,2759 @@ +{ + "best_global_step": 50, + "best_metric": 0.5519999861717224, + "best_model_checkpoint": "output/lora/checkpoint-50", + "epoch": 2.0, + "eval_steps": 50, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 1.3608590364456177, + "learning_rate": 5.76e-07, + "log_odds_chosen": 0.0073966654017567635, + "log_odds_ratio": -0.8660133481025696, + "logits/chosen": 1.1517311334609985, + "logits/rejected": 1.1107122898101807, + "logps/chosen": -3.0449740886688232, + "logps/rejected": -3.0518546104431152, + "loss": 3.4909488677978517, + "nll_loss": 3.4043469429016113, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3044974207878113, + "rewards/margins": 0.0006880179280415177, + "rewards/rejected": -0.3051854372024536, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.8278084993362427, + "learning_rate": 1.2159999999999999e-06, + "log_odds_chosen": -0.0764567106962204, + "log_odds_ratio": -0.9281005859375, + "logits/chosen": 0.985865592956543, + "logits/rejected": 0.9893043637275696, + "logps/chosen": -3.195783853530884, + "logps/rejected": -3.128960132598877, + "loss": 3.6714431762695314, + "nll_loss": 3.5786330699920654, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3195783793926239, + "rewards/margins": -0.006682366132736206, + "rewards/rejected": -0.3128960430622101, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 0.7320559024810791, + "learning_rate": 1.856e-06, + "log_odds_chosen": -0.13351905345916748, + "log_odds_ratio": -0.968097984790802, + "logits/chosen": 1.097598910331726, + "logits/rejected": 1.1367751359939575, + "logps/chosen": -3.1909520626068115, + "logps/rejected": -3.0626091957092285, + "loss": 3.345610427856445, + "nll_loss": 3.2488014698028564, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3190951943397522, + "rewards/margins": -0.01283429004251957, + "rewards/rejected": -0.3062609136104584, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 0.6406434178352356, + "learning_rate": 2.496e-06, + "log_odds_chosen": 0.0689389556646347, + "log_odds_ratio": -0.7773251533508301, + "logits/chosen": 1.0645023584365845, + "logits/rejected": 1.0285594463348389, + "logps/chosen": -2.8054141998291016, + "logps/rejected": -2.8708128929138184, + "loss": 3.268035125732422, + "nll_loss": 3.190302848815918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28054141998291016, + "rewards/margins": 0.006539878435432911, + "rewards/rejected": -0.2870813012123108, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 0.5944439172744751, + "learning_rate": 3.136e-06, + "log_odds_chosen": -0.14803443849086761, + "log_odds_ratio": -0.9101846814155579, + "logits/chosen": 1.166550874710083, + "logits/rejected": 1.1396485567092896, + "logps/chosen": -2.988274335861206, + "logps/rejected": -2.8451037406921387, + "loss": 3.138271141052246, + "nll_loss": 3.0472521781921387, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2988274395465851, + "rewards/margins": -0.01431706827133894, + "rewards/rejected": -0.28451037406921387, + "step": 50 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.13235610723495483, + "eval_log_odds_ratio": -0.8047618269920349, + "eval_logits/chosen": 1.097177267074585, + "eval_logits/rejected": 1.080869197845459, + "eval_logps/chosen": -2.87162446975708, + "eval_logps/rejected": -3.0049262046813965, + "eval_loss": 3.0927987098693848, + "eval_nll_loss": 3.012322425842285, + "eval_rewards/accuracies": 0.5519999861717224, + "eval_rewards/chosen": -0.2871624529361725, + "eval_rewards/margins": 0.013330196961760521, + "eval_rewards/rejected": -0.30049264430999756, + "eval_runtime": 53.8284, + "eval_samples_per_second": 9.289, + "eval_steps_per_second": 4.644, + "step": 50 + }, + { + "epoch": 0.096, + "grad_norm": 0.5271857380867004, + "learning_rate": 3.7759999999999995e-06, + "log_odds_chosen": 0.16638590395450592, + "log_odds_ratio": -0.7247543334960938, + "logits/chosen": 1.2056455612182617, + "logits/rejected": 1.1612131595611572, + "logps/chosen": -2.6139063835144043, + "logps/rejected": -2.7844128608703613, + "loss": 2.9017066955566406, + "nll_loss": 2.8292312622070312, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.2613906264305115, + "rewards/margins": 0.017050642520189285, + "rewards/rejected": -0.27844128012657166, + "step": 60 + }, + { + "epoch": 0.112, + "grad_norm": 0.387198805809021, + "learning_rate": 4.416000000000001e-06, + "log_odds_chosen": 0.04409245774149895, + "log_odds_ratio": -0.7438842058181763, + "logits/chosen": 1.2054810523986816, + "logits/rejected": 1.1467866897583008, + "logps/chosen": -2.6231472492218018, + "logps/rejected": -2.6642415523529053, + "loss": 2.8387472152709963, + "nll_loss": 2.7643589973449707, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26231473684310913, + "rewards/margins": 0.004109424538910389, + "rewards/rejected": -0.26642411947250366, + "step": 70 + }, + { + "epoch": 0.128, + "grad_norm": 0.3270696699619293, + "learning_rate": 5.0559999999999995e-06, + "log_odds_chosen": 0.250882089138031, + "log_odds_ratio": -0.7435027956962585, + "logits/chosen": 1.0074714422225952, + "logits/rejected": 0.9184917211532593, + "logps/chosen": -2.6167237758636475, + "logps/rejected": -2.8408350944519043, + "loss": 2.674570655822754, + "nll_loss": 2.600220203399658, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.26167237758636475, + "rewards/margins": 0.022411148995161057, + "rewards/rejected": -0.2840835154056549, + "step": 80 + }, + { + "epoch": 0.144, + "grad_norm": 0.30302050709724426, + "learning_rate": 5.695999999999999e-06, + "log_odds_chosen": 0.05548218637704849, + "log_odds_ratio": -0.7798537015914917, + "logits/chosen": 1.0686867237091064, + "logits/rejected": 1.0430196523666382, + "logps/chosen": -2.585648536682129, + "logps/rejected": -2.6337788105010986, + "loss": 2.5465234756469726, + "nll_loss": 2.468538522720337, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.258564829826355, + "rewards/margins": 0.004813040141016245, + "rewards/rejected": -0.2633778750896454, + "step": 90 + }, + { + "epoch": 0.16, + "grad_norm": 0.37143367528915405, + "learning_rate": 6.336e-06, + "log_odds_chosen": 0.2372448742389679, + "log_odds_ratio": -0.6427541971206665, + "logits/chosen": 0.9812337160110474, + "logits/rejected": 1.095284104347229, + "logps/chosen": -2.2601876258850098, + "logps/rejected": -2.4757676124572754, + "loss": 2.297770690917969, + "nll_loss": 2.2334952354431152, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2260187566280365, + "rewards/margins": 0.021558010950684547, + "rewards/rejected": -0.2475767582654953, + "step": 100 + }, + { + "epoch": 0.16, + "eval_log_odds_chosen": 0.0465145967900753, + "eval_log_odds_ratio": -0.7675164341926575, + "eval_logits/chosen": 1.0497050285339355, + "eval_logits/rejected": 1.0021971464157104, + "eval_logps/chosen": -2.3477163314819336, + "eval_logps/rejected": -2.400268077850342, + "eval_loss": 2.2927565574645996, + "eval_nll_loss": 2.2160050868988037, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.23477165400981903, + "eval_rewards/margins": 0.005255142226815224, + "eval_rewards/rejected": -0.2400268018245697, + "eval_runtime": 53.4499, + "eval_samples_per_second": 9.355, + "eval_steps_per_second": 4.677, + "step": 100 + }, + { + "epoch": 0.176, + "grad_norm": 0.19999727606773376, + "learning_rate": 6.976e-06, + "log_odds_chosen": -0.004880452062934637, + "log_odds_ratio": -0.781264066696167, + "logits/chosen": 0.9816803932189941, + "logits/rejected": 0.9683195352554321, + "logps/chosen": -2.3489508628845215, + "logps/rejected": -2.3528401851654053, + "loss": 2.225248908996582, + "nll_loss": 2.1471219062805176, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.23489508032798767, + "rewards/margins": 0.00038894638419151306, + "rewards/rejected": -0.23528404533863068, + "step": 110 + }, + { + "epoch": 0.192, + "grad_norm": 0.23247133195400238, + "learning_rate": 7.616e-06, + "log_odds_chosen": 0.1978728026151657, + "log_odds_ratio": -0.692144513130188, + "logits/chosen": 1.0511195659637451, + "logits/rejected": 1.0562175512313843, + "logps/chosen": -2.1107664108276367, + "logps/rejected": -2.264862060546875, + "loss": 2.0843576431274413, + "nll_loss": 2.015143632888794, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21107664704322815, + "rewards/margins": 0.01540955901145935, + "rewards/rejected": -0.2264862060546875, + "step": 120 + }, + { + "epoch": 0.208, + "grad_norm": 0.18884675204753876, + "learning_rate": 7.971555555555556e-06, + "log_odds_chosen": -0.04111287742853165, + "log_odds_ratio": -0.7880030870437622, + "logits/chosen": 1.189117193222046, + "logits/rejected": 1.155256986618042, + "logps/chosen": -2.2085041999816895, + "logps/rejected": -2.1815085411071777, + "loss": 2.1168283462524413, + "nll_loss": 2.0380282402038574, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.22085042297840118, + "rewards/margins": -0.0026995770167559385, + "rewards/rejected": -0.21815085411071777, + "step": 130 + }, + { + "epoch": 0.224, + "grad_norm": 0.18060511350631714, + "learning_rate": 7.900444444444444e-06, + "log_odds_chosen": -0.0038092255126684904, + "log_odds_ratio": -0.8108028173446655, + "logits/chosen": 1.0828830003738403, + "logits/rejected": 1.1538610458374023, + "logps/chosen": -2.226916790008545, + "logps/rejected": -2.2349116802215576, + "loss": 1.993095588684082, + "nll_loss": 1.9120155572891235, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.22269168496131897, + "rewards/margins": 0.0007995119085535407, + "rewards/rejected": -0.22349116206169128, + "step": 140 + }, + { + "epoch": 0.24, + "grad_norm": 0.17054790258407593, + "learning_rate": 7.829333333333333e-06, + "log_odds_chosen": -0.017614809796214104, + "log_odds_ratio": -0.8039711117744446, + "logits/chosen": 1.1850899457931519, + "logits/rejected": 1.0987098217010498, + "logps/chosen": -2.1193461418151855, + "logps/rejected": -2.0869622230529785, + "loss": 1.9284049987792968, + "nll_loss": 1.8480079174041748, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2119346410036087, + "rewards/margins": -0.003238401608541608, + "rewards/rejected": -0.20869621634483337, + "step": 150 + }, + { + "epoch": 0.24, + "eval_log_odds_chosen": 0.030384650453925133, + "eval_log_odds_ratio": -0.7645502686500549, + "eval_logits/chosen": 1.1388652324676514, + "eval_logits/rejected": 1.0906065702438354, + "eval_logps/chosen": -2.155924081802368, + "eval_logps/rejected": -2.191805362701416, + "eval_loss": 1.9731156826019287, + "eval_nll_loss": 1.8966606855392456, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.2155924290418625, + "eval_rewards/margins": 0.0035881223157048225, + "eval_rewards/rejected": -0.21918053925037384, + "eval_runtime": 53.5115, + "eval_samples_per_second": 9.344, + "eval_steps_per_second": 4.672, + "step": 150 + }, + { + "epoch": 0.256, + "grad_norm": 0.1520330011844635, + "learning_rate": 7.75822222222222e-06, + "log_odds_chosen": 0.019025951623916626, + "log_odds_ratio": -0.7408558130264282, + "logits/chosen": 1.2669531106948853, + "logits/rejected": 1.1829397678375244, + "logps/chosen": -2.155996561050415, + "logps/rejected": -2.181856870651245, + "loss": 1.980598258972168, + "nll_loss": 1.9065126180648804, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.2155996561050415, + "rewards/margins": 0.0025860387831926346, + "rewards/rejected": -0.218185693025589, + "step": 160 + }, + { + "epoch": 0.272, + "grad_norm": 0.18062834441661835, + "learning_rate": 7.68711111111111e-06, + "log_odds_chosen": 0.21886181831359863, + "log_odds_ratio": -0.7065083384513855, + "logits/chosen": 1.2067815065383911, + "logits/rejected": 1.1557743549346924, + "logps/chosen": -2.1274123191833496, + "logps/rejected": -2.323625087738037, + "loss": 1.917841339111328, + "nll_loss": 1.8471901416778564, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.21274125576019287, + "rewards/margins": 0.019621269777417183, + "rewards/rejected": -0.2323625087738037, + "step": 170 + }, + { + "epoch": 0.288, + "grad_norm": 0.1637572944164276, + "learning_rate": 7.616e-06, + "log_odds_chosen": -0.05331949517130852, + "log_odds_ratio": -0.7956011891365051, + "logits/chosen": 1.2504408359527588, + "logits/rejected": 1.2527806758880615, + "logps/chosen": -2.083944082260132, + "logps/rejected": -2.0458476543426514, + "loss": 1.9163087844848632, + "nll_loss": 1.836748719215393, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20839443802833557, + "rewards/margins": -0.0038096606731414795, + "rewards/rejected": -0.2045847624540329, + "step": 180 + }, + { + "epoch": 0.304, + "grad_norm": 0.16114692389965057, + "learning_rate": 7.544888888888889e-06, + "log_odds_chosen": 0.07258275896310806, + "log_odds_ratio": -0.723730206489563, + "logits/chosen": 1.211503505706787, + "logits/rejected": 1.2072746753692627, + "logps/chosen": -1.9964195489883423, + "logps/rejected": -2.0695366859436035, + "loss": 1.851585578918457, + "nll_loss": 1.7792127132415771, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19964194297790527, + "rewards/margins": 0.007311700377613306, + "rewards/rejected": -0.20695367455482483, + "step": 190 + }, + { + "epoch": 0.32, + "grad_norm": 0.1941564530134201, + "learning_rate": 7.473777777777777e-06, + "log_odds_chosen": 0.2467677891254425, + "log_odds_ratio": -0.638620913028717, + "logits/chosen": 1.1916354894638062, + "logits/rejected": 1.1236536502838135, + "logps/chosen": -1.9711834192276, + "logps/rejected": -2.1870553493499756, + "loss": 1.8319341659545898, + "nll_loss": 1.7680721282958984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.19711832702159882, + "rewards/margins": 0.02158718928694725, + "rewards/rejected": -0.21870553493499756, + "step": 200 + }, + { + "epoch": 0.32, + "eval_log_odds_chosen": 0.027904914692044258, + "eval_log_odds_ratio": -0.7658352851867676, + "eval_logits/chosen": 1.264652132987976, + "eval_logits/rejected": 1.221064567565918, + "eval_logps/chosen": -2.1049089431762695, + "eval_logps/rejected": -2.1378941535949707, + "eval_loss": 1.9108957052230835, + "eval_nll_loss": 1.8343122005462646, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.210490882396698, + "eval_rewards/margins": 0.0032985424622893333, + "eval_rewards/rejected": -0.2137894183397293, + "eval_runtime": 53.4723, + "eval_samples_per_second": 9.351, + "eval_steps_per_second": 4.675, + "step": 200 + }, + { + "epoch": 0.336, + "grad_norm": 0.21130956709384918, + "learning_rate": 7.402666666666666e-06, + "log_odds_chosen": 0.05619993805885315, + "log_odds_ratio": -0.6978410482406616, + "logits/chosen": 1.3616042137145996, + "logits/rejected": 1.0906412601470947, + "logps/chosen": -2.0596275329589844, + "logps/rejected": -2.1113193035125732, + "loss": 1.8328233718872071, + "nll_loss": 1.763039231300354, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20596274733543396, + "rewards/margins": 0.005169177893549204, + "rewards/rejected": -0.2111319601535797, + "step": 210 + }, + { + "epoch": 0.352, + "grad_norm": 0.1935756653547287, + "learning_rate": 7.3315555555555546e-06, + "log_odds_chosen": -0.14753268659114838, + "log_odds_ratio": -0.8668686151504517, + "logits/chosen": 1.3593966960906982, + "logits/rejected": 1.282545804977417, + "logps/chosen": -2.158689022064209, + "logps/rejected": -2.0243842601776123, + "loss": 1.855682373046875, + "nll_loss": 1.7689956426620483, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.21586890518665314, + "rewards/margins": -0.01343047060072422, + "rewards/rejected": -0.20243844389915466, + "step": 220 + }, + { + "epoch": 0.368, + "grad_norm": 0.16679787635803223, + "learning_rate": 7.260444444444444e-06, + "log_odds_chosen": 0.06307810544967651, + "log_odds_ratio": -0.7288961410522461, + "logits/chosen": 1.3023065328598022, + "logits/rejected": 1.3040940761566162, + "logps/chosen": -2.015089511871338, + "logps/rejected": -2.0810322761535645, + "loss": 1.8415803909301758, + "nll_loss": 1.7686907052993774, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20150896906852722, + "rewards/margins": 0.00659425463527441, + "rewards/rejected": -0.2081032246351242, + "step": 230 + }, + { + "epoch": 0.384, + "grad_norm": 0.17210455238819122, + "learning_rate": 7.1893333333333325e-06, + "log_odds_chosen": 0.2240598499774933, + "log_odds_ratio": -0.6325433850288391, + "logits/chosen": 1.310367465019226, + "logits/rejected": 1.2338093519210815, + "logps/chosen": -1.9855142831802368, + "logps/rejected": -2.173218011856079, + "loss": 1.8426109313964845, + "nll_loss": 1.7793567180633545, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19855143129825592, + "rewards/margins": 0.018770387396216393, + "rewards/rejected": -0.21732179820537567, + "step": 240 + }, + { + "epoch": 0.4, + "grad_norm": 0.1474185585975647, + "learning_rate": 7.118222222222222e-06, + "log_odds_chosen": -0.09763683378696442, + "log_odds_ratio": -0.811837375164032, + "logits/chosen": 1.3854808807373047, + "logits/rejected": 1.4551901817321777, + "logps/chosen": -2.0398902893066406, + "logps/rejected": -1.9660927057266235, + "loss": 1.808901596069336, + "nll_loss": 1.7277179956436157, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20398902893066406, + "rewards/margins": -0.007379765156656504, + "rewards/rejected": -0.1966092884540558, + "step": 250 + }, + { + "epoch": 0.4, + "eval_log_odds_chosen": 0.027593065053224564, + "eval_log_odds_ratio": -0.7654376029968262, + "eval_logits/chosen": 1.3605482578277588, + "eval_logits/rejected": 1.3180304765701294, + "eval_logps/chosen": -2.083819627761841, + "eval_logps/rejected": -2.115978479385376, + "eval_loss": 1.891451358795166, + "eval_nll_loss": 1.8149076700210571, + "eval_rewards/accuracies": 0.5239999890327454, + "eval_rewards/chosen": -0.20838195085525513, + "eval_rewards/margins": 0.003215902717784047, + "eval_rewards/rejected": -0.21159787476062775, + "eval_runtime": 53.4952, + "eval_samples_per_second": 9.347, + "eval_steps_per_second": 4.673, + "step": 250 + }, + { + "epoch": 0.416, + "grad_norm": 0.18132755160331726, + "learning_rate": 7.047111111111111e-06, + "log_odds_chosen": 0.07458068430423737, + "log_odds_ratio": -0.7160965800285339, + "logits/chosen": 1.4305390119552612, + "logits/rejected": 1.3339704275131226, + "logps/chosen": -1.9973361492156982, + "logps/rejected": -2.0638270378112793, + "loss": 1.8488407135009766, + "nll_loss": 1.777231216430664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19973360002040863, + "rewards/margins": 0.006649085786193609, + "rewards/rejected": -0.20638270676136017, + "step": 260 + }, + { + "epoch": 0.432, + "grad_norm": 0.156590074300766, + "learning_rate": 6.976e-06, + "log_odds_chosen": 0.21048691868782043, + "log_odds_ratio": -0.6846021413803101, + "logits/chosen": 1.3391703367233276, + "logits/rejected": 1.3043503761291504, + "logps/chosen": -1.9781110286712646, + "logps/rejected": -2.1710684299468994, + "loss": 1.8046062469482422, + "nll_loss": 1.736146330833435, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19781112670898438, + "rewards/margins": 0.019295744597911835, + "rewards/rejected": -0.21710684895515442, + "step": 270 + }, + { + "epoch": 0.448, + "grad_norm": 0.2862628698348999, + "learning_rate": 6.904888888888889e-06, + "log_odds_chosen": 0.19719335436820984, + "log_odds_ratio": -0.6589730978012085, + "logits/chosen": 1.3844366073608398, + "logits/rejected": 1.4240922927856445, + "logps/chosen": -1.9418405294418335, + "logps/rejected": -2.117654800415039, + "loss": 1.8532024383544923, + "nll_loss": 1.7873048782348633, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1941840648651123, + "rewards/margins": 0.017581436783075333, + "rewards/rejected": -0.21176549792289734, + "step": 280 + }, + { + "epoch": 0.464, + "grad_norm": 0.18839485943317413, + "learning_rate": 6.833777777777778e-06, + "log_odds_chosen": 0.2900911867618561, + "log_odds_ratio": -0.6224602460861206, + "logits/chosen": 1.4391660690307617, + "logits/rejected": 1.3507264852523804, + "logps/chosen": -1.9003779888153076, + "logps/rejected": -2.1583569049835205, + "loss": 1.7574338912963867, + "nll_loss": 1.6951879262924194, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1900378167629242, + "rewards/margins": 0.025797897949814796, + "rewards/rejected": -0.21583569049835205, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 0.19719934463500977, + "learning_rate": 6.762666666666667e-06, + "log_odds_chosen": 0.07987387478351593, + "log_odds_ratio": -0.7276886105537415, + "logits/chosen": 1.3382771015167236, + "logits/rejected": 1.33270263671875, + "logps/chosen": -1.9756524562835693, + "logps/rejected": -2.0477142333984375, + "loss": 1.8089508056640624, + "nll_loss": 1.7361822128295898, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.1975652575492859, + "rewards/margins": 0.007206143345683813, + "rewards/rejected": -0.20477142930030823, + "step": 300 + }, + { + "epoch": 0.48, + "eval_log_odds_chosen": 0.02609633468091488, + "eval_log_odds_ratio": -0.7655816674232483, + "eval_logits/chosen": 1.4215049743652344, + "eval_logits/rejected": 1.3795844316482544, + "eval_logps/chosen": -2.0664799213409424, + "eval_logps/rejected": -2.096339225769043, + "eval_loss": 1.8772507905960083, + "eval_nll_loss": 1.8006926774978638, + "eval_rewards/accuracies": 0.5260000228881836, + "eval_rewards/chosen": -0.20664800703525543, + "eval_rewards/margins": 0.002985927276313305, + "eval_rewards/rejected": -0.209633931517601, + "eval_runtime": 53.5083, + "eval_samples_per_second": 9.344, + "eval_steps_per_second": 4.672, + "step": 300 + }, + { + "epoch": 0.496, + "grad_norm": 0.1585138589143753, + "learning_rate": 6.691555555555555e-06, + "log_odds_chosen": -0.007139368914067745, + "log_odds_ratio": -0.7591060400009155, + "logits/chosen": 1.4326140880584717, + "logits/rejected": 1.3895976543426514, + "logps/chosen": -1.969129204750061, + "logps/rejected": -1.9710346460342407, + "loss": 1.8410327911376954, + "nll_loss": 1.7651220560073853, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19691291451454163, + "rewards/margins": 0.00019057458848692477, + "rewards/rejected": -0.19710348546504974, + "step": 310 + }, + { + "epoch": 0.512, + "grad_norm": 0.22688154876232147, + "learning_rate": 6.620444444444444e-06, + "log_odds_chosen": 0.06559257209300995, + "log_odds_ratio": -0.7108487486839294, + "logits/chosen": 1.448866367340088, + "logits/rejected": 1.4073840379714966, + "logps/chosen": -2.066251039505005, + "logps/rejected": -2.116321086883545, + "loss": 1.8646768569946288, + "nll_loss": 1.793591856956482, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.2066251039505005, + "rewards/margins": 0.0050069959834218025, + "rewards/rejected": -0.21163210272789001, + "step": 320 + }, + { + "epoch": 0.528, + "grad_norm": 0.18054792284965515, + "learning_rate": 6.549333333333333e-06, + "log_odds_chosen": 0.017626959830522537, + "log_odds_ratio": -0.7846770882606506, + "logits/chosen": 1.3628690242767334, + "logits/rejected": 1.4442317485809326, + "logps/chosen": -1.9719655513763428, + "logps/rejected": -1.9827516078948975, + "loss": 1.7763628005981444, + "nll_loss": 1.6978952884674072, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19719655811786652, + "rewards/margins": 0.001078630331903696, + "rewards/rejected": -0.19827519357204437, + "step": 330 + }, + { + "epoch": 0.544, + "grad_norm": 0.20558743178844452, + "learning_rate": 6.478222222222222e-06, + "log_odds_chosen": 0.16140693426132202, + "log_odds_ratio": -0.6672384738922119, + "logits/chosen": 1.4473990201950073, + "logits/rejected": 1.3463249206542969, + "logps/chosen": -2.042762041091919, + "logps/rejected": -2.17926287651062, + "loss": 1.8391897201538085, + "nll_loss": 1.772465705871582, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2042761743068695, + "rewards/margins": 0.013650094158947468, + "rewards/rejected": -0.21792630851268768, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 0.198579341173172, + "learning_rate": 6.407111111111111e-06, + "log_odds_chosen": 0.15977905690670013, + "log_odds_ratio": -0.6974108815193176, + "logits/chosen": 1.4000948667526245, + "logits/rejected": 1.3558040857315063, + "logps/chosen": -2.04255747795105, + "logps/rejected": -2.184256076812744, + "loss": 1.8088817596435547, + "nll_loss": 1.7391407489776611, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20425572991371155, + "rewards/margins": 0.014169883914291859, + "rewards/rejected": -0.21842563152313232, + "step": 350 + }, + { + "epoch": 0.56, + "eval_log_odds_chosen": 0.022830627858638763, + "eval_log_odds_ratio": -0.7658974528312683, + "eval_logits/chosen": 1.4262471199035645, + "eval_logits/rejected": 1.3848557472229004, + "eval_logps/chosen": -2.054938554763794, + "eval_logps/rejected": -2.0817372798919678, + "eval_loss": 1.8663586378097534, + "eval_nll_loss": 1.7897688150405884, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.20549388229846954, + "eval_rewards/margins": 0.0026798879262059927, + "eval_rewards/rejected": -0.2081737518310547, + "eval_runtime": 53.4938, + "eval_samples_per_second": 9.347, + "eval_steps_per_second": 4.673, + "step": 350 + }, + { + "epoch": 0.576, + "grad_norm": 0.2030162364244461, + "learning_rate": 6.336e-06, + "log_odds_chosen": 0.1422426402568817, + "log_odds_ratio": -0.7028877139091492, + "logits/chosen": 1.510601282119751, + "logits/rejected": 1.4965307712554932, + "logps/chosen": -2.0040202140808105, + "logps/rejected": -2.131716251373291, + "loss": 1.8563343048095704, + "nll_loss": 1.7860454320907593, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.20040205121040344, + "rewards/margins": 0.01276957057416439, + "rewards/rejected": -0.21317163109779358, + "step": 360 + }, + { + "epoch": 0.592, + "grad_norm": 0.21605312824249268, + "learning_rate": 6.2648888888888885e-06, + "log_odds_chosen": 0.05190245434641838, + "log_odds_ratio": -0.7171397805213928, + "logits/chosen": 1.393733263015747, + "logits/rejected": 1.3512167930603027, + "logps/chosen": -2.0085949897766113, + "logps/rejected": -2.0461935997009277, + "loss": 1.8606908798217774, + "nll_loss": 1.7889766693115234, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.20085950195789337, + "rewards/margins": 0.0037598726339638233, + "rewards/rejected": -0.204619362950325, + "step": 370 + }, + { + "epoch": 0.608, + "grad_norm": 0.16586966812610626, + "learning_rate": 6.193777777777778e-06, + "log_odds_chosen": 0.1487434357404709, + "log_odds_ratio": -0.7054386138916016, + "logits/chosen": 1.4523228406906128, + "logits/rejected": 1.491560697555542, + "logps/chosen": -2.011794090270996, + "logps/rejected": -2.1287624835968018, + "loss": 1.7962881088256837, + "nll_loss": 1.7257442474365234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20117942988872528, + "rewards/margins": 0.01169683039188385, + "rewards/rejected": -0.21287624537944794, + "step": 380 + }, + { + "epoch": 0.624, + "grad_norm": 0.2835615873336792, + "learning_rate": 6.1226666666666664e-06, + "log_odds_chosen": 0.13590653240680695, + "log_odds_ratio": -0.6991716623306274, + "logits/chosen": 1.3472636938095093, + "logits/rejected": 1.4732224941253662, + "logps/chosen": -1.972169280052185, + "logps/rejected": -2.08345365524292, + "loss": 1.8266469955444335, + "nll_loss": 1.7567298412322998, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1972169280052185, + "rewards/margins": 0.01112845353782177, + "rewards/rejected": -0.20834538340568542, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 0.15232031047344208, + "learning_rate": 6.051555555555556e-06, + "log_odds_chosen": 0.1982167661190033, + "log_odds_ratio": -0.6656922698020935, + "logits/chosen": 1.6487038135528564, + "logits/rejected": 1.6309928894042969, + "logps/chosen": -1.919835090637207, + "logps/rejected": -2.1005043983459473, + "loss": 1.8271181106567382, + "nll_loss": 1.7605489492416382, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19198350608348846, + "rewards/margins": 0.018066909164190292, + "rewards/rejected": -0.21005041897296906, + "step": 400 + }, + { + "epoch": 0.64, + "eval_log_odds_chosen": 0.028749216347932816, + "eval_log_odds_ratio": -0.763251543045044, + "eval_logits/chosen": 1.5312973260879517, + "eval_logits/rejected": 1.4910560846328735, + "eval_logps/chosen": -2.0474772453308105, + "eval_logps/rejected": -2.079118013381958, + "eval_loss": 1.85878586769104, + "eval_nll_loss": 1.7824609279632568, + "eval_rewards/accuracies": 0.527999997138977, + "eval_rewards/chosen": -0.20474772155284882, + "eval_rewards/margins": 0.0031640806701034307, + "eval_rewards/rejected": -0.20791178941726685, + "eval_runtime": 53.5987, + "eval_samples_per_second": 9.329, + "eval_steps_per_second": 4.664, + "step": 400 + }, + { + "epoch": 0.656, + "grad_norm": 0.19347190856933594, + "learning_rate": 5.980444444444444e-06, + "log_odds_chosen": 0.16494014859199524, + "log_odds_ratio": -0.6926692724227905, + "logits/chosen": 1.5291383266448975, + "logits/rejected": 1.4509787559509277, + "logps/chosen": -2.0051121711730957, + "logps/rejected": -2.141641139984131, + "loss": 1.814227294921875, + "nll_loss": 1.7449604272842407, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.20051121711730957, + "rewards/margins": 0.013652893714606762, + "rewards/rejected": -0.2141641080379486, + "step": 410 + }, + { + "epoch": 0.672, + "grad_norm": 0.22619830071926117, + "learning_rate": 5.909333333333333e-06, + "log_odds_chosen": 0.13791924715042114, + "log_odds_ratio": -0.7470442056655884, + "logits/chosen": 1.604962706565857, + "logits/rejected": 1.5795490741729736, + "logps/chosen": -1.9936443567276, + "logps/rejected": -2.138291835784912, + "loss": 1.7467041015625, + "nll_loss": 1.6719995737075806, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19936442375183105, + "rewards/margins": 0.014464760199189186, + "rewards/rejected": -0.21382920444011688, + "step": 420 + }, + { + "epoch": 0.688, + "grad_norm": 0.2198985368013382, + "learning_rate": 5.838222222222221e-06, + "log_odds_chosen": 0.10624992847442627, + "log_odds_ratio": -0.7233898639678955, + "logits/chosen": 1.5435715913772583, + "logits/rejected": 1.4956719875335693, + "logps/chosen": -1.9429054260253906, + "logps/rejected": -2.0160951614379883, + "loss": 1.7684484481811524, + "nll_loss": 1.696109414100647, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19429054856300354, + "rewards/margins": 0.0073189930990338326, + "rewards/rejected": -0.2016095370054245, + "step": 430 + }, + { + "epoch": 0.704, + "grad_norm": 0.18691220879554749, + "learning_rate": 5.767111111111111e-06, + "log_odds_chosen": -0.021660882979631424, + "log_odds_ratio": -0.7595964670181274, + "logits/chosen": 1.6979032754898071, + "logits/rejected": 1.6603370904922485, + "logps/chosen": -1.9889543056488037, + "logps/rejected": -1.965158462524414, + "loss": 1.8263116836547852, + "nll_loss": 1.750352144241333, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19889545440673828, + "rewards/margins": -0.002379600191488862, + "rewards/rejected": -0.19651584327220917, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 0.15867650508880615, + "learning_rate": 5.695999999999999e-06, + "log_odds_chosen": -0.0730157420039177, + "log_odds_ratio": -0.8161319494247437, + "logits/chosen": 1.5580207109451294, + "logits/rejected": 1.5313141345977783, + "logps/chosen": -2.104485034942627, + "logps/rejected": -2.0238442420959473, + "loss": 1.8731468200683594, + "nll_loss": 1.7915337085723877, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.2104485034942627, + "rewards/margins": -0.008064089342951775, + "rewards/rejected": -0.20238442718982697, + "step": 450 + }, + { + "epoch": 0.72, + "eval_log_odds_chosen": 0.02652684412896633, + "eval_log_odds_ratio": -0.7636462450027466, + "eval_logits/chosen": 1.5893203020095825, + "eval_logits/rejected": 1.5504162311553955, + "eval_logps/chosen": -2.0457892417907715, + "eval_logps/rejected": -2.0760769844055176, + "eval_loss": 1.854221224784851, + "eval_nll_loss": 1.7778565883636475, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.2045789510011673, + "eval_rewards/margins": 0.003028758568689227, + "eval_rewards/rejected": -0.2076077163219452, + "eval_runtime": 53.545, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 4.669, + "step": 450 + }, + { + "epoch": 0.736, + "grad_norm": 0.2154403030872345, + "learning_rate": 5.624888888888889e-06, + "log_odds_chosen": 0.1506679207086563, + "log_odds_ratio": -0.6985970139503479, + "logits/chosen": 1.5237703323364258, + "logits/rejected": 1.5915908813476562, + "logps/chosen": -2.0275137424468994, + "logps/rejected": -2.1536142826080322, + "loss": 1.8502899169921876, + "nll_loss": 1.7804298400878906, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.20275135338306427, + "rewards/margins": 0.012610049918293953, + "rewards/rejected": -0.21536140143871307, + "step": 460 + }, + { + "epoch": 0.752, + "grad_norm": 0.17461568117141724, + "learning_rate": 5.553777777777777e-06, + "log_odds_chosen": 0.17391428351402283, + "log_odds_ratio": -0.6869101524353027, + "logits/chosen": 1.5581696033477783, + "logits/rejected": 1.4896894693374634, + "logps/chosen": -1.9319404363632202, + "logps/rejected": -2.0868048667907715, + "loss": 1.7752193450927733, + "nll_loss": 1.7065280675888062, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19319406151771545, + "rewards/margins": 0.01548641175031662, + "rewards/rejected": -0.20868046581745148, + "step": 470 + }, + { + "epoch": 0.768, + "grad_norm": 0.2354680746793747, + "learning_rate": 5.482666666666667e-06, + "log_odds_chosen": 0.0810302346944809, + "log_odds_ratio": -0.7345963716506958, + "logits/chosen": 1.6190745830535889, + "logits/rejected": 1.5643622875213623, + "logps/chosen": -1.9312503337860107, + "logps/rejected": -2.0015344619750977, + "loss": 1.8230710983276368, + "nll_loss": 1.7496116161346436, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19312502443790436, + "rewards/margins": 0.007028433494269848, + "rewards/rejected": -0.20015347003936768, + "step": 480 + }, + { + "epoch": 0.784, + "grad_norm": 0.16735392808914185, + "learning_rate": 5.411555555555555e-06, + "log_odds_chosen": -0.01586019992828369, + "log_odds_ratio": -0.7565222382545471, + "logits/chosen": 1.479667067527771, + "logits/rejected": 1.504528522491455, + "logps/chosen": -2.03438138961792, + "logps/rejected": -2.0231566429138184, + "loss": 1.823404884338379, + "nll_loss": 1.7477527856826782, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.20343813300132751, + "rewards/margins": -0.0011224561603739858, + "rewards/rejected": -0.20231568813323975, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 0.1829444319009781, + "learning_rate": 5.3404444444444445e-06, + "log_odds_chosen": 0.11972711235284805, + "log_odds_ratio": -0.7277721166610718, + "logits/chosen": 1.5400944948196411, + "logits/rejected": 1.490321397781372, + "logps/chosen": -2.0169591903686523, + "logps/rejected": -2.1097054481506348, + "loss": 1.7935520172119142, + "nll_loss": 1.7207750082015991, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.20169591903686523, + "rewards/margins": 0.009274644777178764, + "rewards/rejected": -0.21097056567668915, + "step": 500 + }, + { + "epoch": 0.8, + "eval_log_odds_chosen": 0.023190366104245186, + "eval_log_odds_ratio": -0.7631560564041138, + "eval_logits/chosen": 1.5937296152114868, + "eval_logits/rejected": 1.5535894632339478, + "eval_logps/chosen": -2.0367894172668457, + "eval_logps/rejected": -2.063542366027832, + "eval_loss": 1.8493844270706177, + "eval_nll_loss": 1.773068904876709, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2036789506673813, + "eval_rewards/margins": 0.002675286727026105, + "eval_rewards/rejected": -0.20635424554347992, + "eval_runtime": 53.6528, + "eval_samples_per_second": 9.319, + "eval_steps_per_second": 4.66, + "step": 500 + }, + { + "epoch": 0.816, + "grad_norm": 0.18359120190143585, + "learning_rate": 5.269333333333333e-06, + "log_odds_chosen": 0.0819496288895607, + "log_odds_ratio": -0.7168647646903992, + "logits/chosen": 1.5115034580230713, + "logits/rejected": 1.4402801990509033, + "logps/chosen": -2.0256223678588867, + "logps/rejected": -2.0993785858154297, + "loss": 1.8416709899902344, + "nll_loss": 1.7699846029281616, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.20256221294403076, + "rewards/margins": 0.007375650107860565, + "rewards/rejected": -0.20993788540363312, + "step": 510 + }, + { + "epoch": 0.832, + "grad_norm": 0.17817825078964233, + "learning_rate": 5.1982222222222225e-06, + "log_odds_chosen": 0.2734625041484833, + "log_odds_ratio": -0.6966907382011414, + "logits/chosen": 1.5594890117645264, + "logits/rejected": 1.5883136987686157, + "logps/chosen": -1.941902756690979, + "logps/rejected": -2.200758457183838, + "loss": 1.8082841873168944, + "nll_loss": 1.7386150360107422, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19419027864933014, + "rewards/margins": 0.02588556334376335, + "rewards/rejected": -0.2200758457183838, + "step": 520 + }, + { + "epoch": 0.848, + "grad_norm": 0.2906912863254547, + "learning_rate": 5.12711111111111e-06, + "log_odds_chosen": -0.05212322995066643, + "log_odds_ratio": -0.7876973152160645, + "logits/chosen": 1.5465106964111328, + "logits/rejected": 1.5060274600982666, + "logps/chosen": -2.023458480834961, + "logps/rejected": -1.9740943908691406, + "loss": 1.8264562606811523, + "nll_loss": 1.747686743736267, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2023458480834961, + "rewards/margins": -0.004936427343636751, + "rewards/rejected": -0.19740943610668182, + "step": 530 + }, + { + "epoch": 0.864, + "grad_norm": 0.20073647797107697, + "learning_rate": 5.0559999999999995e-06, + "log_odds_chosen": 0.005497487727552652, + "log_odds_ratio": -0.7783851623535156, + "logits/chosen": 1.685173749923706, + "logits/rejected": 1.7713344097137451, + "logps/chosen": -1.9037139415740967, + "logps/rejected": -1.8979514837265015, + "loss": 1.8080127716064454, + "nll_loss": 1.7301738262176514, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19037137925624847, + "rewards/margins": -0.0005762483924627304, + "rewards/rejected": -0.1897951364517212, + "step": 540 + }, + { + "epoch": 0.88, + "grad_norm": 0.1273553967475891, + "learning_rate": 4.984888888888888e-06, + "log_odds_chosen": 0.18364550173282623, + "log_odds_ratio": -0.6565154790878296, + "logits/chosen": 1.587527871131897, + "logits/rejected": 1.5570650100708008, + "logps/chosen": -1.878670334815979, + "logps/rejected": -2.036764621734619, + "loss": 1.8042585372924804, + "nll_loss": 1.738607406616211, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18786704540252686, + "rewards/margins": 0.01580941304564476, + "rewards/rejected": -0.20367643237113953, + "step": 550 + }, + { + "epoch": 0.88, + "eval_log_odds_chosen": 0.025964470580220222, + "eval_log_odds_ratio": -0.7649032473564148, + "eval_logits/chosen": 1.5951756238937378, + "eval_logits/rejected": 1.5590118169784546, + "eval_logps/chosen": -2.032235860824585, + "eval_logps/rejected": -2.061086654663086, + "eval_loss": 1.8456532955169678, + "eval_nll_loss": 1.7691627740859985, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.2032235562801361, + "eval_rewards/margins": 0.00288510974496603, + "eval_rewards/rejected": -0.2061086744070053, + "eval_runtime": 53.5193, + "eval_samples_per_second": 9.342, + "eval_steps_per_second": 4.671, + "step": 550 + }, + { + "epoch": 0.896, + "grad_norm": 0.2574485242366791, + "learning_rate": 4.9137777777777775e-06, + "log_odds_chosen": 0.2498103827238083, + "log_odds_ratio": -0.6482200026512146, + "logits/chosen": 1.4691909551620483, + "logits/rejected": 1.3890944719314575, + "logps/chosen": -1.944493055343628, + "logps/rejected": -2.15970778465271, + "loss": 1.757819938659668, + "nll_loss": 1.6929981708526611, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.1944493055343628, + "rewards/margins": 0.021521473303437233, + "rewards/rejected": -0.21597079932689667, + "step": 560 + }, + { + "epoch": 0.912, + "grad_norm": 0.1717718094587326, + "learning_rate": 4.842666666666666e-06, + "log_odds_chosen": 0.06141955778002739, + "log_odds_ratio": -0.7500567436218262, + "logits/chosen": 1.6567814350128174, + "logits/rejected": 1.5995619297027588, + "logps/chosen": -1.9717029333114624, + "logps/rejected": -2.0279083251953125, + "loss": 1.7976764678955077, + "nll_loss": 1.722670555114746, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19717030227184296, + "rewards/margins": 0.0056205070577561855, + "rewards/rejected": -0.20279082655906677, + "step": 570 + }, + { + "epoch": 0.928, + "grad_norm": 0.2214452028274536, + "learning_rate": 4.771555555555555e-06, + "log_odds_chosen": -0.03478344902396202, + "log_odds_ratio": -0.7769675850868225, + "logits/chosen": 1.6345102787017822, + "logits/rejected": 1.5030030012130737, + "logps/chosen": -1.9924647808074951, + "logps/rejected": -1.9646708965301514, + "loss": 1.8113759994506835, + "nll_loss": 1.7336797714233398, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.19924648106098175, + "rewards/margins": -0.002779375296086073, + "rewards/rejected": -0.1964671015739441, + "step": 580 + }, + { + "epoch": 0.944, + "grad_norm": 0.17967616021633148, + "learning_rate": 4.700444444444445e-06, + "log_odds_chosen": 0.10119867324829102, + "log_odds_ratio": -0.7037830352783203, + "logits/chosen": 1.5076220035552979, + "logits/rejected": 1.4201761484146118, + "logps/chosen": -1.9879302978515625, + "logps/rejected": -2.082368850708008, + "loss": 1.7988115310668946, + "nll_loss": 1.728433609008789, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19879302382469177, + "rewards/margins": 0.009443843737244606, + "rewards/rejected": -0.20823685824871063, + "step": 590 + }, + { + "epoch": 0.96, + "grad_norm": 0.1719515323638916, + "learning_rate": 4.629333333333333e-06, + "log_odds_chosen": 0.06453205645084381, + "log_odds_ratio": -0.722458004951477, + "logits/chosen": 1.5820884704589844, + "logits/rejected": 1.511212944984436, + "logps/chosen": -1.8608150482177734, + "logps/rejected": -1.913921594619751, + "loss": 1.73321590423584, + "nll_loss": 1.6609699726104736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.18608148396015167, + "rewards/margins": 0.005310675594955683, + "rewards/rejected": -0.19139216840267181, + "step": 600 + }, + { + "epoch": 0.96, + "eval_log_odds_chosen": 0.03060421720147133, + "eval_log_odds_ratio": -0.7616795897483826, + "eval_logits/chosen": 1.6089800596237183, + "eval_logits/rejected": 1.5700582265853882, + "eval_logps/chosen": -2.03167986869812, + "eval_logps/rejected": -2.0645487308502197, + "eval_loss": 1.8425160646438599, + "eval_nll_loss": 1.7663480043411255, + "eval_rewards/accuracies": 0.5299999713897705, + "eval_rewards/chosen": -0.20316800475120544, + "eval_rewards/margins": 0.0032868909183889627, + "eval_rewards/rejected": -0.20645487308502197, + "eval_runtime": 53.5679, + "eval_samples_per_second": 9.334, + "eval_steps_per_second": 4.667, + "step": 600 + }, + { + "epoch": 0.976, + "grad_norm": 0.1661933809518814, + "learning_rate": 4.558222222222223e-06, + "log_odds_chosen": 0.29323670268058777, + "log_odds_ratio": -0.6596352458000183, + "logits/chosen": 1.619877576828003, + "logits/rejected": 1.5766648054122925, + "logps/chosen": -1.8296048641204834, + "logps/rejected": -2.056410789489746, + "loss": 1.7442157745361329, + "nll_loss": 1.6782522201538086, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.18296048045158386, + "rewards/margins": 0.02268058992922306, + "rewards/rejected": -0.20564107596874237, + "step": 610 + }, + { + "epoch": 0.992, + "grad_norm": 0.2402229607105255, + "learning_rate": 4.487111111111111e-06, + "log_odds_chosen": 0.09888540208339691, + "log_odds_ratio": -0.7262139916419983, + "logits/chosen": 1.5223571062088013, + "logits/rejected": 1.5384490489959717, + "logps/chosen": -1.81551194190979, + "logps/rejected": -1.8770872354507446, + "loss": 1.7421588897705078, + "nll_loss": 1.6695371866226196, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.18155118823051453, + "rewards/margins": 0.00615753885358572, + "rewards/rejected": -0.18770872056484222, + "step": 620 + }, + { + "epoch": 1.008, + "grad_norm": 0.20038332045078278, + "learning_rate": 4.416000000000001e-06, + "log_odds_chosen": 0.22199416160583496, + "log_odds_ratio": -0.702614426612854, + "logits/chosen": 1.5671889781951904, + "logits/rejected": 1.4507954120635986, + "logps/chosen": -1.8749492168426514, + "logps/rejected": -2.072239637374878, + "loss": 1.7758310317993165, + "nll_loss": 1.7055692672729492, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18749494850635529, + "rewards/margins": 0.019729027524590492, + "rewards/rejected": -0.20722396671772003, + "step": 630 + }, + { + "epoch": 1.024, + "grad_norm": 0.4241769015789032, + "learning_rate": 4.344888888888888e-06, + "log_odds_chosen": 0.13136598467826843, + "log_odds_ratio": -0.6841301918029785, + "logits/chosen": 1.5495809316635132, + "logits/rejected": 1.551116943359375, + "logps/chosen": -1.9033386707305908, + "logps/rejected": -2.0172486305236816, + "loss": 1.8465911865234375, + "nll_loss": 1.7781782150268555, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19033385813236237, + "rewards/margins": 0.011390982195734978, + "rewards/rejected": -0.2017248421907425, + "step": 640 + }, + { + "epoch": 1.04, + "grad_norm": 0.3452068269252777, + "learning_rate": 4.273777777777778e-06, + "log_odds_chosen": -0.07351900637149811, + "log_odds_ratio": -0.817279040813446, + "logits/chosen": 1.584192156791687, + "logits/rejected": 1.5928871631622314, + "logps/chosen": -2.058300495147705, + "logps/rejected": -1.9885162115097046, + "loss": 1.7821306228637694, + "nll_loss": 1.7004029750823975, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20583002269268036, + "rewards/margins": -0.006978417746722698, + "rewards/rejected": -0.19885161519050598, + "step": 650 + }, + { + "epoch": 1.04, + "eval_log_odds_chosen": 0.025821426883339882, + "eval_log_odds_ratio": -0.7627764344215393, + "eval_logits/chosen": 1.5995525121688843, + "eval_logits/rejected": 1.5644874572753906, + "eval_logps/chosen": -2.020664930343628, + "eval_logps/rejected": -2.0486767292022705, + "eval_loss": 1.8382052183151245, + "eval_nll_loss": 1.761927604675293, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20206648111343384, + "eval_rewards/margins": 0.0028012022376060486, + "eval_rewards/rejected": -0.2048676759004593, + "eval_runtime": 53.5051, + "eval_samples_per_second": 9.345, + "eval_steps_per_second": 4.672, + "step": 650 + }, + { + "epoch": 1.056, + "grad_norm": 0.2219850867986679, + "learning_rate": 4.202666666666666e-06, + "log_odds_chosen": 0.0981612354516983, + "log_odds_ratio": -0.7084988951683044, + "logits/chosen": 1.566150188446045, + "logits/rejected": 1.5409971475601196, + "logps/chosen": -1.9153779745101929, + "logps/rejected": -1.993215799331665, + "loss": 1.7639646530151367, + "nll_loss": 1.6931148767471313, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1915377825498581, + "rewards/margins": 0.007783782668411732, + "rewards/rejected": -0.19932158291339874, + "step": 660 + }, + { + "epoch": 1.072, + "grad_norm": 0.2081521451473236, + "learning_rate": 4.1315555555555556e-06, + "log_odds_chosen": 0.1091976910829544, + "log_odds_ratio": -0.6956084370613098, + "logits/chosen": 1.7070366144180298, + "logits/rejected": 1.643048882484436, + "logps/chosen": -1.9937528371810913, + "logps/rejected": -2.1002330780029297, + "loss": 1.8045967102050782, + "nll_loss": 1.7350358963012695, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19937530159950256, + "rewards/margins": 0.010648000985383987, + "rewards/rejected": -0.21002328395843506, + "step": 670 + }, + { + "epoch": 1.088, + "grad_norm": 0.15036334097385406, + "learning_rate": 4.060444444444444e-06, + "log_odds_chosen": 0.06587956845760345, + "log_odds_ratio": -0.735454261302948, + "logits/chosen": 1.6563133001327515, + "logits/rejected": 1.6276830434799194, + "logps/chosen": -1.9550457000732422, + "logps/rejected": -2.027952194213867, + "loss": 1.7694936752319337, + "nll_loss": 1.6959483623504639, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1955045759677887, + "rewards/margins": 0.007290668785572052, + "rewards/rejected": -0.20279522240161896, + "step": 680 + }, + { + "epoch": 1.104, + "grad_norm": 0.18966805934906006, + "learning_rate": 3.989333333333333e-06, + "log_odds_chosen": 0.10347900539636612, + "log_odds_ratio": -0.7044586539268494, + "logits/chosen": 1.7145198583602905, + "logits/rejected": 1.6799052953720093, + "logps/chosen": -1.9247310161590576, + "logps/rejected": -2.026733875274658, + "loss": 1.7826107025146485, + "nll_loss": 1.712165117263794, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.19247311353683472, + "rewards/margins": 0.010200263932347298, + "rewards/rejected": -0.20267336070537567, + "step": 690 + }, + { + "epoch": 1.12, + "grad_norm": 0.27768856287002563, + "learning_rate": 3.918222222222222e-06, + "log_odds_chosen": 0.22244243323802948, + "log_odds_ratio": -0.6758590936660767, + "logits/chosen": 1.5261728763580322, + "logits/rejected": 1.4202882051467896, + "logps/chosen": -1.9367029666900635, + "logps/rejected": -2.1093690395355225, + "loss": 1.7446697235107422, + "nll_loss": 1.677083969116211, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19367030262947083, + "rewards/margins": 0.01726660504937172, + "rewards/rejected": -0.21093690395355225, + "step": 700 + }, + { + "epoch": 1.12, + "eval_log_odds_chosen": 0.028961200267076492, + "eval_log_odds_ratio": -0.7605804800987244, + "eval_logits/chosen": 1.6280906200408936, + "eval_logits/rejected": 1.5934500694274902, + "eval_logps/chosen": -2.018935441970825, + "eval_logps/rejected": -2.049320936203003, + "eval_loss": 1.8348098993301392, + "eval_nll_loss": 1.7587517499923706, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20189355313777924, + "eval_rewards/margins": 0.003038552822545171, + "eval_rewards/rejected": -0.2049321085214615, + "eval_runtime": 53.5567, + "eval_samples_per_second": 9.336, + "eval_steps_per_second": 4.668, + "step": 700 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.2013629972934723, + "learning_rate": 3.8471111111111105e-06, + "log_odds_chosen": -0.03864391893148422, + "log_odds_ratio": -0.7773549556732178, + "logits/chosen": 1.677484154701233, + "logits/rejected": 1.539167881011963, + "logps/chosen": -2.087277889251709, + "logps/rejected": -2.0545527935028076, + "loss": 1.8523176193237305, + "nll_loss": 1.7745821475982666, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20872780680656433, + "rewards/margins": -0.0032725154887884855, + "rewards/rejected": -0.20545530319213867, + "step": 710 + }, + { + "epoch": 1.152, + "grad_norm": 0.20682214200496674, + "learning_rate": 3.7759999999999995e-06, + "log_odds_chosen": 0.03799188882112503, + "log_odds_ratio": -0.7313598394393921, + "logits/chosen": 1.670763373374939, + "logits/rejected": 1.6574833393096924, + "logps/chosen": -1.883700966835022, + "logps/rejected": -1.9155511856079102, + "loss": 1.8039417266845703, + "nll_loss": 1.73080575466156, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.18837013840675354, + "rewards/margins": 0.0031849914230406284, + "rewards/rejected": -0.19155511260032654, + "step": 720 + }, + { + "epoch": 1.168, + "grad_norm": 0.23043783009052277, + "learning_rate": 3.7048888888888885e-06, + "log_odds_chosen": -0.0075264484621584415, + "log_odds_ratio": -0.7775508761405945, + "logits/chosen": 1.742677092552185, + "logits/rejected": 1.7868130207061768, + "logps/chosen": -1.9592937231063843, + "logps/rejected": -1.9675123691558838, + "loss": 1.8238216400146485, + "nll_loss": 1.7460663318634033, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1959293633699417, + "rewards/margins": 0.000821849680505693, + "rewards/rejected": -0.196751207113266, + "step": 730 + }, + { + "epoch": 1.184, + "grad_norm": 0.18770243227481842, + "learning_rate": 3.633777777777778e-06, + "log_odds_chosen": 0.1855059713125229, + "log_odds_ratio": -0.6716901063919067, + "logits/chosen": 1.7457072734832764, + "logits/rejected": 1.7863832712173462, + "logps/chosen": -1.9091510772705078, + "logps/rejected": -2.0810976028442383, + "loss": 1.7321632385253907, + "nll_loss": 1.6649940013885498, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19091510772705078, + "rewards/margins": 0.017194656655192375, + "rewards/rejected": -0.2081097662448883, + "step": 740 + }, + { + "epoch": 1.2, + "grad_norm": 0.133562833070755, + "learning_rate": 3.562666666666667e-06, + "log_odds_chosen": 0.3153020143508911, + "log_odds_ratio": -0.6737623810768127, + "logits/chosen": 1.7664339542388916, + "logits/rejected": 1.658424973487854, + "logps/chosen": -1.9272540807724, + "logps/rejected": -2.21852707862854, + "loss": 1.769679069519043, + "nll_loss": 1.7023029327392578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19272543489933014, + "rewards/margins": 0.02912726439535618, + "rewards/rejected": -0.22185268998146057, + "step": 750 + }, + { + "epoch": 1.2, + "eval_log_odds_chosen": 0.02566058374941349, + "eval_log_odds_ratio": -0.7629475593566895, + "eval_logits/chosen": 1.6977092027664185, + "eval_logits/rejected": 1.6649681329727173, + "eval_logps/chosen": -2.014634609222412, + "eval_logps/rejected": -2.042663812637329, + "eval_loss": 1.8331661224365234, + "eval_nll_loss": 1.7568713426589966, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2014634758234024, + "eval_rewards/margins": 0.002802920062094927, + "eval_rewards/rejected": -0.20426639914512634, + "eval_runtime": 53.583, + "eval_samples_per_second": 9.331, + "eval_steps_per_second": 4.666, + "step": 750 + }, + { + "epoch": 1.216, + "grad_norm": 0.19514085352420807, + "learning_rate": 3.4915555555555558e-06, + "log_odds_chosen": 0.07212933897972107, + "log_odds_ratio": -0.727311909198761, + "logits/chosen": 1.7513700723648071, + "logits/rejected": 1.649224877357483, + "logps/chosen": -1.9654823541641235, + "logps/rejected": -2.0274367332458496, + "loss": 1.7462617874145507, + "nll_loss": 1.6735305786132812, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1965482383966446, + "rewards/margins": 0.006195452995598316, + "rewards/rejected": -0.20274372398853302, + "step": 760 + }, + { + "epoch": 1.232, + "grad_norm": 0.16212475299835205, + "learning_rate": 3.4204444444444443e-06, + "log_odds_chosen": 0.27029961347579956, + "log_odds_ratio": -0.6815747022628784, + "logits/chosen": 1.6886869668960571, + "logits/rejected": 1.5897142887115479, + "logps/chosen": -1.9044355154037476, + "logps/rejected": -2.1495070457458496, + "loss": 1.7639043807983399, + "nll_loss": 1.6957467794418335, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19044354557991028, + "rewards/margins": 0.024507205933332443, + "rewards/rejected": -0.21495072543621063, + "step": 770 + }, + { + "epoch": 1.248, + "grad_norm": 0.1794252097606659, + "learning_rate": 3.3493333333333333e-06, + "log_odds_chosen": 0.1325821578502655, + "log_odds_ratio": -0.6765932440757751, + "logits/chosen": 1.7684406042099, + "logits/rejected": 1.7867968082427979, + "logps/chosen": -1.85635244846344, + "logps/rejected": -1.9697904586791992, + "loss": 1.7592267990112305, + "nll_loss": 1.6915674209594727, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.18563523888587952, + "rewards/margins": 0.011343811638653278, + "rewards/rejected": -0.19697906076908112, + "step": 780 + }, + { + "epoch": 1.264, + "grad_norm": 0.21208560466766357, + "learning_rate": 3.2782222222222222e-06, + "log_odds_chosen": -0.15325720608234406, + "log_odds_ratio": -0.8170074224472046, + "logits/chosen": 1.7933919429779053, + "logits/rejected": 1.777390718460083, + "logps/chosen": -2.049445390701294, + "logps/rejected": -1.9202073812484741, + "loss": 1.826498794555664, + "nll_loss": 1.7447984218597412, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.20494452118873596, + "rewards/margins": -0.012923778966069221, + "rewards/rejected": -0.1920207440853119, + "step": 790 + }, + { + "epoch": 1.28, + "grad_norm": 0.22979456186294556, + "learning_rate": 3.207111111111111e-06, + "log_odds_chosen": 0.17507728934288025, + "log_odds_ratio": -0.6654232740402222, + "logits/chosen": 1.7312673330307007, + "logits/rejected": 1.7077823877334595, + "logps/chosen": -1.9784681797027588, + "logps/rejected": -2.1287527084350586, + "loss": 1.7675779342651368, + "nll_loss": 1.7010358572006226, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19784680008888245, + "rewards/margins": 0.015028467401862144, + "rewards/rejected": -0.21287527680397034, + "step": 800 + }, + { + "epoch": 1.28, + "eval_log_odds_chosen": 0.0278985183686018, + "eval_log_odds_ratio": -0.7607023119926453, + "eval_logits/chosen": 1.7193233966827393, + "eval_logits/rejected": 1.6865739822387695, + "eval_logps/chosen": -2.012319326400757, + "eval_logps/rejected": -2.0422775745391846, + "eval_loss": 1.8310211896896362, + "eval_nll_loss": 1.7549511194229126, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.20123191177845, + "eval_rewards/margins": 0.0029958393424749374, + "eval_rewards/rejected": -0.2042277753353119, + "eval_runtime": 53.4403, + "eval_samples_per_second": 9.356, + "eval_steps_per_second": 4.678, + "step": 800 + }, + { + "epoch": 1.296, + "grad_norm": 0.19002483785152435, + "learning_rate": 3.136e-06, + "log_odds_chosen": -0.021020114421844482, + "log_odds_ratio": -0.7389410138130188, + "logits/chosen": 1.7910667657852173, + "logits/rejected": 1.743281602859497, + "logps/chosen": -2.0423073768615723, + "logps/rejected": -2.030003070831299, + "loss": 1.8281953811645508, + "nll_loss": 1.7543014287948608, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.20423074066638947, + "rewards/margins": -0.0012304515112191439, + "rewards/rejected": -0.2030002772808075, + "step": 810 + }, + { + "epoch": 1.312, + "grad_norm": 0.18885421752929688, + "learning_rate": 3.0648888888888887e-06, + "log_odds_chosen": -0.009633201174438, + "log_odds_ratio": -0.8210141062736511, + "logits/chosen": 1.713783621788025, + "logits/rejected": 1.663731575012207, + "logps/chosen": -2.0672500133514404, + "logps/rejected": -2.0374674797058105, + "loss": 1.8184293746948241, + "nll_loss": 1.736328125, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.20672500133514404, + "rewards/margins": -0.002978231757879257, + "rewards/rejected": -0.20374679565429688, + "step": 820 + }, + { + "epoch": 1.328, + "grad_norm": 0.19928835332393646, + "learning_rate": 2.9937777777777776e-06, + "log_odds_chosen": -0.09898372739553452, + "log_odds_ratio": -0.8769068717956543, + "logits/chosen": 1.627624273300171, + "logits/rejected": 1.614092469215393, + "logps/chosen": -1.9781357049942017, + "logps/rejected": -1.8761498928070068, + "loss": 1.7701539993286133, + "nll_loss": 1.6824630498886108, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.19781355559825897, + "rewards/margins": -0.010198570787906647, + "rewards/rejected": -0.18761499226093292, + "step": 830 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.19081026315689087, + "learning_rate": 2.9226666666666666e-06, + "log_odds_chosen": -0.04123927652835846, + "log_odds_ratio": -0.7816277742385864, + "logits/chosen": 1.6747424602508545, + "logits/rejected": 1.5965977907180786, + "logps/chosen": -1.9864768981933594, + "logps/rejected": -1.956075668334961, + "loss": 1.797834587097168, + "nll_loss": 1.7196719646453857, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -0.19864769279956818, + "rewards/margins": -0.0030401155818253756, + "rewards/rejected": -0.19560757279396057, + "step": 840 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.23284944891929626, + "learning_rate": 2.8515555555555555e-06, + "log_odds_chosen": 0.2058713734149933, + "log_odds_ratio": -0.6709384918212891, + "logits/chosen": 1.5520470142364502, + "logits/rejected": 1.5949593782424927, + "logps/chosen": -1.895453691482544, + "logps/rejected": -2.06461763381958, + "loss": 1.7646823883056642, + "nll_loss": 1.697588562965393, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.18954536318778992, + "rewards/margins": 0.01691642962396145, + "rewards/rejected": -0.2064618170261383, + "step": 850 + }, + { + "epoch": 1.3599999999999999, + "eval_log_odds_chosen": 0.026527805253863335, + "eval_log_odds_ratio": -0.7608199119567871, + "eval_logits/chosen": 1.6602368354797363, + "eval_logits/rejected": 1.62636137008667, + "eval_logps/chosen": -2.0066142082214355, + "eval_logps/rejected": -2.034489393234253, + "eval_loss": 1.8279491662979126, + "eval_nll_loss": 1.7518671751022339, + "eval_rewards/accuracies": 0.5379999876022339, + "eval_rewards/chosen": -0.20066142082214355, + "eval_rewards/margins": 0.002787541365250945, + "eval_rewards/rejected": -0.20344896614551544, + "eval_runtime": 53.4886, + "eval_samples_per_second": 9.348, + "eval_steps_per_second": 4.674, + "step": 850 + }, + { + "epoch": 1.376, + "grad_norm": 0.16045095026493073, + "learning_rate": 2.7804444444444445e-06, + "log_odds_chosen": 0.15138807892799377, + "log_odds_ratio": -0.6764562129974365, + "logits/chosen": 1.6566671133041382, + "logits/rejected": 1.6294691562652588, + "logps/chosen": -1.9152675867080688, + "logps/rejected": -2.047109603881836, + "loss": 1.759444808959961, + "nll_loss": 1.6917991638183594, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19152674078941345, + "rewards/margins": 0.01318420935422182, + "rewards/rejected": -0.2047109305858612, + "step": 860 + }, + { + "epoch": 1.392, + "grad_norm": 0.15155339241027832, + "learning_rate": 2.7093333333333335e-06, + "log_odds_chosen": 0.28874093294143677, + "log_odds_ratio": -0.6331272125244141, + "logits/chosen": 1.696692705154419, + "logits/rejected": 1.656974196434021, + "logps/chosen": -1.8829189538955688, + "logps/rejected": -2.1337971687316895, + "loss": 1.8225980758666993, + "nll_loss": 1.7592853307724, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18829190731048584, + "rewards/margins": 0.02508782222867012, + "rewards/rejected": -0.21337974071502686, + "step": 870 + }, + { + "epoch": 1.408, + "grad_norm": 0.23297204077243805, + "learning_rate": 2.638222222222222e-06, + "log_odds_chosen": 0.08328817784786224, + "log_odds_ratio": -0.717012882232666, + "logits/chosen": 1.7321844100952148, + "logits/rejected": 1.7046935558319092, + "logps/chosen": -1.954520583152771, + "logps/rejected": -2.024279832839966, + "loss": 1.7641315460205078, + "nll_loss": 1.6924302577972412, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.19545204937458038, + "rewards/margins": 0.006975927390158176, + "rewards/rejected": -0.20242798328399658, + "step": 880 + }, + { + "epoch": 1.424, + "grad_norm": 0.20234379172325134, + "learning_rate": 2.567111111111111e-06, + "log_odds_chosen": 0.097869873046875, + "log_odds_ratio": -0.7072083353996277, + "logits/chosen": 1.7079193592071533, + "logits/rejected": 1.625478982925415, + "logps/chosen": -1.9695608615875244, + "logps/rejected": -2.0492820739746094, + "loss": 1.783566665649414, + "nll_loss": 1.712845802307129, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1969560831785202, + "rewards/margins": 0.007972110994160175, + "rewards/rejected": -0.2049282044172287, + "step": 890 + }, + { + "epoch": 1.44, + "grad_norm": 0.1814461499452591, + "learning_rate": 2.496e-06, + "log_odds_chosen": 0.026505127549171448, + "log_odds_ratio": -0.7608965635299683, + "logits/chosen": 1.6023356914520264, + "logits/rejected": 1.6222490072250366, + "logps/chosen": -2.0157110691070557, + "logps/rejected": -2.047163486480713, + "loss": 1.7937837600708009, + "nll_loss": 1.7176940441131592, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20157113671302795, + "rewards/margins": 0.0031452514231204987, + "rewards/rejected": -0.20471635460853577, + "step": 900 + }, + { + "epoch": 1.44, + "eval_log_odds_chosen": 0.02784898318350315, + "eval_log_odds_ratio": -0.7608876824378967, + "eval_logits/chosen": 1.6986433267593384, + "eval_logits/rejected": 1.666439175605774, + "eval_logps/chosen": -2.00809645652771, + "eval_logps/rejected": -2.0373218059539795, + "eval_loss": 1.8273944854736328, + "eval_nll_loss": 1.7513054609298706, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.20080965757369995, + "eval_rewards/margins": 0.00292251817882061, + "eval_rewards/rejected": -0.2037321925163269, + "eval_runtime": 53.4308, + "eval_samples_per_second": 9.358, + "eval_steps_per_second": 4.679, + "step": 900 + }, + { + "epoch": 1.456, + "grad_norm": 0.2085212618112564, + "learning_rate": 2.424888888888889e-06, + "log_odds_chosen": 0.18774743378162384, + "log_odds_ratio": -0.6731225848197937, + "logits/chosen": 1.6927309036254883, + "logits/rejected": 1.6234180927276611, + "logps/chosen": -1.8558231592178345, + "logps/rejected": -2.020806312561035, + "loss": 1.7055980682373046, + "nll_loss": 1.6382856369018555, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18558230996131897, + "rewards/margins": 0.016498321667313576, + "rewards/rejected": -0.202080637216568, + "step": 910 + }, + { + "epoch": 1.472, + "grad_norm": 0.1574070155620575, + "learning_rate": 2.353777777777778e-06, + "log_odds_chosen": 0.09441863000392914, + "log_odds_ratio": -0.6969857215881348, + "logits/chosen": 1.7141573429107666, + "logits/rejected": 1.7830305099487305, + "logps/chosen": -1.9033101797103882, + "logps/rejected": -1.989233374595642, + "loss": 1.7647336959838866, + "nll_loss": 1.6950347423553467, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.19033101201057434, + "rewards/margins": 0.008592324331402779, + "rewards/rejected": -0.19892333447933197, + "step": 920 + }, + { + "epoch": 1.488, + "grad_norm": 0.14556527137756348, + "learning_rate": 2.2826666666666664e-06, + "log_odds_chosen": 0.10065688192844391, + "log_odds_ratio": -0.6970812082290649, + "logits/chosen": 1.7773923873901367, + "logits/rejected": 1.7257808446884155, + "logps/chosen": -1.8344297409057617, + "logps/rejected": -1.910177230834961, + "loss": 1.795981216430664, + "nll_loss": 1.7262731790542603, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18344298005104065, + "rewards/margins": 0.007574764080345631, + "rewards/rejected": -0.19101771712303162, + "step": 930 + }, + { + "epoch": 1.504, + "grad_norm": 0.13724461197853088, + "learning_rate": 2.2115555555555553e-06, + "log_odds_chosen": 0.07881642878055573, + "log_odds_ratio": -0.722042977809906, + "logits/chosen": 1.7508437633514404, + "logits/rejected": 1.724735975265503, + "logps/chosen": -1.9031156301498413, + "logps/rejected": -1.9599933624267578, + "loss": 1.7547365188598634, + "nll_loss": 1.6825320720672607, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19031158089637756, + "rewards/margins": 0.005687765311449766, + "rewards/rejected": -0.1959993541240692, + "step": 940 + }, + { + "epoch": 1.52, + "grad_norm": 0.17632044851779938, + "learning_rate": 2.1404444444444443e-06, + "log_odds_chosen": -0.024805480614304543, + "log_odds_ratio": -0.7657346129417419, + "logits/chosen": 1.7735048532485962, + "logits/rejected": 1.7464863061904907, + "logps/chosen": -1.9814984798431396, + "logps/rejected": -1.952270269393921, + "loss": 1.776956558227539, + "nll_loss": 1.700383186340332, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19814984500408173, + "rewards/margins": -0.0029228185303509235, + "rewards/rejected": -0.19522707164287567, + "step": 950 + }, + { + "epoch": 1.52, + "eval_log_odds_chosen": 0.028202777728438377, + "eval_log_odds_ratio": -0.7601404786109924, + "eval_logits/chosen": 1.724393606185913, + "eval_logits/rejected": 1.691964864730835, + "eval_logps/chosen": -2.0063939094543457, + "eval_logps/rejected": -2.036207675933838, + "eval_loss": 1.8266409635543823, + "eval_nll_loss": 1.7506269216537476, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20063939690589905, + "eval_rewards/margins": 0.002981391968205571, + "eval_rewards/rejected": -0.2036207914352417, + "eval_runtime": 53.4898, + "eval_samples_per_second": 9.348, + "eval_steps_per_second": 4.674, + "step": 950 + }, + { + "epoch": 1.536, + "grad_norm": 0.16243591904640198, + "learning_rate": 2.0693333333333332e-06, + "log_odds_chosen": 0.10965070873498917, + "log_odds_ratio": -0.6967736482620239, + "logits/chosen": 1.8899224996566772, + "logits/rejected": 1.8401854038238525, + "logps/chosen": -1.945563554763794, + "logps/rejected": -2.0543692111968994, + "loss": 1.8785415649414063, + "nll_loss": 1.8088642358779907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1945563554763794, + "rewards/margins": 0.010880568996071815, + "rewards/rejected": -0.20543691515922546, + "step": 960 + }, + { + "epoch": 1.552, + "grad_norm": 0.21627004444599152, + "learning_rate": 1.998222222222222e-06, + "log_odds_chosen": 0.0991244837641716, + "log_odds_ratio": -0.7134609222412109, + "logits/chosen": 1.6730775833129883, + "logits/rejected": 1.6934372186660767, + "logps/chosen": -1.9332492351531982, + "logps/rejected": -2.012324333190918, + "loss": 1.7858917236328125, + "nll_loss": 1.7145456075668335, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.1933249533176422, + "rewards/margins": 0.007907481864094734, + "rewards/rejected": -0.2012324333190918, + "step": 970 + }, + { + "epoch": 1.568, + "grad_norm": 0.1899784356355667, + "learning_rate": 1.927111111111111e-06, + "log_odds_chosen": 0.22156307101249695, + "log_odds_ratio": -0.6330583095550537, + "logits/chosen": 1.6686766147613525, + "logits/rejected": 1.634007453918457, + "logps/chosen": -1.867730736732483, + "logps/rejected": -2.0626068115234375, + "loss": 1.7414642333984376, + "nll_loss": 1.6781585216522217, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.18677309155464172, + "rewards/margins": 0.019487615674734116, + "rewards/rejected": -0.20626071095466614, + "step": 980 + }, + { + "epoch": 1.584, + "grad_norm": 0.17067763209342957, + "learning_rate": 1.856e-06, + "log_odds_chosen": 0.18612933158874512, + "log_odds_ratio": -0.6796912550926208, + "logits/chosen": 1.6610866785049438, + "logits/rejected": 1.5915647745132446, + "logps/chosen": -1.931335687637329, + "logps/rejected": -2.088463306427002, + "loss": 1.752705192565918, + "nll_loss": 1.684735655784607, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19313354790210724, + "rewards/margins": 0.01571280136704445, + "rewards/rejected": -0.20884636044502258, + "step": 990 + }, + { + "epoch": 1.6, + "grad_norm": 0.2076931893825531, + "learning_rate": 1.7848888888888888e-06, + "log_odds_chosen": 0.1702508181333542, + "log_odds_ratio": -0.6891772747039795, + "logits/chosen": 1.7464933395385742, + "logits/rejected": 1.6518672704696655, + "logps/chosen": -1.9614388942718506, + "logps/rejected": -2.1067023277282715, + "loss": 1.8084089279174804, + "nll_loss": 1.7394912242889404, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.19614391028881073, + "rewards/margins": 0.014526346698403358, + "rewards/rejected": -0.21067026257514954, + "step": 1000 + }, + { + "epoch": 1.6, + "eval_log_odds_chosen": 0.02751515619456768, + "eval_log_odds_ratio": -0.7607263326644897, + "eval_logits/chosen": 1.7029526233673096, + "eval_logits/rejected": 1.67202889919281, + "eval_logps/chosen": -2.0067930221557617, + "eval_logps/rejected": -2.0359673500061035, + "eval_loss": 1.8261206150054932, + "eval_nll_loss": 1.7500479221343994, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20067930221557617, + "eval_rewards/margins": 0.002917409408837557, + "eval_rewards/rejected": -0.20359672605991364, + "eval_runtime": 53.4833, + "eval_samples_per_second": 9.349, + "eval_steps_per_second": 4.674, + "step": 1000 + }, + { + "epoch": 1.616, + "grad_norm": 0.24526309967041016, + "learning_rate": 1.7137777777777778e-06, + "log_odds_chosen": 0.18274818360805511, + "log_odds_ratio": -0.6588706374168396, + "logits/chosen": 1.748281717300415, + "logits/rejected": 1.8165216445922852, + "logps/chosen": -1.9252090454101562, + "logps/rejected": -2.0696587562561035, + "loss": 1.8064483642578124, + "nll_loss": 1.7405614852905273, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19252091646194458, + "rewards/margins": 0.014444932341575623, + "rewards/rejected": -0.2069658488035202, + "step": 1010 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.21430929005146027, + "learning_rate": 1.6426666666666666e-06, + "log_odds_chosen": 0.20202656090259552, + "log_odds_ratio": -0.6916796565055847, + "logits/chosen": 1.726967215538025, + "logits/rejected": 1.7503808736801147, + "logps/chosen": -1.885287880897522, + "logps/rejected": -2.0325818061828613, + "loss": 1.7769804000854492, + "nll_loss": 1.7078125476837158, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18852879106998444, + "rewards/margins": 0.014729383401572704, + "rewards/rejected": -0.2032581865787506, + "step": 1020 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.14660713076591492, + "learning_rate": 1.5715555555555555e-06, + "log_odds_chosen": 0.08981350809335709, + "log_odds_ratio": -0.7998946309089661, + "logits/chosen": 1.7373206615447998, + "logits/rejected": 1.685080885887146, + "logps/chosen": -1.9951941967010498, + "logps/rejected": -2.0966899394989014, + "loss": 1.7756771087646483, + "nll_loss": 1.6956878900527954, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.19951942563056946, + "rewards/margins": 0.010149596258997917, + "rewards/rejected": -0.20966899394989014, + "step": 1030 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.20548580586910248, + "learning_rate": 1.5004444444444445e-06, + "log_odds_chosen": 0.2052316665649414, + "log_odds_ratio": -0.6860161423683167, + "logits/chosen": 1.640928030014038, + "logits/rejected": 1.6122572422027588, + "logps/chosen": -1.8723407983779907, + "logps/rejected": -2.0312299728393555, + "loss": 1.6772642135620117, + "nll_loss": 1.6086626052856445, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1872340738773346, + "rewards/margins": 0.01588893122971058, + "rewards/rejected": -0.20312300324440002, + "step": 1040 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.16696485877037048, + "learning_rate": 1.4293333333333332e-06, + "log_odds_chosen": 0.14438050985336304, + "log_odds_ratio": -0.7038607597351074, + "logits/chosen": 1.71317458152771, + "logits/rejected": 1.705130934715271, + "logps/chosen": -1.912302017211914, + "logps/rejected": -2.036870002746582, + "loss": 1.7682226181030274, + "nll_loss": 1.6978362798690796, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.19123020768165588, + "rewards/margins": 0.012456776574254036, + "rewards/rejected": -0.20368699729442596, + "step": 1050 + }, + { + "epoch": 1.6800000000000002, + "eval_log_odds_chosen": 0.02581177093088627, + "eval_log_odds_ratio": -0.7616178393363953, + "eval_logits/chosen": 1.7283238172531128, + "eval_logits/rejected": 1.6983083486557007, + "eval_logps/chosen": -2.000814914703369, + "eval_logps/rejected": -2.028439998626709, + "eval_loss": 1.8245528936386108, + "eval_nll_loss": 1.7483911514282227, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.2000814825296402, + "eval_rewards/margins": 0.0027625402435660362, + "eval_rewards/rejected": -0.20284400880336761, + "eval_runtime": 53.4143, + "eval_samples_per_second": 9.361, + "eval_steps_per_second": 4.68, + "step": 1050 + }, + { + "epoch": 1.696, + "grad_norm": 0.2045900672674179, + "learning_rate": 1.3582222222222222e-06, + "log_odds_chosen": -0.061510004103183746, + "log_odds_ratio": -0.8348624110221863, + "logits/chosen": 1.605478286743164, + "logits/rejected": 1.642643690109253, + "logps/chosen": -2.066878080368042, + "logps/rejected": -2.0267105102539062, + "loss": 1.7784732818603515, + "nll_loss": 1.6949872970581055, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.2066878080368042, + "rewards/margins": -0.004016753286123276, + "rewards/rejected": -0.20267105102539062, + "step": 1060 + }, + { + "epoch": 1.712, + "grad_norm": 0.20866218209266663, + "learning_rate": 1.2871111111111111e-06, + "log_odds_chosen": 0.20968547463417053, + "log_odds_ratio": -0.6722984313964844, + "logits/chosen": 1.7972224950790405, + "logits/rejected": 1.7789695262908936, + "logps/chosen": -1.88360595703125, + "logps/rejected": -2.0617847442626953, + "loss": 1.757400894165039, + "nll_loss": 1.690171241760254, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.18836061656475067, + "rewards/margins": 0.017817873507738113, + "rewards/rejected": -0.2061784714460373, + "step": 1070 + }, + { + "epoch": 1.728, + "grad_norm": 0.1793377697467804, + "learning_rate": 1.2159999999999999e-06, + "log_odds_chosen": -0.09240353852510452, + "log_odds_ratio": -0.8129439353942871, + "logits/chosen": 1.7852309942245483, + "logits/rejected": 1.7333061695098877, + "logps/chosen": -1.9978444576263428, + "logps/rejected": -1.9168767929077148, + "loss": 1.7519147872924805, + "nll_loss": 1.670620322227478, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.19978444278240204, + "rewards/margins": -0.008096768520772457, + "rewards/rejected": -0.191687673330307, + "step": 1080 + }, + { + "epoch": 1.744, + "grad_norm": 0.11997362971305847, + "learning_rate": 1.1448888888888888e-06, + "log_odds_chosen": -0.11117003858089447, + "log_odds_ratio": -0.8359676599502563, + "logits/chosen": 1.8102128505706787, + "logits/rejected": 1.8499844074249268, + "logps/chosen": -1.976915717124939, + "logps/rejected": -1.890819787979126, + "loss": 1.8335563659667968, + "nll_loss": 1.7499593496322632, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19769158959388733, + "rewards/margins": -0.008609614335000515, + "rewards/rejected": -0.18908196687698364, + "step": 1090 + }, + { + "epoch": 1.76, + "grad_norm": 0.1679936945438385, + "learning_rate": 1.0737777777777776e-06, + "log_odds_chosen": 0.17219075560569763, + "log_odds_ratio": -0.7893794775009155, + "logits/chosen": 1.6944379806518555, + "logits/rejected": 1.5971474647521973, + "logps/chosen": -1.9780410528182983, + "logps/rejected": -2.1673245429992676, + "loss": 1.7718217849731446, + "nll_loss": 1.692883849143982, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19780410826206207, + "rewards/margins": 0.01892835833132267, + "rewards/rejected": -0.216732457280159, + "step": 1100 + }, + { + "epoch": 1.76, + "eval_log_odds_chosen": 0.029876820743083954, + "eval_log_odds_ratio": -0.7605326175689697, + "eval_logits/chosen": 1.7276082038879395, + "eval_logits/rejected": 1.6985164880752563, + "eval_logps/chosen": -2.0030901432037354, + "eval_logps/rejected": -2.0341217517852783, + "eval_loss": 1.8238236904144287, + "eval_nll_loss": 1.7477705478668213, + "eval_rewards/accuracies": 0.5360000133514404, + "eval_rewards/chosen": -0.20030902326107025, + "eval_rewards/margins": 0.003103181254118681, + "eval_rewards/rejected": -0.20341220498085022, + "eval_runtime": 53.5841, + "eval_samples_per_second": 9.331, + "eval_steps_per_second": 4.666, + "step": 1100 + }, + { + "epoch": 1.776, + "grad_norm": 0.18926319479942322, + "learning_rate": 1.0026666666666665e-06, + "log_odds_chosen": 0.14324593544006348, + "log_odds_ratio": -0.6872311234474182, + "logits/chosen": 1.7348664999008179, + "logits/rejected": 1.749889612197876, + "logps/chosen": -1.9070713520050049, + "logps/rejected": -2.018203020095825, + "loss": 1.7610118865966797, + "nll_loss": 1.6922893524169922, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.19070713222026825, + "rewards/margins": 0.011113164946436882, + "rewards/rejected": -0.20182029902935028, + "step": 1110 + }, + { + "epoch": 1.792, + "grad_norm": 0.1704036146402359, + "learning_rate": 9.315555555555555e-07, + "log_odds_chosen": 0.12752032279968262, + "log_odds_ratio": -0.7044534683227539, + "logits/chosen": 1.6711995601654053, + "logits/rejected": 1.711930513381958, + "logps/chosen": -2.0130393505096436, + "logps/rejected": -2.13267183303833, + "loss": 1.7703908920288085, + "nll_loss": 1.6999456882476807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20130392909049988, + "rewards/margins": 0.011963268741965294, + "rewards/rejected": -0.21326720714569092, + "step": 1120 + }, + { + "epoch": 1.808, + "grad_norm": 0.25536438822746277, + "learning_rate": 8.604444444444445e-07, + "log_odds_chosen": 0.051788054406642914, + "log_odds_ratio": -0.7237650156021118, + "logits/chosen": 1.7273956537246704, + "logits/rejected": 1.7318992614746094, + "logps/chosen": -1.9477020502090454, + "logps/rejected": -1.9869747161865234, + "loss": 1.742520523071289, + "nll_loss": 1.6701440811157227, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.1947702020406723, + "rewards/margins": 0.003927274607121944, + "rewards/rejected": -0.19869747757911682, + "step": 1130 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.1945340931415558, + "learning_rate": 7.893333333333333e-07, + "log_odds_chosen": 0.07333675771951675, + "log_odds_ratio": -0.7535021901130676, + "logits/chosen": 1.6395387649536133, + "logits/rejected": 1.667943000793457, + "logps/chosen": -1.9120795726776123, + "logps/rejected": -1.9539562463760376, + "loss": 1.7532955169677735, + "nll_loss": 1.6779453754425049, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.19120794534683228, + "rewards/margins": 0.004187657497823238, + "rewards/rejected": -0.19539561867713928, + "step": 1140 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.1398656815290451, + "learning_rate": 7.182222222222222e-07, + "log_odds_chosen": 0.16766589879989624, + "log_odds_ratio": -0.6838506460189819, + "logits/chosen": 1.7418153285980225, + "logits/rejected": 1.7515296936035156, + "logps/chosen": -1.9456104040145874, + "logps/rejected": -2.071044445037842, + "loss": 1.7342472076416016, + "nll_loss": 1.6658618450164795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19456104934215546, + "rewards/margins": 0.01254339050501585, + "rewards/rejected": -0.20710444450378418, + "step": 1150 + }, + { + "epoch": 1.8399999999999999, + "eval_log_odds_chosen": 0.02730737067759037, + "eval_log_odds_ratio": -0.7609456181526184, + "eval_logits/chosen": 1.732469916343689, + "eval_logits/rejected": 1.703104853630066, + "eval_logps/chosen": -2.002647638320923, + "eval_logps/rejected": -2.0316905975341797, + "eval_loss": 1.8241521120071411, + "eval_nll_loss": 1.74805748462677, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20026475191116333, + "eval_rewards/margins": 0.002904308494180441, + "eval_rewards/rejected": -0.2031690627336502, + "eval_runtime": 53.4648, + "eval_samples_per_second": 9.352, + "eval_steps_per_second": 4.676, + "step": 1150 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.22400490939617157, + "learning_rate": 6.47111111111111e-07, + "log_odds_chosen": 0.1743636131286621, + "log_odds_ratio": -0.67372065782547, + "logits/chosen": 1.6396913528442383, + "logits/rejected": 1.667543649673462, + "logps/chosen": -1.782273292541504, + "logps/rejected": -1.9327503442764282, + "loss": 1.6943111419677734, + "nll_loss": 1.6269391775131226, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.17822733521461487, + "rewards/margins": 0.015047693625092506, + "rewards/rejected": -0.19327504932880402, + "step": 1160 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.18468748033046722, + "learning_rate": 5.76e-07, + "log_odds_chosen": 0.03543982282280922, + "log_odds_ratio": -0.7380915284156799, + "logits/chosen": 1.768194556236267, + "logits/rejected": 1.7483928203582764, + "logps/chosen": -1.959222435951233, + "logps/rejected": -1.9920673370361328, + "loss": 1.8366264343261718, + "nll_loss": 1.762817621231079, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.19592224061489105, + "rewards/margins": 0.0032845207024365664, + "rewards/rejected": -0.19920675456523895, + "step": 1170 + }, + { + "epoch": 1.888, + "grad_norm": 0.22529029846191406, + "learning_rate": 5.048888888888889e-07, + "log_odds_chosen": 0.05032297968864441, + "log_odds_ratio": -0.7138996720314026, + "logits/chosen": 1.744073510169983, + "logits/rejected": 1.708142638206482, + "logps/chosen": -1.8569657802581787, + "logps/rejected": -1.9047315120697021, + "loss": 1.6976106643676758, + "nll_loss": 1.626220941543579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1856965720653534, + "rewards/margins": 0.0047765769995749, + "rewards/rejected": -0.19047315418720245, + "step": 1180 + }, + { + "epoch": 1.904, + "grad_norm": 0.15872281789779663, + "learning_rate": 4.3377777777777773e-07, + "log_odds_chosen": 0.11936229467391968, + "log_odds_ratio": -0.6942839622497559, + "logits/chosen": 1.8646312952041626, + "logits/rejected": 1.788290023803711, + "logps/chosen": -1.9186222553253174, + "logps/rejected": -2.0216126441955566, + "loss": 1.7527915954589843, + "nll_loss": 1.6833631992340088, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.19186219573020935, + "rewards/margins": 0.01029905118048191, + "rewards/rejected": -0.2021612673997879, + "step": 1190 + }, + { + "epoch": 1.92, + "grad_norm": 0.21331432461738586, + "learning_rate": 3.626666666666667e-07, + "log_odds_chosen": 0.15667389333248138, + "log_odds_ratio": -0.7080708146095276, + "logits/chosen": 1.7474346160888672, + "logits/rejected": 1.706602692604065, + "logps/chosen": -1.8474972248077393, + "logps/rejected": -1.9875065088272095, + "loss": 1.7560319900512695, + "nll_loss": 1.6852247714996338, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.18474970757961273, + "rewards/margins": 0.014000937342643738, + "rewards/rejected": -0.19875064492225647, + "step": 1200 + }, + { + "epoch": 1.92, + "eval_log_odds_chosen": 0.028188293799757957, + "eval_log_odds_ratio": -0.7610485553741455, + "eval_logits/chosen": 1.7313594818115234, + "eval_logits/rejected": 1.7020314931869507, + "eval_logps/chosen": -2.0000693798065186, + "eval_logps/rejected": -2.0295324325561523, + "eval_loss": 1.8229460716247559, + "eval_nll_loss": 1.7468411922454834, + "eval_rewards/accuracies": 0.5320000052452087, + "eval_rewards/chosen": -0.20000694692134857, + "eval_rewards/margins": 0.002946300432085991, + "eval_rewards/rejected": -0.2029532641172409, + "eval_runtime": 53.5458, + "eval_samples_per_second": 9.338, + "eval_steps_per_second": 4.669, + "step": 1200 + }, + { + "epoch": 1.936, + "grad_norm": 0.17187529802322388, + "learning_rate": 2.9155555555555554e-07, + "log_odds_chosen": 0.08315232396125793, + "log_odds_ratio": -0.7257999777793884, + "logits/chosen": 1.6482616662979126, + "logits/rejected": 1.6659479141235352, + "logps/chosen": -1.9298980236053467, + "logps/rejected": -2.002552032470703, + "loss": 1.7416587829589845, + "nll_loss": 1.669079065322876, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.192989781498909, + "rewards/margins": 0.007265416439622641, + "rewards/rejected": -0.20025520026683807, + "step": 1210 + }, + { + "epoch": 1.952, + "grad_norm": 0.20439206063747406, + "learning_rate": 2.2044444444444444e-07, + "log_odds_chosen": -0.06532023102045059, + "log_odds_ratio": -0.7724722623825073, + "logits/chosen": 1.6747251749038696, + "logits/rejected": 1.7505438327789307, + "logps/chosen": -2.0724754333496094, + "logps/rejected": -2.0182735919952393, + "loss": 1.8410331726074218, + "nll_loss": 1.7637859582901, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2072475254535675, + "rewards/margins": -0.005420188885182142, + "rewards/rejected": -0.20182733237743378, + "step": 1220 + }, + { + "epoch": 1.968, + "grad_norm": 0.19014698266983032, + "learning_rate": 1.4933333333333335e-07, + "log_odds_chosen": 0.13234707713127136, + "log_odds_ratio": -0.69910728931427, + "logits/chosen": 1.811748743057251, + "logits/rejected": 1.7067095041275024, + "logps/chosen": -1.9825388193130493, + "logps/rejected": -2.091418743133545, + "loss": 1.7848602294921876, + "nll_loss": 1.7149492502212524, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.19825391471385956, + "rewards/margins": 0.010887959972023964, + "rewards/rejected": -0.20914188027381897, + "step": 1230 + }, + { + "epoch": 1.984, + "grad_norm": 0.18438519537448883, + "learning_rate": 7.822222222222221e-08, + "log_odds_chosen": 0.07198077440261841, + "log_odds_ratio": -0.7279488444328308, + "logits/chosen": 1.6942729949951172, + "logits/rejected": 1.5974804162979126, + "logps/chosen": -1.8989187479019165, + "logps/rejected": -1.9649550914764404, + "loss": 1.7285377502441406, + "nll_loss": 1.6557426452636719, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.18989187479019165, + "rewards/margins": 0.006603637244552374, + "rewards/rejected": -0.19649553298950195, + "step": 1240 + }, + { + "epoch": 2.0, + "grad_norm": 0.21022868156433105, + "learning_rate": 7.111111111111111e-09, + "log_odds_chosen": 0.09833584725856781, + "log_odds_ratio": -0.7360068559646606, + "logits/chosen": 1.8026822805404663, + "logits/rejected": 1.7808778285980225, + "logps/chosen": -2.0214970111846924, + "logps/rejected": -2.094386339187622, + "loss": 1.8390890121459962, + "nll_loss": 1.7654882669448853, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.20214971899986267, + "rewards/margins": 0.00728892395272851, + "rewards/rejected": -0.20943865180015564, + "step": 1250 + }, + { + "epoch": 2.0, + "eval_log_odds_chosen": 0.02770400233566761, + "eval_log_odds_ratio": -0.7608067989349365, + "eval_logits/chosen": 1.7315573692321777, + "eval_logits/rejected": 1.7019984722137451, + "eval_logps/chosen": -2.0005197525024414, + "eval_logps/rejected": -2.029689311981201, + "eval_loss": 1.8231130838394165, + "eval_nll_loss": 1.7470324039459229, + "eval_rewards/accuracies": 0.5339999794960022, + "eval_rewards/chosen": -0.20005199313163757, + "eval_rewards/margins": 0.002916935132816434, + "eval_rewards/rejected": -0.20296889543533325, + "eval_runtime": 53.499, + "eval_samples_per_second": 9.346, + "eval_steps_per_second": 4.673, + "step": 1250 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-1250/training_args.bin b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a6aabf8274a2e96d42587f441ac5dfa45450316 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-1250/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e300a4b9ea9cb4eee02d826775897387bbbe1b2eb5ac963e6331fd584f8ee0 +size 5457 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/README.md b/v5/ORPO/ORPO_5k/lora/checkpoint-50/README.md new file mode 100644 index 0000000000000000000000000000000000000000..df73bb4ed6d6a8e69c79ba9761e4cdd4272cc1ab --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/README.md @@ -0,0 +1,209 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +pipeline_tag: text-generation +tags: +- base_model:adapter:meta-llama/Llama-3.2-1B-Instruct +- lora +- orpo +- transformers +- trl +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.18.1 \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a69561f79e9919bdd2eb3aaaca6c08223d07d2b5 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_config.json @@ -0,0 +1,46 @@ +{ + "alora_invocation_tokens": null, + "alpha_pattern": {}, + "arrow_config": null, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "corda_config": null, + "ensure_weight_tying": false, + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "peft_version": "0.18.1", + "qalora_group_size": 16, + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "down_proj", + "k_proj", + "q_proj", + "gate_proj", + "o_proj" + ], + "target_parameters": null, + "task_type": "CAUSAL_LM", + "trainable_token_indices": null, + "use_dora": false, + "use_qalora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_model.safetensors b/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1be825c4bc69dd5833ac59543703bfa19280668f --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d7dde76c0a0f56e48f453104e8e0b46430c743c886fca1cff40881606bc2312 +size 180385008 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/chat_template.jinja b/v5/ORPO/ORPO_5k/lora/checkpoint-50/chat_template.jinja new file mode 100644 index 0000000000000000000000000000000000000000..1bad6a0f648dccdbec523ca79ba90fbcfc806af0 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/chat_template.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "" %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} +{%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/optimizer.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-50/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9829c7e08c96d78c9b7e1fd02bb2192e4c519f4 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a5d8d7077d3e516674c1015c3d64aec8bbe267cc0a95459111fbb7e6f2975ec +size 360902475 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/rng_state.pth b/v5/ORPO/ORPO_5k/lora/checkpoint-50/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1feba1a6538e93b94696d3773853dbc8947b0cad --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:098b29492211804ab324a36f37466821d948280bb74fce4ba895c03f13ecd878 +size 14645 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/scaler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-50/scaler.pt new file mode 100644 index 0000000000000000000000000000000000000000..48ade11e4a25ce357344c0aca97f5e25df3bddc4 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/scaler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cd0e9d505fbc3f97feb166d29026132bdf14eb3e5c7ff77beebc303ee666f96 +size 1383 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/scheduler.pt b/v5/ORPO/ORPO_5k/lora/checkpoint-50/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..21668bf5b097d73094b0c4e54323740614440040 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6b59ce75a4e22439322f33b9671be26f1b1bc66e57f83e53d170b181c5713b +size 1465 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer.json b/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer_config.json b/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c736827e07464a3cdd67acebb3699bfd0a38d1 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/tokenizer_config.json @@ -0,0 +1,14 @@ +{ + "backend": "tokenizers", + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "is_local": false, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "TokenizersBackend" +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/trainer_state.json b/v5/ORPO/ORPO_5k/lora/checkpoint-50/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..def09cf3ef46b0dd6b8947deb75157709fbb49b4 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/trainer_state.json @@ -0,0 +1,143 @@ +{ + "best_global_step": 50, + "best_metric": 0.5519999861717224, + "best_model_checkpoint": "output/lora/checkpoint-50", + "epoch": 0.08, + "eval_steps": 50, + "global_step": 50, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.016, + "grad_norm": 1.3608590364456177, + "learning_rate": 5.76e-07, + "log_odds_chosen": 0.0073966654017567635, + "log_odds_ratio": -0.8660133481025696, + "logits/chosen": 1.1517311334609985, + "logits/rejected": 1.1107122898101807, + "logps/chosen": -3.0449740886688232, + "logps/rejected": -3.0518546104431152, + "loss": 3.4909488677978517, + "nll_loss": 3.4043469429016113, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3044974207878113, + "rewards/margins": 0.0006880179280415177, + "rewards/rejected": -0.3051854372024536, + "step": 10 + }, + { + "epoch": 0.032, + "grad_norm": 1.8278084993362427, + "learning_rate": 1.2159999999999999e-06, + "log_odds_chosen": -0.0764567106962204, + "log_odds_ratio": -0.9281005859375, + "logits/chosen": 0.985865592956543, + "logits/rejected": 0.9893043637275696, + "logps/chosen": -3.195783853530884, + "logps/rejected": -3.128960132598877, + "loss": 3.6714431762695314, + "nll_loss": 3.5786330699920654, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.3195783793926239, + "rewards/margins": -0.006682366132736206, + "rewards/rejected": -0.3128960430622101, + "step": 20 + }, + { + "epoch": 0.048, + "grad_norm": 0.7320559024810791, + "learning_rate": 1.856e-06, + "log_odds_chosen": -0.13351905345916748, + "log_odds_ratio": -0.968097984790802, + "logits/chosen": 1.097598910331726, + "logits/rejected": 1.1367751359939575, + "logps/chosen": -3.1909520626068115, + "logps/rejected": -3.0626091957092285, + "loss": 3.345610427856445, + "nll_loss": 3.2488014698028564, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.3190951943397522, + "rewards/margins": -0.01283429004251957, + "rewards/rejected": -0.3062609136104584, + "step": 30 + }, + { + "epoch": 0.064, + "grad_norm": 0.6406434178352356, + "learning_rate": 2.496e-06, + "log_odds_chosen": 0.0689389556646347, + "log_odds_ratio": -0.7773251533508301, + "logits/chosen": 1.0645023584365845, + "logits/rejected": 1.0285594463348389, + "logps/chosen": -2.8054141998291016, + "logps/rejected": -2.8708128929138184, + "loss": 3.268035125732422, + "nll_loss": 3.190302848815918, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.28054141998291016, + "rewards/margins": 0.006539878435432911, + "rewards/rejected": -0.2870813012123108, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 0.5944439172744751, + "learning_rate": 3.136e-06, + "log_odds_chosen": -0.14803443849086761, + "log_odds_ratio": -0.9101846814155579, + "logits/chosen": 1.166550874710083, + "logits/rejected": 1.1396485567092896, + "logps/chosen": -2.988274335861206, + "logps/rejected": -2.8451037406921387, + "loss": 3.138271141052246, + "nll_loss": 3.0472521781921387, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2988274395465851, + "rewards/margins": -0.01431706827133894, + "rewards/rejected": -0.28451037406921387, + "step": 50 + }, + { + "epoch": 0.08, + "eval_log_odds_chosen": 0.13235610723495483, + "eval_log_odds_ratio": -0.8047618269920349, + "eval_logits/chosen": 1.097177267074585, + "eval_logits/rejected": 1.080869197845459, + "eval_logps/chosen": -2.87162446975708, + "eval_logps/rejected": -3.0049262046813965, + "eval_loss": 3.0927987098693848, + "eval_nll_loss": 3.012322425842285, + "eval_rewards/accuracies": 0.5519999861717224, + "eval_rewards/chosen": -0.2871624529361725, + "eval_rewards/margins": 0.013330196961760521, + "eval_rewards/rejected": -0.30049264430999756, + "eval_runtime": 53.8284, + "eval_samples_per_second": 9.289, + "eval_steps_per_second": 4.644, + "step": 50 + } + ], + "logging_steps": 10, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/v5/ORPO/ORPO_5k/lora/checkpoint-50/training_args.bin b/v5/ORPO/ORPO_5k/lora/checkpoint-50/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6a6aabf8274a2e96d42587f441ac5dfa45450316 --- /dev/null +++ b/v5/ORPO/ORPO_5k/lora/checkpoint-50/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e300a4b9ea9cb4eee02d826775897387bbbe1b2eb5ac963e6331fd584f8ee0 +size 5457 diff --git a/v5/ORPO/gen-output/ORPO_10k/data-00000-of-00001.arrow b/v5/ORPO/gen-output/ORPO_10k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..9ba59c0de9d603f3066291a814f945a13615ca0c --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_10k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb8543b98d8e1ec611797437862d9cfce0f0d4a0304a68a90f6dae699bdd5890 +size 969248 diff --git a/v5/ORPO/gen-output/ORPO_10k/dataset_info.json b/v5/ORPO/gen-output/ORPO_10k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_10k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/ORPO/gen-output/ORPO_10k/state.json b/v5/ORPO/gen-output/ORPO_10k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d6a907dcb4f9586f5d1025ff0b36d6c9a8d937df --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_10k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "3a0e71fa44945cb1", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/ORPO/gen-output/ORPO_1k/data-00000-of-00001.arrow b/v5/ORPO/gen-output/ORPO_1k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..c00abc69330fe25d1d2af835e5db8e7dfcf8b66c --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_1k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a64f666cdce3aeae621c94dfcf516befd82a852358ff1840b64a6aa7562f9213 +size 982888 diff --git a/v5/ORPO/gen-output/ORPO_1k/dataset_info.json b/v5/ORPO/gen-output/ORPO_1k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_1k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/ORPO/gen-output/ORPO_1k/state.json b/v5/ORPO/gen-output/ORPO_1k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..c2f63d1413c408bfa78efa648e907b81f29698ab --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_1k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "47377b6135aa367a", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/v5/ORPO/gen-output/ORPO_5k/data-00000-of-00001.arrow b/v5/ORPO/gen-output/ORPO_5k/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..fd5dbba78d58f0de1f9cf92bc36fb3d61b5d5232 --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_5k/data-00000-of-00001.arrow @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f441abfeff8a0e5d6c64e23212bcf276e86153f136a06ceb9b2b20315ad41c +size 991040 diff --git a/v5/ORPO/gen-output/ORPO_5k/dataset_info.json b/v5/ORPO/gen-output/ORPO_5k/dataset_info.json new file mode 100644 index 0000000000000000000000000000000000000000..523b881cd388b7ceb2401ad4759de06c0653cbef --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_5k/dataset_info.json @@ -0,0 +1,34 @@ +{ + "citation": "", + "description": "", + "features": { + "prompt": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + }, + "generated_text": { + "feature": { + "content": { + "dtype": "string", + "_type": "Value" + }, + "role": { + "dtype": "string", + "_type": "Value" + } + }, + "_type": "List" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/v5/ORPO/gen-output/ORPO_5k/state.json b/v5/ORPO/gen-output/ORPO_5k/state.json new file mode 100644 index 0000000000000000000000000000000000000000..d785a3e8f1761c75c46f73dc8a17502a0e58cb59 --- /dev/null +++ b/v5/ORPO/gen-output/ORPO_5k/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "8ea0921b0b03228f", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file