Raiff1982 commited on 10 days ago

Commit

bb1e8ce

verified ·

1 Parent(s): 5dd00cf

Upload folder using huggingface_hub

Browse files

Files changed (40) hide show

.gitattributes +4 -0
newton/README.md +62 -0
newton/adapter_config.json +43 -0
newton/adapter_model.safetensors +3 -0
newton/chat_template.jinja +109 -0
newton/checkpoint-1000/README.md +209 -0
newton/checkpoint-1000/adapter_config.json +43 -0
newton/checkpoint-1000/adapter_model.safetensors +3 -0
newton/checkpoint-1000/chat_template.jinja +109 -0
newton/checkpoint-1000/optimizer.pt +3 -0
newton/checkpoint-1000/rng_state.pth +3 -0
newton/checkpoint-1000/scheduler.pt +3 -0
newton/checkpoint-1000/tokenizer.json +3 -0
newton/checkpoint-1000/tokenizer_config.json +14 -0
newton/checkpoint-1000/trainer_state.json +1034 -0
newton/checkpoint-1000/training_args.bin +3 -0
newton/checkpoint-1125/README.md +209 -0
newton/checkpoint-1125/adapter_config.json +43 -0
newton/checkpoint-1125/adapter_model.safetensors +3 -0
newton/checkpoint-1125/chat_template.jinja +109 -0
newton/checkpoint-1125/optimizer.pt +3 -0
newton/checkpoint-1125/rng_state.pth +3 -0
newton/checkpoint-1125/scheduler.pt +3 -0
newton/checkpoint-1125/tokenizer.json +3 -0
newton/checkpoint-1125/tokenizer_config.json +14 -0
newton/checkpoint-1125/trainer_state.json +1154 -0
newton/checkpoint-1125/training_args.bin +3 -0
newton/checkpoint-500/README.md +209 -0
newton/checkpoint-500/adapter_config.json +43 -0
newton/checkpoint-500/adapter_model.safetensors +3 -0
newton/checkpoint-500/chat_template.jinja +109 -0
newton/checkpoint-500/optimizer.pt +3 -0
newton/checkpoint-500/rng_state.pth +3 -0
newton/checkpoint-500/scheduler.pt +3 -0
newton/checkpoint-500/tokenizer.json +3 -0
newton/checkpoint-500/tokenizer_config.json +14 -0
newton/checkpoint-500/trainer_state.json +534 -0
newton/checkpoint-500/training_args.bin +3 -0
newton/tokenizer.json +3 -0
newton/tokenizer_config.json +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+newton/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+newton/checkpoint-1125/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+newton/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+newton/tokenizer.json filter=lfs diff=lfs merge=lfs -text

newton/README.md ADDED Viewed

	@@ -0,0 +1,62 @@

+---
+base_model: meta-llama/Llama-3.1-8B-Instruct
+library_name: peft
+model_name: newton
+tags:
+- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for newton
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.29.0
+- Transformers: 5.3.0
+- Pytorch: 2.10.0
+- Datasets: 4.6.1
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

newton/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

newton/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:323635297b5e0c773a26c4451697f85a4ff3020e8864a138ba799a14da2627a2
+size 27297544

newton/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

newton/checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

newton/checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

newton/checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f41c14f1336f835fccc7fe9f0c53b2a0966f2388840ee6241fffd86a6a65108a
+size 27297544

newton/checkpoint-1000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

newton/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44e023b856408604b2dac8f46a59a2c413f9c5171d8a8dd0bcb2e1266e8a17e0
+size 54745547

newton/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58a9efb6a8371c0aa0c7c1f1395d8817f98251d4ccd6b17cd77847cecdf56a0b
+size 14645

newton/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ede3d2a514005ed80690b07770eb75aab9fd0b335517babd631dfbc1716d09fd
+size 1465

newton/checkpoint-1000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

newton/checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

newton/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1034 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.6666666666666665,
+  "eval_steps": 500,
+  "global_step": 1000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 2.6570239067077637,
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 5.294117647058824e-05,
+      "loss": 2.800247573852539,
+      "mean_token_accuracy": 0.4749053567647934,
+      "num_tokens": 56906.0,
+      "step": 10
+    },
+    {
+      "entropy": 2.2495410323143004,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011176470588235294,
+      "loss": 2.4327199935913084,
+      "mean_token_accuracy": 0.5111239477992058,
+      "num_tokens": 113827.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.8682004392147065,
+      "epoch": 0.08,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00017058823529411766,
+      "loss": 1.789840316772461,
+      "mean_token_accuracy": 0.599884121119976,
+      "num_tokens": 170403.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2546741724014283,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019908340971585702,
+      "loss": 1.2151795387268067,
+      "mean_token_accuracy": 0.7106126025319099,
+      "num_tokens": 227456.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.8836664661765099,
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00019725022914757106,
+      "loss": 0.8311976432800293,
+      "mean_token_accuracy": 0.7977700293064117,
+      "num_tokens": 284368.0,
+      "step": 50
+    },
+    {
+      "entropy": 0.6855858579277992,
+      "epoch": 0.16,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00019541704857928507,
+      "loss": 0.6242359638214111,
+      "mean_token_accuracy": 0.847702169418335,
+      "num_tokens": 341357.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.4690785683691502,
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00019358386801099912,
+      "loss": 0.40251870155334474,
+      "mean_token_accuracy": 0.9024116918444633,
+      "num_tokens": 398280.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.34345744624733926,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001917506874427131,
+      "loss": 0.28333656787872313,
+      "mean_token_accuracy": 0.9320006996393204,
+      "num_tokens": 455232.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.25451925955712795,
+      "epoch": 0.24,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.00018991750687442712,
+      "loss": 0.21085577011108397,
+      "mean_token_accuracy": 0.949009683728218,
+      "num_tokens": 511782.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.19814539551734925,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00018808432630614116,
+      "loss": 0.1717105984687805,
+      "mean_token_accuracy": 0.9577329605817795,
+      "num_tokens": 568641.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.18550167009234428,
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00018625114573785518,
+      "loss": 0.15982584953308104,
+      "mean_token_accuracy": 0.9591923207044601,
+      "num_tokens": 626038.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.16009770445525645,
+      "epoch": 0.32,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00018441796516956922,
+      "loss": 0.12815338373184204,
+      "mean_token_accuracy": 0.9657398357987403,
+      "num_tokens": 682880.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.14740683771669866,
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00018258478460128323,
+      "loss": 0.1188442587852478,
+      "mean_token_accuracy": 0.9664651393890381,
+      "num_tokens": 739719.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.13307180535048246,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00018075160403299728,
+      "loss": 0.11054203510284424,
+      "mean_token_accuracy": 0.9669812738895416,
+      "num_tokens": 795894.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.12216594349592924,
+      "epoch": 0.4,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001789184234647113,
+      "loss": 0.10401068925857544,
+      "mean_token_accuracy": 0.9683825269341468,
+      "num_tokens": 852124.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.11619068495929241,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001770852428964253,
+      "loss": 0.0976063370704651,
+      "mean_token_accuracy": 0.9695558726787568,
+      "num_tokens": 909328.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.10669020470231771,
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00017525206232813932,
+      "loss": 0.09338906407356262,
+      "mean_token_accuracy": 0.970247569680214,
+      "num_tokens": 966577.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.10276608634740114,
+      "epoch": 0.48,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00017341888175985334,
+      "loss": 0.09135337471961975,
+      "mean_token_accuracy": 0.9711026951670647,
+      "num_tokens": 1022961.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.10297673251479864,
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00017158570119156738,
+      "loss": 0.08887208104133607,
+      "mean_token_accuracy": 0.9709939315915108,
+      "num_tokens": 1079479.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.09722564350813627,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001697525206232814,
+      "loss": 0.08848196864128113,
+      "mean_token_accuracy": 0.9712936446070671,
+      "num_tokens": 1135784.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.09498227294534445,
+      "epoch": 0.56,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00016791934005499544,
+      "loss": 0.08531092405319214,
+      "mean_token_accuracy": 0.9717509031295777,
+      "num_tokens": 1192723.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.09660841915756464,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00016608615948670945,
+      "loss": 0.08432384729385375,
+      "mean_token_accuracy": 0.9723995119333267,
+      "num_tokens": 1248974.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.09139632768929004,
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001642529789184235,
+      "loss": 0.08340675234794617,
+      "mean_token_accuracy": 0.9725200146436691,
+      "num_tokens": 1306125.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.09041857812553644,
+      "epoch": 0.64,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001624197983501375,
+      "loss": 0.08240053057670593,
+      "mean_token_accuracy": 0.9727400034666062,
+      "num_tokens": 1362509.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.08917351886630058,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00016058661778185152,
+      "loss": 0.08038315176963806,
+      "mean_token_accuracy": 0.9722966447472572,
+      "num_tokens": 1419155.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.08846015091985464,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00015875343721356554,
+      "loss": 0.08111950755119324,
+      "mean_token_accuracy": 0.9725704893469811,
+      "num_tokens": 1475233.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.08615751322358847,
+      "epoch": 0.72,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00015692025664527955,
+      "loss": 0.07856618165969849,
+      "mean_token_accuracy": 0.9734801158308983,
+      "num_tokens": 1531666.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.08350808713585138,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001550870760769936,
+      "loss": 0.07699183821678161,
+      "mean_token_accuracy": 0.9737285181879998,
+      "num_tokens": 1588686.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.08553262427449226,
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001532538955087076,
+      "loss": 0.07849866151809692,
+      "mean_token_accuracy": 0.9727597609162331,
+      "num_tokens": 1645610.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.08688175324350596,
+      "epoch": 0.8,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00015142071494042165,
+      "loss": 0.0791881263256073,
+      "mean_token_accuracy": 0.9728336438536644,
+      "num_tokens": 1702234.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.08647099416702986,
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014958753437213567,
+      "loss": 0.07916317582130432,
+      "mean_token_accuracy": 0.9720797210931778,
+      "num_tokens": 1758523.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.08278416823595762,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014775435380384968,
+      "loss": 0.07689375281333924,
+      "mean_token_accuracy": 0.9735667318105697,
+      "num_tokens": 1815080.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.08433555215597152,
+      "epoch": 0.88,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00014592117323556373,
+      "loss": 0.07733245491981507,
+      "mean_token_accuracy": 0.973043854534626,
+      "num_tokens": 1872283.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.0831523710861802,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00014408799266727771,
+      "loss": 0.07743646502494812,
+      "mean_token_accuracy": 0.9724773317575455,
+      "num_tokens": 1929120.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.08173599634319544,
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.00014225481209899176,
+      "loss": 0.07464101910591125,
+      "mean_token_accuracy": 0.9732464775443077,
+      "num_tokens": 1986433.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.08154450561851263,
+      "epoch": 0.96,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00014042163153070577,
+      "loss": 0.07836683988571166,
+      "mean_token_accuracy": 0.9733009964227677,
+      "num_tokens": 2043465.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.08830973766744137,
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001385884509624198,
+      "loss": 0.07805899381637574,
+      "mean_token_accuracy": 0.9734541475772858,
+      "num_tokens": 2100933.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.08108338043093681,
+      "epoch": 1.0133333333333334,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00013675527039413383,
+      "loss": 0.07582586407661437,
+      "mean_token_accuracy": 0.9734946370124817,
+      "num_tokens": 2157057.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.0781314555555582,
+      "epoch": 1.04,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00013492208982584784,
+      "loss": 0.0714304804801941,
+      "mean_token_accuracy": 0.975023752450943,
+      "num_tokens": 2214085.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.07955040819942952,
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.00013308890925756189,
+      "loss": 0.07331350445747375,
+      "mean_token_accuracy": 0.9737342849373818,
+      "num_tokens": 2270765.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.07677881456911564,
+      "epoch": 1.0933333333333333,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001312557286892759,
+      "loss": 0.07168130278587341,
+      "mean_token_accuracy": 0.9739445611834526,
+      "num_tokens": 2327512.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.07667716387659311,
+      "epoch": 1.12,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00012942254812098992,
+      "loss": 0.07219807505607605,
+      "mean_token_accuracy": 0.9742562755942344,
+      "num_tokens": 2384423.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.07681187009438872,
+      "epoch": 1.1466666666666667,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.00012758936755270393,
+      "loss": 0.07280588746070862,
+      "mean_token_accuracy": 0.9735747814178467,
+      "num_tokens": 2441102.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.07602620646357536,
+      "epoch": 1.1733333333333333,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.00012575618698441797,
+      "loss": 0.07293958067893982,
+      "mean_token_accuracy": 0.9740705206990242,
+      "num_tokens": 2497642.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.07798876240849495,
+      "epoch": 1.2,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.000123923006416132,
+      "loss": 0.07215467095375061,
+      "mean_token_accuracy": 0.9742186814546585,
+      "num_tokens": 2554273.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.07671927772462368,
+      "epoch": 1.2266666666666666,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012208982584784603,
+      "loss": 0.07254356741905213,
+      "mean_token_accuracy": 0.9733539551496506,
+      "num_tokens": 2610932.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.07502734698355198,
+      "epoch": 1.2533333333333334,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012025664527956005,
+      "loss": 0.07076438069343567,
+      "mean_token_accuracy": 0.9745794385671616,
+      "num_tokens": 2668226.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.07516032289713621,
+      "epoch": 1.28,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.00011842346471127406,
+      "loss": 0.0711740493774414,
+      "mean_token_accuracy": 0.9735412746667862,
+      "num_tokens": 2725180.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.07623793687671424,
+      "epoch": 1.3066666666666666,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00011659028414298809,
+      "loss": 0.07199874520301819,
+      "mean_token_accuracy": 0.9739259093999862,
+      "num_tokens": 2782069.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.07468608934432268,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001147571035747021,
+      "loss": 0.07050397992134094,
+      "mean_token_accuracy": 0.9742979735136033,
+      "num_tokens": 2838772.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.07314184289425611,
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00011292392300641615,
+      "loss": 0.06992406845092773,
+      "mean_token_accuracy": 0.9748412847518921,
+      "num_tokens": 2896384.0,
+      "step": 510
+    },
+    {
+      "entropy": 0.07735273949801921,
+      "epoch": 1.3866666666666667,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.00011109074243813016,
+      "loss": 0.07089330554008484,
+      "mean_token_accuracy": 0.973857656121254,
+      "num_tokens": 2953074.0,
+      "step": 520
+    },
+    {
+      "entropy": 0.07427110467106104,
+      "epoch": 1.4133333333333333,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.00010925756186984419,
+      "loss": 0.07023302912712097,
+      "mean_token_accuracy": 0.9745061740279197,
+      "num_tokens": 3009599.0,
+      "step": 530
+    },
+    {
+      "entropy": 0.07496015410870313,
+      "epoch": 1.44,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001074243813015582,
+      "loss": 0.07044907808303832,
+      "mean_token_accuracy": 0.97446711063385,
+      "num_tokens": 3065550.0,
+      "step": 540
+    },
+    {
+      "entropy": 0.07237969692796468,
+      "epoch": 1.4666666666666668,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.00010559120073327222,
+      "loss": 0.06903309226036072,
+      "mean_token_accuracy": 0.9751396328210831,
+      "num_tokens": 3122339.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.07292939173057675,
+      "epoch": 1.4933333333333334,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.00010375802016498626,
+      "loss": 0.06951733827590942,
+      "mean_token_accuracy": 0.9748973533511162,
+      "num_tokens": 3179284.0,
+      "step": 560
+    },
+    {
+      "entropy": 0.0735103216022253,
+      "epoch": 1.52,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.00010192483959670028,
+      "loss": 0.06886410713195801,
+      "mean_token_accuracy": 0.9742336764931678,
+      "num_tokens": 3236634.0,
+      "step": 570
+    },
+    {
+      "entropy": 0.07244595270603896,
+      "epoch": 1.5466666666666666,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001000916590284143,
+      "loss": 0.06925945878028869,
+      "mean_token_accuracy": 0.9746079474687577,
+      "num_tokens": 3293217.0,
+      "step": 580
+    },
+    {
+      "entropy": 0.0733188034966588,
+      "epoch": 1.5733333333333333,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 9.825847846012832e-05,
+      "loss": 0.06935187578201293,
+      "mean_token_accuracy": 0.9748518764972687,
+      "num_tokens": 3349872.0,
+      "step": 590
+    },
+    {
+      "entropy": 0.07255212999880314,
+      "epoch": 1.6,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 9.642529789184235e-05,
+      "loss": 0.07008358240127563,
+      "mean_token_accuracy": 0.9742572873830795,
+      "num_tokens": 3406930.0,
+      "step": 600
+    },
+    {
+      "entropy": 0.0732356732711196,
+      "epoch": 1.6266666666666667,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 9.459211732355638e-05,
+      "loss": 0.06836349368095399,
+      "mean_token_accuracy": 0.9751275479793549,
+      "num_tokens": 3464439.0,
+      "step": 610
+    },
+    {
+      "entropy": 0.07225457970052958,
+      "epoch": 1.6533333333333333,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 9.27589367552704e-05,
+      "loss": 0.06948843002319335,
+      "mean_token_accuracy": 0.9739401176571846,
+      "num_tokens": 3521325.0,
+      "step": 620
+    },
+    {
+      "entropy": 0.07250613961368799,
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 9.092575618698442e-05,
+      "loss": 0.06941892504692078,
+      "mean_token_accuracy": 0.9748956650495529,
+      "num_tokens": 3577996.0,
+      "step": 630
+    },
+    {
+      "entropy": 0.0732794025912881,
+      "epoch": 1.7066666666666666,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 8.909257561869845e-05,
+      "loss": 0.06896185874938965,
+      "mean_token_accuracy": 0.9750035509467125,
+      "num_tokens": 3634811.0,
+      "step": 640
+    },
+    {
+      "entropy": 0.07183574195951223,
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 8.725939505041248e-05,
+      "loss": 0.0701564073562622,
+      "mean_token_accuracy": 0.9742208927869797,
+      "num_tokens": 3691017.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.07327579502016306,
+      "epoch": 1.76,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 8.54262144821265e-05,
+      "loss": 0.06881371140480042,
+      "mean_token_accuracy": 0.9741959020495414,
+      "num_tokens": 3747546.0,
+      "step": 660
+    },
+    {
+      "entropy": 0.07111402666196227,
+      "epoch": 1.7866666666666666,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 8.359303391384051e-05,
+      "loss": 0.06966341137886048,
+      "mean_token_accuracy": 0.9747162073850631,
+      "num_tokens": 3804126.0,
+      "step": 670
+    },
+    {
+      "entropy": 0.07224018704146147,
+      "epoch": 1.8133333333333335,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 8.175985334555454e-05,
+      "loss": 0.06840948462486267,
+      "mean_token_accuracy": 0.9747431293129921,
+      "num_tokens": 3861006.0,
+      "step": 680
+    },
+    {
+      "entropy": 0.07255861330777406,
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 7.992667277726857e-05,
+      "loss": 0.06987766623497009,
+      "mean_token_accuracy": 0.9739771053195,
+      "num_tokens": 3916797.0,
+      "step": 690
+    },
+    {
+      "entropy": 0.07260533329099417,
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 7.809349220898258e-05,
+      "loss": 0.06835905909538269,
+      "mean_token_accuracy": 0.9750322937965393,
+      "num_tokens": 3973197.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.0710109818726778,
+      "epoch": 1.8933333333333333,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 7.626031164069661e-05,
+      "loss": 0.0677144169807434,
+      "mean_token_accuracy": 0.9751162648200988,
+      "num_tokens": 4030212.0,
+      "step": 710
+    },
+    {
+      "entropy": 0.070679662656039,
+      "epoch": 1.92,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 7.442713107241064e-05,
+      "loss": 0.0661697268486023,
+      "mean_token_accuracy": 0.9755514889955521,
+      "num_tokens": 4087699.0,
+      "step": 720
+    },
+    {
+      "entropy": 0.0694987777620554,
+      "epoch": 1.9466666666666668,
+      "grad_norm": 0.115234375,
+      "learning_rate": 7.259395050412467e-05,
+      "loss": 0.06822068691253662,
+      "mean_token_accuracy": 0.97522524446249,
+      "num_tokens": 4144740.0,
+      "step": 730
+    },
+    {
+      "entropy": 0.07208629371598363,
+      "epoch": 1.9733333333333334,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 7.076076993583868e-05,
+      "loss": 0.06933082938194275,
+      "mean_token_accuracy": 0.9743774682283401,
+      "num_tokens": 4201289.0,
+      "step": 740
+    },
+    {
+      "entropy": 0.07209395840764046,
+      "epoch": 2.0,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 6.89275893675527e-05,
+      "loss": 0.06815703511238098,
+      "mean_token_accuracy": 0.974660362303257,
+      "num_tokens": 4257958.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.07068475261330605,
+      "epoch": 2.026666666666667,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 6.709440879926673e-05,
+      "loss": 0.0669311225414276,
+      "mean_token_accuracy": 0.9747605755925178,
+      "num_tokens": 4314723.0,
+      "step": 760
+    },
+    {
+      "entropy": 0.06951902080327273,
+      "epoch": 2.0533333333333332,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 6.526122823098076e-05,
+      "loss": 0.0668017327785492,
+      "mean_token_accuracy": 0.9751198858022689,
+      "num_tokens": 4371457.0,
+      "step": 770
+    },
+    {
+      "entropy": 0.07024376196786761,
+      "epoch": 2.08,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 6.342804766269478e-05,
+      "loss": 0.06699610352516175,
+      "mean_token_accuracy": 0.9748657032847404,
+      "num_tokens": 4427543.0,
+      "step": 780
+    },
+    {
+      "entropy": 0.06954137068241835,
+      "epoch": 2.1066666666666665,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 6.15948670944088e-05,
+      "loss": 0.06581668257713318,
+      "mean_token_accuracy": 0.9755794301629066,
+      "num_tokens": 4484853.0,
+      "step": 790
+    },
+    {
+      "entropy": 0.06969003304839134,
+      "epoch": 2.1333333333333333,
+      "grad_norm": 0.05859375,
+      "learning_rate": 5.976168652612283e-05,
+      "loss": 0.06605738401412964,
+      "mean_token_accuracy": 0.9751082003116608,
+      "num_tokens": 4540895.0,
+      "step": 800
+    },
+    {
+      "entropy": 0.07048749346286058,
+      "epoch": 2.16,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 5.792850595783685e-05,
+      "loss": 0.06759686470031738,
+      "mean_token_accuracy": 0.9748542428016662,
+      "num_tokens": 4597531.0,
+      "step": 810
+    },
+    {
+      "entropy": 0.0699356870725751,
+      "epoch": 2.1866666666666665,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 5.6095325389550866e-05,
+      "loss": 0.06627315282821655,
+      "mean_token_accuracy": 0.9759758025407791,
+      "num_tokens": 4654517.0,
+      "step": 820
+    },
+    {
+      "entropy": 0.06981293484568596,
+      "epoch": 2.2133333333333334,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 5.4262144821264894e-05,
+      "loss": 0.06639997959136963,
+      "mean_token_accuracy": 0.9752195671200752,
+      "num_tokens": 4711508.0,
+      "step": 830
+    },
+    {
+      "entropy": 0.06960875494405627,
+      "epoch": 2.24,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 5.2428964252978916e-05,
+      "loss": 0.06645302176475525,
+      "mean_token_accuracy": 0.9757942840456962,
+      "num_tokens": 4768589.0,
+      "step": 840
+    },
+    {
+      "entropy": 0.06928735189139842,
+      "epoch": 2.2666666666666666,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 5.0595783684692945e-05,
+      "loss": 0.06615262627601623,
+      "mean_token_accuracy": 0.975421866774559,
+      "num_tokens": 4825447.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.0701323315501213,
+      "epoch": 2.2933333333333334,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 4.876260311640697e-05,
+      "loss": 0.06594157218933105,
+      "mean_token_accuracy": 0.9752340018749237,
+      "num_tokens": 4882324.0,
+      "step": 860
+    },
+    {
+      "entropy": 0.06790421595796943,
+      "epoch": 2.32,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 4.6929422548120995e-05,
+      "loss": 0.06551963090896606,
+      "mean_token_accuracy": 0.9751909494400024,
+      "num_tokens": 4939254.0,
+      "step": 870
+    },
+    {
+      "entropy": 0.07054078914225101,
+      "epoch": 2.3466666666666667,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 4.509624197983501e-05,
+      "loss": 0.06690743565559387,
+      "mean_token_accuracy": 0.9751562505960465,
+      "num_tokens": 4995524.0,
+      "step": 880
+    },
+    {
+      "entropy": 0.06957337409257888,
+      "epoch": 2.3733333333333335,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 4.326306141154904e-05,
+      "loss": 0.06609007120132446,
+      "mean_token_accuracy": 0.9754323452711106,
+      "num_tokens": 5052578.0,
+      "step": 890
+    },
+    {
+      "entropy": 0.07044977657496929,
+      "epoch": 2.4,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 4.142988084326306e-05,
+      "loss": 0.06621668338775635,
+      "mean_token_accuracy": 0.9750386416912079,
+      "num_tokens": 5108922.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.06792065436020493,
+      "epoch": 2.4266666666666667,
+      "grad_norm": 0.046875,
+      "learning_rate": 3.959670027497709e-05,
+      "loss": 0.06501899361610412,
+      "mean_token_accuracy": 0.9760412231087685,
+      "num_tokens": 5166394.0,
+      "step": 910
+    },
+    {
+      "entropy": 0.06912549249827862,
+      "epoch": 2.453333333333333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 3.776351970669111e-05,
+      "loss": 0.06575977206230163,
+      "mean_token_accuracy": 0.975604172050953,
+      "num_tokens": 5223123.0,
+      "step": 920
+    },
+    {
+      "entropy": 0.06817780192941428,
+      "epoch": 2.48,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 3.593033913840513e-05,
+      "loss": 0.06491979956626892,
+      "mean_token_accuracy": 0.9758375898003578,
+      "num_tokens": 5280867.0,
+      "step": 930
+    },
+    {
+      "entropy": 0.06880640015006065,
+      "epoch": 2.506666666666667,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 3.409715857011916e-05,
+      "loss": 0.0658724844455719,
+      "mean_token_accuracy": 0.9759016156196594,
+      "num_tokens": 5337629.0,
+      "step": 940
+    },
+    {
+      "entropy": 0.06923360927030445,
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 3.2263978001833184e-05,
+      "loss": 0.06607494950294494,
+      "mean_token_accuracy": 0.9753221690654754,
+      "num_tokens": 5394318.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.06904373681172729,
+      "epoch": 2.56,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 3.0430797433547202e-05,
+      "loss": 0.06557352542877197,
+      "mean_token_accuracy": 0.9759575456380845,
+      "num_tokens": 5450413.0,
+      "step": 960
+    },
+    {
+      "entropy": 0.06914114560931921,
+      "epoch": 2.586666666666667,
+      "grad_norm": 0.046875,
+      "learning_rate": 2.8597616865261228e-05,
+      "loss": 0.06594338417053222,
+      "mean_token_accuracy": 0.9751049831509591,
+      "num_tokens": 5507306.0,
+      "step": 970
+    },
+    {
+      "entropy": 0.0688713699579239,
+      "epoch": 2.6133333333333333,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 2.6764436296975253e-05,
+      "loss": 0.06489255428314208,
+      "mean_token_accuracy": 0.9756928265094758,
+      "num_tokens": 5564241.0,
+      "step": 980
+    },
+    {
+      "entropy": 0.0688857214525342,
+      "epoch": 2.64,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 2.4931255728689275e-05,
+      "loss": 0.06557077169418335,
+      "mean_token_accuracy": 0.9758043006062508,
+      "num_tokens": 5620870.0,
+      "step": 990
+    },
+    {
+      "entropy": 0.06913622673600912,
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 2.30980751604033e-05,
+      "loss": 0.06396430134773254,
+      "mean_token_accuracy": 0.9762534514069557,
+      "num_tokens": 5677975.0,
+      "step": 1000
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.647683611123712e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

newton/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8755273dccefb3d7fa41448d64a8c28d76451700a997d4cbd5f7ac202a091f77
+size 5585

newton/checkpoint-1125/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

newton/checkpoint-1125/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

newton/checkpoint-1125/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:323635297b5e0c773a26c4451697f85a4ff3020e8864a138ba799a14da2627a2
+size 27297544

newton/checkpoint-1125/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

newton/checkpoint-1125/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0613cd7692608d4f46da8fb85a3e7b638bf5c9637f9cfc5d1454e781d35e5997
+size 54745547

newton/checkpoint-1125/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c207979b56e7df5b5e151c53b37e511fa7122539c6e7e5570ee51af2a0968967
+size 14645

newton/checkpoint-1125/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb2a37c55dd5d7928c5b2c15b7d4f650fade3ddb7af6dc8961ca05874b789488
+size 1465

newton/checkpoint-1125/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

newton/checkpoint-1125/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

newton/checkpoint-1125/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1154 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 3.0,
+  "eval_steps": 500,
+  "global_step": 1125,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 2.6570239067077637,
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 5.294117647058824e-05,
+      "loss": 2.800247573852539,
+      "mean_token_accuracy": 0.4749053567647934,
+      "num_tokens": 56906.0,
+      "step": 10
+    },
+    {
+      "entropy": 2.2495410323143004,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011176470588235294,
+      "loss": 2.4327199935913084,
+      "mean_token_accuracy": 0.5111239477992058,
+      "num_tokens": 113827.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.8682004392147065,
+      "epoch": 0.08,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00017058823529411766,
+      "loss": 1.789840316772461,
+      "mean_token_accuracy": 0.599884121119976,
+      "num_tokens": 170403.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2546741724014283,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019908340971585702,
+      "loss": 1.2151795387268067,
+      "mean_token_accuracy": 0.7106126025319099,
+      "num_tokens": 227456.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.8836664661765099,
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00019725022914757106,
+      "loss": 0.8311976432800293,
+      "mean_token_accuracy": 0.7977700293064117,
+      "num_tokens": 284368.0,
+      "step": 50
+    },
+    {
+      "entropy": 0.6855858579277992,
+      "epoch": 0.16,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00019541704857928507,
+      "loss": 0.6242359638214111,
+      "mean_token_accuracy": 0.847702169418335,
+      "num_tokens": 341357.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.4690785683691502,
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00019358386801099912,
+      "loss": 0.40251870155334474,
+      "mean_token_accuracy": 0.9024116918444633,
+      "num_tokens": 398280.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.34345744624733926,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001917506874427131,
+      "loss": 0.28333656787872313,
+      "mean_token_accuracy": 0.9320006996393204,
+      "num_tokens": 455232.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.25451925955712795,
+      "epoch": 0.24,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.00018991750687442712,
+      "loss": 0.21085577011108397,
+      "mean_token_accuracy": 0.949009683728218,
+      "num_tokens": 511782.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.19814539551734925,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00018808432630614116,
+      "loss": 0.1717105984687805,
+      "mean_token_accuracy": 0.9577329605817795,
+      "num_tokens": 568641.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.18550167009234428,
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00018625114573785518,
+      "loss": 0.15982584953308104,
+      "mean_token_accuracy": 0.9591923207044601,
+      "num_tokens": 626038.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.16009770445525645,
+      "epoch": 0.32,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00018441796516956922,
+      "loss": 0.12815338373184204,
+      "mean_token_accuracy": 0.9657398357987403,
+      "num_tokens": 682880.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.14740683771669866,
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00018258478460128323,
+      "loss": 0.1188442587852478,
+      "mean_token_accuracy": 0.9664651393890381,
+      "num_tokens": 739719.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.13307180535048246,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00018075160403299728,
+      "loss": 0.11054203510284424,
+      "mean_token_accuracy": 0.9669812738895416,
+      "num_tokens": 795894.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.12216594349592924,
+      "epoch": 0.4,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001789184234647113,
+      "loss": 0.10401068925857544,
+      "mean_token_accuracy": 0.9683825269341468,
+      "num_tokens": 852124.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.11619068495929241,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001770852428964253,
+      "loss": 0.0976063370704651,
+      "mean_token_accuracy": 0.9695558726787568,
+      "num_tokens": 909328.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.10669020470231771,
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00017525206232813932,
+      "loss": 0.09338906407356262,
+      "mean_token_accuracy": 0.970247569680214,
+      "num_tokens": 966577.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.10276608634740114,
+      "epoch": 0.48,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00017341888175985334,
+      "loss": 0.09135337471961975,
+      "mean_token_accuracy": 0.9711026951670647,
+      "num_tokens": 1022961.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.10297673251479864,
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00017158570119156738,
+      "loss": 0.08887208104133607,
+      "mean_token_accuracy": 0.9709939315915108,
+      "num_tokens": 1079479.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.09722564350813627,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001697525206232814,
+      "loss": 0.08848196864128113,
+      "mean_token_accuracy": 0.9712936446070671,
+      "num_tokens": 1135784.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.09498227294534445,
+      "epoch": 0.56,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00016791934005499544,
+      "loss": 0.08531092405319214,
+      "mean_token_accuracy": 0.9717509031295777,
+      "num_tokens": 1192723.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.09660841915756464,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00016608615948670945,
+      "loss": 0.08432384729385375,
+      "mean_token_accuracy": 0.9723995119333267,
+      "num_tokens": 1248974.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.09139632768929004,
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001642529789184235,
+      "loss": 0.08340675234794617,
+      "mean_token_accuracy": 0.9725200146436691,
+      "num_tokens": 1306125.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.09041857812553644,
+      "epoch": 0.64,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001624197983501375,
+      "loss": 0.08240053057670593,
+      "mean_token_accuracy": 0.9727400034666062,
+      "num_tokens": 1362509.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.08917351886630058,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00016058661778185152,
+      "loss": 0.08038315176963806,
+      "mean_token_accuracy": 0.9722966447472572,
+      "num_tokens": 1419155.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.08846015091985464,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00015875343721356554,
+      "loss": 0.08111950755119324,
+      "mean_token_accuracy": 0.9725704893469811,
+      "num_tokens": 1475233.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.08615751322358847,
+      "epoch": 0.72,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00015692025664527955,
+      "loss": 0.07856618165969849,
+      "mean_token_accuracy": 0.9734801158308983,
+      "num_tokens": 1531666.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.08350808713585138,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001550870760769936,
+      "loss": 0.07699183821678161,
+      "mean_token_accuracy": 0.9737285181879998,
+      "num_tokens": 1588686.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.08553262427449226,
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001532538955087076,
+      "loss": 0.07849866151809692,
+      "mean_token_accuracy": 0.9727597609162331,
+      "num_tokens": 1645610.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.08688175324350596,
+      "epoch": 0.8,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00015142071494042165,
+      "loss": 0.0791881263256073,
+      "mean_token_accuracy": 0.9728336438536644,
+      "num_tokens": 1702234.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.08647099416702986,
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014958753437213567,
+      "loss": 0.07916317582130432,
+      "mean_token_accuracy": 0.9720797210931778,
+      "num_tokens": 1758523.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.08278416823595762,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014775435380384968,
+      "loss": 0.07689375281333924,
+      "mean_token_accuracy": 0.9735667318105697,
+      "num_tokens": 1815080.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.08433555215597152,
+      "epoch": 0.88,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00014592117323556373,
+      "loss": 0.07733245491981507,
+      "mean_token_accuracy": 0.973043854534626,
+      "num_tokens": 1872283.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.0831523710861802,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00014408799266727771,
+      "loss": 0.07743646502494812,
+      "mean_token_accuracy": 0.9724773317575455,
+      "num_tokens": 1929120.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.08173599634319544,
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.00014225481209899176,
+      "loss": 0.07464101910591125,
+      "mean_token_accuracy": 0.9732464775443077,
+      "num_tokens": 1986433.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.08154450561851263,
+      "epoch": 0.96,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00014042163153070577,
+      "loss": 0.07836683988571166,
+      "mean_token_accuracy": 0.9733009964227677,
+      "num_tokens": 2043465.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.08830973766744137,
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001385884509624198,
+      "loss": 0.07805899381637574,
+      "mean_token_accuracy": 0.9734541475772858,
+      "num_tokens": 2100933.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.08108338043093681,
+      "epoch": 1.0133333333333334,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00013675527039413383,
+      "loss": 0.07582586407661437,
+      "mean_token_accuracy": 0.9734946370124817,
+      "num_tokens": 2157057.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.0781314555555582,
+      "epoch": 1.04,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00013492208982584784,
+      "loss": 0.0714304804801941,
+      "mean_token_accuracy": 0.975023752450943,
+      "num_tokens": 2214085.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.07955040819942952,
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.00013308890925756189,
+      "loss": 0.07331350445747375,
+      "mean_token_accuracy": 0.9737342849373818,
+      "num_tokens": 2270765.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.07677881456911564,
+      "epoch": 1.0933333333333333,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001312557286892759,
+      "loss": 0.07168130278587341,
+      "mean_token_accuracy": 0.9739445611834526,
+      "num_tokens": 2327512.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.07667716387659311,
+      "epoch": 1.12,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00012942254812098992,
+      "loss": 0.07219807505607605,
+      "mean_token_accuracy": 0.9742562755942344,
+      "num_tokens": 2384423.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.07681187009438872,
+      "epoch": 1.1466666666666667,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.00012758936755270393,
+      "loss": 0.07280588746070862,
+      "mean_token_accuracy": 0.9735747814178467,
+      "num_tokens": 2441102.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.07602620646357536,
+      "epoch": 1.1733333333333333,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.00012575618698441797,
+      "loss": 0.07293958067893982,
+      "mean_token_accuracy": 0.9740705206990242,
+      "num_tokens": 2497642.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.07798876240849495,
+      "epoch": 1.2,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.000123923006416132,
+      "loss": 0.07215467095375061,
+      "mean_token_accuracy": 0.9742186814546585,
+      "num_tokens": 2554273.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.07671927772462368,
+      "epoch": 1.2266666666666666,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012208982584784603,
+      "loss": 0.07254356741905213,
+      "mean_token_accuracy": 0.9733539551496506,
+      "num_tokens": 2610932.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.07502734698355198,
+      "epoch": 1.2533333333333334,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012025664527956005,
+      "loss": 0.07076438069343567,
+      "mean_token_accuracy": 0.9745794385671616,
+      "num_tokens": 2668226.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.07516032289713621,
+      "epoch": 1.28,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.00011842346471127406,
+      "loss": 0.0711740493774414,
+      "mean_token_accuracy": 0.9735412746667862,
+      "num_tokens": 2725180.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.07623793687671424,
+      "epoch": 1.3066666666666666,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00011659028414298809,
+      "loss": 0.07199874520301819,
+      "mean_token_accuracy": 0.9739259093999862,
+      "num_tokens": 2782069.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.07468608934432268,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001147571035747021,
+      "loss": 0.07050397992134094,
+      "mean_token_accuracy": 0.9742979735136033,
+      "num_tokens": 2838772.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.07314184289425611,
+      "epoch": 1.3599999999999999,
+      "grad_norm": 0.0732421875,
+      "learning_rate": 0.00011292392300641615,
+      "loss": 0.06992406845092773,
+      "mean_token_accuracy": 0.9748412847518921,
+      "num_tokens": 2896384.0,
+      "step": 510
+    },
+    {
+      "entropy": 0.07735273949801921,
+      "epoch": 1.3866666666666667,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 0.00011109074243813016,
+      "loss": 0.07089330554008484,
+      "mean_token_accuracy": 0.973857656121254,
+      "num_tokens": 2953074.0,
+      "step": 520
+    },
+    {
+      "entropy": 0.07427110467106104,
+      "epoch": 1.4133333333333333,
+      "grad_norm": 0.05615234375,
+      "learning_rate": 0.00010925756186984419,
+      "loss": 0.07023302912712097,
+      "mean_token_accuracy": 0.9745061740279197,
+      "num_tokens": 3009599.0,
+      "step": 530
+    },
+    {
+      "entropy": 0.07496015410870313,
+      "epoch": 1.44,
+      "grad_norm": 0.04150390625,
+      "learning_rate": 0.0001074243813015582,
+      "loss": 0.07044907808303832,
+      "mean_token_accuracy": 0.97446711063385,
+      "num_tokens": 3065550.0,
+      "step": 540
+    },
+    {
+      "entropy": 0.07237969692796468,
+      "epoch": 1.4666666666666668,
+      "grad_norm": 0.0537109375,
+      "learning_rate": 0.00010559120073327222,
+      "loss": 0.06903309226036072,
+      "mean_token_accuracy": 0.9751396328210831,
+      "num_tokens": 3122339.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.07292939173057675,
+      "epoch": 1.4933333333333334,
+      "grad_norm": 0.044921875,
+      "learning_rate": 0.00010375802016498626,
+      "loss": 0.06951733827590942,
+      "mean_token_accuracy": 0.9748973533511162,
+      "num_tokens": 3179284.0,
+      "step": 560
+    },
+    {
+      "entropy": 0.0735103216022253,
+      "epoch": 1.52,
+      "grad_norm": 0.0595703125,
+      "learning_rate": 0.00010192483959670028,
+      "loss": 0.06886410713195801,
+      "mean_token_accuracy": 0.9742336764931678,
+      "num_tokens": 3236634.0,
+      "step": 570
+    },
+    {
+      "entropy": 0.07244595270603896,
+      "epoch": 1.5466666666666666,
+      "grad_norm": 0.049072265625,
+      "learning_rate": 0.0001000916590284143,
+      "loss": 0.06925945878028869,
+      "mean_token_accuracy": 0.9746079474687577,
+      "num_tokens": 3293217.0,
+      "step": 580
+    },
+    {
+      "entropy": 0.0733188034966588,
+      "epoch": 1.5733333333333333,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 9.825847846012832e-05,
+      "loss": 0.06935187578201293,
+      "mean_token_accuracy": 0.9748518764972687,
+      "num_tokens": 3349872.0,
+      "step": 590
+    },
+    {
+      "entropy": 0.07255212999880314,
+      "epoch": 1.6,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 9.642529789184235e-05,
+      "loss": 0.07008358240127563,
+      "mean_token_accuracy": 0.9742572873830795,
+      "num_tokens": 3406930.0,
+      "step": 600
+    },
+    {
+      "entropy": 0.0732356732711196,
+      "epoch": 1.6266666666666667,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 9.459211732355638e-05,
+      "loss": 0.06836349368095399,
+      "mean_token_accuracy": 0.9751275479793549,
+      "num_tokens": 3464439.0,
+      "step": 610
+    },
+    {
+      "entropy": 0.07225457970052958,
+      "epoch": 1.6533333333333333,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 9.27589367552704e-05,
+      "loss": 0.06948843002319335,
+      "mean_token_accuracy": 0.9739401176571846,
+      "num_tokens": 3521325.0,
+      "step": 620
+    },
+    {
+      "entropy": 0.07250613961368799,
+      "epoch": 1.6800000000000002,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 9.092575618698442e-05,
+      "loss": 0.06941892504692078,
+      "mean_token_accuracy": 0.9748956650495529,
+      "num_tokens": 3577996.0,
+      "step": 630
+    },
+    {
+      "entropy": 0.0732794025912881,
+      "epoch": 1.7066666666666666,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 8.909257561869845e-05,
+      "loss": 0.06896185874938965,
+      "mean_token_accuracy": 0.9750035509467125,
+      "num_tokens": 3634811.0,
+      "step": 640
+    },
+    {
+      "entropy": 0.07183574195951223,
+      "epoch": 1.7333333333333334,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 8.725939505041248e-05,
+      "loss": 0.0701564073562622,
+      "mean_token_accuracy": 0.9742208927869797,
+      "num_tokens": 3691017.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.07327579502016306,
+      "epoch": 1.76,
+      "grad_norm": 0.07470703125,
+      "learning_rate": 8.54262144821265e-05,
+      "loss": 0.06881371140480042,
+      "mean_token_accuracy": 0.9741959020495414,
+      "num_tokens": 3747546.0,
+      "step": 660
+    },
+    {
+      "entropy": 0.07111402666196227,
+      "epoch": 1.7866666666666666,
+      "grad_norm": 0.05712890625,
+      "learning_rate": 8.359303391384051e-05,
+      "loss": 0.06966341137886048,
+      "mean_token_accuracy": 0.9747162073850631,
+      "num_tokens": 3804126.0,
+      "step": 670
+    },
+    {
+      "entropy": 0.07224018704146147,
+      "epoch": 1.8133333333333335,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 8.175985334555454e-05,
+      "loss": 0.06840948462486267,
+      "mean_token_accuracy": 0.9747431293129921,
+      "num_tokens": 3861006.0,
+      "step": 680
+    },
+    {
+      "entropy": 0.07255861330777406,
+      "epoch": 1.8399999999999999,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 7.992667277726857e-05,
+      "loss": 0.06987766623497009,
+      "mean_token_accuracy": 0.9739771053195,
+      "num_tokens": 3916797.0,
+      "step": 690
+    },
+    {
+      "entropy": 0.07260533329099417,
+      "epoch": 1.8666666666666667,
+      "grad_norm": 0.048583984375,
+      "learning_rate": 7.809349220898258e-05,
+      "loss": 0.06835905909538269,
+      "mean_token_accuracy": 0.9750322937965393,
+      "num_tokens": 3973197.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.0710109818726778,
+      "epoch": 1.8933333333333333,
+      "grad_norm": 0.041748046875,
+      "learning_rate": 7.626031164069661e-05,
+      "loss": 0.0677144169807434,
+      "mean_token_accuracy": 0.9751162648200988,
+      "num_tokens": 4030212.0,
+      "step": 710
+    },
+    {
+      "entropy": 0.070679662656039,
+      "epoch": 1.92,
+      "grad_norm": 0.0458984375,
+      "learning_rate": 7.442713107241064e-05,
+      "loss": 0.0661697268486023,
+      "mean_token_accuracy": 0.9755514889955521,
+      "num_tokens": 4087699.0,
+      "step": 720
+    },
+    {
+      "entropy": 0.0694987777620554,
+      "epoch": 1.9466666666666668,
+      "grad_norm": 0.115234375,
+      "learning_rate": 7.259395050412467e-05,
+      "loss": 0.06822068691253662,
+      "mean_token_accuracy": 0.97522524446249,
+      "num_tokens": 4144740.0,
+      "step": 730
+    },
+    {
+      "entropy": 0.07208629371598363,
+      "epoch": 1.9733333333333334,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 7.076076993583868e-05,
+      "loss": 0.06933082938194275,
+      "mean_token_accuracy": 0.9743774682283401,
+      "num_tokens": 4201289.0,
+      "step": 740
+    },
+    {
+      "entropy": 0.07209395840764046,
+      "epoch": 2.0,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 6.89275893675527e-05,
+      "loss": 0.06815703511238098,
+      "mean_token_accuracy": 0.974660362303257,
+      "num_tokens": 4257958.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.07068475261330605,
+      "epoch": 2.026666666666667,
+      "grad_norm": 0.042236328125,
+      "learning_rate": 6.709440879926673e-05,
+      "loss": 0.0669311225414276,
+      "mean_token_accuracy": 0.9747605755925178,
+      "num_tokens": 4314723.0,
+      "step": 760
+    },
+    {
+      "entropy": 0.06951902080327273,
+      "epoch": 2.0533333333333332,
+      "grad_norm": 0.0419921875,
+      "learning_rate": 6.526122823098076e-05,
+      "loss": 0.0668017327785492,
+      "mean_token_accuracy": 0.9751198858022689,
+      "num_tokens": 4371457.0,
+      "step": 770
+    },
+    {
+      "entropy": 0.07024376196786761,
+      "epoch": 2.08,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 6.342804766269478e-05,
+      "loss": 0.06699610352516175,
+      "mean_token_accuracy": 0.9748657032847404,
+      "num_tokens": 4427543.0,
+      "step": 780
+    },
+    {
+      "entropy": 0.06954137068241835,
+      "epoch": 2.1066666666666665,
+      "grad_norm": 0.043212890625,
+      "learning_rate": 6.15948670944088e-05,
+      "loss": 0.06581668257713318,
+      "mean_token_accuracy": 0.9755794301629066,
+      "num_tokens": 4484853.0,
+      "step": 790
+    },
+    {
+      "entropy": 0.06969003304839134,
+      "epoch": 2.1333333333333333,
+      "grad_norm": 0.05859375,
+      "learning_rate": 5.976168652612283e-05,
+      "loss": 0.06605738401412964,
+      "mean_token_accuracy": 0.9751082003116608,
+      "num_tokens": 4540895.0,
+      "step": 800
+    },
+    {
+      "entropy": 0.07048749346286058,
+      "epoch": 2.16,
+      "grad_norm": 0.04931640625,
+      "learning_rate": 5.792850595783685e-05,
+      "loss": 0.06759686470031738,
+      "mean_token_accuracy": 0.9748542428016662,
+      "num_tokens": 4597531.0,
+      "step": 810
+    },
+    {
+      "entropy": 0.0699356870725751,
+      "epoch": 2.1866666666666665,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 5.6095325389550866e-05,
+      "loss": 0.06627315282821655,
+      "mean_token_accuracy": 0.9759758025407791,
+      "num_tokens": 4654517.0,
+      "step": 820
+    },
+    {
+      "entropy": 0.06981293484568596,
+      "epoch": 2.2133333333333334,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 5.4262144821264894e-05,
+      "loss": 0.06639997959136963,
+      "mean_token_accuracy": 0.9752195671200752,
+      "num_tokens": 4711508.0,
+      "step": 830
+    },
+    {
+      "entropy": 0.06960875494405627,
+      "epoch": 2.24,
+      "grad_norm": 0.04736328125,
+      "learning_rate": 5.2428964252978916e-05,
+      "loss": 0.06645302176475525,
+      "mean_token_accuracy": 0.9757942840456962,
+      "num_tokens": 4768589.0,
+      "step": 840
+    },
+    {
+      "entropy": 0.06928735189139842,
+      "epoch": 2.2666666666666666,
+      "grad_norm": 0.06005859375,
+      "learning_rate": 5.0595783684692945e-05,
+      "loss": 0.06615262627601623,
+      "mean_token_accuracy": 0.975421866774559,
+      "num_tokens": 4825447.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.0701323315501213,
+      "epoch": 2.2933333333333334,
+      "grad_norm": 0.043701171875,
+      "learning_rate": 4.876260311640697e-05,
+      "loss": 0.06594157218933105,
+      "mean_token_accuracy": 0.9752340018749237,
+      "num_tokens": 4882324.0,
+      "step": 860
+    },
+    {
+      "entropy": 0.06790421595796943,
+      "epoch": 2.32,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 4.6929422548120995e-05,
+      "loss": 0.06551963090896606,
+      "mean_token_accuracy": 0.9751909494400024,
+      "num_tokens": 4939254.0,
+      "step": 870
+    },
+    {
+      "entropy": 0.07054078914225101,
+      "epoch": 2.3466666666666667,
+      "grad_norm": 0.051025390625,
+      "learning_rate": 4.509624197983501e-05,
+      "loss": 0.06690743565559387,
+      "mean_token_accuracy": 0.9751562505960465,
+      "num_tokens": 4995524.0,
+      "step": 880
+    },
+    {
+      "entropy": 0.06957337409257888,
+      "epoch": 2.3733333333333335,
+      "grad_norm": 0.049560546875,
+      "learning_rate": 4.326306141154904e-05,
+      "loss": 0.06609007120132446,
+      "mean_token_accuracy": 0.9754323452711106,
+      "num_tokens": 5052578.0,
+      "step": 890
+    },
+    {
+      "entropy": 0.07044977657496929,
+      "epoch": 2.4,
+      "grad_norm": 0.0517578125,
+      "learning_rate": 4.142988084326306e-05,
+      "loss": 0.06621668338775635,
+      "mean_token_accuracy": 0.9750386416912079,
+      "num_tokens": 5108922.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.06792065436020493,
+      "epoch": 2.4266666666666667,
+      "grad_norm": 0.046875,
+      "learning_rate": 3.959670027497709e-05,
+      "loss": 0.06501899361610412,
+      "mean_token_accuracy": 0.9760412231087685,
+      "num_tokens": 5166394.0,
+      "step": 910
+    },
+    {
+      "entropy": 0.06912549249827862,
+      "epoch": 2.453333333333333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 3.776351970669111e-05,
+      "loss": 0.06575977206230163,
+      "mean_token_accuracy": 0.975604172050953,
+      "num_tokens": 5223123.0,
+      "step": 920
+    },
+    {
+      "entropy": 0.06817780192941428,
+      "epoch": 2.48,
+      "grad_norm": 0.0439453125,
+      "learning_rate": 3.593033913840513e-05,
+      "loss": 0.06491979956626892,
+      "mean_token_accuracy": 0.9758375898003578,
+      "num_tokens": 5280867.0,
+      "step": 930
+    },
+    {
+      "entropy": 0.06880640015006065,
+      "epoch": 2.506666666666667,
+      "grad_norm": 0.050048828125,
+      "learning_rate": 3.409715857011916e-05,
+      "loss": 0.0658724844455719,
+      "mean_token_accuracy": 0.9759016156196594,
+      "num_tokens": 5337629.0,
+      "step": 940
+    },
+    {
+      "entropy": 0.06923360927030445,
+      "epoch": 2.533333333333333,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 3.2263978001833184e-05,
+      "loss": 0.06607494950294494,
+      "mean_token_accuracy": 0.9753221690654754,
+      "num_tokens": 5394318.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.06904373681172729,
+      "epoch": 2.56,
+      "grad_norm": 0.04541015625,
+      "learning_rate": 3.0430797433547202e-05,
+      "loss": 0.06557352542877197,
+      "mean_token_accuracy": 0.9759575456380845,
+      "num_tokens": 5450413.0,
+      "step": 960
+    },
+    {
+      "entropy": 0.06914114560931921,
+      "epoch": 2.586666666666667,
+      "grad_norm": 0.046875,
+      "learning_rate": 2.8597616865261228e-05,
+      "loss": 0.06594338417053222,
+      "mean_token_accuracy": 0.9751049831509591,
+      "num_tokens": 5507306.0,
+      "step": 970
+    },
+    {
+      "entropy": 0.0688713699579239,
+      "epoch": 2.6133333333333333,
+      "grad_norm": 0.052001953125,
+      "learning_rate": 2.6764436296975253e-05,
+      "loss": 0.06489255428314208,
+      "mean_token_accuracy": 0.9756928265094758,
+      "num_tokens": 5564241.0,
+      "step": 980
+    },
+    {
+      "entropy": 0.0688857214525342,
+      "epoch": 2.64,
+      "grad_norm": 0.053466796875,
+      "learning_rate": 2.4931255728689275e-05,
+      "loss": 0.06557077169418335,
+      "mean_token_accuracy": 0.9758043006062508,
+      "num_tokens": 5620870.0,
+      "step": 990
+    },
+    {
+      "entropy": 0.06913622673600912,
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.060302734375,
+      "learning_rate": 2.30980751604033e-05,
+      "loss": 0.06396430134773254,
+      "mean_token_accuracy": 0.9762534514069557,
+      "num_tokens": 5677975.0,
+      "step": 1000
+    },
+    {
+      "entropy": 0.06967059737071395,
+      "epoch": 2.6933333333333334,
+      "grad_norm": 0.0556640625,
+      "learning_rate": 2.1264894592117325e-05,
+      "loss": 0.0658549726009369,
+      "mean_token_accuracy": 0.9755063205957413,
+      "num_tokens": 5734406.0,
+      "step": 1010
+    },
+    {
+      "entropy": 0.06996878925710917,
+      "epoch": 2.7199999999999998,
+      "grad_norm": 0.047607421875,
+      "learning_rate": 1.943171402383135e-05,
+      "loss": 0.06624419689178467,
+      "mean_token_accuracy": 0.9752198234200478,
+      "num_tokens": 5790588.0,
+      "step": 1020
+    },
+    {
+      "entropy": 0.06913588438183069,
+      "epoch": 2.7466666666666666,
+      "grad_norm": 0.051513671875,
+      "learning_rate": 1.7598533455545372e-05,
+      "loss": 0.06566822528839111,
+      "mean_token_accuracy": 0.975077997148037,
+      "num_tokens": 5846871.0,
+      "step": 1030
+    },
+    {
+      "entropy": 0.07049406385049224,
+      "epoch": 2.7733333333333334,
+      "grad_norm": 0.0498046875,
+      "learning_rate": 1.5765352887259398e-05,
+      "loss": 0.06581954956054688,
+      "mean_token_accuracy": 0.9753255605697632,
+      "num_tokens": 5902888.0,
+      "step": 1040
+    },
+    {
+      "entropy": 0.06881497353315354,
+      "epoch": 2.8,
+      "grad_norm": 0.04443359375,
+      "learning_rate": 1.393217231897342e-05,
+      "loss": 0.06458759903907776,
+      "mean_token_accuracy": 0.9755938291549683,
+      "num_tokens": 5960106.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.06842826995998622,
+      "epoch": 2.8266666666666667,
+      "grad_norm": 0.046630859375,
+      "learning_rate": 1.2098991750687445e-05,
+      "loss": 0.06443418264389038,
+      "mean_token_accuracy": 0.9758713901042938,
+      "num_tokens": 6016963.0,
+      "step": 1060
+    },
+    {
+      "entropy": 0.06925875274464488,
+      "epoch": 2.8533333333333335,
+      "grad_norm": 0.05078125,
+      "learning_rate": 1.0265811182401468e-05,
+      "loss": 0.06562719345092774,
+      "mean_token_accuracy": 0.9754008457064629,
+      "num_tokens": 6073215.0,
+      "step": 1070
+    },
+    {
+      "entropy": 0.06846961556002498,
+      "epoch": 2.88,
+      "grad_norm": 0.05224609375,
+      "learning_rate": 8.43263061411549e-06,
+      "loss": 0.06463822722434998,
+      "mean_token_accuracy": 0.9759333416819572,
+      "num_tokens": 6130427.0,
+      "step": 1080
+    },
+    {
+      "entropy": 0.06969590932130813,
+      "epoch": 2.9066666666666667,
+      "grad_norm": 0.055908203125,
+      "learning_rate": 6.599450045829514e-06,
+      "loss": 0.06606504321098328,
+      "mean_token_accuracy": 0.9749638319015503,
+      "num_tokens": 6186584.0,
+      "step": 1090
+    },
+    {
+      "entropy": 0.06768293902277947,
+      "epoch": 2.9333333333333336,
+      "grad_norm": 0.0478515625,
+      "learning_rate": 4.766269477543538e-06,
+      "loss": 0.06344886422157288,
+      "mean_token_accuracy": 0.9760955572128296,
+      "num_tokens": 6244713.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.06839841092005372,
+      "epoch": 2.96,
+      "grad_norm": 0.0546875,
+      "learning_rate": 2.933088909257562e-06,
+      "loss": 0.06508639454841614,
+      "mean_token_accuracy": 0.9756930440664291,
+      "num_tokens": 6301263.0,
+      "step": 1110
+    },
+    {
+      "entropy": 0.06823750771582127,
+      "epoch": 2.986666666666667,
+      "grad_norm": 0.04833984375,
+      "learning_rate": 1.0999083409715858e-06,
+      "loss": 0.06445437669754028,
+      "mean_token_accuracy": 0.9759095475077629,
+      "num_tokens": 6358358.0,
+      "step": 1120
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.9781846035472384e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

newton/checkpoint-1125/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8755273dccefb3d7fa41448d64a8c28d76451700a997d4cbd5f7ac202a091f77
+size 5585

newton/checkpoint-500/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: meta-llama/Llama-3.1-8B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:meta-llama/Llama-3.1-8B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

newton/checkpoint-500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "o_proj",
+    "k_proj",
+    "v_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

newton/checkpoint-500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2e3fa39229c6ec9a3ae3953299dd1633da0fe90c86d1cbd81f4670401ecc4d6
+size 27297544

newton/checkpoint-500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,109 @@

+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+{#- System message + builtin tools #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+{%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+                {%- endfor %}
+            {{- ")" }}
+        {%- else  %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {#- This means we're in ipython mode #}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}

newton/checkpoint-500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02efb59f2ff28dda0a4a4530d4c7f101cf666c78420719d9db581857205db1e
+size 54745547

newton/checkpoint-500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e06ed9055f2879d20734525b54e3185ffbd4df450c6774c39d8caa49df8499ed
+size 14645

newton/checkpoint-500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f2b5fa48c7f09e4487186c3527dd0cb37a3de8892b16ebca696ce3df604cb5
+size 1465

newton/checkpoint-500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

newton/checkpoint-500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}

newton/checkpoint-500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,534 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.3333333333333333,
+  "eval_steps": 500,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 2.6570239067077637,
+      "epoch": 0.02666666666666667,
+      "grad_norm": 0.287109375,
+      "learning_rate": 5.294117647058824e-05,
+      "loss": 2.800247573852539,
+      "mean_token_accuracy": 0.4749053567647934,
+      "num_tokens": 56906.0,
+      "step": 10
+    },
+    {
+      "entropy": 2.2495410323143004,
+      "epoch": 0.05333333333333334,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.00011176470588235294,
+      "loss": 2.4327199935913084,
+      "mean_token_accuracy": 0.5111239477992058,
+      "num_tokens": 113827.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.8682004392147065,
+      "epoch": 0.08,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00017058823529411766,
+      "loss": 1.789840316772461,
+      "mean_token_accuracy": 0.599884121119976,
+      "num_tokens": 170403.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2546741724014283,
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.00019908340971585702,
+      "loss": 1.2151795387268067,
+      "mean_token_accuracy": 0.7106126025319099,
+      "num_tokens": 227456.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.8836664661765099,
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.00019725022914757106,
+      "loss": 0.8311976432800293,
+      "mean_token_accuracy": 0.7977700293064117,
+      "num_tokens": 284368.0,
+      "step": 50
+    },
+    {
+      "entropy": 0.6855858579277992,
+      "epoch": 0.16,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.00019541704857928507,
+      "loss": 0.6242359638214111,
+      "mean_token_accuracy": 0.847702169418335,
+      "num_tokens": 341357.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.4690785683691502,
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.00019358386801099912,
+      "loss": 0.40251870155334474,
+      "mean_token_accuracy": 0.9024116918444633,
+      "num_tokens": 398280.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.34345744624733926,
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0001917506874427131,
+      "loss": 0.28333656787872313,
+      "mean_token_accuracy": 0.9320006996393204,
+      "num_tokens": 455232.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.25451925955712795,
+      "epoch": 0.24,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.00018991750687442712,
+      "loss": 0.21085577011108397,
+      "mean_token_accuracy": 0.949009683728218,
+      "num_tokens": 511782.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.19814539551734925,
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00018808432630614116,
+      "loss": 0.1717105984687805,
+      "mean_token_accuracy": 0.9577329605817795,
+      "num_tokens": 568641.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.18550167009234428,
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.00018625114573785518,
+      "loss": 0.15982584953308104,
+      "mean_token_accuracy": 0.9591923207044601,
+      "num_tokens": 626038.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.16009770445525645,
+      "epoch": 0.32,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.00018441796516956922,
+      "loss": 0.12815338373184204,
+      "mean_token_accuracy": 0.9657398357987403,
+      "num_tokens": 682880.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.14740683771669866,
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.00018258478460128323,
+      "loss": 0.1188442587852478,
+      "mean_token_accuracy": 0.9664651393890381,
+      "num_tokens": 739719.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.13307180535048246,
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.1474609375,
+      "learning_rate": 0.00018075160403299728,
+      "loss": 0.11054203510284424,
+      "mean_token_accuracy": 0.9669812738895416,
+      "num_tokens": 795894.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.12216594349592924,
+      "epoch": 0.4,
+      "grad_norm": 0.1240234375,
+      "learning_rate": 0.0001789184234647113,
+      "loss": 0.10401068925857544,
+      "mean_token_accuracy": 0.9683825269341468,
+      "num_tokens": 852124.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.11619068495929241,
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.12060546875,
+      "learning_rate": 0.0001770852428964253,
+      "loss": 0.0976063370704651,
+      "mean_token_accuracy": 0.9695558726787568,
+      "num_tokens": 909328.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.10669020470231771,
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.1279296875,
+      "learning_rate": 0.00017525206232813932,
+      "loss": 0.09338906407356262,
+      "mean_token_accuracy": 0.970247569680214,
+      "num_tokens": 966577.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.10276608634740114,
+      "epoch": 0.48,
+      "grad_norm": 0.115234375,
+      "learning_rate": 0.00017341888175985334,
+      "loss": 0.09135337471961975,
+      "mean_token_accuracy": 0.9711026951670647,
+      "num_tokens": 1022961.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.10297673251479864,
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.11474609375,
+      "learning_rate": 0.00017158570119156738,
+      "loss": 0.08887208104133607,
+      "mean_token_accuracy": 0.9709939315915108,
+      "num_tokens": 1079479.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.09722564350813627,
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.1044921875,
+      "learning_rate": 0.0001697525206232814,
+      "loss": 0.08848196864128113,
+      "mean_token_accuracy": 0.9712936446070671,
+      "num_tokens": 1135784.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.09498227294534445,
+      "epoch": 0.56,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.00016791934005499544,
+      "loss": 0.08531092405319214,
+      "mean_token_accuracy": 0.9717509031295777,
+      "num_tokens": 1192723.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.09660841915756464,
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.154296875,
+      "learning_rate": 0.00016608615948670945,
+      "loss": 0.08432384729385375,
+      "mean_token_accuracy": 0.9723995119333267,
+      "num_tokens": 1248974.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.09139632768929004,
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.08203125,
+      "learning_rate": 0.0001642529789184235,
+      "loss": 0.08340675234794617,
+      "mean_token_accuracy": 0.9725200146436691,
+      "num_tokens": 1306125.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.09041857812553644,
+      "epoch": 0.64,
+      "grad_norm": 0.0751953125,
+      "learning_rate": 0.0001624197983501375,
+      "loss": 0.08240053057670593,
+      "mean_token_accuracy": 0.9727400034666062,
+      "num_tokens": 1362509.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.08917351886630058,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.11181640625,
+      "learning_rate": 0.00016058661778185152,
+      "loss": 0.08038315176963806,
+      "mean_token_accuracy": 0.9722966447472572,
+      "num_tokens": 1419155.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.08846015091985464,
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.00015875343721356554,
+      "loss": 0.08111950755119324,
+      "mean_token_accuracy": 0.9725704893469811,
+      "num_tokens": 1475233.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.08615751322358847,
+      "epoch": 0.72,
+      "grad_norm": 0.103515625,
+      "learning_rate": 0.00015692025664527955,
+      "loss": 0.07856618165969849,
+      "mean_token_accuracy": 0.9734801158308983,
+      "num_tokens": 1531666.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.08350808713585138,
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.0869140625,
+      "learning_rate": 0.0001550870760769936,
+      "loss": 0.07699183821678161,
+      "mean_token_accuracy": 0.9737285181879998,
+      "num_tokens": 1588686.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.08553262427449226,
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.140625,
+      "learning_rate": 0.0001532538955087076,
+      "loss": 0.07849866151809692,
+      "mean_token_accuracy": 0.9727597609162331,
+      "num_tokens": 1645610.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.08688175324350596,
+      "epoch": 0.8,
+      "grad_norm": 0.1318359375,
+      "learning_rate": 0.00015142071494042165,
+      "loss": 0.0791881263256073,
+      "mean_token_accuracy": 0.9728336438536644,
+      "num_tokens": 1702234.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.08647099416702986,
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014958753437213567,
+      "loss": 0.07916317582130432,
+      "mean_token_accuracy": 0.9720797210931778,
+      "num_tokens": 1758523.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.08278416823595762,
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.076171875,
+      "learning_rate": 0.00014775435380384968,
+      "loss": 0.07689375281333924,
+      "mean_token_accuracy": 0.9735667318105697,
+      "num_tokens": 1815080.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.08433555215597152,
+      "epoch": 0.88,
+      "grad_norm": 0.0888671875,
+      "learning_rate": 0.00014592117323556373,
+      "loss": 0.07733245491981507,
+      "mean_token_accuracy": 0.973043854534626,
+      "num_tokens": 1872283.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.0831523710861802,
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.00014408799266727771,
+      "loss": 0.07743646502494812,
+      "mean_token_accuracy": 0.9724773317575455,
+      "num_tokens": 1929120.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.08173599634319544,
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.08447265625,
+      "learning_rate": 0.00014225481209899176,
+      "loss": 0.07464101910591125,
+      "mean_token_accuracy": 0.9732464775443077,
+      "num_tokens": 1986433.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.08154450561851263,
+      "epoch": 0.96,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.00014042163153070577,
+      "loss": 0.07836683988571166,
+      "mean_token_accuracy": 0.9733009964227677,
+      "num_tokens": 2043465.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.08830973766744137,
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.0634765625,
+      "learning_rate": 0.0001385884509624198,
+      "loss": 0.07805899381637574,
+      "mean_token_accuracy": 0.9734541475772858,
+      "num_tokens": 2100933.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.08108338043093681,
+      "epoch": 1.0133333333333334,
+      "grad_norm": 0.05859375,
+      "learning_rate": 0.00013675527039413383,
+      "loss": 0.07582586407661437,
+      "mean_token_accuracy": 0.9734946370124817,
+      "num_tokens": 2157057.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.0781314555555582,
+      "epoch": 1.04,
+      "grad_norm": 0.05078125,
+      "learning_rate": 0.00013492208982584784,
+      "loss": 0.0714304804801941,
+      "mean_token_accuracy": 0.975023752450943,
+      "num_tokens": 2214085.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.07955040819942952,
+      "epoch": 1.0666666666666667,
+      "grad_norm": 0.08984375,
+      "learning_rate": 0.00013308890925756189,
+      "loss": 0.07331350445747375,
+      "mean_token_accuracy": 0.9737342849373818,
+      "num_tokens": 2270765.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.07677881456911564,
+      "epoch": 1.0933333333333333,
+      "grad_norm": 0.07177734375,
+      "learning_rate": 0.0001312557286892759,
+      "loss": 0.07168130278587341,
+      "mean_token_accuracy": 0.9739445611834526,
+      "num_tokens": 2327512.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.07667716387659311,
+      "epoch": 1.12,
+      "grad_norm": 0.0771484375,
+      "learning_rate": 0.00012942254812098992,
+      "loss": 0.07219807505607605,
+      "mean_token_accuracy": 0.9742562755942344,
+      "num_tokens": 2384423.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.07681187009438872,
+      "epoch": 1.1466666666666667,
+      "grad_norm": 0.0615234375,
+      "learning_rate": 0.00012758936755270393,
+      "loss": 0.07280588746070862,
+      "mean_token_accuracy": 0.9735747814178467,
+      "num_tokens": 2441102.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.07602620646357536,
+      "epoch": 1.1733333333333333,
+      "grad_norm": 0.06982421875,
+      "learning_rate": 0.00012575618698441797,
+      "loss": 0.07293958067893982,
+      "mean_token_accuracy": 0.9740705206990242,
+      "num_tokens": 2497642.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.07798876240849495,
+      "epoch": 1.2,
+      "grad_norm": 0.07421875,
+      "learning_rate": 0.000123923006416132,
+      "loss": 0.07215467095375061,
+      "mean_token_accuracy": 0.9742186814546585,
+      "num_tokens": 2554273.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.07671927772462368,
+      "epoch": 1.2266666666666666,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012208982584784603,
+      "loss": 0.07254356741905213,
+      "mean_token_accuracy": 0.9733539551496506,
+      "num_tokens": 2610932.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.07502734698355198,
+      "epoch": 1.2533333333333334,
+      "grad_norm": 0.05029296875,
+      "learning_rate": 0.00012025664527956005,
+      "loss": 0.07076438069343567,
+      "mean_token_accuracy": 0.9745794385671616,
+      "num_tokens": 2668226.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.07516032289713621,
+      "epoch": 1.28,
+      "grad_norm": 0.045654296875,
+      "learning_rate": 0.00011842346471127406,
+      "loss": 0.0711740493774414,
+      "mean_token_accuracy": 0.9735412746667862,
+      "num_tokens": 2725180.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.07623793687671424,
+      "epoch": 1.3066666666666666,
+      "grad_norm": 0.053955078125,
+      "learning_rate": 0.00011659028414298809,
+      "loss": 0.07199874520301819,
+      "mean_token_accuracy": 0.9739259093999862,
+      "num_tokens": 2782069.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.07468608934432268,
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.046142578125,
+      "learning_rate": 0.0001147571035747021,
+      "loss": 0.07050397992134094,
+      "mean_token_accuracy": 0.9742979735136033,
+      "num_tokens": 2838772.0,
+      "step": 500
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1125,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3243190835068928e+17,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

newton/checkpoint-500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8755273dccefb3d7fa41448d64a8c28d76451700a997d4cbd5f7ac202a091f77
+size 5585

newton/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
+size 17209920

newton/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "backend": "tokenizers",
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|eot_id|>",
+  "is_local": false,
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|eot_id|>",
+  "tokenizer_class": "TokenizersBackend"
+}