Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +202 -0
adapter_config.json +31 -0
adapter_model.safetensors +3 -0
added_tokens.json +3 -0
chat_template.jinja +143 -0
special_tokens_map.json +33 -0
tokenizer.json +3 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
trainer_state.json +2750 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 256,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 256,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": "model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:86601ca88cd2d45e93b6466e1926bc931f3bbfbb85d9af0c149e2ddc82263977
+size 953752328

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,143 @@

+{#- Begin-of-sequence token to start the model prompt -#}
+{{ bos_token }}
+{#- Extracts the system message. Gemma does not support system messages so it will be prepended to first user message. -#}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set system_message = messages[0]['content'] -%}
+    {%- else -%}
+        {%- set system_message = messages[0]['content'][0]['text'] -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set system_message = "You are a helpful assistant Zero-Gemma made by ZeroAgency company from Russia. You must be helpful, harmless, and honest." -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- if enable_thinking is defined and enable_thinking is true -%}
+    {%- set system_message = system_message + "\nFirst, think through the reasoning internally, then present the reasoning within <think>...</think>. After thinking, clearly state a response that addresses the user's request and aligns with their preferences, not just providing a direct answer." -%}
+{%- endif -%}
+{%- set system_message = system_message + '\n\n' -%}
+{#- Set tools to none if not defined for this ChatCompletion request (helps avoid errors later) -#}
+{%- if not tools is defined -%}
+    {%- set tools = none -%}
+{%- endif -%}
+{#- First - system message -#}
+{{ '<start_of_turn>system\n' -}}
+{{ system_message }}
+{#- Append system message with tool information if using tools in message request. -#}
+{%- if tools is not none -%}
+    {{- "Tools (functions) are available. If you decide to invoke one or more of the tools, you must respond with a python list of the function calls.\n" -}}
+    {{- "Example Format: [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] \n" -}}
+    {{- "Do not use variables. DO NOT USE MARKDOWN SYNTAX. You SHOULD NOT include any other text in the response if you call a function. If none of the functions can be used, point it out. If you lack the parameters required by the function, also point it out.\n" -}}
+    {{- "Here is a list of functions in JSON format that you can invoke.\n" -}}
+    {{- tools | tojson(indent=4) -}}
+    {{- "\n\n" -}}
+{%- endif -%}
+{{ '<end_of_turn>\n' }}
+{#- Main loop over all messages in the conversation history -#}
+{%- for message in loop_messages if message['role'] != 'system' -%}
+    {#- Normalize roles for model prompt formatting -#}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- elif (message['role'] == 'tool') -%}
+        {%- set role = "user" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {#- Mark the start of a message block with the appropriate role -#}
+    {{ '<start_of_turn>' + role + '\n' -}}
+    {#- Format model tool calls (turns where model indicates they want to call a tool) -#}
+    {%- if 'tool_calls' in message -%}
+        {%- if message['content'] is string -%}
+            {%- set content = message['content'] -%}
+            {# {%- if '</think>' in content -%} #}
+            {{- content | trim -}}
+            {# {{- "\n" -}} #}
+            {# {%- endif -%} #}
+        {%- endif -%}
+        {#- Opening bracket for tool call list. -#}
+        {{- '[' -}}
+        {#- For each tool call -#}
+        {%- for tool_call in message.tool_calls -%}
+            {#- Function name & opening parenthesis. -#}
+            {%- if tool_call.function is defined -%}
+                {%- set tool_call = tool_call.function -%}
+            {%- endif -%}
+            {{- tool_call.name + '(' -}}
+            {#-- Handle arguments as list (positional) or dict (named) --#}
+            {#-- Named arguments (dict) --#}
+            {%- if tool_call.arguments is iterable and tool_call.arguments is mapping -%}
+                {%- set first = true -%}
+                {%- for key, val in tool_call.arguments.items() -%}
+                    {%- if not first %}, {% endif -%}
+                    {{ key }}={{ val | tojson }}
+                    {%- set first = false -%}
+                {%- endfor -%}
+            {#-- Positional arguments (list) --#}
+            {%- elif tool_call.arguments is iterable -%}
+                {{- tool_call.arguments | map('tojson') | join(', ') -}}
+            {#-- Fallback: single positional value --#}
+            {%- else -%}
+                {{- tool_call.arguments | tojson -}}
+            {#-- Closing parenthesis. --#}
+            {%- endif -%}
+                {{- ')' -}}
+            {#-- If more than one tool call, place comma and move to formatting next tool call --#}
+            {%- if not loop.last -%}{{- "," -}}{%- endif -%}
+        {%- endfor -%}
+        {#- Closing bracket for tool call list. -#}
+        {{- ']' -}}
+    {%- endif -%}
+    {#- Tool response start tag (for messages from a tool) -#}
+    {%- if (message['role'] == 'tool') -%}
+        {{- '<tool_response>\n' -}}
+    {%- endif -%}
+    {#- Render the message content: handle plain string or multimodal content like image/text -#}
+    {%- if not 'tool_calls' in message and message['content'] -%}
+        {%- if message['content'] is string -%}
+            {%- set content = message['content'] -%}
+            {# {%- if '</think>' in content -%}
+                {%- set content = content.split('</think>')[-1] -%}
+            {%- endif -%} #}
+            {{- content | trim -}}
+        {%- elif message['content'] is iterable -%}
+            {%- for item in message['content'] -%}
+                {%- if item['type'] == 'image' -%}
+                    {{ '<start_of_image>' }}
+                {%- elif item['type'] == 'text' -%}
+                    {%- set content = item['text'] -%}
+                    {# {%- if '</think>' in content -%}
+                        {%- set content = content.split('</think>')[-1] -%}
+                    {%- endif -%} #}
+                    {{ content | trim }}
+                {%- endif -%}
+            {%- endfor -%}
+        {%- else -%}
+            {{ raise_exception("Invalid content type:"+ message|tojson) }}
+        {%- endif -%}
+    {%- endif -%}
+    {#- Tool response end tag -#}
+    {%- if (message['role'] == 'tool') -%}
+        {{ '</tool_response>' -}}
+    {%- endif -%}
+    {#- Mark end of a single turn -#}
+    {{ '<end_of_turn>\n' }}
+{%- endfor -%}
+{#- If generation is to be triggered, add model prompt prefix -#}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model\n'}}
+    {%- if enable_thinking is defined and enable_thinking is true -%}
+        {{- '<think>' -}}
+    {%- endif %}
+{%- endif -%}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2750 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 388,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.005154639175257732,
+      "grad_norm": 190.93727111816406,
+      "learning_rate": 0.0,
+      "loss": 6.2913,
+      "step": 1
+    },
+    {
+      "epoch": 0.010309278350515464,
+      "grad_norm": 173.73265075683594,
+      "learning_rate": 5.263157894736842e-06,
+      "loss": 6.2366,
+      "step": 2
+    },
+    {
+      "epoch": 0.015463917525773196,
+      "grad_norm": 58.1086540222168,
+      "learning_rate": 1.0526315789473684e-05,
+      "loss": 5.7487,
+      "step": 3
+    },
+    {
+      "epoch": 0.020618556701030927,
+      "grad_norm": 45.20127868652344,
+      "learning_rate": 1.5789473684210526e-05,
+      "loss": 5.6542,
+      "step": 4
+    },
+    {
+      "epoch": 0.02577319587628866,
+      "grad_norm": 38.75779342651367,
+      "learning_rate": 2.105263157894737e-05,
+      "loss": 5.3452,
+      "step": 5
+    },
+    {
+      "epoch": 0.030927835051546393,
+      "grad_norm": 16.875654220581055,
+      "learning_rate": 2.6315789473684212e-05,
+      "loss": 5.0154,
+      "step": 6
+    },
+    {
+      "epoch": 0.03608247422680412,
+      "grad_norm": 5.762566089630127,
+      "learning_rate": 3.157894736842105e-05,
+      "loss": 4.6468,
+      "step": 7
+    },
+    {
+      "epoch": 0.041237113402061855,
+      "grad_norm": 4.387909889221191,
+      "learning_rate": 3.6842105263157895e-05,
+      "loss": 4.46,
+      "step": 8
+    },
+    {
+      "epoch": 0.04639175257731959,
+      "grad_norm": 3.514525890350342,
+      "learning_rate": 4.210526315789474e-05,
+      "loss": 4.2009,
+      "step": 9
+    },
+    {
+      "epoch": 0.05154639175257732,
+      "grad_norm": 2.510279893875122,
+      "learning_rate": 4.736842105263158e-05,
+      "loss": 3.9583,
+      "step": 10
+    },
+    {
+      "epoch": 0.05670103092783505,
+      "grad_norm": 2.3860387802124023,
+      "learning_rate": 5.2631578947368424e-05,
+      "loss": 3.7731,
+      "step": 11
+    },
+    {
+      "epoch": 0.061855670103092786,
+      "grad_norm": 2.4361982345581055,
+      "learning_rate": 5.789473684210527e-05,
+      "loss": 3.564,
+      "step": 12
+    },
+    {
+      "epoch": 0.06701030927835051,
+      "grad_norm": 1.8688565492630005,
+      "learning_rate": 6.31578947368421e-05,
+      "loss": 3.5308,
+      "step": 13
+    },
+    {
+      "epoch": 0.07216494845360824,
+      "grad_norm": 1.5935324430465698,
+      "learning_rate": 6.842105263157895e-05,
+      "loss": 3.1773,
+      "step": 14
+    },
+    {
+      "epoch": 0.07731958762886598,
+      "grad_norm": 1.6549628973007202,
+      "learning_rate": 7.368421052631579e-05,
+      "loss": 2.9485,
+      "step": 15
+    },
+    {
+      "epoch": 0.08247422680412371,
+      "grad_norm": 1.222391963005066,
+      "learning_rate": 7.894736842105263e-05,
+      "loss": 2.8407,
+      "step": 16
+    },
+    {
+      "epoch": 0.08762886597938144,
+      "grad_norm": 1.015263557434082,
+      "learning_rate": 8.421052631578948e-05,
+      "loss": 2.5357,
+      "step": 17
+    },
+    {
+      "epoch": 0.09278350515463918,
+      "grad_norm": 0.8914713263511658,
+      "learning_rate": 8.947368421052632e-05,
+      "loss": 2.4483,
+      "step": 18
+    },
+    {
+      "epoch": 0.0979381443298969,
+      "grad_norm": 0.7467647790908813,
+      "learning_rate": 9.473684210526316e-05,
+      "loss": 2.2122,
+      "step": 19
+    },
+    {
+      "epoch": 0.10309278350515463,
+      "grad_norm": 0.7337709069252014,
+      "learning_rate": 0.0001,
+      "loss": 1.9914,
+      "step": 20
+    },
+    {
+      "epoch": 0.10824742268041238,
+      "grad_norm": 0.773763120174408,
+      "learning_rate": 0.00010526315789473685,
+      "loss": 1.855,
+      "step": 21
+    },
+    {
+      "epoch": 0.1134020618556701,
+      "grad_norm": 0.8684746026992798,
+      "learning_rate": 0.0001105263157894737,
+      "loss": 1.7021,
+      "step": 22
+    },
+    {
+      "epoch": 0.11855670103092783,
+      "grad_norm": 0.9705705046653748,
+      "learning_rate": 0.00011578947368421053,
+      "loss": 1.4578,
+      "step": 23
+    },
+    {
+      "epoch": 0.12371134020618557,
+      "grad_norm": 0.8127485513687134,
+      "learning_rate": 0.00012105263157894738,
+      "loss": 1.306,
+      "step": 24
+    },
+    {
+      "epoch": 0.12886597938144329,
+      "grad_norm": 0.6740350127220154,
+      "learning_rate": 0.0001263157894736842,
+      "loss": 1.1731,
+      "step": 25
+    },
+    {
+      "epoch": 0.13402061855670103,
+      "grad_norm": 0.5295709371566772,
+      "learning_rate": 0.00013157894736842108,
+      "loss": 1.0209,
+      "step": 26
+    },
+    {
+      "epoch": 0.13917525773195877,
+      "grad_norm": 0.48113131523132324,
+      "learning_rate": 0.0001368421052631579,
+      "loss": 0.9791,
+      "step": 27
+    },
+    {
+      "epoch": 0.14432989690721648,
+      "grad_norm": 0.4143475294113159,
+      "learning_rate": 0.00014210526315789474,
+      "loss": 0.8705,
+      "step": 28
+    },
+    {
+      "epoch": 0.14948453608247422,
+      "grad_norm": 0.4144071340560913,
+      "learning_rate": 0.00014736842105263158,
+      "loss": 0.8208,
+      "step": 29
+    },
+    {
+      "epoch": 0.15463917525773196,
+      "grad_norm": 0.3122115731239319,
+      "learning_rate": 0.00015263157894736845,
+      "loss": 0.746,
+      "step": 30
+    },
+    {
+      "epoch": 0.15979381443298968,
+      "grad_norm": 0.25521549582481384,
+      "learning_rate": 0.00015789473684210527,
+      "loss": 0.7197,
+      "step": 31
+    },
+    {
+      "epoch": 0.16494845360824742,
+      "grad_norm": 0.24646416306495667,
+      "learning_rate": 0.0001631578947368421,
+      "loss": 0.6847,
+      "step": 32
+    },
+    {
+      "epoch": 0.17010309278350516,
+      "grad_norm": 0.20585910975933075,
+      "learning_rate": 0.00016842105263157895,
+      "loss": 0.6784,
+      "step": 33
+    },
+    {
+      "epoch": 0.17525773195876287,
+      "grad_norm": 0.18354211747646332,
+      "learning_rate": 0.0001736842105263158,
+      "loss": 0.6381,
+      "step": 34
+    },
+    {
+      "epoch": 0.18041237113402062,
+      "grad_norm": 0.17553451657295227,
+      "learning_rate": 0.00017894736842105264,
+      "loss": 0.635,
+      "step": 35
+    },
+    {
+      "epoch": 0.18556701030927836,
+      "grad_norm": 0.17485342919826508,
+      "learning_rate": 0.00018421052631578948,
+      "loss": 0.6112,
+      "step": 36
+    },
+    {
+      "epoch": 0.19072164948453607,
+      "grad_norm": 0.15421968698501587,
+      "learning_rate": 0.00018947368421052632,
+      "loss": 0.603,
+      "step": 37
+    },
+    {
+      "epoch": 0.1958762886597938,
+      "grad_norm": 0.12738950550556183,
+      "learning_rate": 0.00019473684210526317,
+      "loss": 0.5835,
+      "step": 38
+    },
+    {
+      "epoch": 0.20103092783505155,
+      "grad_norm": 0.13533693552017212,
+      "learning_rate": 0.0002,
+      "loss": 0.5914,
+      "step": 39
+    },
+    {
+      "epoch": 0.20618556701030927,
+      "grad_norm": 0.1037764698266983,
+      "learning_rate": 0.00019999597161708712,
+      "loss": 0.5766,
+      "step": 40
+    },
+    {
+      "epoch": 0.211340206185567,
+      "grad_norm": 0.10902086645364761,
+      "learning_rate": 0.00019998388679290583,
+      "loss": 0.5688,
+      "step": 41
+    },
+    {
+      "epoch": 0.21649484536082475,
+      "grad_norm": 0.09495263546705246,
+      "learning_rate": 0.0001999637465011021,
+      "loss": 0.5633,
+      "step": 42
+    },
+    {
+      "epoch": 0.22164948453608246,
+      "grad_norm": 0.08923530578613281,
+      "learning_rate": 0.00019993555236433213,
+      "loss": 0.5439,
+      "step": 43
+    },
+    {
+      "epoch": 0.2268041237113402,
+      "grad_norm": 0.08847405016422272,
+      "learning_rate": 0.00019989930665413147,
+      "loss": 0.543,
+      "step": 44
+    },
+    {
+      "epoch": 0.23195876288659795,
+      "grad_norm": 0.08645694702863693,
+      "learning_rate": 0.0001998550122907321,
+      "loss": 0.5584,
+      "step": 45
+    },
+    {
+      "epoch": 0.23711340206185566,
+      "grad_norm": 0.08545728027820587,
+      "learning_rate": 0.00019980267284282717,
+      "loss": 0.5298,
+      "step": 46
+    },
+    {
+      "epoch": 0.2422680412371134,
+      "grad_norm": 0.08298264443874359,
+      "learning_rate": 0.00019974229252728342,
+      "loss": 0.5225,
+      "step": 47
+    },
+    {
+      "epoch": 0.24742268041237114,
+      "grad_norm": 0.07304146885871887,
+      "learning_rate": 0.00019967387620880146,
+      "loss": 0.5096,
+      "step": 48
+    },
+    {
+      "epoch": 0.25257731958762886,
+      "grad_norm": 0.07074376940727234,
+      "learning_rate": 0.00019959742939952392,
+      "loss": 0.5196,
+      "step": 49
+    },
+    {
+      "epoch": 0.25773195876288657,
+      "grad_norm": 0.07222142070531845,
+      "learning_rate": 0.00019951295825859113,
+      "loss": 0.5275,
+      "step": 50
+    },
+    {
+      "epoch": 0.26288659793814434,
+      "grad_norm": 0.0744776651263237,
+      "learning_rate": 0.00019942046959164515,
+      "loss": 0.5164,
+      "step": 51
+    },
+    {
+      "epoch": 0.26804123711340205,
+      "grad_norm": 0.0768490582704544,
+      "learning_rate": 0.00019931997085028132,
+      "loss": 0.5147,
+      "step": 52
+    },
+    {
+      "epoch": 0.27319587628865977,
+      "grad_norm": 0.07787171751260757,
+      "learning_rate": 0.0001992114701314478,
+      "loss": 0.4927,
+      "step": 53
+    },
+    {
+      "epoch": 0.27835051546391754,
+      "grad_norm": 0.07264231145381927,
+      "learning_rate": 0.00019909497617679348,
+      "loss": 0.5044,
+      "step": 54
+    },
+    {
+      "epoch": 0.28350515463917525,
+      "grad_norm": 0.06552039086818695,
+      "learning_rate": 0.0001989704983719635,
+      "loss": 0.5114,
+      "step": 55
+    },
+    {
+      "epoch": 0.28865979381443296,
+      "grad_norm": 0.06201677396893501,
+      "learning_rate": 0.0001988380467458431,
+      "loss": 0.4876,
+      "step": 56
+    },
+    {
+      "epoch": 0.29381443298969073,
+      "grad_norm": 0.06521492451429367,
+      "learning_rate": 0.00019869763196974957,
+      "loss": 0.4849,
+      "step": 57
+    },
+    {
+      "epoch": 0.29896907216494845,
+      "grad_norm": 0.07199273258447647,
+      "learning_rate": 0.00019854926535657267,
+      "loss": 0.4819,
+      "step": 58
+    },
+    {
+      "epoch": 0.30412371134020616,
+      "grad_norm": 0.059282802045345306,
+      "learning_rate": 0.00019839295885986296,
+      "loss": 0.4848,
+      "step": 59
+    },
+    {
+      "epoch": 0.30927835051546393,
+      "grad_norm": 0.06017481908202171,
+      "learning_rate": 0.0001982287250728689,
+      "loss": 0.4724,
+      "step": 60
+    },
+    {
+      "epoch": 0.31443298969072164,
+      "grad_norm": 0.06318192183971405,
+      "learning_rate": 0.00019805657722752202,
+      "loss": 0.477,
+      "step": 61
+    },
+    {
+      "epoch": 0.31958762886597936,
+      "grad_norm": 0.07218744605779648,
+      "learning_rate": 0.00019787652919337116,
+      "loss": 0.4619,
+      "step": 62
+    },
+    {
+      "epoch": 0.3247422680412371,
+      "grad_norm": 0.09625069051980972,
+      "learning_rate": 0.00019768859547646478,
+      "loss": 0.4816,
+      "step": 63
+    },
+    {
+      "epoch": 0.32989690721649484,
+      "grad_norm": 0.10104988515377045,
+      "learning_rate": 0.00019749279121818235,
+      "loss": 0.48,
+      "step": 64
+    },
+    {
+      "epoch": 0.33505154639175255,
+      "grad_norm": 0.09369470179080963,
+      "learning_rate": 0.0001972891321940145,
+      "loss": 0.4684,
+      "step": 65
+    },
+    {
+      "epoch": 0.3402061855670103,
+      "grad_norm": 0.07391127198934555,
+      "learning_rate": 0.00019707763481229183,
+      "loss": 0.4546,
+      "step": 66
+    },
+    {
+      "epoch": 0.34536082474226804,
+      "grad_norm": 0.08418463915586472,
+      "learning_rate": 0.0001968583161128631,
+      "loss": 0.4608,
+      "step": 67
+    },
+    {
+      "epoch": 0.35051546391752575,
+      "grad_norm": 0.07738372683525085,
+      "learning_rate": 0.00019663119376572242,
+      "loss": 0.4682,
+      "step": 68
+    },
+    {
+      "epoch": 0.3556701030927835,
+      "grad_norm": 0.06787073612213135,
+      "learning_rate": 0.00019639628606958533,
+      "loss": 0.4812,
+      "step": 69
+    },
+    {
+      "epoch": 0.36082474226804123,
+      "grad_norm": 0.0691099762916565,
+      "learning_rate": 0.00019615361195041492,
+      "loss": 0.4735,
+      "step": 70
+    },
+    {
+      "epoch": 0.36597938144329895,
+      "grad_norm": 0.0697403997182846,
+      "learning_rate": 0.0001959031909598966,
+      "loss": 0.4579,
+      "step": 71
+    },
+    {
+      "epoch": 0.3711340206185567,
+      "grad_norm": 0.0726587325334549,
+      "learning_rate": 0.00019564504327386316,
+      "loss": 0.4689,
+      "step": 72
+    },
+    {
+      "epoch": 0.37628865979381443,
+      "grad_norm": 0.07147994637489319,
+      "learning_rate": 0.0001953791896906692,
+      "loss": 0.4517,
+      "step": 73
+    },
+    {
+      "epoch": 0.38144329896907214,
+      "grad_norm": 0.06729312986135483,
+      "learning_rate": 0.00019510565162951537,
+      "loss": 0.4501,
+      "step": 74
+    },
+    {
+      "epoch": 0.3865979381443299,
+      "grad_norm": 0.0733562707901001,
+      "learning_rate": 0.00019482445112872264,
+      "loss": 0.4675,
+      "step": 75
+    },
+    {
+      "epoch": 0.3917525773195876,
+      "grad_norm": 0.07467377930879593,
+      "learning_rate": 0.00019453561084395687,
+      "loss": 0.4528,
+      "step": 76
+    },
+    {
+      "epoch": 0.39690721649484534,
+      "grad_norm": 0.07150419056415558,
+      "learning_rate": 0.0001942391540464035,
+      "loss": 0.4479,
+      "step": 77
+    },
+    {
+      "epoch": 0.4020618556701031,
+      "grad_norm": 0.0706859678030014,
+      "learning_rate": 0.00019393510462089236,
+      "loss": 0.4797,
+      "step": 78
+    },
+    {
+      "epoch": 0.4072164948453608,
+      "grad_norm": 0.07195789366960526,
+      "learning_rate": 0.00019362348706397373,
+      "loss": 0.4557,
+      "step": 79
+    },
+    {
+      "epoch": 0.41237113402061853,
+      "grad_norm": 0.07562831044197083,
+      "learning_rate": 0.00019330432648194444,
+      "loss": 0.4586,
+      "step": 80
+    },
+    {
+      "epoch": 0.4175257731958763,
+      "grad_norm": 0.07137080281972885,
+      "learning_rate": 0.00019297764858882514,
+      "loss": 0.4416,
+      "step": 81
+    },
+    {
+      "epoch": 0.422680412371134,
+      "grad_norm": 0.09101378172636032,
+      "learning_rate": 0.00019264347970428876,
+      "loss": 0.4469,
+      "step": 82
+    },
+    {
+      "epoch": 0.42783505154639173,
+      "grad_norm": 0.10783733427524567,
+      "learning_rate": 0.00019230184675153976,
+      "loss": 0.4607,
+      "step": 83
+    },
+    {
+      "epoch": 0.4329896907216495,
+      "grad_norm": 0.143487811088562,
+      "learning_rate": 0.0001919527772551451,
+      "loss": 0.4446,
+      "step": 84
+    },
+    {
+      "epoch": 0.4381443298969072,
+      "grad_norm": 0.12131296098232269,
+      "learning_rate": 0.00019159629933881666,
+      "loss": 0.4496,
+      "step": 85
+    },
+    {
+      "epoch": 0.44329896907216493,
+      "grad_norm": 0.14953602850437164,
+      "learning_rate": 0.00019123244172314545,
+      "loss": 0.44,
+      "step": 86
+    },
+    {
+      "epoch": 0.4484536082474227,
+      "grad_norm": 0.22672319412231445,
+      "learning_rate": 0.00019086123372328746,
+      "loss": 0.4435,
+      "step": 87
+    },
+    {
+      "epoch": 0.4536082474226804,
+      "grad_norm": 0.14028961956501007,
+      "learning_rate": 0.00019048270524660196,
+      "loss": 0.4481,
+      "step": 88
+    },
+    {
+      "epoch": 0.4587628865979381,
+      "grad_norm": 0.16384148597717285,
+      "learning_rate": 0.0001900968867902419,
+      "loss": 0.4456,
+      "step": 89
+    },
+    {
+      "epoch": 0.4639175257731959,
+      "grad_norm": 0.15399976074695587,
+      "learning_rate": 0.00018970380943869687,
+      "loss": 0.44,
+      "step": 90
+    },
+    {
+      "epoch": 0.4690721649484536,
+      "grad_norm": 0.09678421914577484,
+      "learning_rate": 0.00018930350486128856,
+      "loss": 0.4541,
+      "step": 91
+    },
+    {
+      "epoch": 0.4742268041237113,
+      "grad_norm": 0.09682022035121918,
+      "learning_rate": 0.00018889600530961934,
+      "loss": 0.4317,
+      "step": 92
+    },
+    {
+      "epoch": 0.4793814432989691,
+      "grad_norm": 0.09989262372255325,
+      "learning_rate": 0.00018848134361497385,
+      "loss": 0.4519,
+      "step": 93
+    },
+    {
+      "epoch": 0.4845360824742268,
+      "grad_norm": 0.0877247154712677,
+      "learning_rate": 0.0001880595531856738,
+      "loss": 0.4367,
+      "step": 94
+    },
+    {
+      "epoch": 0.4896907216494845,
+      "grad_norm": 0.08405473828315735,
+      "learning_rate": 0.00018763066800438636,
+      "loss": 0.4361,
+      "step": 95
+    },
+    {
+      "epoch": 0.4948453608247423,
+      "grad_norm": 0.08135327696800232,
+      "learning_rate": 0.00018719472262538624,
+      "loss": 0.4454,
+      "step": 96
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.0931510180234909,
+      "learning_rate": 0.00018675175217177175,
+      "loss": 0.4577,
+      "step": 97
+    },
+    {
+      "epoch": 0.5051546391752577,
+      "grad_norm": 0.0886392667889595,
+      "learning_rate": 0.00018630179233263504,
+      "loss": 0.429,
+      "step": 98
+    },
+    {
+      "epoch": 0.5103092783505154,
+      "grad_norm": 0.1057060956954956,
+      "learning_rate": 0.00018584487936018661,
+      "loss": 0.4555,
+      "step": 99
+    },
+    {
+      "epoch": 0.5154639175257731,
+      "grad_norm": 0.1023128554224968,
+      "learning_rate": 0.00018538105006683472,
+      "loss": 0.4507,
+      "step": 100
+    },
+    {
+      "epoch": 0.520618556701031,
+      "grad_norm": 0.16284054517745972,
+      "learning_rate": 0.0001849103418222194,
+      "loss": 0.4464,
+      "step": 101
+    },
+    {
+      "epoch": 0.5257731958762887,
+      "grad_norm": 0.20978468656539917,
+      "learning_rate": 0.00018443279255020152,
+      "loss": 0.4404,
+      "step": 102
+    },
+    {
+      "epoch": 0.5309278350515464,
+      "grad_norm": 0.1973002552986145,
+      "learning_rate": 0.00018394844072580773,
+      "loss": 0.4261,
+      "step": 103
+    },
+    {
+      "epoch": 0.5360824742268041,
+      "grad_norm": 0.12068638205528259,
+      "learning_rate": 0.00018345732537213027,
+      "loss": 0.4434,
+      "step": 104
+    },
+    {
+      "epoch": 0.5412371134020618,
+      "grad_norm": 0.1234351322054863,
+      "learning_rate": 0.00018295948605718314,
+      "loss": 0.4344,
+      "step": 105
+    },
+    {
+      "epoch": 0.5463917525773195,
+      "grad_norm": 0.17490962147712708,
+      "learning_rate": 0.0001824549628907141,
+      "loss": 0.4497,
+      "step": 106
+    },
+    {
+      "epoch": 0.5515463917525774,
+      "grad_norm": 0.13712941110134125,
+      "learning_rate": 0.0001819437965209732,
+      "loss": 0.4424,
+      "step": 107
+    },
+    {
+      "epoch": 0.5567010309278351,
+      "grad_norm": 0.13059262931346893,
+      "learning_rate": 0.00018142602813143785,
+      "loss": 0.4385,
+      "step": 108
+    },
+    {
+      "epoch": 0.5618556701030928,
+      "grad_norm": 0.16579480469226837,
+      "learning_rate": 0.00018090169943749476,
+      "loss": 0.4343,
+      "step": 109
+    },
+    {
+      "epoch": 0.5670103092783505,
+      "grad_norm": 0.15796129405498505,
+      "learning_rate": 0.00018037085268307889,
+      "loss": 0.4472,
+      "step": 110
+    },
+    {
+      "epoch": 0.5721649484536082,
+      "grad_norm": 0.1521310657262802,
+      "learning_rate": 0.00017983353063727016,
+      "loss": 0.4235,
+      "step": 111
+    },
+    {
+      "epoch": 0.5773195876288659,
+      "grad_norm": 0.12863527238368988,
+      "learning_rate": 0.00017928977659084755,
+      "loss": 0.4239,
+      "step": 112
+    },
+    {
+      "epoch": 0.5824742268041238,
+      "grad_norm": 0.12139534950256348,
+      "learning_rate": 0.00017873963435280121,
+      "loss": 0.4249,
+      "step": 113
+    },
+    {
+      "epoch": 0.5876288659793815,
+      "grad_norm": 0.16452330350875854,
+      "learning_rate": 0.000178183148246803,
+      "loss": 0.434,
+      "step": 114
+    },
+    {
+      "epoch": 0.5927835051546392,
+      "grad_norm": 0.15867012739181519,
+      "learning_rate": 0.00017762036310763532,
+      "loss": 0.4409,
+      "step": 115
+    },
+    {
+      "epoch": 0.5979381443298969,
+      "grad_norm": 0.13938477635383606,
+      "learning_rate": 0.00017705132427757895,
+      "loss": 0.4392,
+      "step": 116
+    },
+    {
+      "epoch": 0.6030927835051546,
+      "grad_norm": 0.15676359832286835,
+      "learning_rate": 0.00017647607760275987,
+      "loss": 0.4238,
+      "step": 117
+    },
+    {
+      "epoch": 0.6082474226804123,
+      "grad_norm": 0.1875935196876526,
+      "learning_rate": 0.00017589466942945556,
+      "loss": 0.4364,
+      "step": 118
+    },
+    {
+      "epoch": 0.6134020618556701,
+      "grad_norm": 0.18062135577201843,
+      "learning_rate": 0.00017530714660036112,
+      "loss": 0.4316,
+      "step": 119
+    },
+    {
+      "epoch": 0.6185567010309279,
+      "grad_norm": 0.21620391309261322,
+      "learning_rate": 0.00017471355645081498,
+      "loss": 0.4365,
+      "step": 120
+    },
+    {
+      "epoch": 0.6237113402061856,
+      "grad_norm": 0.22219762206077576,
+      "learning_rate": 0.0001741139468049855,
+      "loss": 0.4515,
+      "step": 121
+    },
+    {
+      "epoch": 0.6288659793814433,
+      "grad_norm": 0.27636048197746277,
+      "learning_rate": 0.00017350836597201767,
+      "loss": 0.4322,
+      "step": 122
+    },
+    {
+      "epoch": 0.634020618556701,
+      "grad_norm": 0.27494513988494873,
+      "learning_rate": 0.00017289686274214118,
+      "loss": 0.4437,
+      "step": 123
+    },
+    {
+      "epoch": 0.6391752577319587,
+      "grad_norm": 0.19892732799053192,
+      "learning_rate": 0.00017227948638273916,
+      "loss": 0.4323,
+      "step": 124
+    },
+    {
+      "epoch": 0.6443298969072165,
+      "grad_norm": 0.22920124232769012,
+      "learning_rate": 0.00017165628663437922,
+      "loss": 0.42,
+      "step": 125
+    },
+    {
+      "epoch": 0.6494845360824743,
+      "grad_norm": 0.2462163269519806,
+      "learning_rate": 0.0001710273137068057,
+      "loss": 0.4367,
+      "step": 126
+    },
+    {
+      "epoch": 0.654639175257732,
+      "grad_norm": 0.25883370637893677,
+      "learning_rate": 0.0001703926182748945,
+      "loss": 0.4338,
+      "step": 127
+    },
+    {
+      "epoch": 0.6597938144329897,
+      "grad_norm": 0.2813364267349243,
+      "learning_rate": 0.00016975225147457026,
+      "loss": 0.425,
+      "step": 128
+    },
+    {
+      "epoch": 0.6649484536082474,
+      "grad_norm": 0.26451367139816284,
+      "learning_rate": 0.00016910626489868649,
+      "loss": 0.444,
+      "step": 129
+    },
+    {
+      "epoch": 0.6701030927835051,
+      "grad_norm": 0.260074257850647,
+      "learning_rate": 0.00016845471059286887,
+      "loss": 0.4401,
+      "step": 130
+    },
+    {
+      "epoch": 0.6752577319587629,
+      "grad_norm": 0.33685773611068726,
+      "learning_rate": 0.0001677976410513221,
+      "loss": 0.4289,
+      "step": 131
+    },
+    {
+      "epoch": 0.6804123711340206,
+      "grad_norm": 0.13846200704574585,
+      "learning_rate": 0.0001671351092126004,
+      "loss": 0.4095,
+      "step": 132
+    },
+    {
+      "epoch": 0.6855670103092784,
+      "grad_norm": 0.24901287257671356,
+      "learning_rate": 0.0001664671684553426,
+      "loss": 0.4212,
+      "step": 133
+    },
+    {
+      "epoch": 0.6907216494845361,
+      "grad_norm": 0.25652503967285156,
+      "learning_rate": 0.00016579387259397127,
+      "loss": 0.4304,
+      "step": 134
+    },
+    {
+      "epoch": 0.6958762886597938,
+      "grad_norm": 0.19353553652763367,
+      "learning_rate": 0.00016511527587435737,
+      "loss": 0.4517,
+      "step": 135
+    },
+    {
+      "epoch": 0.7010309278350515,
+      "grad_norm": 0.21667616069316864,
+      "learning_rate": 0.00016443143296944945,
+      "loss": 0.4397,
+      "step": 136
+    },
+    {
+      "epoch": 0.7061855670103093,
+      "grad_norm": 0.2371380776166916,
+      "learning_rate": 0.000163742398974869,
+      "loss": 0.4394,
+      "step": 137
+    },
+    {
+      "epoch": 0.711340206185567,
+      "grad_norm": 0.26480844616889954,
+      "learning_rate": 0.00016304822940447139,
+      "loss": 0.4281,
+      "step": 138
+    },
+    {
+      "epoch": 0.7164948453608248,
+      "grad_norm": 0.16394782066345215,
+      "learning_rate": 0.00016234898018587337,
+      "loss": 0.4362,
+      "step": 139
+    },
+    {
+      "epoch": 0.7216494845360825,
+      "grad_norm": 0.2286471277475357,
+      "learning_rate": 0.00016164470765594698,
+      "loss": 0.4306,
+      "step": 140
+    },
+    {
+      "epoch": 0.7268041237113402,
+      "grad_norm": 0.20856066048145294,
+      "learning_rate": 0.00016093546855628084,
+      "loss": 0.4306,
+      "step": 141
+    },
+    {
+      "epoch": 0.7319587628865979,
+      "grad_norm": 0.17189185321331024,
+      "learning_rate": 0.00016022132002860824,
+      "loss": 0.4267,
+      "step": 142
+    },
+    {
+      "epoch": 0.7371134020618557,
+      "grad_norm": 0.23078757524490356,
+      "learning_rate": 0.00015950231961020373,
+      "loss": 0.4385,
+      "step": 143
+    },
+    {
+      "epoch": 0.7422680412371134,
+      "grad_norm": 0.24594247341156006,
+      "learning_rate": 0.00015877852522924732,
+      "loss": 0.4375,
+      "step": 144
+    },
+    {
+      "epoch": 0.7474226804123711,
+      "grad_norm": 0.185899019241333,
+      "learning_rate": 0.00015804999520015734,
+      "loss": 0.4112,
+      "step": 145
+    },
+    {
+      "epoch": 0.7525773195876289,
+      "grad_norm": 0.2080090045928955,
+      "learning_rate": 0.00015731678821889224,
+      "loss": 0.4366,
+      "step": 146
+    },
+    {
+      "epoch": 0.7577319587628866,
+      "grad_norm": 0.19917985796928406,
+      "learning_rate": 0.00015657896335822147,
+      "loss": 0.4359,
+      "step": 147
+    },
+    {
+      "epoch": 0.7628865979381443,
+      "grad_norm": 0.1784542351961136,
+      "learning_rate": 0.00015583658006296624,
+      "loss": 0.4145,
+      "step": 148
+    },
+    {
+      "epoch": 0.7680412371134021,
+      "grad_norm": 0.26271045207977295,
+      "learning_rate": 0.00015508969814521025,
+      "loss": 0.4518,
+      "step": 149
+    },
+    {
+      "epoch": 0.7731958762886598,
+      "grad_norm": 0.28644442558288574,
+      "learning_rate": 0.0001543383777794806,
+      "loss": 0.4336,
+      "step": 150
+    },
+    {
+      "epoch": 0.7783505154639175,
+      "grad_norm": 0.32217490673065186,
+      "learning_rate": 0.00015358267949789966,
+      "loss": 0.4335,
+      "step": 151
+    },
+    {
+      "epoch": 0.7835051546391752,
+      "grad_norm": 0.31271830201148987,
+      "learning_rate": 0.00015282266418530847,
+      "loss": 0.4424,
+      "step": 152
+    },
+    {
+      "epoch": 0.788659793814433,
+      "grad_norm": 0.2646642327308655,
+      "learning_rate": 0.00015205839307436088,
+      "loss": 0.4166,
+      "step": 153
+    },
+    {
+      "epoch": 0.7938144329896907,
+      "grad_norm": 0.3375913202762604,
+      "learning_rate": 0.00015128992774059063,
+      "loss": 0.4278,
+      "step": 154
+    },
+    {
+      "epoch": 0.7989690721649485,
+      "grad_norm": 0.3240370750427246,
+      "learning_rate": 0.00015051733009745013,
+      "loss": 0.4414,
+      "step": 155
+    },
+    {
+      "epoch": 0.8041237113402062,
+      "grad_norm": 0.34704843163490295,
+      "learning_rate": 0.0001497406623913222,
+      "loss": 0.4359,
+      "step": 156
+    },
+    {
+      "epoch": 0.8092783505154639,
+      "grad_norm": 0.3096780776977539,
+      "learning_rate": 0.00014895998719650526,
+      "loss": 0.4296,
+      "step": 157
+    },
+    {
+      "epoch": 0.8144329896907216,
+      "grad_norm": 0.32201290130615234,
+      "learning_rate": 0.00014817536741017152,
+      "loss": 0.4163,
+      "step": 158
+    },
+    {
+      "epoch": 0.8195876288659794,
+      "grad_norm": 0.24183842539787292,
+      "learning_rate": 0.00014738686624729986,
+      "loss": 0.4319,
+      "step": 159
+    },
+    {
+      "epoch": 0.8247422680412371,
+      "grad_norm": 0.2772982120513916,
+      "learning_rate": 0.00014659454723558248,
+      "loss": 0.4338,
+      "step": 160
+    },
+    {
+      "epoch": 0.8298969072164949,
+      "grad_norm": 0.2415013611316681,
+      "learning_rate": 0.00014579847421030678,
+      "loss": 0.4268,
+      "step": 161
+    },
+    {
+      "epoch": 0.8350515463917526,
+      "grad_norm": 0.19889673590660095,
+      "learning_rate": 0.00014499871130921213,
+      "loss": 0.4573,
+      "step": 162
+    },
+    {
+      "epoch": 0.8402061855670103,
+      "grad_norm": 0.3569074273109436,
+      "learning_rate": 0.0001441953229673227,
+      "loss": 0.4327,
+      "step": 163
+    },
+    {
+      "epoch": 0.845360824742268,
+      "grad_norm": 0.29068055748939514,
+      "learning_rate": 0.00014338837391175582,
+      "loss": 0.4356,
+      "step": 164
+    },
+    {
+      "epoch": 0.8505154639175257,
+      "grad_norm": 0.2646566927433014,
+      "learning_rate": 0.00014257792915650728,
+      "loss": 0.4381,
+      "step": 165
+    },
+    {
+      "epoch": 0.8556701030927835,
+      "grad_norm": 0.2640703320503235,
+      "learning_rate": 0.00014176405399721312,
+      "loss": 0.4368,
+      "step": 166
+    },
+    {
+      "epoch": 0.8608247422680413,
+      "grad_norm": 0.22350774705410004,
+      "learning_rate": 0.00014094681400588906,
+      "loss": 0.4325,
+      "step": 167
+    },
+    {
+      "epoch": 0.865979381443299,
+      "grad_norm": 0.24790436029434204,
+      "learning_rate": 0.00014012627502564743,
+      "loss": 0.4142,
+      "step": 168
+    },
+    {
+      "epoch": 0.8711340206185567,
+      "grad_norm": 0.4644359350204468,
+      "learning_rate": 0.00013930250316539238,
+      "loss": 0.4258,
+      "step": 169
+    },
+    {
+      "epoch": 0.8762886597938144,
+      "grad_norm": 0.3182356655597687,
+      "learning_rate": 0.0001384755647944936,
+      "loss": 0.4368,
+      "step": 170
+    },
+    {
+      "epoch": 0.8814432989690721,
+      "grad_norm": 0.377511590719223,
+      "learning_rate": 0.0001376455265374392,
+      "loss": 0.4353,
+      "step": 171
+    },
+    {
+      "epoch": 0.8865979381443299,
+      "grad_norm": 0.31862640380859375,
+      "learning_rate": 0.00013681245526846783,
+      "loss": 0.44,
+      "step": 172
+    },
+    {
+      "epoch": 0.8917525773195877,
+      "grad_norm": 0.2726942002773285,
+      "learning_rate": 0.00013597641810618073,
+      "loss": 0.4475,
+      "step": 173
+    },
+    {
+      "epoch": 0.8969072164948454,
+      "grad_norm": 0.2775639295578003,
+      "learning_rate": 0.0001351374824081343,
+      "loss": 0.4327,
+      "step": 174
+    },
+    {
+      "epoch": 0.9020618556701031,
+      "grad_norm": 0.16788411140441895,
+      "learning_rate": 0.00013429571576541315,
+      "loss": 0.4436,
+      "step": 175
+    },
+    {
+      "epoch": 0.9072164948453608,
+      "grad_norm": 0.36008498072624207,
+      "learning_rate": 0.00013345118599718457,
+      "loss": 0.4381,
+      "step": 176
+    },
+    {
+      "epoch": 0.9123711340206185,
+      "grad_norm": 0.3529493510723114,
+      "learning_rate": 0.0001326039611452342,
+      "loss": 0.4311,
+      "step": 177
+    },
+    {
+      "epoch": 0.9175257731958762,
+      "grad_norm": 0.33236363530158997,
+      "learning_rate": 0.00013175410946848445,
+      "loss": 0.4413,
+      "step": 178
+    },
+    {
+      "epoch": 0.9226804123711341,
+      "grad_norm": 0.34233689308166504,
+      "learning_rate": 0.00013090169943749476,
+      "loss": 0.4341,
+      "step": 179
+    },
+    {
+      "epoch": 0.9278350515463918,
+      "grad_norm": 0.23846842348575592,
+      "learning_rate": 0.0001300467997289452,
+      "loss": 0.4284,
+      "step": 180
+    },
+    {
+      "epoch": 0.9329896907216495,
+      "grad_norm": 0.2865229845046997,
+      "learning_rate": 0.00012918947922010336,
+      "loss": 0.4339,
+      "step": 181
+    },
+    {
+      "epoch": 0.9381443298969072,
+      "grad_norm": 0.31014958024024963,
+      "learning_rate": 0.00012832980698327496,
+      "loss": 0.4349,
+      "step": 182
+    },
+    {
+      "epoch": 0.9432989690721649,
+      "grad_norm": 0.2616514563560486,
+      "learning_rate": 0.00012746785228023904,
+      "loss": 0.4226,
+      "step": 183
+    },
+    {
+      "epoch": 0.9484536082474226,
+      "grad_norm": 0.29992541670799255,
+      "learning_rate": 0.00012660368455666752,
+      "loss": 0.4405,
+      "step": 184
+    },
+    {
+      "epoch": 0.9536082474226805,
+      "grad_norm": 0.28125184774398804,
+      "learning_rate": 0.00012573737343653024,
+      "loss": 0.4255,
+      "step": 185
+    },
+    {
+      "epoch": 0.9587628865979382,
+      "grad_norm": 0.30920377373695374,
+      "learning_rate": 0.0001248689887164855,
+      "loss": 0.4254,
+      "step": 186
+    },
+    {
+      "epoch": 0.9639175257731959,
+      "grad_norm": 0.29404139518737793,
+      "learning_rate": 0.0001239986003602566,
+      "loss": 0.4301,
+      "step": 187
+    },
+    {
+      "epoch": 0.9690721649484536,
+      "grad_norm": 0.24007168412208557,
+      "learning_rate": 0.00012312627849299523,
+      "loss": 0.4365,
+      "step": 188
+    },
+    {
+      "epoch": 0.9742268041237113,
+      "grad_norm": 0.1996045708656311,
+      "learning_rate": 0.00012225209339563145,
+      "loss": 0.4094,
+      "step": 189
+    },
+    {
+      "epoch": 0.979381443298969,
+      "grad_norm": 0.21352912485599518,
+      "learning_rate": 0.00012137611549921146,
+      "loss": 0.4372,
+      "step": 190
+    },
+    {
+      "epoch": 0.9845360824742269,
+      "grad_norm": 0.2544335126876831,
+      "learning_rate": 0.00012049841537922307,
+      "loss": 0.4131,
+      "step": 191
+    },
+    {
+      "epoch": 0.9896907216494846,
+      "grad_norm": 0.22331076860427856,
+      "learning_rate": 0.00011961906374990952,
+      "loss": 0.4279,
+      "step": 192
+    },
+    {
+      "epoch": 0.9948453608247423,
+      "grad_norm": 0.23338709771633148,
+      "learning_rate": 0.00011873813145857249,
+      "loss": 0.4172,
+      "step": 193
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.23860973119735718,
+      "learning_rate": 0.00011785568947986367,
+      "loss": 0.4318,
+      "step": 194
+    },
+    {
+      "epoch": 1.0051546391752577,
+      "grad_norm": 0.22747361660003662,
+      "learning_rate": 0.00011697180891006689,
+      "loss": 0.4275,
+      "step": 195
+    },
+    {
+      "epoch": 1.0103092783505154,
+      "grad_norm": 0.24887818098068237,
+      "learning_rate": 0.00011608656096136984,
+      "loss": 0.429,
+      "step": 196
+    },
+    {
+      "epoch": 1.0154639175257731,
+      "grad_norm": 0.2906290292739868,
+      "learning_rate": 0.00011520001695612674,
+      "loss": 0.43,
+      "step": 197
+    },
+    {
+      "epoch": 1.0206185567010309,
+      "grad_norm": 0.3029741644859314,
+      "learning_rate": 0.00011431224832111196,
+      "loss": 0.4311,
+      "step": 198
+    },
+    {
+      "epoch": 1.0257731958762886,
+      "grad_norm": 0.25530320405960083,
+      "learning_rate": 0.00011342332658176555,
+      "loss": 0.4208,
+      "step": 199
+    },
+    {
+      "epoch": 1.0309278350515463,
+      "grad_norm": 0.18977971374988556,
+      "learning_rate": 0.00011253332335643043,
+      "loss": 0.4075,
+      "step": 200
+    },
+    {
+      "epoch": 1.0360824742268042,
+      "grad_norm": 0.24631249904632568,
+      "learning_rate": 0.00011164231035058228,
+      "loss": 0.4222,
+      "step": 201
+    },
+    {
+      "epoch": 1.041237113402062,
+      "grad_norm": 0.2640010714530945,
+      "learning_rate": 0.00011075035935105252,
+      "loss": 0.4304,
+      "step": 202
+    },
+    {
+      "epoch": 1.0463917525773196,
+      "grad_norm": 0.2835286855697632,
+      "learning_rate": 0.00010985754222024436,
+      "loss": 0.4305,
+      "step": 203
+    },
+    {
+      "epoch": 1.0515463917525774,
+      "grad_norm": 0.3086528480052948,
+      "learning_rate": 0.00010896393089034336,
+      "loss": 0.4424,
+      "step": 204
+    },
+    {
+      "epoch": 1.056701030927835,
+      "grad_norm": 0.3864690065383911,
+      "learning_rate": 0.00010806959735752174,
+      "loss": 0.4403,
+      "step": 205
+    },
+    {
+      "epoch": 1.0618556701030928,
+      "grad_norm": 0.4415989816188812,
+      "learning_rate": 0.00010717461367613793,
+      "loss": 0.4165,
+      "step": 206
+    },
+    {
+      "epoch": 1.0670103092783505,
+      "grad_norm": 0.4663720726966858,
+      "learning_rate": 0.00010627905195293135,
+      "loss": 0.4195,
+      "step": 207
+    },
+    {
+      "epoch": 1.0721649484536082,
+      "grad_norm": 0.40034109354019165,
+      "learning_rate": 0.00010538298434121283,
+      "loss": 0.4223,
+      "step": 208
+    },
+    {
+      "epoch": 1.077319587628866,
+      "grad_norm": 0.31532707810401917,
+      "learning_rate": 0.00010448648303505151,
+      "loss": 0.424,
+      "step": 209
+    },
+    {
+      "epoch": 1.0824742268041236,
+      "grad_norm": 0.2678797245025635,
+      "learning_rate": 0.00010358962026345824,
+      "loss": 0.4295,
+      "step": 210
+    },
+    {
+      "epoch": 1.0876288659793814,
+      "grad_norm": 0.2952194809913635,
+      "learning_rate": 0.00010269246828456629,
+      "loss": 0.4246,
+      "step": 211
+    },
+    {
+      "epoch": 1.0927835051546393,
+      "grad_norm": 0.39344316720962524,
+      "learning_rate": 0.00010179509937980973,
+      "loss": 0.434,
+      "step": 212
+    },
+    {
+      "epoch": 1.097938144329897,
+      "grad_norm": 0.36972832679748535,
+      "learning_rate": 0.00010089758584809979,
+      "loss": 0.4249,
+      "step": 213
+    },
+    {
+      "epoch": 1.1030927835051547,
+      "grad_norm": 0.23801574110984802,
+      "learning_rate": 0.0001,
+      "loss": 0.4179,
+      "step": 214
+    },
+    {
+      "epoch": 1.1082474226804124,
+      "grad_norm": 0.35938265919685364,
+      "learning_rate": 9.910241415190021e-05,
+      "loss": 0.4309,
+      "step": 215
+    },
+    {
+      "epoch": 1.1134020618556701,
+      "grad_norm": 0.501671314239502,
+      "learning_rate": 9.820490062019029e-05,
+      "loss": 0.4364,
+      "step": 216
+    },
+    {
+      "epoch": 1.1185567010309279,
+      "grad_norm": 0.4622708559036255,
+      "learning_rate": 9.730753171543374e-05,
+      "loss": 0.4229,
+      "step": 217
+    },
+    {
+      "epoch": 1.1237113402061856,
+      "grad_norm": 0.28288254141807556,
+      "learning_rate": 9.641037973654178e-05,
+      "loss": 0.4248,
+      "step": 218
+    },
+    {
+      "epoch": 1.1288659793814433,
+      "grad_norm": 0.23704715073108673,
+      "learning_rate": 9.551351696494854e-05,
+      "loss": 0.4185,
+      "step": 219
+    },
+    {
+      "epoch": 1.134020618556701,
+      "grad_norm": 0.4235609173774719,
+      "learning_rate": 9.461701565878719e-05,
+      "loss": 0.4171,
+      "step": 220
+    },
+    {
+      "epoch": 1.1391752577319587,
+      "grad_norm": 0.3826042413711548,
+      "learning_rate": 9.372094804706867e-05,
+      "loss": 0.4331,
+      "step": 221
+    },
+    {
+      "epoch": 1.1443298969072164,
+      "grad_norm": 0.2800697088241577,
+      "learning_rate": 9.282538632386207e-05,
+      "loss": 0.4214,
+      "step": 222
+    },
+    {
+      "epoch": 1.1494845360824741,
+      "grad_norm": 0.3822883367538452,
+      "learning_rate": 9.193040264247829e-05,
+      "loss": 0.4345,
+      "step": 223
+    },
+    {
+      "epoch": 1.1546391752577319,
+      "grad_norm": 0.306937038898468,
+      "learning_rate": 9.103606910965666e-05,
+      "loss": 0.4134,
+      "step": 224
+    },
+    {
+      "epoch": 1.1597938144329896,
+      "grad_norm": 0.24911299347877502,
+      "learning_rate": 9.014245777975565e-05,
+      "loss": 0.4196,
+      "step": 225
+    },
+    {
+      "epoch": 1.1649484536082475,
+      "grad_norm": 0.30347758531570435,
+      "learning_rate": 8.924964064894753e-05,
+      "loss": 0.407,
+      "step": 226
+    },
+    {
+      "epoch": 1.1701030927835052,
+      "grad_norm": 0.36097684502601624,
+      "learning_rate": 8.835768964941773e-05,
+      "loss": 0.4317,
+      "step": 227
+    },
+    {
+      "epoch": 1.175257731958763,
+      "grad_norm": 0.2212606817483902,
+      "learning_rate": 8.746667664356956e-05,
+      "loss": 0.4067,
+      "step": 228
+    },
+    {
+      "epoch": 1.1804123711340206,
+      "grad_norm": 0.2710006833076477,
+      "learning_rate": 8.657667341823448e-05,
+      "loss": 0.421,
+      "step": 229
+    },
+    {
+      "epoch": 1.1855670103092784,
+      "grad_norm": 0.2709140479564667,
+      "learning_rate": 8.568775167888806e-05,
+      "loss": 0.4104,
+      "step": 230
+    },
+    {
+      "epoch": 1.190721649484536,
+      "grad_norm": 0.25175172090530396,
+      "learning_rate": 8.479998304387329e-05,
+      "loss": 0.4184,
+      "step": 231
+    },
+    {
+      "epoch": 1.1958762886597938,
+      "grad_norm": 0.2586748003959656,
+      "learning_rate": 8.391343903863018e-05,
+      "loss": 0.4157,
+      "step": 232
+    },
+    {
+      "epoch": 1.2010309278350515,
+      "grad_norm": 0.26554447412490845,
+      "learning_rate": 8.302819108993312e-05,
+      "loss": 0.4263,
+      "step": 233
+    },
+    {
+      "epoch": 1.2061855670103092,
+      "grad_norm": 0.24530813097953796,
+      "learning_rate": 8.214431052013634e-05,
+      "loss": 0.4216,
+      "step": 234
+    },
+    {
+      "epoch": 1.211340206185567,
+      "grad_norm": 0.29449594020843506,
+      "learning_rate": 8.126186854142752e-05,
+      "loss": 0.4241,
+      "step": 235
+    },
+    {
+      "epoch": 1.2164948453608249,
+      "grad_norm": 0.359494686126709,
+      "learning_rate": 8.038093625009052e-05,
+      "loss": 0.4274,
+      "step": 236
+    },
+    {
+      "epoch": 1.2216494845360826,
+      "grad_norm": 0.23757228255271912,
+      "learning_rate": 7.950158462077697e-05,
+      "loss": 0.4126,
+      "step": 237
+    },
+    {
+      "epoch": 1.2268041237113403,
+      "grad_norm": 0.22119931876659393,
+      "learning_rate": 7.862388450078855e-05,
+      "loss": 0.4192,
+      "step": 238
+    },
+    {
+      "epoch": 1.231958762886598,
+      "grad_norm": 0.31067612767219543,
+      "learning_rate": 7.774790660436858e-05,
+      "loss": 0.4392,
+      "step": 239
+    },
+    {
+      "epoch": 1.2371134020618557,
+      "grad_norm": 0.2888616919517517,
+      "learning_rate": 7.68737215070048e-05,
+      "loss": 0.416,
+      "step": 240
+    },
+    {
+      "epoch": 1.2422680412371134,
+      "grad_norm": 0.15679210424423218,
+      "learning_rate": 7.600139963974341e-05,
+      "loss": 0.4157,
+      "step": 241
+    },
+    {
+      "epoch": 1.2474226804123711,
+      "grad_norm": 0.26868337392807007,
+      "learning_rate": 7.513101128351454e-05,
+      "loss": 0.4114,
+      "step": 242
+    },
+    {
+      "epoch": 1.2525773195876289,
+      "grad_norm": 0.2522866725921631,
+      "learning_rate": 7.426262656346978e-05,
+      "loss": 0.4255,
+      "step": 243
+    },
+    {
+      "epoch": 1.2577319587628866,
+      "grad_norm": 0.1718539297580719,
+      "learning_rate": 7.339631544333249e-05,
+      "loss": 0.4312,
+      "step": 244
+    },
+    {
+      "epoch": 1.2628865979381443,
+      "grad_norm": 0.23187138140201569,
+      "learning_rate": 7.2532147719761e-05,
+      "loss": 0.4268,
+      "step": 245
+    },
+    {
+      "epoch": 1.268041237113402,
+      "grad_norm": 0.2766572833061218,
+      "learning_rate": 7.167019301672509e-05,
+      "loss": 0.4302,
+      "step": 246
+    },
+    {
+      "epoch": 1.2731958762886597,
+      "grad_norm": 0.20282387733459473,
+      "learning_rate": 7.081052077989667e-05,
+      "loss": 0.4113,
+      "step": 247
+    },
+    {
+      "epoch": 1.2783505154639174,
+      "grad_norm": 0.17571978271007538,
+      "learning_rate": 6.995320027105481e-05,
+      "loss": 0.4214,
+      "step": 248
+    },
+    {
+      "epoch": 1.2835051546391751,
+      "grad_norm": 0.2154776006937027,
+      "learning_rate": 6.909830056250527e-05,
+      "loss": 0.4321,
+      "step": 249
+    },
+    {
+      "epoch": 1.2886597938144329,
+      "grad_norm": 0.20032523572444916,
+      "learning_rate": 6.824589053151558e-05,
+      "loss": 0.4127,
+      "step": 250
+    },
+    {
+      "epoch": 1.2938144329896908,
+      "grad_norm": 0.16858811676502228,
+      "learning_rate": 6.739603885476582e-05,
+      "loss": 0.4161,
+      "step": 251
+    },
+    {
+      "epoch": 1.2989690721649485,
+      "grad_norm": 0.19868813455104828,
+      "learning_rate": 6.654881400281547e-05,
+      "loss": 0.4125,
+      "step": 252
+    },
+    {
+      "epoch": 1.3041237113402062,
+      "grad_norm": 0.22468076646327972,
+      "learning_rate": 6.570428423458687e-05,
+      "loss": 0.4176,
+      "step": 253
+    },
+    {
+      "epoch": 1.309278350515464,
+      "grad_norm": 0.1965940147638321,
+      "learning_rate": 6.486251759186572e-05,
+      "loss": 0.4077,
+      "step": 254
+    },
+    {
+      "epoch": 1.3144329896907216,
+      "grad_norm": 0.15204209089279175,
+      "learning_rate": 6.402358189381934e-05,
+      "loss": 0.4136,
+      "step": 255
+    },
+    {
+      "epoch": 1.3195876288659794,
+      "grad_norm": 0.1732676774263382,
+      "learning_rate": 6.318754473153221e-05,
+      "loss": 0.4003,
+      "step": 256
+    },
+    {
+      "epoch": 1.324742268041237,
+      "grad_norm": 0.15440921485424042,
+      "learning_rate": 6.23544734625608e-05,
+      "loss": 0.4223,
+      "step": 257
+    },
+    {
+      "epoch": 1.3298969072164948,
+      "grad_norm": 0.16463245451450348,
+      "learning_rate": 6.152443520550641e-05,
+      "loss": 0.4177,
+      "step": 258
+    },
+    {
+      "epoch": 1.3350515463917525,
+      "grad_norm": 0.18284174799919128,
+      "learning_rate": 6.069749683460765e-05,
+      "loss": 0.4114,
+      "step": 259
+    },
+    {
+      "epoch": 1.3402061855670104,
+      "grad_norm": 0.15177594125270844,
+      "learning_rate": 5.9873724974352585e-05,
+      "loss": 0.3997,
+      "step": 260
+    },
+    {
+      "epoch": 1.3453608247422681,
+      "grad_norm": 0.19975629448890686,
+      "learning_rate": 5.9053185994110974e-05,
+      "loss": 0.4059,
+      "step": 261
+    },
+    {
+      "epoch": 1.3505154639175259,
+      "grad_norm": 0.24263139069080353,
+      "learning_rate": 5.82359460027869e-05,
+      "loss": 0.4135,
+      "step": 262
+    },
+    {
+      "epoch": 1.3556701030927836,
+      "grad_norm": 0.16142095625400543,
+      "learning_rate": 5.7422070843492734e-05,
+      "loss": 0.4285,
+      "step": 263
+    },
+    {
+      "epoch": 1.3608247422680413,
+      "grad_norm": 0.21296386420726776,
+      "learning_rate": 5.6611626088244194e-05,
+      "loss": 0.4245,
+      "step": 264
+    },
+    {
+      "epoch": 1.365979381443299,
+      "grad_norm": 0.18045397102832794,
+      "learning_rate": 5.5804677032677354e-05,
+      "loss": 0.4093,
+      "step": 265
+    },
+    {
+      "epoch": 1.3711340206185567,
+      "grad_norm": 0.1832776963710785,
+      "learning_rate": 5.5001288690787886e-05,
+      "loss": 0.4209,
+      "step": 266
+    },
+    {
+      "epoch": 1.3762886597938144,
+      "grad_norm": 0.18741373717784882,
+      "learning_rate": 5.420152578969326e-05,
+      "loss": 0.405,
+      "step": 267
+    },
+    {
+      "epoch": 1.3814432989690721,
+      "grad_norm": 0.17056772112846375,
+      "learning_rate": 5.340545276441755e-05,
+      "loss": 0.4048,
+      "step": 268
+    },
+    {
+      "epoch": 1.3865979381443299,
+      "grad_norm": 0.1502230316400528,
+      "learning_rate": 5.261313375270014e-05,
+      "loss": 0.424,
+      "step": 269
+    },
+    {
+      "epoch": 1.3917525773195876,
+      "grad_norm": 0.19059504568576813,
+      "learning_rate": 5.182463258982846e-05,
+      "loss": 0.4068,
+      "step": 270
+    },
+    {
+      "epoch": 1.3969072164948453,
+      "grad_norm": 0.19260834157466888,
+      "learning_rate": 5.1040012803494795e-05,
+      "loss": 0.4046,
+      "step": 271
+    },
+    {
+      "epoch": 1.402061855670103,
+      "grad_norm": 0.192903533577919,
+      "learning_rate": 5.025933760867781e-05,
+      "loss": 0.4383,
+      "step": 272
+    },
+    {
+      "epoch": 1.4072164948453607,
+      "grad_norm": 0.20904873311519623,
+      "learning_rate": 4.9482669902549894e-05,
+      "loss": 0.4123,
+      "step": 273
+    },
+    {
+      "epoch": 1.4123711340206184,
+      "grad_norm": 0.16577503085136414,
+      "learning_rate": 4.87100722594094e-05,
+      "loss": 0.4153,
+      "step": 274
+    },
+    {
+      "epoch": 1.4175257731958764,
+      "grad_norm": 0.16849367320537567,
+      "learning_rate": 4.794160692563917e-05,
+      "loss": 0.4011,
+      "step": 275
+    },
+    {
+      "epoch": 1.422680412371134,
+      "grad_norm": 0.15250536799430847,
+      "learning_rate": 4.717733581469157e-05,
+      "loss": 0.4058,
+      "step": 276
+    },
+    {
+      "epoch": 1.4278350515463918,
+      "grad_norm": 0.20221127569675446,
+      "learning_rate": 4.6417320502100316e-05,
+      "loss": 0.4217,
+      "step": 277
+    },
+    {
+      "epoch": 1.4329896907216495,
+      "grad_norm": 0.17506621778011322,
+      "learning_rate": 4.566162222051946e-05,
+      "loss": 0.4042,
+      "step": 278
+    },
+    {
+      "epoch": 1.4381443298969072,
+      "grad_norm": 0.15628258883953094,
+      "learning_rate": 4.491030185478976e-05,
+      "loss": 0.41,
+      "step": 279
+    },
+    {
+      "epoch": 1.443298969072165,
+      "grad_norm": 0.166415736079216,
+      "learning_rate": 4.416341993703373e-05,
+      "loss": 0.3997,
+      "step": 280
+    },
+    {
+      "epoch": 1.4484536082474226,
+      "grad_norm": 0.167769655585289,
+      "learning_rate": 4.3421036641778556e-05,
+      "loss": 0.4013,
+      "step": 281
+    },
+    {
+      "epoch": 1.4536082474226804,
+      "grad_norm": 0.15038897097110748,
+      "learning_rate": 4.268321178110779e-05,
+      "loss": 0.4069,
+      "step": 282
+    },
+    {
+      "epoch": 1.458762886597938,
+      "grad_norm": 0.1389695107936859,
+      "learning_rate": 4.195000479984265e-05,
+      "loss": 0.4053,
+      "step": 283
+    },
+    {
+      "epoch": 1.463917525773196,
+      "grad_norm": 0.17076821625232697,
+      "learning_rate": 4.12214747707527e-05,
+      "loss": 0.4032,
+      "step": 284
+    },
+    {
+      "epoch": 1.4690721649484537,
+      "grad_norm": 0.18228599429130554,
+      "learning_rate": 4.049768038979631e-05,
+      "loss": 0.4192,
+      "step": 285
+    },
+    {
+      "epoch": 1.4742268041237114,
+      "grad_norm": 0.1899213343858719,
+      "learning_rate": 3.9778679971391785e-05,
+      "loss": 0.3979,
+      "step": 286
+    },
+    {
+      "epoch": 1.4793814432989691,
+      "grad_norm": 0.16366295516490936,
+      "learning_rate": 3.90645314437192e-05,
+      "loss": 0.4178,
+      "step": 287
+    },
+    {
+      "epoch": 1.4845360824742269,
+      "grad_norm": 0.2040897011756897,
+      "learning_rate": 3.8355292344053026e-05,
+      "loss": 0.4047,
+      "step": 288
+    },
+    {
+      "epoch": 1.4896907216494846,
+      "grad_norm": 0.1330062597990036,
+      "learning_rate": 3.7651019814126654e-05,
+      "loss": 0.4019,
+      "step": 289
+    },
+    {
+      "epoch": 1.4948453608247423,
+      "grad_norm": 0.19464842975139618,
+      "learning_rate": 3.695177059552861e-05,
+      "loss": 0.4129,
+      "step": 290
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.14035341143608093,
+      "learning_rate": 3.6257601025131026e-05,
+      "loss": 0.4247,
+      "step": 291
+    },
+    {
+      "epoch": 1.5051546391752577,
+      "grad_norm": 0.19228717684745789,
+      "learning_rate": 3.556856703055058e-05,
+      "loss": 0.3981,
+      "step": 292
+    },
+    {
+      "epoch": 1.5103092783505154,
+      "grad_norm": 0.15985019505023956,
+      "learning_rate": 3.488472412564264e-05,
+      "loss": 0.4231,
+      "step": 293
+    },
+    {
+      "epoch": 1.5154639175257731,
+      "grad_norm": 0.19275085628032684,
+      "learning_rate": 3.4206127406028745e-05,
+      "loss": 0.4193,
+      "step": 294
+    },
+    {
+      "epoch": 1.5206185567010309,
+      "grad_norm": 0.16484279930591583,
+      "learning_rate": 3.353283154465746e-05,
+      "loss": 0.4158,
+      "step": 295
+    },
+    {
+      "epoch": 1.5257731958762886,
+      "grad_norm": 0.1540604829788208,
+      "learning_rate": 3.28648907873996e-05,
+      "loss": 0.4073,
+      "step": 296
+    },
+    {
+      "epoch": 1.5309278350515463,
+      "grad_norm": 0.1933484673500061,
+      "learning_rate": 3.220235894867794e-05,
+      "loss": 0.3935,
+      "step": 297
+    },
+    {
+      "epoch": 1.536082474226804,
+      "grad_norm": 0.1472434252500534,
+      "learning_rate": 3.154528940713113e-05,
+      "loss": 0.4149,
+      "step": 298
+    },
+    {
+      "epoch": 1.5412371134020617,
+      "grad_norm": 0.1777832806110382,
+      "learning_rate": 3.089373510131354e-05,
+      "loss": 0.4055,
+      "step": 299
+    },
+    {
+      "epoch": 1.5463917525773194,
+      "grad_norm": 0.1485530436038971,
+      "learning_rate": 3.0247748525429787e-05,
+      "loss": 0.4213,
+      "step": 300
+    },
+    {
+      "epoch": 1.5515463917525774,
+      "grad_norm": 0.1698344647884369,
+      "learning_rate": 2.960738172510551e-05,
+      "loss": 0.4146,
+      "step": 301
+    },
+    {
+      "epoch": 1.556701030927835,
+      "grad_norm": 0.15248975157737732,
+      "learning_rate": 2.8972686293194308e-05,
+      "loss": 0.4111,
+      "step": 302
+    },
+    {
+      "epoch": 1.5618556701030928,
+      "grad_norm": 0.1573648601770401,
+      "learning_rate": 2.8343713365620772e-05,
+      "loss": 0.4034,
+      "step": 303
+    },
+    {
+      "epoch": 1.5670103092783505,
+      "grad_norm": 0.14649935066699982,
+      "learning_rate": 2.7720513617260856e-05,
+      "loss": 0.4173,
+      "step": 304
+    },
+    {
+      "epoch": 1.5721649484536082,
+      "grad_norm": 0.1425144374370575,
+      "learning_rate": 2.7103137257858868e-05,
+      "loss": 0.3941,
+      "step": 305
+    },
+    {
+      "epoch": 1.577319587628866,
+      "grad_norm": 0.1503569483757019,
+      "learning_rate": 2.6491634027982325e-05,
+      "loss": 0.3953,
+      "step": 306
+    },
+    {
+      "epoch": 1.5824742268041239,
+      "grad_norm": 0.1541711837053299,
+      "learning_rate": 2.5886053195014538e-05,
+      "loss": 0.3977,
+      "step": 307
+    },
+    {
+      "epoch": 1.5876288659793816,
+      "grad_norm": 0.15536653995513916,
+      "learning_rate": 2.5286443549185036e-05,
+      "loss": 0.4068,
+      "step": 308
+    },
+    {
+      "epoch": 1.5927835051546393,
+      "grad_norm": 0.1571768969297409,
+      "learning_rate": 2.4692853399638917e-05,
+      "loss": 0.4132,
+      "step": 309
+    },
+    {
+      "epoch": 1.597938144329897,
+      "grad_norm": 0.13989152014255524,
+      "learning_rate": 2.410533057054446e-05,
+      "loss": 0.4113,
+      "step": 310
+    },
+    {
+      "epoch": 1.6030927835051547,
+      "grad_norm": 0.15358886122703552,
+      "learning_rate": 2.352392239724016e-05,
+      "loss": 0.3957,
+      "step": 311
+    },
+    {
+      "epoch": 1.6082474226804124,
+      "grad_norm": 0.1443978250026703,
+      "learning_rate": 2.2948675722421086e-05,
+      "loss": 0.4077,
+      "step": 312
+    },
+    {
+      "epoch": 1.6134020618556701,
+      "grad_norm": 0.13177341222763062,
+      "learning_rate": 2.237963689236472e-05,
+      "loss": 0.4037,
+      "step": 313
+    },
+    {
+      "epoch": 1.6185567010309279,
+      "grad_norm": 0.12739062309265137,
+      "learning_rate": 2.181685175319702e-05,
+      "loss": 0.4096,
+      "step": 314
+    },
+    {
+      "epoch": 1.6237113402061856,
+      "grad_norm": 0.14786216616630554,
+      "learning_rate": 2.1260365647198798e-05,
+      "loss": 0.4227,
+      "step": 315
+    },
+    {
+      "epoch": 1.6288659793814433,
+      "grad_norm": 0.1393991857767105,
+      "learning_rate": 2.0710223409152475e-05,
+      "loss": 0.4044,
+      "step": 316
+    },
+    {
+      "epoch": 1.634020618556701,
+      "grad_norm": 0.14007160067558289,
+      "learning_rate": 2.016646936272987e-05,
+      "loss": 0.417,
+      "step": 317
+    },
+    {
+      "epoch": 1.6391752577319587,
+      "grad_norm": 0.13872084021568298,
+      "learning_rate": 1.9629147316921125e-05,
+      "loss": 0.4049,
+      "step": 318
+    },
+    {
+      "epoch": 1.6443298969072164,
+      "grad_norm": 0.12924394011497498,
+      "learning_rate": 1.9098300562505266e-05,
+      "loss": 0.3913,
+      "step": 319
+    },
+    {
+      "epoch": 1.6494845360824741,
+      "grad_norm": 0.14139704406261444,
+      "learning_rate": 1.8573971868562156e-05,
+      "loss": 0.4074,
+      "step": 320
+    },
+    {
+      "epoch": 1.6546391752577319,
+      "grad_norm": 0.12420302629470825,
+      "learning_rate": 1.805620347902681e-05,
+      "loss": 0.4064,
+      "step": 321
+    },
+    {
+      "epoch": 1.6597938144329896,
+      "grad_norm": 0.1246916800737381,
+      "learning_rate": 1.7545037109285946e-05,
+      "loss": 0.3981,
+      "step": 322
+    },
+    {
+      "epoch": 1.6649484536082473,
+      "grad_norm": 0.12461568415164948,
+      "learning_rate": 1.7040513942816906e-05,
+      "loss": 0.4146,
+      "step": 323
+    },
+    {
+      "epoch": 1.670103092783505,
+      "grad_norm": 0.13187824189662933,
+      "learning_rate": 1.6542674627869737e-05,
+      "loss": 0.4129,
+      "step": 324
+    },
+    {
+      "epoch": 1.675257731958763,
+      "grad_norm": 0.1405288428068161,
+      "learning_rate": 1.6051559274192275e-05,
+      "loss": 0.4022,
+      "step": 325
+    },
+    {
+      "epoch": 1.6804123711340206,
+      "grad_norm": 0.12159695476293564,
+      "learning_rate": 1.5567207449798515e-05,
+      "loss": 0.3853,
+      "step": 326
+    },
+    {
+      "epoch": 1.6855670103092784,
+      "grad_norm": 0.14212271571159363,
+      "learning_rate": 1.5089658177780653e-05,
+      "loss": 0.3946,
+      "step": 327
+    },
+    {
+      "epoch": 1.690721649484536,
+      "grad_norm": 0.13337042927742004,
+      "learning_rate": 1.4618949933165272e-05,
+      "loss": 0.4028,
+      "step": 328
+    },
+    {
+      "epoch": 1.6958762886597938,
+      "grad_norm": 0.1317555457353592,
+      "learning_rate": 1.415512063981339e-05,
+      "loss": 0.4213,
+      "step": 329
+    },
+    {
+      "epoch": 1.7010309278350515,
+      "grad_norm": 0.12092739343643188,
+      "learning_rate": 1.3698207667364982e-05,
+      "loss": 0.4084,
+      "step": 330
+    },
+    {
+      "epoch": 1.7061855670103094,
+      "grad_norm": 0.11935717612504959,
+      "learning_rate": 1.3248247828228245e-05,
+      "loss": 0.4098,
+      "step": 331
+    },
+    {
+      "epoch": 1.7113402061855671,
+      "grad_norm": 0.12374074757099152,
+      "learning_rate": 1.2805277374613744e-05,
+      "loss": 0.3972,
+      "step": 332
+    },
+    {
+      "epoch": 1.7164948453608249,
+      "grad_norm": 0.13308891654014587,
+      "learning_rate": 1.2369331995613665e-05,
+      "loss": 0.4101,
+      "step": 333
+    },
+    {
+      "epoch": 1.7216494845360826,
+      "grad_norm": 0.13691888749599457,
+      "learning_rate": 1.19404468143262e-05,
+      "loss": 0.4009,
+      "step": 334
+    },
+    {
+      "epoch": 1.7268041237113403,
+      "grad_norm": 0.12358833104372025,
+      "learning_rate": 1.151865638502615e-05,
+      "loss": 0.4033,
+      "step": 335
+    },
+    {
+      "epoch": 1.731958762886598,
+      "grad_norm": 0.12735094130039215,
+      "learning_rate": 1.1103994690380681e-05,
+      "loss": 0.4003,
+      "step": 336
+    },
+    {
+      "epoch": 1.7371134020618557,
+      "grad_norm": 0.12450878322124481,
+      "learning_rate": 1.069649513871147e-05,
+      "loss": 0.4087,
+      "step": 337
+    },
+    {
+      "epoch": 1.7422680412371134,
+      "grad_norm": 0.11977449804544449,
+      "learning_rate": 1.0296190561303132e-05,
+      "loss": 0.4094,
+      "step": 338
+    },
+    {
+      "epoch": 1.7474226804123711,
+      "grad_norm": 0.11960332095623016,
+      "learning_rate": 9.903113209758096e-06,
+      "loss": 0.385,
+      "step": 339
+    },
+    {
+      "epoch": 1.7525773195876289,
+      "grad_norm": 0.11467370390892029,
+      "learning_rate": 9.517294753398064e-06,
+      "loss": 0.4061,
+      "step": 340
+    },
+    {
+      "epoch": 1.7577319587628866,
+      "grad_norm": 0.11368629336357117,
+      "learning_rate": 9.138766276712552e-06,
+      "loss": 0.4082,
+      "step": 341
+    },
+    {
+      "epoch": 1.7628865979381443,
+      "grad_norm": 0.10669873654842377,
+      "learning_rate": 8.767558276854549e-06,
+      "loss": 0.3872,
+      "step": 342
+    },
+    {
+      "epoch": 1.768041237113402,
+      "grad_norm": 0.13322988152503967,
+      "learning_rate": 8.403700661183355e-06,
+      "loss": 0.4203,
+      "step": 343
+    },
+    {
+      "epoch": 1.7731958762886597,
+      "grad_norm": 0.11445485055446625,
+      "learning_rate": 8.047222744854943e-06,
+      "loss": 0.4063,
+      "step": 344
+    },
+    {
+      "epoch": 1.7783505154639174,
+      "grad_norm": 0.11018280684947968,
+      "learning_rate": 7.698153248460271e-06,
+      "loss": 0.4087,
+      "step": 345
+    },
+    {
+      "epoch": 1.7835051546391751,
+      "grad_norm": 0.12875722348690033,
+      "learning_rate": 7.3565202957112555e-06,
+      "loss": 0.4157,
+      "step": 346
+    },
+    {
+      "epoch": 1.7886597938144329,
+      "grad_norm": 0.10996179282665253,
+      "learning_rate": 7.022351411174866e-06,
+      "loss": 0.3904,
+      "step": 347
+    },
+    {
+      "epoch": 1.7938144329896906,
+      "grad_norm": 0.10373340547084808,
+      "learning_rate": 6.695673518055579e-06,
+      "loss": 0.3978,
+      "step": 348
+    },
+    {
+      "epoch": 1.7989690721649485,
+      "grad_norm": 0.10824684053659439,
+      "learning_rate": 6.37651293602628e-06,
+      "loss": 0.4106,
+      "step": 349
+    },
+    {
+      "epoch": 1.8041237113402062,
+      "grad_norm": 0.11701569706201553,
+      "learning_rate": 6.06489537910766e-06,
+      "loss": 0.4079,
+      "step": 350
+    },
+    {
+      "epoch": 1.809278350515464,
+      "grad_norm": 0.10616633296012878,
+      "learning_rate": 5.760845953596527e-06,
+      "loss": 0.4044,
+      "step": 351
+    },
+    {
+      "epoch": 1.8144329896907216,
+      "grad_norm": 0.11716404557228088,
+      "learning_rate": 5.464389156043115e-06,
+      "loss": 0.3914,
+      "step": 352
+    },
+    {
+      "epoch": 1.8195876288659794,
+      "grad_norm": 0.16754718124866486,
+      "learning_rate": 5.175548871277358e-06,
+      "loss": 0.4058,
+      "step": 353
+    },
+    {
+      "epoch": 1.824742268041237,
+      "grad_norm": 0.1094307005405426,
+      "learning_rate": 4.8943483704846475e-06,
+      "loss": 0.4046,
+      "step": 354
+    },
+    {
+      "epoch": 1.829896907216495,
+      "grad_norm": 0.09927380084991455,
+      "learning_rate": 4.620810309330803e-06,
+      "loss": 0.399,
+      "step": 355
+    },
+    {
+      "epoch": 1.8350515463917527,
+      "grad_norm": 0.11139936745166779,
+      "learning_rate": 4.35495672613685e-06,
+      "loss": 0.432,
+      "step": 356
+    },
+    {
+      "epoch": 1.8402061855670104,
+      "grad_norm": 0.10676106810569763,
+      "learning_rate": 4.096809040103444e-06,
+      "loss": 0.4039,
+      "step": 357
+    },
+    {
+      "epoch": 1.8453608247422681,
+      "grad_norm": 0.1022983193397522,
+      "learning_rate": 3.8463880495851146e-06,
+      "loss": 0.4045,
+      "step": 358
+    },
+    {
+      "epoch": 1.8505154639175259,
+      "grad_norm": 0.11308566480875015,
+      "learning_rate": 3.6037139304146762e-06,
+      "loss": 0.408,
+      "step": 359
+    },
+    {
+      "epoch": 1.8556701030927836,
+      "grad_norm": 0.10805762559175491,
+      "learning_rate": 3.3688062342776106e-06,
+      "loss": 0.409,
+      "step": 360
+    },
+    {
+      "epoch": 1.8608247422680413,
+      "grad_norm": 0.10656285285949707,
+      "learning_rate": 3.1416838871368924e-06,
+      "loss": 0.4061,
+      "step": 361
+    },
+    {
+      "epoch": 1.865979381443299,
+      "grad_norm": 0.10592852532863617,
+      "learning_rate": 2.922365187708187e-06,
+      "loss": 0.3867,
+      "step": 362
+    },
+    {
+      "epoch": 1.8711340206185567,
+      "grad_norm": 0.10599105805158615,
+      "learning_rate": 2.7108678059855065e-06,
+      "loss": 0.3954,
+      "step": 363
+    },
+    {
+      "epoch": 1.8762886597938144,
+      "grad_norm": 0.10687398910522461,
+      "learning_rate": 2.5072087818176382e-06,
+      "loss": 0.4086,
+      "step": 364
+    },
+    {
+      "epoch": 1.8814432989690721,
+      "grad_norm": 0.099769227206707,
+      "learning_rate": 2.311404523535243e-06,
+      "loss": 0.4065,
+      "step": 365
+    },
+    {
+      "epoch": 1.8865979381443299,
+      "grad_norm": 0.10475780069828033,
+      "learning_rate": 2.123470806628858e-06,
+      "loss": 0.4132,
+      "step": 366
+    },
+    {
+      "epoch": 1.8917525773195876,
+      "grad_norm": 0.10736888647079468,
+      "learning_rate": 1.9434227724779984e-06,
+      "loss": 0.4243,
+      "step": 367
+    },
+    {
+      "epoch": 1.8969072164948453,
+      "grad_norm": 0.09772989898920059,
+      "learning_rate": 1.771274927131139e-06,
+      "loss": 0.4091,
+      "step": 368
+    },
+    {
+      "epoch": 1.902061855670103,
+      "grad_norm": 0.10204560309648514,
+      "learning_rate": 1.6070411401370334e-06,
+      "loss": 0.4186,
+      "step": 369
+    },
+    {
+      "epoch": 1.9072164948453607,
+      "grad_norm": 0.10475929826498032,
+      "learning_rate": 1.4507346434273316e-06,
+      "loss": 0.4109,
+      "step": 370
+    },
+    {
+      "epoch": 1.9123711340206184,
+      "grad_norm": 0.09802843630313873,
+      "learning_rate": 1.3023680302504338e-06,
+      "loss": 0.4034,
+      "step": 371
+    },
+    {
+      "epoch": 1.9175257731958761,
+      "grad_norm": 0.10157228261232376,
+      "learning_rate": 1.1619532541569333e-06,
+      "loss": 0.4132,
+      "step": 372
+    },
+    {
+      "epoch": 1.922680412371134,
+      "grad_norm": 0.09992757439613342,
+      "learning_rate": 1.0295016280365112e-06,
+      "loss": 0.4092,
+      "step": 373
+    },
+    {
+      "epoch": 1.9278350515463918,
+      "grad_norm": 0.10680104792118073,
+      "learning_rate": 9.0502382320653e-07,
+      "loss": 0.4052,
+      "step": 374
+    },
+    {
+      "epoch": 1.9329896907216495,
+      "grad_norm": 0.10894012451171875,
+      "learning_rate": 7.885298685522235e-07,
+      "loss": 0.4106,
+      "step": 375
+    },
+    {
+      "epoch": 1.9381443298969072,
+      "grad_norm": 0.09763751178979874,
+      "learning_rate": 6.800291497187083e-07,
+      "loss": 0.4103,
+      "step": 376
+    },
+    {
+      "epoch": 1.943298969072165,
+      "grad_norm": 0.10077156871557236,
+      "learning_rate": 5.795304083548559e-07,
+      "loss": 0.3978,
+      "step": 377
+    },
+    {
+      "epoch": 1.9484536082474226,
+      "grad_norm": 0.09721335768699646,
+      "learning_rate": 4.870417414088779e-07,
+      "loss": 0.4136,
+      "step": 378
+    },
+    {
+      "epoch": 1.9536082474226806,
+      "grad_norm": 0.11184490472078323,
+      "learning_rate": 4.025706004760932e-07,
+      "loss": 0.4002,
+      "step": 379
+    },
+    {
+      "epoch": 1.9587628865979383,
+      "grad_norm": 0.10041658580303192,
+      "learning_rate": 3.261237911985404e-07,
+      "loss": 0.4023,
+      "step": 380
+    },
+    {
+      "epoch": 1.963917525773196,
+      "grad_norm": 0.09845972061157227,
+      "learning_rate": 2.577074727165951e-07,
+      "loss": 0.4063,
+      "step": 381
+    },
+    {
+      "epoch": 1.9690721649484537,
+      "grad_norm": 0.1027616560459137,
+      "learning_rate": 1.973271571728441e-07,
+      "loss": 0.4114,
+      "step": 382
+    },
+    {
+      "epoch": 1.9742268041237114,
+      "grad_norm": 0.0927368700504303,
+      "learning_rate": 1.449877092679075e-07,
+      "loss": 0.3864,
+      "step": 383
+    },
+    {
+      "epoch": 1.9793814432989691,
+      "grad_norm": 0.09774715453386307,
+      "learning_rate": 1.0069334586854107e-07,
+      "loss": 0.4123,
+      "step": 384
+    },
+    {
+      "epoch": 1.9845360824742269,
+      "grad_norm": 0.09442637115716934,
+      "learning_rate": 6.444763566786361e-08,
+      "loss": 0.3879,
+      "step": 385
+    },
+    {
+      "epoch": 1.9896907216494846,
+      "grad_norm": 0.09873173385858536,
+      "learning_rate": 3.6253498897886873e-08,
+      "loss": 0.4049,
+      "step": 386
+    },
+    {
+      "epoch": 1.9948453608247423,
+      "grad_norm": 0.10053720325231552,
+      "learning_rate": 1.6113207094181626e-08,
+      "loss": 0.3931,
+      "step": 387
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.09618926793336868,
+      "learning_rate": 4.028382912890649e-09,
+      "loss": 0.4088,
+      "step": 388
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 388,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.5133457970584617e+19,
+  "train_batch_size": 24,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1a8ff49ec3fd81ebf5bee49cc1ea2236a25c4cb433b8d002f13c1b9fbc7336f9
+size 15032