Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -0
README.md +209 -0
adapter_config.json +46 -0
adapter_model.safetensors +3 -0
chat_template.jinja +117 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
tokenizer.json +3 -0
tokenizer_config.json +29 -0
trainer_state.json +1075 -0
training_args.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen3-Coder-30B-A3B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen3-Coder-30B-A3B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen3-Coder-30B-A3B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "q_proj",
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ec62876de622d14dedb4da963455955477c39c9268502906c582db292796dfc
+size 53528920

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,117 @@

+{% macro render_extra_keys(json_dict, handled_keys) %}
+    {%- if json_dict is mapping %}
+        {%- for json_key in json_dict if json_key not in handled_keys %}
+            {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
+                {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
+            {%- else %}
+                {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+{% endmacro %}
+{%- if messages[0]["role"] == "system" %}
+    {%- set system_message = messages[0]["content"] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = [] %}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- "<|im_start|>system\n" + system_message }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
+    {%- endif %}
+{%- endif %}
+{%- if tools is iterable and tools | length > 0 %}
+    {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
+    {{- "<tools>" }}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
+        {%- if tool.description is defined %}
+            {{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
+        {%- endif %}
+        {{- '\n<parameters>' }}
+        {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
+            {%- for param_name, param_fields in tool.parameters.properties|items %}
+                {{- '\n<parameter>' }}
+                {{- '\n<name>' ~ param_name ~ '</name>' }}
+                {%- if param_fields.type is defined %}
+                    {{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
+                {%- endif %}
+                {%- if param_fields.description is defined %}
+                    {{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
+                {%- endif %}
+                {%- set handled_keys = ['name', 'type', 'description'] %}
+                {{- render_extra_keys(param_fields, handled_keys) }}
+                {{- '\n</parameter>' }}
+            {%- endfor %}
+        {%- endif %}
+        {% set handled_keys = ['type', 'properties'] %}
+        {{- render_extra_keys(tool.parameters, handled_keys) }}
+        {{- '\n</parameters>' }}
+        {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
+        {{- render_extra_keys(tool, handled_keys) }}
+        {{- '\n</function>' }}
+    {%- endfor %}
+    {{- "\n</tools>" }}
+    {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
+{%- endif %}
+{%- if system_message is defined %}
+    {{- '<|im_end|>\n' }}
+{%- else %}
+    {%- if tools is iterable and tools | length > 0 %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in loop_messages %}
+    {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
+            {{- '\n' + message.content | trim + '\n' }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
+            {%- if tool_call.arguments is defined %}
+                {%- for args_name, args_value in tool_call.arguments|items %}
+                    {{- '<parameter=' + args_name + '>\n' }}
+                    {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
+                    {{- args_value }}
+                    {{- '\n</parameter>\n' }}
+                {%- endfor %}
+            {%- endif %}
+            {{- '</function>\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>user\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>\n' }}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>\n' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- else %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66efe6d162462dccc7ba946f706e7453625246b79b78e9f8b890eb06e8ce8379
+size 27614166

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe6c9dc1ffcec3fabe9e60a4178bdb8df2583340910df5d99436b9812079d05
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b215a27550f7e6212a372d3c82b0189b243e058c2f2aefe6a4194883d88ccdb6
+size 1064

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": false,
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1075 @@

+{
+  "best_global_step": 1038,
+  "best_metric": 1.032158374786377,
+  "best_model_checkpoint": "C:\\unity_train\\output\\unity-coder-adapter\\checkpoint-1038",
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1038,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.0811691626906395,
+      "epoch": 0.00963623223319682,
+      "grad_norm": 0.35738348960876465,
+      "learning_rate": 1.153846153846154e-05,
+      "loss": 2.42077693939209,
+      "mean_token_accuracy": 0.5671707078814506,
+      "num_tokens": 59463.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.0596500962972641,
+      "epoch": 0.01927246446639364,
+      "grad_norm": 0.26245468854904175,
+      "learning_rate": 2.435897435897436e-05,
+      "loss": 2.3048450469970705,
+      "mean_token_accuracy": 0.5807798743247986,
+      "num_tokens": 122441.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.1488549172878266,
+      "epoch": 0.02890869669959046,
+      "grad_norm": 0.21727126836776733,
+      "learning_rate": 3.717948717948718e-05,
+      "loss": 2.15008487701416,
+      "mean_token_accuracy": 0.5763025932013989,
+      "num_tokens": 178300.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.324630731344223,
+      "epoch": 0.03854492893278728,
+      "grad_norm": 0.09244390577077866,
+      "learning_rate": 5e-05,
+      "loss": 1.9429821014404296,
+      "mean_token_accuracy": 0.595233204215765,
+      "num_tokens": 240279.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.54823177754879,
+      "epoch": 0.0481811611659841,
+      "grad_norm": 0.07658641040325165,
+      "learning_rate": 6.282051282051282e-05,
+      "loss": 1.8167322158813477,
+      "mean_token_accuracy": 0.6044709533452988,
+      "num_tokens": 302914.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.6716329783201218,
+      "epoch": 0.05781739339918092,
+      "grad_norm": 0.0950954258441925,
+      "learning_rate": 7.564102564102564e-05,
+      "loss": 1.7055500030517579,
+      "mean_token_accuracy": 0.6348149433732033,
+      "num_tokens": 351888.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.519571453332901,
+      "epoch": 0.06745362563237774,
+      "grad_norm": 0.07054093480110168,
+      "learning_rate": 8.846153846153847e-05,
+      "loss": 1.5317837715148925,
+      "mean_token_accuracy": 0.6589120730757714,
+      "num_tokens": 414923.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.5119123458862305,
+      "epoch": 0.07708985786557455,
+      "grad_norm": 0.07167445123195648,
+      "learning_rate": 0.00010128205128205129,
+      "loss": 1.4823562622070312,
+      "mean_token_accuracy": 0.6631396897137165,
+      "num_tokens": 468593.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.3805345997214318,
+      "epoch": 0.08672609009877139,
+      "grad_norm": 0.10775867104530334,
+      "learning_rate": 0.0001141025641025641,
+      "loss": 1.413505458831787,
+      "mean_token_accuracy": 0.6830234751105309,
+      "num_tokens": 525914.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.4891180142760276,
+      "epoch": 0.0963623223319682,
+      "grad_norm": 0.05629459396004677,
+      "learning_rate": 0.00012692307692307693,
+      "loss": 1.5126714706420898,
+      "mean_token_accuracy": 0.6582696467638016,
+      "num_tokens": 589835.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.4281011313199996,
+      "epoch": 0.10599855456516502,
+      "grad_norm": 0.05547759681940079,
+      "learning_rate": 0.00013974358974358974,
+      "loss": 1.44595947265625,
+      "mean_token_accuracy": 0.6682780802249908,
+      "num_tokens": 648138.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.3978321179747581,
+      "epoch": 0.11563478679836184,
+      "grad_norm": 0.05473790690302849,
+      "learning_rate": 0.00015256410256410255,
+      "loss": 1.4433627128601074,
+      "mean_token_accuracy": 0.674263383448124,
+      "num_tokens": 702219.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.4054933816194535,
+      "epoch": 0.12527101903155866,
+      "grad_norm": 0.06133154034614563,
+      "learning_rate": 0.0001653846153846154,
+      "loss": 1.4284349441528321,
+      "mean_token_accuracy": 0.6733272626996041,
+      "num_tokens": 759042.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.452716739475727,
+      "epoch": 0.13490725126475547,
+      "grad_norm": 0.05656077712774277,
+      "learning_rate": 0.00017820512820512823,
+      "loss": 1.4834243774414062,
+      "mean_token_accuracy": 0.6669867470860481,
+      "num_tokens": 822515.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.2200356990098953,
+      "epoch": 0.1445434834979523,
+      "grad_norm": 0.06002284586429596,
+      "learning_rate": 0.00019102564102564104,
+      "loss": 1.2548653602600097,
+      "mean_token_accuracy": 0.7205198630690575,
+      "num_tokens": 876506.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.3374179184436799,
+      "epoch": 0.1541797157311491,
+      "grad_norm": 0.07770856469869614,
+      "learning_rate": 0.00019979716024340772,
+      "loss": 1.383979606628418,
+      "mean_token_accuracy": 0.6905814677476882,
+      "num_tokens": 934413.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.275234666466713,
+      "epoch": 0.16381594796434595,
+      "grad_norm": 0.06369830667972565,
+      "learning_rate": 0.0001991210277214334,
+      "loss": 1.2860990524291993,
+      "mean_token_accuracy": 0.7051964432001114,
+      "num_tokens": 992594.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.3504198059439658,
+      "epoch": 0.17345218019754277,
+      "grad_norm": 0.07278598099946976,
+      "learning_rate": 0.0001984448951994591,
+      "loss": 1.3741103172302247,
+      "mean_token_accuracy": 0.6862095996737481,
+      "num_tokens": 1048524.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.2644047453999518,
+      "epoch": 0.1830884124307396,
+      "grad_norm": 0.07138558477163315,
+      "learning_rate": 0.00019776876267748477,
+      "loss": 1.2995844841003419,
+      "mean_token_accuracy": 0.706048458814621,
+      "num_tokens": 1109157.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.3248096346855163,
+      "epoch": 0.1927246446639364,
+      "grad_norm": 0.08208955079317093,
+      "learning_rate": 0.00019709263015551048,
+      "loss": 1.3252597808837892,
+      "mean_token_accuracy": 0.7000985190272331,
+      "num_tokens": 1170070.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.2682933300733565,
+      "epoch": 0.20236087689713322,
+      "grad_norm": 0.055171046406030655,
+      "learning_rate": 0.00019641649763353617,
+      "loss": 1.3352859497070313,
+      "mean_token_accuracy": 0.700250719487667,
+      "num_tokens": 1229436.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.347843087464571,
+      "epoch": 0.21199710913033004,
+      "grad_norm": 0.06492508947849274,
+      "learning_rate": 0.00019574036511156188,
+      "loss": 1.3542288780212401,
+      "mean_token_accuracy": 0.6922637760639191,
+      "num_tokens": 1287532.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.218190498650074,
+      "epoch": 0.22163334136352686,
+      "grad_norm": 0.0640958771109581,
+      "learning_rate": 0.00019506423258958757,
+      "loss": 1.2281950950622558,
+      "mean_token_accuracy": 0.718992106616497,
+      "num_tokens": 1344357.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.3014725491404533,
+      "epoch": 0.23126957359672368,
+      "grad_norm": 0.05718906223773956,
+      "learning_rate": 0.00019438810006761325,
+      "loss": 1.3229006767272948,
+      "mean_token_accuracy": 0.6986020863056183,
+      "num_tokens": 1405209.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.227053464204073,
+      "epoch": 0.2409058058299205,
+      "grad_norm": 0.05763186141848564,
+      "learning_rate": 0.00019371196754563896,
+      "loss": 1.2807598114013672,
+      "mean_token_accuracy": 0.7116438299417496,
+      "num_tokens": 1457023.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.2856510818004607,
+      "epoch": 0.2505420380631173,
+      "grad_norm": 0.06267844140529633,
+      "learning_rate": 0.00019303583502366465,
+      "loss": 1.3038443565368651,
+      "mean_token_accuracy": 0.7054769903421402,
+      "num_tokens": 1513975.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.228683941066265,
+      "epoch": 0.26017827029631413,
+      "grad_norm": 0.0609310083091259,
+      "learning_rate": 0.00019235970250169033,
+      "loss": 1.2614136695861817,
+      "mean_token_accuracy": 0.7095189347863198,
+      "num_tokens": 1579252.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.2551469817757606,
+      "epoch": 0.26981450252951095,
+      "grad_norm": 0.06873613595962524,
+      "learning_rate": 0.00019168356997971604,
+      "loss": 1.2714256286621093,
+      "mean_token_accuracy": 0.712159389257431,
+      "num_tokens": 1637995.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.2518196165561677,
+      "epoch": 0.27945073476270776,
+      "grad_norm": 0.07062622159719467,
+      "learning_rate": 0.00019100743745774173,
+      "loss": 1.302776050567627,
+      "mean_token_accuracy": 0.7063210532069206,
+      "num_tokens": 1703008.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.2446896508336067,
+      "epoch": 0.2890869669959046,
+      "grad_norm": 0.051750048995018005,
+      "learning_rate": 0.00019033130493576744,
+      "loss": 1.3015289306640625,
+      "mean_token_accuracy": 0.7073144510388374,
+      "num_tokens": 1766072.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.3015983402729034,
+      "epoch": 0.2987231992291014,
+      "grad_norm": 0.0565313994884491,
+      "learning_rate": 0.00018965517241379312,
+      "loss": 1.311717414855957,
+      "mean_token_accuracy": 0.7029094457626343,
+      "num_tokens": 1820923.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.1577203705906869,
+      "epoch": 0.3083594314622982,
+      "grad_norm": 0.05471700802445412,
+      "learning_rate": 0.0001889790398918188,
+      "loss": 1.2251465797424317,
+      "mean_token_accuracy": 0.7273583576083184,
+      "num_tokens": 1877116.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.205883078277111,
+      "epoch": 0.31799566369549503,
+      "grad_norm": 0.05748973786830902,
+      "learning_rate": 0.00018830290736984452,
+      "loss": 1.244644546508789,
+      "mean_token_accuracy": 0.7220181196928024,
+      "num_tokens": 1935606.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.258717157691717,
+      "epoch": 0.3276318959286919,
+      "grad_norm": 0.05800570175051689,
+      "learning_rate": 0.0001876267748478702,
+      "loss": 1.3071101188659668,
+      "mean_token_accuracy": 0.7034722596406937,
+      "num_tokens": 1997199.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.22360208183527,
+      "epoch": 0.3372681281618887,
+      "grad_norm": 0.09994573146104813,
+      "learning_rate": 0.00018695064232589589,
+      "loss": 1.2296308517456054,
+      "mean_token_accuracy": 0.7146901577711106,
+      "num_tokens": 2048172.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.3035957127809525,
+      "epoch": 0.34690436039508554,
+      "grad_norm": 0.07320819795131683,
+      "learning_rate": 0.00018627450980392157,
+      "loss": 1.2939067840576173,
+      "mean_token_accuracy": 0.696925450861454,
+      "num_tokens": 2109948.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.212366634607315,
+      "epoch": 0.35654059262828236,
+      "grad_norm": 0.06324014812707901,
+      "learning_rate": 0.00018559837728194725,
+      "loss": 1.2521446228027344,
+      "mean_token_accuracy": 0.717373288422823,
+      "num_tokens": 2171174.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.240752936899662,
+      "epoch": 0.3661768248614792,
+      "grad_norm": 0.07091817259788513,
+      "learning_rate": 0.00018492224475997297,
+      "loss": 1.2615043640136718,
+      "mean_token_accuracy": 0.7065044552087784,
+      "num_tokens": 2228430.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.2120654836297036,
+      "epoch": 0.375813057094676,
+      "grad_norm": 0.05131813511252403,
+      "learning_rate": 0.00018424611223799865,
+      "loss": 1.2185423851013184,
+      "mean_token_accuracy": 0.7187462538480759,
+      "num_tokens": 2285248.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.2625371024012566,
+      "epoch": 0.3854492893278728,
+      "grad_norm": 0.056896600872278214,
+      "learning_rate": 0.00018356997971602433,
+      "loss": 1.3041479110717773,
+      "mean_token_accuracy": 0.7014564648270607,
+      "num_tokens": 2345584.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.229997194558382,
+      "epoch": 0.39508552156106963,
+      "grad_norm": 0.06733101606369019,
+      "learning_rate": 0.00018289384719405005,
+      "loss": 1.2783479690551758,
+      "mean_token_accuracy": 0.7130212724208832,
+      "num_tokens": 2402073.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.253336711972952,
+      "epoch": 0.40472175379426645,
+      "grad_norm": 0.1017632707953453,
+      "learning_rate": 0.00018221771467207573,
+      "loss": 1.2827540397644044,
+      "mean_token_accuracy": 0.703055490553379,
+      "num_tokens": 2461269.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.1999207064509392,
+      "epoch": 0.41435798602746327,
+      "grad_norm": 0.0675441101193428,
+      "learning_rate": 0.00018154158215010142,
+      "loss": 1.1980154037475585,
+      "mean_token_accuracy": 0.7192850261926651,
+      "num_tokens": 2520272.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.1966832533478737,
+      "epoch": 0.4239942182606601,
+      "grad_norm": 0.04782295227050781,
+      "learning_rate": 0.00018086544962812713,
+      "loss": 1.196799373626709,
+      "mean_token_accuracy": 0.7202263355255127,
+      "num_tokens": 2574916.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.28217963129282,
+      "epoch": 0.4336304504938569,
+      "grad_norm": 0.055622052401304245,
+      "learning_rate": 0.0001801893171061528,
+      "loss": 1.2780956268310546,
+      "mean_token_accuracy": 0.6991938084363938,
+      "num_tokens": 2630803.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.1060978904366494,
+      "epoch": 0.4432666827270537,
+      "grad_norm": 0.08565858751535416,
+      "learning_rate": 0.00017951318458417852,
+      "loss": 1.1542920112609862,
+      "mean_token_accuracy": 0.7370428621768952,
+      "num_tokens": 2686122.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.3041150823235512,
+      "epoch": 0.45290291496025054,
+      "grad_norm": 0.05283431336283684,
+      "learning_rate": 0.0001788370520622042,
+      "loss": 1.2778042793273925,
+      "mean_token_accuracy": 0.700305138528347,
+      "num_tokens": 2751119.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.2362390145659448,
+      "epoch": 0.46253914719344735,
+      "grad_norm": 0.05927009880542755,
+      "learning_rate": 0.0001781609195402299,
+      "loss": 1.2784390449523926,
+      "mean_token_accuracy": 0.700747960805893,
+      "num_tokens": 2809393.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.2329043842852117,
+      "epoch": 0.47217537942664417,
+      "grad_norm": 0.05709557607769966,
+      "learning_rate": 0.0001774847870182556,
+      "loss": 1.2346397399902345,
+      "mean_token_accuracy": 0.7168422609567642,
+      "num_tokens": 2870246.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.197921334207058,
+      "epoch": 0.481811611659841,
+      "grad_norm": 0.051337700337171555,
+      "learning_rate": 0.0001768086544962813,
+      "loss": 1.2222982406616212,
+      "mean_token_accuracy": 0.7162339583039283,
+      "num_tokens": 2934671.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.2013787925243378,
+      "epoch": 0.4914478438930378,
+      "grad_norm": 0.06907735019922256,
+      "learning_rate": 0.00017613252197430697,
+      "loss": 1.2295709609985352,
+      "mean_token_accuracy": 0.7151955872774124,
+      "num_tokens": 2989014.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.1978842347860337,
+      "epoch": 0.5010840761262346,
+      "grad_norm": 0.06152976304292679,
+      "learning_rate": 0.00017545638945233268,
+      "loss": 1.2030101776123048,
+      "mean_token_accuracy": 0.7203953012824058,
+      "num_tokens": 3045999.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.3108120203018188,
+      "epoch": 0.5107203083594315,
+      "grad_norm": 0.05314662307500839,
+      "learning_rate": 0.00017478025693035837,
+      "loss": 1.3376919746398925,
+      "mean_token_accuracy": 0.6901254534721375,
+      "num_tokens": 3106021.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.269498337060213,
+      "epoch": 0.5203565405926283,
+      "grad_norm": 0.052769869565963745,
+      "learning_rate": 0.00017410412440838405,
+      "loss": 1.3347968101501464,
+      "mean_token_accuracy": 0.7055287733674049,
+      "num_tokens": 3167001.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.174779912084341,
+      "epoch": 0.5299927728258251,
+      "grad_norm": 0.057087887078523636,
+      "learning_rate": 0.00017342799188640974,
+      "loss": 1.204134464263916,
+      "mean_token_accuracy": 0.7242644309997559,
+      "num_tokens": 3222794.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.1798104658722877,
+      "epoch": 0.5396290050590219,
+      "grad_norm": 0.06867323815822601,
+      "learning_rate": 0.00017275185936443542,
+      "loss": 1.218485164642334,
+      "mean_token_accuracy": 0.7220648050308227,
+      "num_tokens": 3282856.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.2600811369717122,
+      "epoch": 0.5492652372922188,
+      "grad_norm": 0.06002269685268402,
+      "learning_rate": 0.00017207572684246113,
+      "loss": 1.2951341629028321,
+      "mean_token_accuracy": 0.7052147269248963,
+      "num_tokens": 3343572.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.2493580430746078,
+      "epoch": 0.5589014695254155,
+      "grad_norm": 0.06532762199640274,
+      "learning_rate": 0.00017139959432048682,
+      "loss": 1.2813149452209474,
+      "mean_token_accuracy": 0.7067271783947945,
+      "num_tokens": 3402401.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.174679161608219,
+      "epoch": 0.5685377017586124,
+      "grad_norm": 0.0596516914665699,
+      "learning_rate": 0.0001707234617985125,
+      "loss": 1.2225926399230957,
+      "mean_token_accuracy": 0.7233842894434929,
+      "num_tokens": 3459042.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.2576898023486138,
+      "epoch": 0.5781739339918092,
+      "grad_norm": 0.05735331028699875,
+      "learning_rate": 0.0001700473292765382,
+      "loss": 1.2835253715515136,
+      "mean_token_accuracy": 0.7045948460698128,
+      "num_tokens": 3527637.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.2473611667752267,
+      "epoch": 0.587810166225006,
+      "grad_norm": 0.06076724827289581,
+      "learning_rate": 0.0001693711967545639,
+      "loss": 1.2636917114257813,
+      "mean_token_accuracy": 0.7045381426811218,
+      "num_tokens": 3589404.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.2829708829522133,
+      "epoch": 0.5974463984582028,
+      "grad_norm": 0.05943462252616882,
+      "learning_rate": 0.0001686950642325896,
+      "loss": 1.2889452934265138,
+      "mean_token_accuracy": 0.7001010566949845,
+      "num_tokens": 3651124.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.2208770334720611,
+      "epoch": 0.6070826306913997,
+      "grad_norm": 0.04695171117782593,
+      "learning_rate": 0.0001680189317106153,
+      "loss": 1.259203052520752,
+      "mean_token_accuracy": 0.7110297352075576,
+      "num_tokens": 3705293.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.301590697467327,
+      "epoch": 0.6167188629245964,
+      "grad_norm": 0.047615595161914825,
+      "learning_rate": 0.00016734279918864098,
+      "loss": 1.322781467437744,
+      "mean_token_accuracy": 0.6952215626835823,
+      "num_tokens": 3765429.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.2000589437782765,
+      "epoch": 0.6263550951577933,
+      "grad_norm": 0.057772569358348846,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 1.2448016166687013,
+      "mean_token_accuracy": 0.7192464172840118,
+      "num_tokens": 3826736.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.187142127752304,
+      "epoch": 0.6359913273909901,
+      "grad_norm": 0.06310118734836578,
+      "learning_rate": 0.00016599053414469237,
+      "loss": 1.1989784240722656,
+      "mean_token_accuracy": 0.7233487412333488,
+      "num_tokens": 3886196.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.2190307170152663,
+      "epoch": 0.6456275596241869,
+      "grad_norm": 0.049476709216833115,
+      "learning_rate": 0.00016531440162271806,
+      "loss": 1.2834860801696777,
+      "mean_token_accuracy": 0.713593752682209,
+      "num_tokens": 3941611.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.1955443389713765,
+      "epoch": 0.6552637918573838,
+      "grad_norm": 0.06877182424068451,
+      "learning_rate": 0.00016463826910074377,
+      "loss": 1.2530131340026855,
+      "mean_token_accuracy": 0.7211957752704621,
+      "num_tokens": 3998745.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.255328443646431,
+      "epoch": 0.6649000240905806,
+      "grad_norm": 0.06499036401510239,
+      "learning_rate": 0.00016396213657876945,
+      "loss": 1.2712255477905274,
+      "mean_token_accuracy": 0.7075859263539315,
+      "num_tokens": 4057693.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.269506113231182,
+      "epoch": 0.6745362563237775,
+      "grad_norm": 0.07255974411964417,
+      "learning_rate": 0.00016328600405679514,
+      "loss": 1.294202709197998,
+      "mean_token_accuracy": 0.7018555745482444,
+      "num_tokens": 4113943.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.24193774163723,
+      "epoch": 0.6841724885569742,
+      "grad_norm": 0.07056763768196106,
+      "learning_rate": 0.00016260987153482082,
+      "loss": 1.2850048065185546,
+      "mean_token_accuracy": 0.714261619746685,
+      "num_tokens": 4171377.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.2012858629226684,
+      "epoch": 0.6938087207901711,
+      "grad_norm": 0.06263507157564163,
+      "learning_rate": 0.0001619337390128465,
+      "loss": 1.2244320869445802,
+      "mean_token_accuracy": 0.7127483233809471,
+      "num_tokens": 4224788.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.118230439722538,
+      "epoch": 0.7034449530233678,
+      "grad_norm": 0.0841294601559639,
+      "learning_rate": 0.00016125760649087222,
+      "loss": 1.1727646827697753,
+      "mean_token_accuracy": 0.7389020159840584,
+      "num_tokens": 4273260.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.1662761889398099,
+      "epoch": 0.7130811852565647,
+      "grad_norm": 0.05674638971686363,
+      "learning_rate": 0.0001605814739688979,
+      "loss": 1.2040279388427735,
+      "mean_token_accuracy": 0.7247484371066093,
+      "num_tokens": 4330262.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.1847384825348855,
+      "epoch": 0.7227174174897615,
+      "grad_norm": 0.0654911920428276,
+      "learning_rate": 0.00015990534144692359,
+      "loss": 1.245723056793213,
+      "mean_token_accuracy": 0.7201158210635186,
+      "num_tokens": 4387940.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.2574817538261414,
+      "epoch": 0.7323536497229584,
+      "grad_norm": 0.05644860863685608,
+      "learning_rate": 0.0001592292089249493,
+      "loss": 1.2513961791992188,
+      "mean_token_accuracy": 0.7070513799786567,
+      "num_tokens": 4451167.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.1798933163285255,
+      "epoch": 0.7419898819561551,
+      "grad_norm": 0.04677167534828186,
+      "learning_rate": 0.00015855307640297498,
+      "loss": 1.2556601524353028,
+      "mean_token_accuracy": 0.7209865570068359,
+      "num_tokens": 4514662.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.2833519145846366,
+      "epoch": 0.751626114189352,
+      "grad_norm": 0.05460638552904129,
+      "learning_rate": 0.0001578769438810007,
+      "loss": 1.3391889572143554,
+      "mean_token_accuracy": 0.699195285141468,
+      "num_tokens": 4577464.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.301037323474884,
+      "epoch": 0.7612623464225488,
+      "grad_norm": 0.08067753911018372,
+      "learning_rate": 0.00015720081135902638,
+      "loss": 1.3280585289001465,
+      "mean_token_accuracy": 0.6965255320072175,
+      "num_tokens": 4640394.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.1654298216104508,
+      "epoch": 0.7708985786557456,
+      "grad_norm": 0.06938653439283371,
+      "learning_rate": 0.00015652467883705206,
+      "loss": 1.2016778945922852,
+      "mean_token_accuracy": 0.7259130507707596,
+      "num_tokens": 4694747.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.2972704201936722,
+      "epoch": 0.7805348108889424,
+      "grad_norm": 0.05404666066169739,
+      "learning_rate": 0.00015584854631507777,
+      "loss": 1.2715141296386718,
+      "mean_token_accuracy": 0.7007692798972129,
+      "num_tokens": 4754920.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.191229759156704,
+      "epoch": 0.7901710431221393,
+      "grad_norm": 0.05804457888007164,
+      "learning_rate": 0.00015517241379310346,
+      "loss": 1.2225379943847656,
+      "mean_token_accuracy": 0.7155083760619163,
+      "num_tokens": 4813911.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.1537679880857468,
+      "epoch": 0.799807275355336,
+      "grad_norm": 0.06406034529209137,
+      "learning_rate": 0.00015449628127112914,
+      "loss": 1.2089889526367188,
+      "mean_token_accuracy": 0.7305796332657337,
+      "num_tokens": 4867754.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.3313012823462487,
+      "epoch": 0.8094435075885329,
+      "grad_norm": 0.05331671983003616,
+      "learning_rate": 0.00015382014874915485,
+      "loss": 1.3202786445617676,
+      "mean_token_accuracy": 0.6896555438637734,
+      "num_tokens": 4932220.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.1428007125854491,
+      "epoch": 0.8190797398217297,
+      "grad_norm": 0.0543409064412117,
+      "learning_rate": 0.00015314401622718054,
+      "loss": 1.1924286842346192,
+      "mean_token_accuracy": 0.7276274234056472,
+      "num_tokens": 4991379.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.1923618368804454,
+      "epoch": 0.8287159720549265,
+      "grad_norm": 0.04643765091896057,
+      "learning_rate": 0.00015246788370520625,
+      "loss": 1.2740966796875,
+      "mean_token_accuracy": 0.7193146347999573,
+      "num_tokens": 5050844.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.2764008730649947,
+      "epoch": 0.8383522042881233,
+      "grad_norm": 0.06100849807262421,
+      "learning_rate": 0.00015179175118323193,
+      "loss": 1.2969582557678223,
+      "mean_token_accuracy": 0.7002251073718071,
+      "num_tokens": 5111937.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.1852023541927337,
+      "epoch": 0.8479884365213202,
+      "grad_norm": 0.08129674941301346,
+      "learning_rate": 0.00015111561866125762,
+      "loss": 1.215758991241455,
+      "mean_token_accuracy": 0.7221846386790276,
+      "num_tokens": 5173122.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.2100131824612617,
+      "epoch": 0.857624668754517,
+      "grad_norm": 0.05021384358406067,
+      "learning_rate": 0.0001504394861392833,
+      "loss": 1.245238971710205,
+      "mean_token_accuracy": 0.7174671500921249,
+      "num_tokens": 5237440.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.2959269508719444,
+      "epoch": 0.8672609009877138,
+      "grad_norm": 0.06317298859357834,
+      "learning_rate": 0.000149763353617309,
+      "loss": 1.298589324951172,
+      "mean_token_accuracy": 0.696705661714077,
+      "num_tokens": 5306311.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.178437228500843,
+      "epoch": 0.8768971332209107,
+      "grad_norm": 0.0590016208589077,
+      "learning_rate": 0.00014908722109533467,
+      "loss": 1.2140485763549804,
+      "mean_token_accuracy": 0.7231360018253327,
+      "num_tokens": 5365204.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.1817002773284913,
+      "epoch": 0.8865333654541074,
+      "grad_norm": 0.06499218195676804,
+      "learning_rate": 0.00014841108857336038,
+      "loss": 1.2365409851074218,
+      "mean_token_accuracy": 0.7207681879401207,
+      "num_tokens": 5425136.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.1871344536542892,
+      "epoch": 0.8961695976873043,
+      "grad_norm": 0.05251992866396904,
+      "learning_rate": 0.00014773495605138607,
+      "loss": 1.216776180267334,
+      "mean_token_accuracy": 0.7216181293129921,
+      "num_tokens": 5485557.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.2011874541640282,
+      "epoch": 0.9058058299205011,
+      "grad_norm": 0.05413221940398216,
+      "learning_rate": 0.00014705882352941178,
+      "loss": 1.2213294982910157,
+      "mean_token_accuracy": 0.7150228247046471,
+      "num_tokens": 5545105.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.2181889459490776,
+      "epoch": 0.9154420621536979,
+      "grad_norm": 0.056055545806884766,
+      "learning_rate": 0.00014638269100743746,
+      "loss": 1.25078125,
+      "mean_token_accuracy": 0.7119832545518875,
+      "num_tokens": 5605134.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.1678502529859542,
+      "epoch": 0.9250782943868947,
+      "grad_norm": 0.05342623591423035,
+      "learning_rate": 0.00014570655848546315,
+      "loss": 1.2045835494995116,
+      "mean_token_accuracy": 0.7228824377059937,
+      "num_tokens": 5663877.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.1557280227541924,
+      "epoch": 0.9347145266200916,
+      "grad_norm": 0.056248344480991364,
+      "learning_rate": 0.00014503042596348886,
+      "loss": 1.175156021118164,
+      "mean_token_accuracy": 0.7239007547497749,
+      "num_tokens": 5726645.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.2256763622164726,
+      "epoch": 0.9443507588532883,
+      "grad_norm": 0.06137476861476898,
+      "learning_rate": 0.00014435429344151454,
+      "loss": 1.234114933013916,
+      "mean_token_accuracy": 0.7094191774725914,
+      "num_tokens": 5783468.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.1099035568535327,
+      "epoch": 0.9539869910864852,
+      "grad_norm": 0.052310869097709656,
+      "learning_rate": 0.00014367816091954023,
+      "loss": 1.158639907836914,
+      "mean_token_accuracy": 0.7366219267249108,
+      "num_tokens": 5840101.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.1962413311004638,
+      "epoch": 0.963623223319682,
+      "grad_norm": 0.055140018463134766,
+      "learning_rate": 0.00014300202839756594,
+      "loss": 1.210787010192871,
+      "mean_token_accuracy": 0.7173429980874062,
+      "num_tokens": 5903466.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.0866067253053189,
+      "epoch": 0.9732594555528788,
+      "grad_norm": 0.07032942771911621,
+      "learning_rate": 0.00014232589587559162,
+      "loss": 1.1129253387451172,
+      "mean_token_accuracy": 0.7413682281970978,
+      "num_tokens": 5958697.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.1695112690329552,
+      "epoch": 0.9828956877860756,
+      "grad_norm": 0.06965778768062592,
+      "learning_rate": 0.00014164976335361734,
+      "loss": 1.2002062797546387,
+      "mean_token_accuracy": 0.7275112867355347,
+      "num_tokens": 6018820.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.129349136352539,
+      "epoch": 0.9925319200192725,
+      "grad_norm": 0.0639411062002182,
+      "learning_rate": 0.00014097363083164302,
+      "loss": 1.1613442420959472,
+      "mean_token_accuracy": 0.7342579141259193,
+      "num_tokens": 6074337.0,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0,
+      "eval_entropy": 1.0300818726266783,
+      "eval_loss": 1.032158374786377,
+      "eval_mean_token_accuracy": 0.7592671683222431,
+      "eval_num_tokens": 6115944.0,
+      "eval_runtime": 513.4443,
+      "eval_samples_per_second": 1.7,
+      "eval_steps_per_second": 1.7,
+      "step": 1038
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3114,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.1498282716531098e+18,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16c55af3229f91defd719bd313afa179de5164961715a6136bf7587ace17f152
+size 5240