agu18dec commited on 29 days ago

Commit

3175b72

verified ·

1 Parent(s): 0621a31

add checkpoint cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/README.md +61 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/adapter_config.json +46 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/adapter_config.json +46 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/trainer_state.json +1044 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/adapter_config.json +46 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/trainer_state.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/adapter_config.json +46 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/tokenizer.json +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/tokenizer_config.json +207 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/trainer_state.json +2064 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/training_args.bin +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/vocab.json +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/README.md +209 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/adapter_config.json +46 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/adapter_model.safetensors +3 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/added_tokens.json +24 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/chat_template.jinja +54 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/merges.txt +0 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/special_tokens_map.json +31 -0
checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/tokenizer.json +3 -0

.gitattributes CHANGED Viewed

@@ -204,3 +204,14 @@ checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s2_vt_perlayer_repro/tokenizer.json
 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/checkpoint-8856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/checkpoint-9840/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/checkpoint-8856/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/checkpoint-9840/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/cat_qwen25_7b_r8_a32_e10_lr1e-4_s3_vt_perlayer_repro/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-4076/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-5095/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-6114/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-7133/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-8152/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-9171/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+model_name: cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10
+This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/8jjpbuy7)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.18.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.10.0
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08c759c17a3646bf0a7df306c92ac8a706f95b5f65ba67172c3c234d7a8851ab
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01cefddb25d5026a17c2c58d2bf6412fac70374bcad4e950ddf94d64c415e77a
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1044 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1019,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2483618259429932,
+      "epoch": 0.009813542688910697,
+      "grad_norm": 3.559945583343506,
+      "learning_rate": 1.7647058823529412e-06,
+      "loss": 0.8134,
+      "mean_token_accuracy": 0.7300779759883881,
+      "num_tokens": 13663.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2403294801712037,
+      "epoch": 0.019627085377821395,
+      "grad_norm": 4.009516716003418,
+      "learning_rate": 3.7254901960784316e-06,
+      "loss": 0.7284,
+      "mean_token_accuracy": 0.7614562273025512,
+      "num_tokens": 27321.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2553077578544616,
+      "epoch": 0.029440628066732092,
+      "grad_norm": 3.2608017921447754,
+      "learning_rate": 5.686274509803921e-06,
+      "loss": 0.7572,
+      "mean_token_accuracy": 0.7476559698581695,
+      "num_tokens": 41194.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2486775279045106,
+      "epoch": 0.03925417075564279,
+      "grad_norm": 2.9102795124053955,
+      "learning_rate": 7.647058823529413e-06,
+      "loss": 0.7251,
+      "mean_token_accuracy": 0.7451107144355774,
+      "num_tokens": 54431.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.2510095834732056,
+      "epoch": 0.04906771344455348,
+      "grad_norm": 2.0170955657958984,
+      "learning_rate": 9.607843137254903e-06,
+      "loss": 0.6338,
+      "mean_token_accuracy": 0.7795099318027496,
+      "num_tokens": 68329.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2673060655593873,
+      "epoch": 0.058881256133464184,
+      "grad_norm": 1.863154411315918,
+      "learning_rate": 1.1568627450980394e-05,
+      "loss": 0.6495,
+      "mean_token_accuracy": 0.7649777054786682,
+      "num_tokens": 81724.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.243555223941803,
+      "epoch": 0.06869479882237488,
+      "grad_norm": 2.1085784435272217,
+      "learning_rate": 1.3529411764705883e-05,
+      "loss": 0.6042,
+      "mean_token_accuracy": 0.7765651226043702,
+      "num_tokens": 95693.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2313958764076234,
+      "epoch": 0.07850834151128558,
+      "grad_norm": 2.176846981048584,
+      "learning_rate": 1.5490196078431373e-05,
+      "loss": 0.5522,
+      "mean_token_accuracy": 0.796428507566452,
+      "num_tokens": 109253.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.241322922706604,
+      "epoch": 0.08832188420019627,
+      "grad_norm": 2.0627377033233643,
+      "learning_rate": 1.7450980392156862e-05,
+      "loss": 0.6493,
+      "mean_token_accuracy": 0.7616375625133515,
+      "num_tokens": 122810.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.202180063724518,
+      "epoch": 0.09813542688910697,
+      "grad_norm": 2.1048226356506348,
+      "learning_rate": 1.9411764705882355e-05,
+      "loss": 0.549,
+      "mean_token_accuracy": 0.7949124991893768,
+      "num_tokens": 136706.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.1756585597991944,
+      "epoch": 0.10794896957801767,
+      "grad_norm": 2.2883429527282715,
+      "learning_rate": 2.1372549019607844e-05,
+      "loss": 0.4989,
+      "mean_token_accuracy": 0.8112820327281952,
+      "num_tokens": 150742.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.1946371436119079,
+      "epoch": 0.11776251226692837,
+      "grad_norm": 2.2675540447235107,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 0.5786,
+      "mean_token_accuracy": 0.7798411428928376,
+      "num_tokens": 164305.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.19124995470047,
+      "epoch": 0.12757605495583907,
+      "grad_norm": 2.603771924972534,
+      "learning_rate": 2.5294117647058825e-05,
+      "loss": 0.5354,
+      "mean_token_accuracy": 0.7984326481819153,
+      "num_tokens": 177967.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.1813685297966003,
+      "epoch": 0.13738959764474976,
+      "grad_norm": 2.699855089187622,
+      "learning_rate": 2.7254901960784314e-05,
+      "loss": 0.5117,
+      "mean_token_accuracy": 0.8128579616546631,
+      "num_tokens": 192090.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.1989248156547547,
+      "epoch": 0.14720314033366044,
+      "grad_norm": 2.542903423309326,
+      "learning_rate": 2.9215686274509806e-05,
+      "loss": 0.5394,
+      "mean_token_accuracy": 0.7957534492015839,
+      "num_tokens": 206673.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.1441561222076415,
+      "epoch": 0.15701668302257116,
+      "grad_norm": 3.7011725902557373,
+      "learning_rate": 3.11764705882353e-05,
+      "loss": 0.4586,
+      "mean_token_accuracy": 0.8275609016418457,
+      "num_tokens": 220293.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.1597741365432739,
+      "epoch": 0.16683022571148184,
+      "grad_norm": 2.313307046890259,
+      "learning_rate": 3.3137254901960784e-05,
+      "loss": 0.4532,
+      "mean_token_accuracy": 0.8308035731315613,
+      "num_tokens": 234011.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.1612919807434081,
+      "epoch": 0.17664376840039253,
+      "grad_norm": 2.9966866970062256,
+      "learning_rate": 3.5098039215686276e-05,
+      "loss": 0.5044,
+      "mean_token_accuracy": 0.8123882353305817,
+      "num_tokens": 247118.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.1632250428199769,
+      "epoch": 0.18645731108930325,
+      "grad_norm": 3.3941473960876465,
+      "learning_rate": 3.705882352941177e-05,
+      "loss": 0.4761,
+      "mean_token_accuracy": 0.8212073087692261,
+      "num_tokens": 261365.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.1553410053253175,
+      "epoch": 0.19627085377821393,
+      "grad_norm": 2.857267379760742,
+      "learning_rate": 3.9019607843137254e-05,
+      "loss": 0.4553,
+      "mean_token_accuracy": 0.8259550392627716,
+      "num_tokens": 274990.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.152228093147278,
+      "epoch": 0.20608439646712462,
+      "grad_norm": 3.4860124588012695,
+      "learning_rate": 4.0980392156862746e-05,
+      "loss": 0.4613,
+      "mean_token_accuracy": 0.8292915642261505,
+      "num_tokens": 288512.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.131454634666443,
+      "epoch": 0.21589793915603533,
+      "grad_norm": 2.9565322399139404,
+      "learning_rate": 4.294117647058823e-05,
+      "loss": 0.4439,
+      "mean_token_accuracy": 0.8337770104408264,
+      "num_tokens": 301852.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.116087293624878,
+      "epoch": 0.22571148184494602,
+      "grad_norm": 3.363611936569214,
+      "learning_rate": 4.490196078431373e-05,
+      "loss": 0.4326,
+      "mean_token_accuracy": 0.836861526966095,
+      "num_tokens": 315850.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.1328616857528686,
+      "epoch": 0.23552502453385674,
+      "grad_norm": 2.8903441429138184,
+      "learning_rate": 4.6862745098039216e-05,
+      "loss": 0.4509,
+      "mean_token_accuracy": 0.8328852355480194,
+      "num_tokens": 329271.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.130015003681183,
+      "epoch": 0.24533856722276742,
+      "grad_norm": 3.509453773498535,
+      "learning_rate": 4.882352941176471e-05,
+      "loss": 0.4359,
+      "mean_token_accuracy": 0.8327082991600037,
+      "num_tokens": 343233.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.1058894276618958,
+      "epoch": 0.25515210991167814,
+      "grad_norm": 3.33721923828125,
+      "learning_rate": 5.0784313725490194e-05,
+      "loss": 0.4356,
+      "mean_token_accuracy": 0.8358817815780639,
+      "num_tokens": 356887.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.1200787544250488,
+      "epoch": 0.2649656526005888,
+      "grad_norm": 3.4578216075897217,
+      "learning_rate": 5.274509803921569e-05,
+      "loss": 0.4277,
+      "mean_token_accuracy": 0.8432585775852204,
+      "num_tokens": 370548.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.1384179592132568,
+      "epoch": 0.2747791952894995,
+      "grad_norm": 3.2266058921813965,
+      "learning_rate": 5.4705882352941185e-05,
+      "loss": 0.4176,
+      "mean_token_accuracy": 0.8439722537994385,
+      "num_tokens": 384687.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.1408962607383728,
+      "epoch": 0.2845927379784102,
+      "grad_norm": 3.087733030319214,
+      "learning_rate": 5.666666666666667e-05,
+      "loss": 0.445,
+      "mean_token_accuracy": 0.8325196325778961,
+      "num_tokens": 398358.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.1371111273765564,
+      "epoch": 0.2944062806673209,
+      "grad_norm": 3.274041175842285,
+      "learning_rate": 5.862745098039216e-05,
+      "loss": 0.4244,
+      "mean_token_accuracy": 0.8416358292102813,
+      "num_tokens": 412700.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.1113396763801575,
+      "epoch": 0.3042198233562316,
+      "grad_norm": 2.8629136085510254,
+      "learning_rate": 6.058823529411765e-05,
+      "loss": 0.3977,
+      "mean_token_accuracy": 0.8494885206222534,
+      "num_tokens": 427312.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.1260488390922547,
+      "epoch": 0.3140333660451423,
+      "grad_norm": 3.125242233276367,
+      "learning_rate": 6.254901960784314e-05,
+      "loss": 0.4073,
+      "mean_token_accuracy": 0.8544234335422516,
+      "num_tokens": 441507.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.1475743889808654,
+      "epoch": 0.323846908734053,
+      "grad_norm": 4.150859355926514,
+      "learning_rate": 6.450980392156864e-05,
+      "loss": 0.4422,
+      "mean_token_accuracy": 0.8314357757568359,
+      "num_tokens": 454411.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.1072185158729553,
+      "epoch": 0.3336604514229637,
+      "grad_norm": 3.3013269901275635,
+      "learning_rate": 6.647058823529411e-05,
+      "loss": 0.4181,
+      "mean_token_accuracy": 0.8433676600456238,
+      "num_tokens": 467641.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.115389347076416,
+      "epoch": 0.3434739941118744,
+      "grad_norm": 2.7073593139648438,
+      "learning_rate": 6.843137254901961e-05,
+      "loss": 0.45,
+      "mean_token_accuracy": 0.8306823253631592,
+      "num_tokens": 481724.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.0993050813674927,
+      "epoch": 0.35328753680078506,
+      "grad_norm": 4.06768798828125,
+      "learning_rate": 7.039215686274511e-05,
+      "loss": 0.3839,
+      "mean_token_accuracy": 0.8561393201351166,
+      "num_tokens": 495345.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.1100080490112305,
+      "epoch": 0.3631010794896958,
+      "grad_norm": 2.8841543197631836,
+      "learning_rate": 7.23529411764706e-05,
+      "loss": 0.4263,
+      "mean_token_accuracy": 0.8410856664180756,
+      "num_tokens": 509800.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.1247098922729493,
+      "epoch": 0.3729146221786065,
+      "grad_norm": 4.105075836181641,
+      "learning_rate": 7.431372549019608e-05,
+      "loss": 0.4157,
+      "mean_token_accuracy": 0.8427928566932679,
+      "num_tokens": 523514.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.1408787369728088,
+      "epoch": 0.38272816486751715,
+      "grad_norm": 2.9242143630981445,
+      "learning_rate": 7.627450980392157e-05,
+      "loss": 0.4266,
+      "mean_token_accuracy": 0.8396799921989441,
+      "num_tokens": 537693.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.1217655539512634,
+      "epoch": 0.39254170755642787,
+      "grad_norm": 2.826019763946533,
+      "learning_rate": 7.823529411764707e-05,
+      "loss": 0.416,
+      "mean_token_accuracy": 0.8408508718013763,
+      "num_tokens": 551457.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.157455313205719,
+      "epoch": 0.4023552502453386,
+      "grad_norm": 3.266930103302002,
+      "learning_rate": 8.019607843137255e-05,
+      "loss": 0.3848,
+      "mean_token_accuracy": 0.8575535297393799,
+      "num_tokens": 565157.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.1714832425117492,
+      "epoch": 0.41216879293424924,
+      "grad_norm": 2.520620584487915,
+      "learning_rate": 8.215686274509804e-05,
+      "loss": 0.3873,
+      "mean_token_accuracy": 0.8593390583992004,
+      "num_tokens": 579080.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.1691359281539917,
+      "epoch": 0.42198233562315995,
+      "grad_norm": 3.3553106784820557,
+      "learning_rate": 8.411764705882354e-05,
+      "loss": 0.3929,
+      "mean_token_accuracy": 0.848738569021225,
+      "num_tokens": 592922.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.161519956588745,
+      "epoch": 0.43179587831207067,
+      "grad_norm": 3.784954309463501,
+      "learning_rate": 8.607843137254903e-05,
+      "loss": 0.3447,
+      "mean_token_accuracy": 0.8669132351875305,
+      "num_tokens": 607114.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.175284707546234,
+      "epoch": 0.44160942100098133,
+      "grad_norm": 3.634345293045044,
+      "learning_rate": 8.80392156862745e-05,
+      "loss": 0.4092,
+      "mean_token_accuracy": 0.8398001670837403,
+      "num_tokens": 620907.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.1871889114379883,
+      "epoch": 0.45142296368989204,
+      "grad_norm": 3.3342034816741943,
+      "learning_rate": 9e-05,
+      "loss": 0.3739,
+      "mean_token_accuracy": 0.8601496338844299,
+      "num_tokens": 635363.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.163673198223114,
+      "epoch": 0.46123650637880276,
+      "grad_norm": 3.118102550506592,
+      "learning_rate": 9.196078431372549e-05,
+      "loss": 0.4217,
+      "mean_token_accuracy": 0.8399349391460419,
+      "num_tokens": 648200.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.206890833377838,
+      "epoch": 0.47105004906771347,
+      "grad_norm": 3.5182156562805176,
+      "learning_rate": 9.392156862745099e-05,
+      "loss": 0.4306,
+      "mean_token_accuracy": 0.8402868688106537,
+      "num_tokens": 662920.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.1812933087348938,
+      "epoch": 0.48086359175662413,
+      "grad_norm": 3.2047181129455566,
+      "learning_rate": 9.588235294117648e-05,
+      "loss": 0.3924,
+      "mean_token_accuracy": 0.854317981004715,
+      "num_tokens": 676646.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.214794135093689,
+      "epoch": 0.49067713444553485,
+      "grad_norm": 3.524360418319702,
+      "learning_rate": 9.784313725490196e-05,
+      "loss": 0.3901,
+      "mean_token_accuracy": 0.8527312636375427,
+      "num_tokens": 690435.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.223682713508606,
+      "epoch": 0.5004906771344455,
+      "grad_norm": 3.3886806964874268,
+      "learning_rate": 9.980392156862746e-05,
+      "loss": 0.3784,
+      "mean_token_accuracy": 0.8536435484886169,
+      "num_tokens": 704664.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.2168277263641358,
+      "epoch": 0.5103042198233563,
+      "grad_norm": 3.704810380935669,
+      "learning_rate": 9.999978670840125e-05,
+      "loss": 0.3918,
+      "mean_token_accuracy": 0.8539480805397034,
+      "num_tokens": 718834.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.2357110142707826,
+      "epoch": 0.5201177625122669,
+      "grad_norm": 3.3917276859283447,
+      "learning_rate": 9.999904940644553e-05,
+      "loss": 0.3874,
+      "mean_token_accuracy": 0.8504210293293,
+      "num_tokens": 733172.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2259963989257812,
+      "epoch": 0.5299313052011776,
+      "grad_norm": 3.740762233734131,
+      "learning_rate": 9.999778546866733e-05,
+      "loss": 0.3979,
+      "mean_token_accuracy": 0.8438404977321625,
+      "num_tokens": 746577.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.199124240875244,
+      "epoch": 0.5397448478900884,
+      "grad_norm": 2.874812126159668,
+      "learning_rate": 9.999599490837959e-05,
+      "loss": 0.363,
+      "mean_token_accuracy": 0.8524531662464142,
+      "num_tokens": 760378.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.2345012068748473,
+      "epoch": 0.549558390578999,
+      "grad_norm": 2.862276554107666,
+      "learning_rate": 9.999367774444214e-05,
+      "loss": 0.4005,
+      "mean_token_accuracy": 0.846814078092575,
+      "num_tokens": 774528.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.1909167528152467,
+      "epoch": 0.5593719332679097,
+      "grad_norm": 2.375776529312134,
+      "learning_rate": 9.999083400126145e-05,
+      "loss": 0.3588,
+      "mean_token_accuracy": 0.8638595938682556,
+      "num_tokens": 787978.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.1990839481353759,
+      "epoch": 0.5691854759568205,
+      "grad_norm": 2.649693250656128,
+      "learning_rate": 9.998746370879049e-05,
+      "loss": 0.3629,
+      "mean_token_accuracy": 0.8618560075759888,
+      "num_tokens": 802122.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.1829696774482727,
+      "epoch": 0.5789990186457311,
+      "grad_norm": 2.828573703765869,
+      "learning_rate": 9.99835669025282e-05,
+      "loss": 0.3294,
+      "mean_token_accuracy": 0.8737357437610627,
+      "num_tokens": 816567.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.2039927005767823,
+      "epoch": 0.5888125613346418,
+      "grad_norm": 2.849315643310547,
+      "learning_rate": 9.997914362351934e-05,
+      "loss": 0.3045,
+      "mean_token_accuracy": 0.8843503415584564,
+      "num_tokens": 830550.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.1890666484832764,
+      "epoch": 0.5986261040235525,
+      "grad_norm": 2.388582229614258,
+      "learning_rate": 9.997419391835396e-05,
+      "loss": 0.3378,
+      "mean_token_accuracy": 0.8758296966552734,
+      "num_tokens": 843779.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.213183867931366,
+      "epoch": 0.6084396467124632,
+      "grad_norm": 3.474524736404419,
+      "learning_rate": 9.996871783916687e-05,
+      "loss": 0.3504,
+      "mean_token_accuracy": 0.8678486227989197,
+      "num_tokens": 857498.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.187168836593628,
+      "epoch": 0.6182531894013739,
+      "grad_norm": 3.741032123565674,
+      "learning_rate": 9.996271544363717e-05,
+      "loss": 0.3656,
+      "mean_token_accuracy": 0.8562004089355468,
+      "num_tokens": 871163.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.2092647194862365,
+      "epoch": 0.6280667320902846,
+      "grad_norm": 2.7920150756835938,
+      "learning_rate": 9.995618679498758e-05,
+      "loss": 0.3654,
+      "mean_token_accuracy": 0.864486688375473,
+      "num_tokens": 884912.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.1932833075523377,
+      "epoch": 0.6378802747791953,
+      "grad_norm": 3.4623348712921143,
+      "learning_rate": 9.994913196198381e-05,
+      "loss": 0.3384,
+      "mean_token_accuracy": 0.8657363414764404,
+      "num_tokens": 898102.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.210770559310913,
+      "epoch": 0.647693817468106,
+      "grad_norm": 3.1084299087524414,
+      "learning_rate": 9.994155101893386e-05,
+      "loss": 0.3564,
+      "mean_token_accuracy": 0.8611253321170806,
+      "num_tokens": 911902.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.2453441143035888,
+      "epoch": 0.6575073601570167,
+      "grad_norm": 2.4232285022735596,
+      "learning_rate": 9.993344404568712e-05,
+      "loss": 0.3996,
+      "mean_token_accuracy": 0.852339768409729,
+      "num_tokens": 926137.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.1893890380859375,
+      "epoch": 0.6673209028459274,
+      "grad_norm": 2.9850914478302,
+      "learning_rate": 9.992481112763372e-05,
+      "loss": 0.3151,
+      "mean_token_accuracy": 0.8825689435005188,
+      "num_tokens": 940507.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.1946087718009948,
+      "epoch": 0.677134445534838,
+      "grad_norm": 2.2807886600494385,
+      "learning_rate": 9.991565235570341e-05,
+      "loss": 0.3327,
+      "mean_token_accuracy": 0.8753289222717285,
+      "num_tokens": 954934.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.2035173773765564,
+      "epoch": 0.6869479882237488,
+      "grad_norm": 3.3408844470977783,
+      "learning_rate": 9.990596782636481e-05,
+      "loss": 0.3865,
+      "mean_token_accuracy": 0.855133694410324,
+      "num_tokens": 968760.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.1885886549949647,
+      "epoch": 0.6967615309126595,
+      "grad_norm": 3.780911922454834,
+      "learning_rate": 9.989575764162426e-05,
+      "loss": 0.3145,
+      "mean_token_accuracy": 0.8819782733917236,
+      "num_tokens": 982644.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.1796546936035157,
+      "epoch": 0.7065750736015701,
+      "grad_norm": 2.4805071353912354,
+      "learning_rate": 9.988502190902476e-05,
+      "loss": 0.3532,
+      "mean_token_accuracy": 0.8675242066383362,
+      "num_tokens": 996817.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.1823334097862244,
+      "epoch": 0.7163886162904809,
+      "grad_norm": 2.9788618087768555,
+      "learning_rate": 9.987376074164491e-05,
+      "loss": 0.3366,
+      "mean_token_accuracy": 0.8697103559970856,
+      "num_tokens": 1010008.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.1985557913780212,
+      "epoch": 0.7262021589793916,
+      "grad_norm": 2.5642588138580322,
+      "learning_rate": 9.986197425809766e-05,
+      "loss": 0.3305,
+      "mean_token_accuracy": 0.8674235641956329,
+      "num_tokens": 1023812.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.1962911128997802,
+      "epoch": 0.7360157016683022,
+      "grad_norm": 3.236114978790283,
+      "learning_rate": 9.984966258252903e-05,
+      "loss": 0.3707,
+      "mean_token_accuracy": 0.8536763250827789,
+      "num_tokens": 1037399.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.18121737241745,
+      "epoch": 0.745829244357213,
+      "grad_norm": 2.707700252532959,
+      "learning_rate": 9.983682584461688e-05,
+      "loss": 0.2948,
+      "mean_token_accuracy": 0.8834168791770936,
+      "num_tokens": 1050972.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.190599763393402,
+      "epoch": 0.7556427870461236,
+      "grad_norm": 3.883239507675171,
+      "learning_rate": 9.982346417956949e-05,
+      "loss": 0.3678,
+      "mean_token_accuracy": 0.8604642748832703,
+      "num_tokens": 1064581.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.1711848378181458,
+      "epoch": 0.7654563297350343,
+      "grad_norm": 2.420788049697876,
+      "learning_rate": 9.98095777281242e-05,
+      "loss": 0.3255,
+      "mean_token_accuracy": 0.8733546793460846,
+      "num_tokens": 1078433.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.175394594669342,
+      "epoch": 0.7752698724239451,
+      "grad_norm": 3.810960292816162,
+      "learning_rate": 9.979516663654582e-05,
+      "loss": 0.3239,
+      "mean_token_accuracy": 0.8758346498012543,
+      "num_tokens": 1091839.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.1753032803535461,
+      "epoch": 0.7850834151128557,
+      "grad_norm": 2.5673933029174805,
+      "learning_rate": 9.978023105662519e-05,
+      "loss": 0.3252,
+      "mean_token_accuracy": 0.8731696844100952,
+      "num_tokens": 1105747.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.1706650733947754,
+      "epoch": 0.7948969578017664,
+      "grad_norm": 3.5536954402923584,
+      "learning_rate": 9.976477114567752e-05,
+      "loss": 0.3254,
+      "mean_token_accuracy": 0.881614089012146,
+      "num_tokens": 1119344.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.1545325756072997,
+      "epoch": 0.8047105004906772,
+      "grad_norm": 3.0351312160491943,
+      "learning_rate": 9.974878706654076e-05,
+      "loss": 0.3075,
+      "mean_token_accuracy": 0.880529397726059,
+      "num_tokens": 1133252.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.1713720440864563,
+      "epoch": 0.8145240431795878,
+      "grad_norm": 2.81852126121521,
+      "learning_rate": 9.97322789875739e-05,
+      "loss": 0.3263,
+      "mean_token_accuracy": 0.8688643455505372,
+      "num_tokens": 1147368.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.152477788925171,
+      "epoch": 0.8243375858684985,
+      "grad_norm": 3.188549757003784,
+      "learning_rate": 9.971524708265515e-05,
+      "loss": 0.3062,
+      "mean_token_accuracy": 0.8836665332317353,
+      "num_tokens": 1161165.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.1690003156661988,
+      "epoch": 0.8341511285574092,
+      "grad_norm": 2.653815269470215,
+      "learning_rate": 9.969769153118014e-05,
+      "loss": 0.3191,
+      "mean_token_accuracy": 0.8780744910240174,
+      "num_tokens": 1174814.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.182269549369812,
+      "epoch": 0.8439646712463199,
+      "grad_norm": 2.8399338722229004,
+      "learning_rate": 9.967961251806005e-05,
+      "loss": 0.36,
+      "mean_token_accuracy": 0.864084666967392,
+      "num_tokens": 1188967.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.15756698846817,
+      "epoch": 0.8537782139352306,
+      "grad_norm": 3.0958690643310547,
+      "learning_rate": 9.966101023371961e-05,
+      "loss": 0.3226,
+      "mean_token_accuracy": 0.8756173610687256,
+      "num_tokens": 1202907.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.1475032567977905,
+      "epoch": 0.8635917566241413,
+      "grad_norm": 2.4389164447784424,
+      "learning_rate": 9.964188487409512e-05,
+      "loss": 0.2938,
+      "mean_token_accuracy": 0.8890304386615753,
+      "num_tokens": 1216865.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.1877331137657166,
+      "epoch": 0.873405299313052,
+      "grad_norm": 3.5680530071258545,
+      "learning_rate": 9.962223664063241e-05,
+      "loss": 0.304,
+      "mean_token_accuracy": 0.8802395105361939,
+      "num_tokens": 1230849.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.1612010836601256,
+      "epoch": 0.8832188420019627,
+      "grad_norm": 2.2012088298797607,
+      "learning_rate": 9.960206574028468e-05,
+      "loss": 0.3228,
+      "mean_token_accuracy": 0.8745718121528625,
+      "num_tokens": 1243965.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.1542456030845643,
+      "epoch": 0.8930323846908734,
+      "grad_norm": 4.392910480499268,
+      "learning_rate": 9.958137238551036e-05,
+      "loss": 0.3072,
+      "mean_token_accuracy": 0.8850542485713959,
+      "num_tokens": 1257918.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.1391899585723877,
+      "epoch": 0.9028459273797841,
+      "grad_norm": 2.334970712661743,
+      "learning_rate": 9.956015679427082e-05,
+      "loss": 0.324,
+      "mean_token_accuracy": 0.8772465825080872,
+      "num_tokens": 1271980.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.1350346326828002,
+      "epoch": 0.9126594700686947,
+      "grad_norm": 2.9113283157348633,
+      "learning_rate": 9.953841919002812e-05,
+      "loss": 0.3036,
+      "mean_token_accuracy": 0.8844376146793366,
+      "num_tokens": 1286287.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.1404588222503662,
+      "epoch": 0.9224730127576055,
+      "grad_norm": 2.446030855178833,
+      "learning_rate": 9.951615980174261e-05,
+      "loss": 0.2895,
+      "mean_token_accuracy": 0.8874838829040528,
+      "num_tokens": 1300381.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.139818048477173,
+      "epoch": 0.9322865554465162,
+      "grad_norm": 4.113491058349609,
+      "learning_rate": 9.94933788638706e-05,
+      "loss": 0.2981,
+      "mean_token_accuracy": 0.8840461432933807,
+      "num_tokens": 1314086.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.1538641214370728,
+      "epoch": 0.9421000981354269,
+      "grad_norm": 2.8096957206726074,
+      "learning_rate": 9.947007661636182e-05,
+      "loss": 0.3039,
+      "mean_token_accuracy": 0.8843685805797576,
+      "num_tokens": 1328434.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.129905092716217,
+      "epoch": 0.9519136408243376,
+      "grad_norm": 2.2897799015045166,
+      "learning_rate": 9.944625330465688e-05,
+      "loss": 0.307,
+      "mean_token_accuracy": 0.8870210766792297,
+      "num_tokens": 1341971.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.1422709345817565,
+      "epoch": 0.9617271835132483,
+      "grad_norm": 3.0245301723480225,
+      "learning_rate": 9.942190917968474e-05,
+      "loss": 0.267,
+      "mean_token_accuracy": 0.895936119556427,
+      "num_tokens": 1356685.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.1508997082710266,
+      "epoch": 0.971540726202159,
+      "grad_norm": 3.526949882507324,
+      "learning_rate": 9.939704449786005e-05,
+      "loss": 0.309,
+      "mean_token_accuracy": 0.8837933301925659,
+      "num_tokens": 1370541.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.1339004516601563,
+      "epoch": 0.9813542688910697,
+      "grad_norm": 5.214639663696289,
+      "learning_rate": 9.937165952108042e-05,
+      "loss": 0.2651,
+      "mean_token_accuracy": 0.8960758924484253,
+      "num_tokens": 1383962.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.1374788284301758,
+      "epoch": 0.9911678115799804,
+      "grad_norm": 3.573387622833252,
+      "learning_rate": 9.93457545167237e-05,
+      "loss": 0.2806,
+      "mean_token_accuracy": 0.8932440996170044,
+      "num_tokens": 1397902.0,
+      "step": 1010
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10190,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.993655431707238e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56509bf0973b5a62fa9102ae0f90af2527f08ae5fd5d1d953f30cc3fcb84c764
+size 6417

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-1019/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08c759c17a3646bf0a7df306c92ac8a706f95b5f65ba67172c3c234d7a8851ab
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56509bf0973b5a62fa9102ae0f90af2527f08ae5fd5d1d953f30cc3fcb84c764
+size 6417

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-10190/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4974da7397600085ed4c6d7bbd9497a5e8adb45b4be1059a8a4391a26344ea19
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2064 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2038,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 1.2483618259429932,
+      "epoch": 0.009813542688910697,
+      "grad_norm": 3.559945583343506,
+      "learning_rate": 1.7647058823529412e-06,
+      "loss": 0.8134,
+      "mean_token_accuracy": 0.7300779759883881,
+      "num_tokens": 13663.0,
+      "step": 10
+    },
+    {
+      "entropy": 1.2403294801712037,
+      "epoch": 0.019627085377821395,
+      "grad_norm": 4.009516716003418,
+      "learning_rate": 3.7254901960784316e-06,
+      "loss": 0.7284,
+      "mean_token_accuracy": 0.7614562273025512,
+      "num_tokens": 27321.0,
+      "step": 20
+    },
+    {
+      "entropy": 1.2553077578544616,
+      "epoch": 0.029440628066732092,
+      "grad_norm": 3.2608017921447754,
+      "learning_rate": 5.686274509803921e-06,
+      "loss": 0.7572,
+      "mean_token_accuracy": 0.7476559698581695,
+      "num_tokens": 41194.0,
+      "step": 30
+    },
+    {
+      "entropy": 1.2486775279045106,
+      "epoch": 0.03925417075564279,
+      "grad_norm": 2.9102795124053955,
+      "learning_rate": 7.647058823529413e-06,
+      "loss": 0.7251,
+      "mean_token_accuracy": 0.7451107144355774,
+      "num_tokens": 54431.0,
+      "step": 40
+    },
+    {
+      "entropy": 1.2510095834732056,
+      "epoch": 0.04906771344455348,
+      "grad_norm": 2.0170955657958984,
+      "learning_rate": 9.607843137254903e-06,
+      "loss": 0.6338,
+      "mean_token_accuracy": 0.7795099318027496,
+      "num_tokens": 68329.0,
+      "step": 50
+    },
+    {
+      "entropy": 1.2673060655593873,
+      "epoch": 0.058881256133464184,
+      "grad_norm": 1.863154411315918,
+      "learning_rate": 1.1568627450980394e-05,
+      "loss": 0.6495,
+      "mean_token_accuracy": 0.7649777054786682,
+      "num_tokens": 81724.0,
+      "step": 60
+    },
+    {
+      "entropy": 1.243555223941803,
+      "epoch": 0.06869479882237488,
+      "grad_norm": 2.1085784435272217,
+      "learning_rate": 1.3529411764705883e-05,
+      "loss": 0.6042,
+      "mean_token_accuracy": 0.7765651226043702,
+      "num_tokens": 95693.0,
+      "step": 70
+    },
+    {
+      "entropy": 1.2313958764076234,
+      "epoch": 0.07850834151128558,
+      "grad_norm": 2.176846981048584,
+      "learning_rate": 1.5490196078431373e-05,
+      "loss": 0.5522,
+      "mean_token_accuracy": 0.796428507566452,
+      "num_tokens": 109253.0,
+      "step": 80
+    },
+    {
+      "entropy": 1.241322922706604,
+      "epoch": 0.08832188420019627,
+      "grad_norm": 2.0627377033233643,
+      "learning_rate": 1.7450980392156862e-05,
+      "loss": 0.6493,
+      "mean_token_accuracy": 0.7616375625133515,
+      "num_tokens": 122810.0,
+      "step": 90
+    },
+    {
+      "entropy": 1.202180063724518,
+      "epoch": 0.09813542688910697,
+      "grad_norm": 2.1048226356506348,
+      "learning_rate": 1.9411764705882355e-05,
+      "loss": 0.549,
+      "mean_token_accuracy": 0.7949124991893768,
+      "num_tokens": 136706.0,
+      "step": 100
+    },
+    {
+      "entropy": 1.1756585597991944,
+      "epoch": 0.10794896957801767,
+      "grad_norm": 2.2883429527282715,
+      "learning_rate": 2.1372549019607844e-05,
+      "loss": 0.4989,
+      "mean_token_accuracy": 0.8112820327281952,
+      "num_tokens": 150742.0,
+      "step": 110
+    },
+    {
+      "entropy": 1.1946371436119079,
+      "epoch": 0.11776251226692837,
+      "grad_norm": 2.2675540447235107,
+      "learning_rate": 2.3333333333333336e-05,
+      "loss": 0.5786,
+      "mean_token_accuracy": 0.7798411428928376,
+      "num_tokens": 164305.0,
+      "step": 120
+    },
+    {
+      "entropy": 1.19124995470047,
+      "epoch": 0.12757605495583907,
+      "grad_norm": 2.603771924972534,
+      "learning_rate": 2.5294117647058825e-05,
+      "loss": 0.5354,
+      "mean_token_accuracy": 0.7984326481819153,
+      "num_tokens": 177967.0,
+      "step": 130
+    },
+    {
+      "entropy": 1.1813685297966003,
+      "epoch": 0.13738959764474976,
+      "grad_norm": 2.699855089187622,
+      "learning_rate": 2.7254901960784314e-05,
+      "loss": 0.5117,
+      "mean_token_accuracy": 0.8128579616546631,
+      "num_tokens": 192090.0,
+      "step": 140
+    },
+    {
+      "entropy": 1.1989248156547547,
+      "epoch": 0.14720314033366044,
+      "grad_norm": 2.542903423309326,
+      "learning_rate": 2.9215686274509806e-05,
+      "loss": 0.5394,
+      "mean_token_accuracy": 0.7957534492015839,
+      "num_tokens": 206673.0,
+      "step": 150
+    },
+    {
+      "entropy": 1.1441561222076415,
+      "epoch": 0.15701668302257116,
+      "grad_norm": 3.7011725902557373,
+      "learning_rate": 3.11764705882353e-05,
+      "loss": 0.4586,
+      "mean_token_accuracy": 0.8275609016418457,
+      "num_tokens": 220293.0,
+      "step": 160
+    },
+    {
+      "entropy": 1.1597741365432739,
+      "epoch": 0.16683022571148184,
+      "grad_norm": 2.313307046890259,
+      "learning_rate": 3.3137254901960784e-05,
+      "loss": 0.4532,
+      "mean_token_accuracy": 0.8308035731315613,
+      "num_tokens": 234011.0,
+      "step": 170
+    },
+    {
+      "entropy": 1.1612919807434081,
+      "epoch": 0.17664376840039253,
+      "grad_norm": 2.9966866970062256,
+      "learning_rate": 3.5098039215686276e-05,
+      "loss": 0.5044,
+      "mean_token_accuracy": 0.8123882353305817,
+      "num_tokens": 247118.0,
+      "step": 180
+    },
+    {
+      "entropy": 1.1632250428199769,
+      "epoch": 0.18645731108930325,
+      "grad_norm": 3.3941473960876465,
+      "learning_rate": 3.705882352941177e-05,
+      "loss": 0.4761,
+      "mean_token_accuracy": 0.8212073087692261,
+      "num_tokens": 261365.0,
+      "step": 190
+    },
+    {
+      "entropy": 1.1553410053253175,
+      "epoch": 0.19627085377821393,
+      "grad_norm": 2.857267379760742,
+      "learning_rate": 3.9019607843137254e-05,
+      "loss": 0.4553,
+      "mean_token_accuracy": 0.8259550392627716,
+      "num_tokens": 274990.0,
+      "step": 200
+    },
+    {
+      "entropy": 1.152228093147278,
+      "epoch": 0.20608439646712462,
+      "grad_norm": 3.4860124588012695,
+      "learning_rate": 4.0980392156862746e-05,
+      "loss": 0.4613,
+      "mean_token_accuracy": 0.8292915642261505,
+      "num_tokens": 288512.0,
+      "step": 210
+    },
+    {
+      "entropy": 1.131454634666443,
+      "epoch": 0.21589793915603533,
+      "grad_norm": 2.9565322399139404,
+      "learning_rate": 4.294117647058823e-05,
+      "loss": 0.4439,
+      "mean_token_accuracy": 0.8337770104408264,
+      "num_tokens": 301852.0,
+      "step": 220
+    },
+    {
+      "entropy": 1.116087293624878,
+      "epoch": 0.22571148184494602,
+      "grad_norm": 3.363611936569214,
+      "learning_rate": 4.490196078431373e-05,
+      "loss": 0.4326,
+      "mean_token_accuracy": 0.836861526966095,
+      "num_tokens": 315850.0,
+      "step": 230
+    },
+    {
+      "entropy": 1.1328616857528686,
+      "epoch": 0.23552502453385674,
+      "grad_norm": 2.8903441429138184,
+      "learning_rate": 4.6862745098039216e-05,
+      "loss": 0.4509,
+      "mean_token_accuracy": 0.8328852355480194,
+      "num_tokens": 329271.0,
+      "step": 240
+    },
+    {
+      "entropy": 1.130015003681183,
+      "epoch": 0.24533856722276742,
+      "grad_norm": 3.509453773498535,
+      "learning_rate": 4.882352941176471e-05,
+      "loss": 0.4359,
+      "mean_token_accuracy": 0.8327082991600037,
+      "num_tokens": 343233.0,
+      "step": 250
+    },
+    {
+      "entropy": 1.1058894276618958,
+      "epoch": 0.25515210991167814,
+      "grad_norm": 3.33721923828125,
+      "learning_rate": 5.0784313725490194e-05,
+      "loss": 0.4356,
+      "mean_token_accuracy": 0.8358817815780639,
+      "num_tokens": 356887.0,
+      "step": 260
+    },
+    {
+      "entropy": 1.1200787544250488,
+      "epoch": 0.2649656526005888,
+      "grad_norm": 3.4578216075897217,
+      "learning_rate": 5.274509803921569e-05,
+      "loss": 0.4277,
+      "mean_token_accuracy": 0.8432585775852204,
+      "num_tokens": 370548.0,
+      "step": 270
+    },
+    {
+      "entropy": 1.1384179592132568,
+      "epoch": 0.2747791952894995,
+      "grad_norm": 3.2266058921813965,
+      "learning_rate": 5.4705882352941185e-05,
+      "loss": 0.4176,
+      "mean_token_accuracy": 0.8439722537994385,
+      "num_tokens": 384687.0,
+      "step": 280
+    },
+    {
+      "entropy": 1.1408962607383728,
+      "epoch": 0.2845927379784102,
+      "grad_norm": 3.087733030319214,
+      "learning_rate": 5.666666666666667e-05,
+      "loss": 0.445,
+      "mean_token_accuracy": 0.8325196325778961,
+      "num_tokens": 398358.0,
+      "step": 290
+    },
+    {
+      "entropy": 1.1371111273765564,
+      "epoch": 0.2944062806673209,
+      "grad_norm": 3.274041175842285,
+      "learning_rate": 5.862745098039216e-05,
+      "loss": 0.4244,
+      "mean_token_accuracy": 0.8416358292102813,
+      "num_tokens": 412700.0,
+      "step": 300
+    },
+    {
+      "entropy": 1.1113396763801575,
+      "epoch": 0.3042198233562316,
+      "grad_norm": 2.8629136085510254,
+      "learning_rate": 6.058823529411765e-05,
+      "loss": 0.3977,
+      "mean_token_accuracy": 0.8494885206222534,
+      "num_tokens": 427312.0,
+      "step": 310
+    },
+    {
+      "entropy": 1.1260488390922547,
+      "epoch": 0.3140333660451423,
+      "grad_norm": 3.125242233276367,
+      "learning_rate": 6.254901960784314e-05,
+      "loss": 0.4073,
+      "mean_token_accuracy": 0.8544234335422516,
+      "num_tokens": 441507.0,
+      "step": 320
+    },
+    {
+      "entropy": 1.1475743889808654,
+      "epoch": 0.323846908734053,
+      "grad_norm": 4.150859355926514,
+      "learning_rate": 6.450980392156864e-05,
+      "loss": 0.4422,
+      "mean_token_accuracy": 0.8314357757568359,
+      "num_tokens": 454411.0,
+      "step": 330
+    },
+    {
+      "entropy": 1.1072185158729553,
+      "epoch": 0.3336604514229637,
+      "grad_norm": 3.3013269901275635,
+      "learning_rate": 6.647058823529411e-05,
+      "loss": 0.4181,
+      "mean_token_accuracy": 0.8433676600456238,
+      "num_tokens": 467641.0,
+      "step": 340
+    },
+    {
+      "entropy": 1.115389347076416,
+      "epoch": 0.3434739941118744,
+      "grad_norm": 2.7073593139648438,
+      "learning_rate": 6.843137254901961e-05,
+      "loss": 0.45,
+      "mean_token_accuracy": 0.8306823253631592,
+      "num_tokens": 481724.0,
+      "step": 350
+    },
+    {
+      "entropy": 1.0993050813674927,
+      "epoch": 0.35328753680078506,
+      "grad_norm": 4.06768798828125,
+      "learning_rate": 7.039215686274511e-05,
+      "loss": 0.3839,
+      "mean_token_accuracy": 0.8561393201351166,
+      "num_tokens": 495345.0,
+      "step": 360
+    },
+    {
+      "entropy": 1.1100080490112305,
+      "epoch": 0.3631010794896958,
+      "grad_norm": 2.8841543197631836,
+      "learning_rate": 7.23529411764706e-05,
+      "loss": 0.4263,
+      "mean_token_accuracy": 0.8410856664180756,
+      "num_tokens": 509800.0,
+      "step": 370
+    },
+    {
+      "entropy": 1.1247098922729493,
+      "epoch": 0.3729146221786065,
+      "grad_norm": 4.105075836181641,
+      "learning_rate": 7.431372549019608e-05,
+      "loss": 0.4157,
+      "mean_token_accuracy": 0.8427928566932679,
+      "num_tokens": 523514.0,
+      "step": 380
+    },
+    {
+      "entropy": 1.1408787369728088,
+      "epoch": 0.38272816486751715,
+      "grad_norm": 2.9242143630981445,
+      "learning_rate": 7.627450980392157e-05,
+      "loss": 0.4266,
+      "mean_token_accuracy": 0.8396799921989441,
+      "num_tokens": 537693.0,
+      "step": 390
+    },
+    {
+      "entropy": 1.1217655539512634,
+      "epoch": 0.39254170755642787,
+      "grad_norm": 2.826019763946533,
+      "learning_rate": 7.823529411764707e-05,
+      "loss": 0.416,
+      "mean_token_accuracy": 0.8408508718013763,
+      "num_tokens": 551457.0,
+      "step": 400
+    },
+    {
+      "entropy": 1.157455313205719,
+      "epoch": 0.4023552502453386,
+      "grad_norm": 3.266930103302002,
+      "learning_rate": 8.019607843137255e-05,
+      "loss": 0.3848,
+      "mean_token_accuracy": 0.8575535297393799,
+      "num_tokens": 565157.0,
+      "step": 410
+    },
+    {
+      "entropy": 1.1714832425117492,
+      "epoch": 0.41216879293424924,
+      "grad_norm": 2.520620584487915,
+      "learning_rate": 8.215686274509804e-05,
+      "loss": 0.3873,
+      "mean_token_accuracy": 0.8593390583992004,
+      "num_tokens": 579080.0,
+      "step": 420
+    },
+    {
+      "entropy": 1.1691359281539917,
+      "epoch": 0.42198233562315995,
+      "grad_norm": 3.3553106784820557,
+      "learning_rate": 8.411764705882354e-05,
+      "loss": 0.3929,
+      "mean_token_accuracy": 0.848738569021225,
+      "num_tokens": 592922.0,
+      "step": 430
+    },
+    {
+      "entropy": 1.161519956588745,
+      "epoch": 0.43179587831207067,
+      "grad_norm": 3.784954309463501,
+      "learning_rate": 8.607843137254903e-05,
+      "loss": 0.3447,
+      "mean_token_accuracy": 0.8669132351875305,
+      "num_tokens": 607114.0,
+      "step": 440
+    },
+    {
+      "entropy": 1.175284707546234,
+      "epoch": 0.44160942100098133,
+      "grad_norm": 3.634345293045044,
+      "learning_rate": 8.80392156862745e-05,
+      "loss": 0.4092,
+      "mean_token_accuracy": 0.8398001670837403,
+      "num_tokens": 620907.0,
+      "step": 450
+    },
+    {
+      "entropy": 1.1871889114379883,
+      "epoch": 0.45142296368989204,
+      "grad_norm": 3.3342034816741943,
+      "learning_rate": 9e-05,
+      "loss": 0.3739,
+      "mean_token_accuracy": 0.8601496338844299,
+      "num_tokens": 635363.0,
+      "step": 460
+    },
+    {
+      "entropy": 1.163673198223114,
+      "epoch": 0.46123650637880276,
+      "grad_norm": 3.118102550506592,
+      "learning_rate": 9.196078431372549e-05,
+      "loss": 0.4217,
+      "mean_token_accuracy": 0.8399349391460419,
+      "num_tokens": 648200.0,
+      "step": 470
+    },
+    {
+      "entropy": 1.206890833377838,
+      "epoch": 0.47105004906771347,
+      "grad_norm": 3.5182156562805176,
+      "learning_rate": 9.392156862745099e-05,
+      "loss": 0.4306,
+      "mean_token_accuracy": 0.8402868688106537,
+      "num_tokens": 662920.0,
+      "step": 480
+    },
+    {
+      "entropy": 1.1812933087348938,
+      "epoch": 0.48086359175662413,
+      "grad_norm": 3.2047181129455566,
+      "learning_rate": 9.588235294117648e-05,
+      "loss": 0.3924,
+      "mean_token_accuracy": 0.854317981004715,
+      "num_tokens": 676646.0,
+      "step": 490
+    },
+    {
+      "entropy": 1.214794135093689,
+      "epoch": 0.49067713444553485,
+      "grad_norm": 3.524360418319702,
+      "learning_rate": 9.784313725490196e-05,
+      "loss": 0.3901,
+      "mean_token_accuracy": 0.8527312636375427,
+      "num_tokens": 690435.0,
+      "step": 500
+    },
+    {
+      "entropy": 1.223682713508606,
+      "epoch": 0.5004906771344455,
+      "grad_norm": 3.3886806964874268,
+      "learning_rate": 9.980392156862746e-05,
+      "loss": 0.3784,
+      "mean_token_accuracy": 0.8536435484886169,
+      "num_tokens": 704664.0,
+      "step": 510
+    },
+    {
+      "entropy": 1.2168277263641358,
+      "epoch": 0.5103042198233563,
+      "grad_norm": 3.704810380935669,
+      "learning_rate": 9.999978670840125e-05,
+      "loss": 0.3918,
+      "mean_token_accuracy": 0.8539480805397034,
+      "num_tokens": 718834.0,
+      "step": 520
+    },
+    {
+      "entropy": 1.2357110142707826,
+      "epoch": 0.5201177625122669,
+      "grad_norm": 3.3917276859283447,
+      "learning_rate": 9.999904940644553e-05,
+      "loss": 0.3874,
+      "mean_token_accuracy": 0.8504210293293,
+      "num_tokens": 733172.0,
+      "step": 530
+    },
+    {
+      "entropy": 1.2259963989257812,
+      "epoch": 0.5299313052011776,
+      "grad_norm": 3.740762233734131,
+      "learning_rate": 9.999778546866733e-05,
+      "loss": 0.3979,
+      "mean_token_accuracy": 0.8438404977321625,
+      "num_tokens": 746577.0,
+      "step": 540
+    },
+    {
+      "entropy": 1.199124240875244,
+      "epoch": 0.5397448478900884,
+      "grad_norm": 2.874812126159668,
+      "learning_rate": 9.999599490837959e-05,
+      "loss": 0.363,
+      "mean_token_accuracy": 0.8524531662464142,
+      "num_tokens": 760378.0,
+      "step": 550
+    },
+    {
+      "entropy": 1.2345012068748473,
+      "epoch": 0.549558390578999,
+      "grad_norm": 2.862276554107666,
+      "learning_rate": 9.999367774444214e-05,
+      "loss": 0.4005,
+      "mean_token_accuracy": 0.846814078092575,
+      "num_tokens": 774528.0,
+      "step": 560
+    },
+    {
+      "entropy": 1.1909167528152467,
+      "epoch": 0.5593719332679097,
+      "grad_norm": 2.375776529312134,
+      "learning_rate": 9.999083400126145e-05,
+      "loss": 0.3588,
+      "mean_token_accuracy": 0.8638595938682556,
+      "num_tokens": 787978.0,
+      "step": 570
+    },
+    {
+      "entropy": 1.1990839481353759,
+      "epoch": 0.5691854759568205,
+      "grad_norm": 2.649693250656128,
+      "learning_rate": 9.998746370879049e-05,
+      "loss": 0.3629,
+      "mean_token_accuracy": 0.8618560075759888,
+      "num_tokens": 802122.0,
+      "step": 580
+    },
+    {
+      "entropy": 1.1829696774482727,
+      "epoch": 0.5789990186457311,
+      "grad_norm": 2.828573703765869,
+      "learning_rate": 9.99835669025282e-05,
+      "loss": 0.3294,
+      "mean_token_accuracy": 0.8737357437610627,
+      "num_tokens": 816567.0,
+      "step": 590
+    },
+    {
+      "entropy": 1.2039927005767823,
+      "epoch": 0.5888125613346418,
+      "grad_norm": 2.849315643310547,
+      "learning_rate": 9.997914362351934e-05,
+      "loss": 0.3045,
+      "mean_token_accuracy": 0.8843503415584564,
+      "num_tokens": 830550.0,
+      "step": 600
+    },
+    {
+      "entropy": 1.1890666484832764,
+      "epoch": 0.5986261040235525,
+      "grad_norm": 2.388582229614258,
+      "learning_rate": 9.997419391835396e-05,
+      "loss": 0.3378,
+      "mean_token_accuracy": 0.8758296966552734,
+      "num_tokens": 843779.0,
+      "step": 610
+    },
+    {
+      "entropy": 1.213183867931366,
+      "epoch": 0.6084396467124632,
+      "grad_norm": 3.474524736404419,
+      "learning_rate": 9.996871783916687e-05,
+      "loss": 0.3504,
+      "mean_token_accuracy": 0.8678486227989197,
+      "num_tokens": 857498.0,
+      "step": 620
+    },
+    {
+      "entropy": 1.187168836593628,
+      "epoch": 0.6182531894013739,
+      "grad_norm": 3.741032123565674,
+      "learning_rate": 9.996271544363717e-05,
+      "loss": 0.3656,
+      "mean_token_accuracy": 0.8562004089355468,
+      "num_tokens": 871163.0,
+      "step": 630
+    },
+    {
+      "entropy": 1.2092647194862365,
+      "epoch": 0.6280667320902846,
+      "grad_norm": 2.7920150756835938,
+      "learning_rate": 9.995618679498758e-05,
+      "loss": 0.3654,
+      "mean_token_accuracy": 0.864486688375473,
+      "num_tokens": 884912.0,
+      "step": 640
+    },
+    {
+      "entropy": 1.1932833075523377,
+      "epoch": 0.6378802747791953,
+      "grad_norm": 3.4623348712921143,
+      "learning_rate": 9.994913196198381e-05,
+      "loss": 0.3384,
+      "mean_token_accuracy": 0.8657363414764404,
+      "num_tokens": 898102.0,
+      "step": 650
+    },
+    {
+      "entropy": 1.210770559310913,
+      "epoch": 0.647693817468106,
+      "grad_norm": 3.1084299087524414,
+      "learning_rate": 9.994155101893386e-05,
+      "loss": 0.3564,
+      "mean_token_accuracy": 0.8611253321170806,
+      "num_tokens": 911902.0,
+      "step": 660
+    },
+    {
+      "entropy": 1.2453441143035888,
+      "epoch": 0.6575073601570167,
+      "grad_norm": 2.4232285022735596,
+      "learning_rate": 9.993344404568712e-05,
+      "loss": 0.3996,
+      "mean_token_accuracy": 0.852339768409729,
+      "num_tokens": 926137.0,
+      "step": 670
+    },
+    {
+      "entropy": 1.1893890380859375,
+      "epoch": 0.6673209028459274,
+      "grad_norm": 2.9850914478302,
+      "learning_rate": 9.992481112763372e-05,
+      "loss": 0.3151,
+      "mean_token_accuracy": 0.8825689435005188,
+      "num_tokens": 940507.0,
+      "step": 680
+    },
+    {
+      "entropy": 1.1946087718009948,
+      "epoch": 0.677134445534838,
+      "grad_norm": 2.2807886600494385,
+      "learning_rate": 9.991565235570341e-05,
+      "loss": 0.3327,
+      "mean_token_accuracy": 0.8753289222717285,
+      "num_tokens": 954934.0,
+      "step": 690
+    },
+    {
+      "entropy": 1.2035173773765564,
+      "epoch": 0.6869479882237488,
+      "grad_norm": 3.3408844470977783,
+      "learning_rate": 9.990596782636481e-05,
+      "loss": 0.3865,
+      "mean_token_accuracy": 0.855133694410324,
+      "num_tokens": 968760.0,
+      "step": 700
+    },
+    {
+      "entropy": 1.1885886549949647,
+      "epoch": 0.6967615309126595,
+      "grad_norm": 3.780911922454834,
+      "learning_rate": 9.989575764162426e-05,
+      "loss": 0.3145,
+      "mean_token_accuracy": 0.8819782733917236,
+      "num_tokens": 982644.0,
+      "step": 710
+    },
+    {
+      "entropy": 1.1796546936035157,
+      "epoch": 0.7065750736015701,
+      "grad_norm": 2.4805071353912354,
+      "learning_rate": 9.988502190902476e-05,
+      "loss": 0.3532,
+      "mean_token_accuracy": 0.8675242066383362,
+      "num_tokens": 996817.0,
+      "step": 720
+    },
+    {
+      "entropy": 1.1823334097862244,
+      "epoch": 0.7163886162904809,
+      "grad_norm": 2.9788618087768555,
+      "learning_rate": 9.987376074164491e-05,
+      "loss": 0.3366,
+      "mean_token_accuracy": 0.8697103559970856,
+      "num_tokens": 1010008.0,
+      "step": 730
+    },
+    {
+      "entropy": 1.1985557913780212,
+      "epoch": 0.7262021589793916,
+      "grad_norm": 2.5642588138580322,
+      "learning_rate": 9.986197425809766e-05,
+      "loss": 0.3305,
+      "mean_token_accuracy": 0.8674235641956329,
+      "num_tokens": 1023812.0,
+      "step": 740
+    },
+    {
+      "entropy": 1.1962911128997802,
+      "epoch": 0.7360157016683022,
+      "grad_norm": 3.236114978790283,
+      "learning_rate": 9.984966258252903e-05,
+      "loss": 0.3707,
+      "mean_token_accuracy": 0.8536763250827789,
+      "num_tokens": 1037399.0,
+      "step": 750
+    },
+    {
+      "entropy": 1.18121737241745,
+      "epoch": 0.745829244357213,
+      "grad_norm": 2.707700252532959,
+      "learning_rate": 9.983682584461688e-05,
+      "loss": 0.2948,
+      "mean_token_accuracy": 0.8834168791770936,
+      "num_tokens": 1050972.0,
+      "step": 760
+    },
+    {
+      "entropy": 1.190599763393402,
+      "epoch": 0.7556427870461236,
+      "grad_norm": 3.883239507675171,
+      "learning_rate": 9.982346417956949e-05,
+      "loss": 0.3678,
+      "mean_token_accuracy": 0.8604642748832703,
+      "num_tokens": 1064581.0,
+      "step": 770
+    },
+    {
+      "entropy": 1.1711848378181458,
+      "epoch": 0.7654563297350343,
+      "grad_norm": 2.420788049697876,
+      "learning_rate": 9.98095777281242e-05,
+      "loss": 0.3255,
+      "mean_token_accuracy": 0.8733546793460846,
+      "num_tokens": 1078433.0,
+      "step": 780
+    },
+    {
+      "entropy": 1.175394594669342,
+      "epoch": 0.7752698724239451,
+      "grad_norm": 3.810960292816162,
+      "learning_rate": 9.979516663654582e-05,
+      "loss": 0.3239,
+      "mean_token_accuracy": 0.8758346498012543,
+      "num_tokens": 1091839.0,
+      "step": 790
+    },
+    {
+      "entropy": 1.1753032803535461,
+      "epoch": 0.7850834151128557,
+      "grad_norm": 2.5673933029174805,
+      "learning_rate": 9.978023105662519e-05,
+      "loss": 0.3252,
+      "mean_token_accuracy": 0.8731696844100952,
+      "num_tokens": 1105747.0,
+      "step": 800
+    },
+    {
+      "entropy": 1.1706650733947754,
+      "epoch": 0.7948969578017664,
+      "grad_norm": 3.5536954402923584,
+      "learning_rate": 9.976477114567752e-05,
+      "loss": 0.3254,
+      "mean_token_accuracy": 0.881614089012146,
+      "num_tokens": 1119344.0,
+      "step": 810
+    },
+    {
+      "entropy": 1.1545325756072997,
+      "epoch": 0.8047105004906772,
+      "grad_norm": 3.0351312160491943,
+      "learning_rate": 9.974878706654076e-05,
+      "loss": 0.3075,
+      "mean_token_accuracy": 0.880529397726059,
+      "num_tokens": 1133252.0,
+      "step": 820
+    },
+    {
+      "entropy": 1.1713720440864563,
+      "epoch": 0.8145240431795878,
+      "grad_norm": 2.81852126121521,
+      "learning_rate": 9.97322789875739e-05,
+      "loss": 0.3263,
+      "mean_token_accuracy": 0.8688643455505372,
+      "num_tokens": 1147368.0,
+      "step": 830
+    },
+    {
+      "entropy": 1.152477788925171,
+      "epoch": 0.8243375858684985,
+      "grad_norm": 3.188549757003784,
+      "learning_rate": 9.971524708265515e-05,
+      "loss": 0.3062,
+      "mean_token_accuracy": 0.8836665332317353,
+      "num_tokens": 1161165.0,
+      "step": 840
+    },
+    {
+      "entropy": 1.1690003156661988,
+      "epoch": 0.8341511285574092,
+      "grad_norm": 2.653815269470215,
+      "learning_rate": 9.969769153118014e-05,
+      "loss": 0.3191,
+      "mean_token_accuracy": 0.8780744910240174,
+      "num_tokens": 1174814.0,
+      "step": 850
+    },
+    {
+      "entropy": 1.182269549369812,
+      "epoch": 0.8439646712463199,
+      "grad_norm": 2.8399338722229004,
+      "learning_rate": 9.967961251806005e-05,
+      "loss": 0.36,
+      "mean_token_accuracy": 0.864084666967392,
+      "num_tokens": 1188967.0,
+      "step": 860
+    },
+    {
+      "entropy": 1.15756698846817,
+      "epoch": 0.8537782139352306,
+      "grad_norm": 3.0958690643310547,
+      "learning_rate": 9.966101023371961e-05,
+      "loss": 0.3226,
+      "mean_token_accuracy": 0.8756173610687256,
+      "num_tokens": 1202907.0,
+      "step": 870
+    },
+    {
+      "entropy": 1.1475032567977905,
+      "epoch": 0.8635917566241413,
+      "grad_norm": 2.4389164447784424,
+      "learning_rate": 9.964188487409512e-05,
+      "loss": 0.2938,
+      "mean_token_accuracy": 0.8890304386615753,
+      "num_tokens": 1216865.0,
+      "step": 880
+    },
+    {
+      "entropy": 1.1877331137657166,
+      "epoch": 0.873405299313052,
+      "grad_norm": 3.5680530071258545,
+      "learning_rate": 9.962223664063241e-05,
+      "loss": 0.304,
+      "mean_token_accuracy": 0.8802395105361939,
+      "num_tokens": 1230849.0,
+      "step": 890
+    },
+    {
+      "entropy": 1.1612010836601256,
+      "epoch": 0.8832188420019627,
+      "grad_norm": 2.2012088298797607,
+      "learning_rate": 9.960206574028468e-05,
+      "loss": 0.3228,
+      "mean_token_accuracy": 0.8745718121528625,
+      "num_tokens": 1243965.0,
+      "step": 900
+    },
+    {
+      "entropy": 1.1542456030845643,
+      "epoch": 0.8930323846908734,
+      "grad_norm": 4.392910480499268,
+      "learning_rate": 9.958137238551036e-05,
+      "loss": 0.3072,
+      "mean_token_accuracy": 0.8850542485713959,
+      "num_tokens": 1257918.0,
+      "step": 910
+    },
+    {
+      "entropy": 1.1391899585723877,
+      "epoch": 0.9028459273797841,
+      "grad_norm": 2.334970712661743,
+      "learning_rate": 9.956015679427082e-05,
+      "loss": 0.324,
+      "mean_token_accuracy": 0.8772465825080872,
+      "num_tokens": 1271980.0,
+      "step": 920
+    },
+    {
+      "entropy": 1.1350346326828002,
+      "epoch": 0.9126594700686947,
+      "grad_norm": 2.9113283157348633,
+      "learning_rate": 9.953841919002812e-05,
+      "loss": 0.3036,
+      "mean_token_accuracy": 0.8844376146793366,
+      "num_tokens": 1286287.0,
+      "step": 930
+    },
+    {
+      "entropy": 1.1404588222503662,
+      "epoch": 0.9224730127576055,
+      "grad_norm": 2.446030855178833,
+      "learning_rate": 9.951615980174261e-05,
+      "loss": 0.2895,
+      "mean_token_accuracy": 0.8874838829040528,
+      "num_tokens": 1300381.0,
+      "step": 940
+    },
+    {
+      "entropy": 1.139818048477173,
+      "epoch": 0.9322865554465162,
+      "grad_norm": 4.113491058349609,
+      "learning_rate": 9.94933788638706e-05,
+      "loss": 0.2981,
+      "mean_token_accuracy": 0.8840461432933807,
+      "num_tokens": 1314086.0,
+      "step": 950
+    },
+    {
+      "entropy": 1.1538641214370728,
+      "epoch": 0.9421000981354269,
+      "grad_norm": 2.8096957206726074,
+      "learning_rate": 9.947007661636182e-05,
+      "loss": 0.3039,
+      "mean_token_accuracy": 0.8843685805797576,
+      "num_tokens": 1328434.0,
+      "step": 960
+    },
+    {
+      "entropy": 1.129905092716217,
+      "epoch": 0.9519136408243376,
+      "grad_norm": 2.2897799015045166,
+      "learning_rate": 9.944625330465688e-05,
+      "loss": 0.307,
+      "mean_token_accuracy": 0.8870210766792297,
+      "num_tokens": 1341971.0,
+      "step": 970
+    },
+    {
+      "entropy": 1.1422709345817565,
+      "epoch": 0.9617271835132483,
+      "grad_norm": 3.0245301723480225,
+      "learning_rate": 9.942190917968474e-05,
+      "loss": 0.267,
+      "mean_token_accuracy": 0.895936119556427,
+      "num_tokens": 1356685.0,
+      "step": 980
+    },
+    {
+      "entropy": 1.1508997082710266,
+      "epoch": 0.971540726202159,
+      "grad_norm": 3.526949882507324,
+      "learning_rate": 9.939704449786005e-05,
+      "loss": 0.309,
+      "mean_token_accuracy": 0.8837933301925659,
+      "num_tokens": 1370541.0,
+      "step": 990
+    },
+    {
+      "entropy": 1.1339004516601563,
+      "epoch": 0.9813542688910697,
+      "grad_norm": 5.214639663696289,
+      "learning_rate": 9.937165952108042e-05,
+      "loss": 0.2651,
+      "mean_token_accuracy": 0.8960758924484253,
+      "num_tokens": 1383962.0,
+      "step": 1000
+    },
+    {
+      "entropy": 1.1374788284301758,
+      "epoch": 0.9911678115799804,
+      "grad_norm": 3.573387622833252,
+      "learning_rate": 9.93457545167237e-05,
+      "loss": 0.2806,
+      "mean_token_accuracy": 0.8932440996170044,
+      "num_tokens": 1397902.0,
+      "step": 1010
+    },
+    {
+      "entropy": 1.1547051310539245,
+      "epoch": 1.000981354268891,
+      "grad_norm": 2.75999116897583,
+      "learning_rate": 9.931932975764516e-05,
+      "loss": 0.2955,
+      "mean_token_accuracy": 0.8851269900798797,
+      "num_tokens": 1410184.0,
+      "step": 1020
+    },
+    {
+      "entropy": 1.1515445828437805,
+      "epoch": 1.0107948969578018,
+      "grad_norm": 3.056048631668091,
+      "learning_rate": 9.929238552217455e-05,
+      "loss": 0.2466,
+      "mean_token_accuracy": 0.9091300308704376,
+      "num_tokens": 1424018.0,
+      "step": 1030
+    },
+    {
+      "entropy": 1.1538839101791383,
+      "epoch": 1.0206084396467126,
+      "grad_norm": 3.0634031295776367,
+      "learning_rate": 9.92649220941133e-05,
+      "loss": 0.2364,
+      "mean_token_accuracy": 0.9111003875732422,
+      "num_tokens": 1438068.0,
+      "step": 1040
+    },
+    {
+      "entropy": 1.1284090638160706,
+      "epoch": 1.030421982335623,
+      "grad_norm": 3.0699167251586914,
+      "learning_rate": 9.923693976273139e-05,
+      "loss": 0.2111,
+      "mean_token_accuracy": 0.9171704292297364,
+      "num_tokens": 1452019.0,
+      "step": 1050
+    },
+    {
+      "entropy": 1.101210856437683,
+      "epoch": 1.0402355250245339,
+      "grad_norm": 4.536798477172852,
+      "learning_rate": 9.920843882276437e-05,
+      "loss": 0.2267,
+      "mean_token_accuracy": 0.9128627121448517,
+      "num_tokens": 1465698.0,
+      "step": 1060
+    },
+    {
+      "entropy": 1.1133004069328307,
+      "epoch": 1.0500490677134446,
+      "grad_norm": 3.4195828437805176,
+      "learning_rate": 9.917941957441028e-05,
+      "loss": 0.2301,
+      "mean_token_accuracy": 0.9139590561389923,
+      "num_tokens": 1479631.0,
+      "step": 1070
+    },
+    {
+      "entropy": 1.1176589846611023,
+      "epoch": 1.0598626104023552,
+      "grad_norm": 3.2385478019714355,
+      "learning_rate": 9.914988232332647e-05,
+      "loss": 0.2471,
+      "mean_token_accuracy": 0.9055424690246582,
+      "num_tokens": 1492799.0,
+      "step": 1080
+    },
+    {
+      "entropy": 1.121683955192566,
+      "epoch": 1.069676153091266,
+      "grad_norm": 3.5210251808166504,
+      "learning_rate": 9.91198273806263e-05,
+      "loss": 0.2187,
+      "mean_token_accuracy": 0.9202529132366181,
+      "num_tokens": 1506531.0,
+      "step": 1090
+    },
+    {
+      "entropy": 1.1105925798416139,
+      "epoch": 1.0794896957801767,
+      "grad_norm": 2.8446905612945557,
+      "learning_rate": 9.908925506287603e-05,
+      "loss": 0.2229,
+      "mean_token_accuracy": 0.9195864617824554,
+      "num_tokens": 1520031.0,
+      "step": 1100
+    },
+    {
+      "entropy": 1.126805567741394,
+      "epoch": 1.0893032384690873,
+      "grad_norm": 2.4356892108917236,
+      "learning_rate": 9.905816569209127e-05,
+      "loss": 0.2201,
+      "mean_token_accuracy": 0.9141518414020539,
+      "num_tokens": 1534436.0,
+      "step": 1110
+    },
+    {
+      "entropy": 1.1253270387649537,
+      "epoch": 1.099116781157998,
+      "grad_norm": 2.5282037258148193,
+      "learning_rate": 9.902655959573384e-05,
+      "loss": 0.2398,
+      "mean_token_accuracy": 0.906571364402771,
+      "num_tokens": 1547962.0,
+      "step": 1120
+    },
+    {
+      "entropy": 1.1043806791305542,
+      "epoch": 1.1089303238469088,
+      "grad_norm": 2.932882308959961,
+      "learning_rate": 9.899443710670807e-05,
+      "loss": 0.2462,
+      "mean_token_accuracy": 0.9070131957530976,
+      "num_tokens": 1561849.0,
+      "step": 1130
+    },
+    {
+      "entropy": 1.094444954395294,
+      "epoch": 1.1187438665358194,
+      "grad_norm": 3.0343003273010254,
+      "learning_rate": 9.896179856335749e-05,
+      "loss": 0.21,
+      "mean_token_accuracy": 0.9164992868900299,
+      "num_tokens": 1575503.0,
+      "step": 1140
+    },
+    {
+      "entropy": 1.1073559522628784,
+      "epoch": 1.1285574092247301,
+      "grad_norm": 2.9921181201934814,
+      "learning_rate": 9.892864430946114e-05,
+      "loss": 0.23,
+      "mean_token_accuracy": 0.9125438928604126,
+      "num_tokens": 1589893.0,
+      "step": 1150
+    },
+    {
+      "entropy": 1.1016913652420044,
+      "epoch": 1.138370951913641,
+      "grad_norm": 2.405503034591675,
+      "learning_rate": 9.889497469423004e-05,
+      "loss": 0.2017,
+      "mean_token_accuracy": 0.9252783298492432,
+      "num_tokens": 1603816.0,
+      "step": 1160
+    },
+    {
+      "entropy": 1.11280415058136,
+      "epoch": 1.1481844946025515,
+      "grad_norm": 3.991299629211426,
+      "learning_rate": 9.886079007230341e-05,
+      "loss": 0.226,
+      "mean_token_accuracy": 0.9137633264064788,
+      "num_tokens": 1617087.0,
+      "step": 1170
+    },
+    {
+      "entropy": 1.1015000700950623,
+      "epoch": 1.1579980372914622,
+      "grad_norm": 3.361809492111206,
+      "learning_rate": 9.882609080374505e-05,
+      "loss": 0.2151,
+      "mean_token_accuracy": 0.9169393658638001,
+      "num_tokens": 1631558.0,
+      "step": 1180
+    },
+    {
+      "entropy": 1.1231618046760559,
+      "epoch": 1.167811579980373,
+      "grad_norm": 3.2894387245178223,
+      "learning_rate": 9.879087725403947e-05,
+      "loss": 0.2256,
+      "mean_token_accuracy": 0.9099008500576019,
+      "num_tokens": 1645944.0,
+      "step": 1190
+    },
+    {
+      "entropy": 1.0991198301315308,
+      "epoch": 1.1776251226692835,
+      "grad_norm": 3.0466201305389404,
+      "learning_rate": 9.875514979408801e-05,
+      "loss": 0.2136,
+      "mean_token_accuracy": 0.9225036799907684,
+      "num_tokens": 1659636.0,
+      "step": 1200
+    },
+    {
+      "entropy": 1.093700921535492,
+      "epoch": 1.1874386653581943,
+      "grad_norm": 3.4141945838928223,
+      "learning_rate": 9.871890880020508e-05,
+      "loss": 0.2091,
+      "mean_token_accuracy": 0.9208812236785888,
+      "num_tokens": 1673078.0,
+      "step": 1210
+    },
+    {
+      "entropy": 1.1064150214195252,
+      "epoch": 1.197252208047105,
+      "grad_norm": 3.0487678050994873,
+      "learning_rate": 9.868215465411403e-05,
+      "loss": 0.2059,
+      "mean_token_accuracy": 0.9218286991119384,
+      "num_tokens": 1687154.0,
+      "step": 1220
+    },
+    {
+      "entropy": 1.109318232536316,
+      "epoch": 1.2070657507360156,
+      "grad_norm": 3.9022064208984375,
+      "learning_rate": 9.864488774294323e-05,
+      "loss": 0.2123,
+      "mean_token_accuracy": 0.9169949591159821,
+      "num_tokens": 1701533.0,
+      "step": 1230
+    },
+    {
+      "entropy": 1.1062457799911498,
+      "epoch": 1.2168792934249264,
+      "grad_norm": 4.482473850250244,
+      "learning_rate": 9.860710845922194e-05,
+      "loss": 0.2243,
+      "mean_token_accuracy": 0.9112108528614045,
+      "num_tokens": 1714722.0,
+      "step": 1240
+    },
+    {
+      "entropy": 1.0982243537902832,
+      "epoch": 1.2266928361138372,
+      "grad_norm": 2.623514175415039,
+      "learning_rate": 9.856881720087618e-05,
+      "loss": 0.2268,
+      "mean_token_accuracy": 0.9161077320575715,
+      "num_tokens": 1727989.0,
+      "step": 1250
+    },
+    {
+      "entropy": 1.1158525943756104,
+      "epoch": 1.2365063788027477,
+      "grad_norm": 3.0516302585601807,
+      "learning_rate": 9.853001437122465e-05,
+      "loss": 0.2153,
+      "mean_token_accuracy": 0.9184135735034943,
+      "num_tokens": 1742532.0,
+      "step": 1260
+    },
+    {
+      "entropy": 1.1110124349594117,
+      "epoch": 1.2463199214916585,
+      "grad_norm": 2.7834887504577637,
+      "learning_rate": 9.849070037897427e-05,
+      "loss": 0.1982,
+      "mean_token_accuracy": 0.9278346180915833,
+      "num_tokens": 1755690.0,
+      "step": 1270
+    },
+    {
+      "entropy": 1.0998762726783753,
+      "epoch": 1.2561334641805693,
+      "grad_norm": 3.024761438369751,
+      "learning_rate": 9.845087563821608e-05,
+      "loss": 0.2325,
+      "mean_token_accuracy": 0.9124217391014099,
+      "num_tokens": 1769918.0,
+      "step": 1280
+    },
+    {
+      "entropy": 1.105711567401886,
+      "epoch": 1.2659470068694798,
+      "grad_norm": 4.066706657409668,
+      "learning_rate": 9.841054056842073e-05,
+      "loss": 0.2054,
+      "mean_token_accuracy": 0.9234851360321045,
+      "num_tokens": 1783944.0,
+      "step": 1290
+    },
+    {
+      "entropy": 1.0856220126152039,
+      "epoch": 1.2757605495583906,
+      "grad_norm": 3.5071566104888916,
+      "learning_rate": 9.836969559443416e-05,
+      "loss": 0.1895,
+      "mean_token_accuracy": 0.9265620410442352,
+      "num_tokens": 1797715.0,
+      "step": 1300
+    },
+    {
+      "entropy": 1.082138192653656,
+      "epoch": 1.2855740922473013,
+      "grad_norm": 3.116482734680176,
+      "learning_rate": 9.832834114647307e-05,
+      "loss": 0.2269,
+      "mean_token_accuracy": 0.9138248920440674,
+      "num_tokens": 1811354.0,
+      "step": 1310
+    },
+    {
+      "entropy": 1.0945980429649353,
+      "epoch": 1.295387634936212,
+      "grad_norm": 3.037992238998413,
+      "learning_rate": 9.828647766012044e-05,
+      "loss": 0.2251,
+      "mean_token_accuracy": 0.9168272972106933,
+      "num_tokens": 1825286.0,
+      "step": 1320
+    },
+    {
+      "entropy": 1.1021912217140197,
+      "epoch": 1.3052011776251227,
+      "grad_norm": 3.164567232131958,
+      "learning_rate": 9.824410557632087e-05,
+      "loss": 0.2202,
+      "mean_token_accuracy": 0.9173984825611115,
+      "num_tokens": 1839043.0,
+      "step": 1330
+    },
+    {
+      "entropy": 1.0630751252174377,
+      "epoch": 1.3150147203140334,
+      "grad_norm": 2.8916590213775635,
+      "learning_rate": 9.820122534137597e-05,
+      "loss": 0.1924,
+      "mean_token_accuracy": 0.9297727942466736,
+      "num_tokens": 1852342.0,
+      "step": 1340
+    },
+    {
+      "entropy": 1.0630465269088745,
+      "epoch": 1.324828263002944,
+      "grad_norm": 4.064034461975098,
+      "learning_rate": 9.815783740693972e-05,
+      "loss": 0.2026,
+      "mean_token_accuracy": 0.9194791316986084,
+      "num_tokens": 1866189.0,
+      "step": 1350
+    },
+    {
+      "entropy": 1.0882034778594971,
+      "epoch": 1.3346418056918548,
+      "grad_norm": 3.0452957153320312,
+      "learning_rate": 9.811394223001358e-05,
+      "loss": 0.2095,
+      "mean_token_accuracy": 0.9215176224708557,
+      "num_tokens": 1880098.0,
+      "step": 1360
+    },
+    {
+      "entropy": 1.0704776525497437,
+      "epoch": 1.3444553483807655,
+      "grad_norm": 3.454646587371826,
+      "learning_rate": 9.806954027294182e-05,
+      "loss": 0.1889,
+      "mean_token_accuracy": 0.926536750793457,
+      "num_tokens": 1893007.0,
+      "step": 1370
+    },
+    {
+      "entropy": 1.0929898142814636,
+      "epoch": 1.354268891069676,
+      "grad_norm": 3.7232348918914795,
+      "learning_rate": 9.802463200340654e-05,
+      "loss": 0.2084,
+      "mean_token_accuracy": 0.9215791761875153,
+      "num_tokens": 1906405.0,
+      "step": 1380
+    },
+    {
+      "entropy": 1.1055084466934204,
+      "epoch": 1.3640824337585868,
+      "grad_norm": 4.541325569152832,
+      "learning_rate": 9.797921789442283e-05,
+      "loss": 0.2533,
+      "mean_token_accuracy": 0.9111249685287476,
+      "num_tokens": 1920751.0,
+      "step": 1390
+    },
+    {
+      "entropy": 1.111575174331665,
+      "epoch": 1.3738959764474976,
+      "grad_norm": 3.11545991897583,
+      "learning_rate": 9.793329842433369e-05,
+      "loss": 0.2231,
+      "mean_token_accuracy": 0.9171372473239898,
+      "num_tokens": 1934411.0,
+      "step": 1400
+    },
+    {
+      "entropy": 1.118805205821991,
+      "epoch": 1.3837095191364082,
+      "grad_norm": 3.4786877632141113,
+      "learning_rate": 9.788687407680508e-05,
+      "loss": 0.2282,
+      "mean_token_accuracy": 0.9180926561355591,
+      "num_tokens": 1948013.0,
+      "step": 1410
+    },
+    {
+      "entropy": 1.1229176759719848,
+      "epoch": 1.393523061825319,
+      "grad_norm": 3.463297128677368,
+      "learning_rate": 9.783994534082077e-05,
+      "loss": 0.2073,
+      "mean_token_accuracy": 0.9175416469573975,
+      "num_tokens": 1962274.0,
+      "step": 1420
+    },
+    {
+      "entropy": 1.1284404516220092,
+      "epoch": 1.4033366045142297,
+      "grad_norm": 3.737522840499878,
+      "learning_rate": 9.779251271067725e-05,
+      "loss": 0.2093,
+      "mean_token_accuracy": 0.9178255915641784,
+      "num_tokens": 1976604.0,
+      "step": 1430
+    },
+    {
+      "entropy": 1.119704294204712,
+      "epoch": 1.4131501472031402,
+      "grad_norm": 3.056716203689575,
+      "learning_rate": 9.774457668597845e-05,
+      "loss": 0.2116,
+      "mean_token_accuracy": 0.9212493300437927,
+      "num_tokens": 1991136.0,
+      "step": 1440
+    },
+    {
+      "entropy": 1.118160879611969,
+      "epoch": 1.422963689892051,
+      "grad_norm": 4.535443305969238,
+      "learning_rate": 9.769613777163054e-05,
+      "loss": 0.2337,
+      "mean_token_accuracy": 0.9148732900619507,
+      "num_tokens": 2005289.0,
+      "step": 1450
+    },
+    {
+      "entropy": 1.1039478302001953,
+      "epoch": 1.4327772325809618,
+      "grad_norm": 2.3329625129699707,
+      "learning_rate": 9.764719647783656e-05,
+      "loss": 0.2051,
+      "mean_token_accuracy": 0.9203830122947693,
+      "num_tokens": 2018916.0,
+      "step": 1460
+    },
+    {
+      "entropy": 1.1027318596839906,
+      "epoch": 1.4425907752698723,
+      "grad_norm": 3.346900701522827,
+      "learning_rate": 9.759775332009106e-05,
+      "loss": 0.1994,
+      "mean_token_accuracy": 0.9229048132896424,
+      "num_tokens": 2032597.0,
+      "step": 1470
+    },
+    {
+      "entropy": 1.1042916417121886,
+      "epoch": 1.452404317958783,
+      "grad_norm": 2.8728506565093994,
+      "learning_rate": 9.754780881917474e-05,
+      "loss": 0.2077,
+      "mean_token_accuracy": 0.9186806797981262,
+      "num_tokens": 2046720.0,
+      "step": 1480
+    },
+    {
+      "entropy": 1.1157737255096436,
+      "epoch": 1.4622178606476939,
+      "grad_norm": 3.852989435195923,
+      "learning_rate": 9.749736350114885e-05,
+      "loss": 0.2079,
+      "mean_token_accuracy": 0.9245038151741027,
+      "num_tokens": 2060943.0,
+      "step": 1490
+    },
+    {
+      "entropy": 1.1101845383644104,
+      "epoch": 1.4720314033366044,
+      "grad_norm": 3.331449508666992,
+      "learning_rate": 9.744641789734972e-05,
+      "loss": 0.206,
+      "mean_token_accuracy": 0.920949536561966,
+      "num_tokens": 2074710.0,
+      "step": 1500
+    },
+    {
+      "entropy": 1.1219327092170714,
+      "epoch": 1.4818449460255152,
+      "grad_norm": 2.983902931213379,
+      "learning_rate": 9.739497254438314e-05,
+      "loss": 0.2222,
+      "mean_token_accuracy": 0.9134875714778901,
+      "num_tokens": 2088707.0,
+      "step": 1510
+    },
+    {
+      "entropy": 1.1166712522506714,
+      "epoch": 1.491658488714426,
+      "grad_norm": 3.1242899894714355,
+      "learning_rate": 9.734302798411876e-05,
+      "loss": 0.2034,
+      "mean_token_accuracy": 0.9169783771038056,
+      "num_tokens": 2101428.0,
+      "step": 1520
+    },
+    {
+      "entropy": 1.1222842693328858,
+      "epoch": 1.5014720314033365,
+      "grad_norm": 2.415327548980713,
+      "learning_rate": 9.729058476368429e-05,
+      "loss": 0.2205,
+      "mean_token_accuracy": 0.9173868536949158,
+      "num_tokens": 2115066.0,
+      "step": 1530
+    },
+    {
+      "entropy": 1.1332029700279236,
+      "epoch": 1.5112855740922473,
+      "grad_norm": 3.2529125213623047,
+      "learning_rate": 9.723764343545978e-05,
+      "loss": 0.2225,
+      "mean_token_accuracy": 0.9092679500579834,
+      "num_tokens": 2128953.0,
+      "step": 1540
+    },
+    {
+      "entropy": 1.1292387008666993,
+      "epoch": 1.521099116781158,
+      "grad_norm": 3.5483853816986084,
+      "learning_rate": 9.718420455707187e-05,
+      "loss": 0.2304,
+      "mean_token_accuracy": 0.9117673516273499,
+      "num_tokens": 2142756.0,
+      "step": 1550
+    },
+    {
+      "entropy": 1.1128414750099183,
+      "epoch": 1.5309126594700686,
+      "grad_norm": 3.6198623180389404,
+      "learning_rate": 9.713026869138777e-05,
+      "loss": 0.216,
+      "mean_token_accuracy": 0.9205533683300018,
+      "num_tokens": 2156108.0,
+      "step": 1560
+    },
+    {
+      "entropy": 1.1110066413879394,
+      "epoch": 1.5407262021589794,
+      "grad_norm": 3.5513079166412354,
+      "learning_rate": 9.70758364065095e-05,
+      "loss": 0.223,
+      "mean_token_accuracy": 0.9193585395812989,
+      "num_tokens": 2169356.0,
+      "step": 1570
+    },
+    {
+      "entropy": 1.120824432373047,
+      "epoch": 1.5505397448478901,
+      "grad_norm": 3.5703399181365967,
+      "learning_rate": 9.702090827576776e-05,
+      "loss": 0.2194,
+      "mean_token_accuracy": 0.9192981481552124,
+      "num_tokens": 2183713.0,
+      "step": 1580
+    },
+    {
+      "entropy": 1.1208290219306947,
+      "epoch": 1.5603532875368007,
+      "grad_norm": 3.4128241539001465,
+      "learning_rate": 9.6965484877716e-05,
+      "loss": 0.214,
+      "mean_token_accuracy": 0.9178707063198089,
+      "num_tokens": 2198089.0,
+      "step": 1590
+    },
+    {
+      "entropy": 1.1338149905204773,
+      "epoch": 1.5701668302257115,
+      "grad_norm": 2.6194801330566406,
+      "learning_rate": 9.690956679612421e-05,
+      "loss": 0.2167,
+      "mean_token_accuracy": 0.9148365676403045,
+      "num_tokens": 2213070.0,
+      "step": 1600
+    },
+    {
+      "entropy": 1.1091657519340514,
+      "epoch": 1.5799803729146222,
+      "grad_norm": 2.8092269897460938,
+      "learning_rate": 9.685315461997293e-05,
+      "loss": 0.2007,
+      "mean_token_accuracy": 0.9189371585845947,
+      "num_tokens": 2227108.0,
+      "step": 1610
+    },
+    {
+      "entropy": 1.105021595954895,
+      "epoch": 1.5897939156035328,
+      "grad_norm": 3.600299119949341,
+      "learning_rate": 9.679624894344688e-05,
+      "loss": 0.2019,
+      "mean_token_accuracy": 0.9251212537288666,
+      "num_tokens": 2240704.0,
+      "step": 1620
+    },
+    {
+      "entropy": 1.1206649661064148,
+      "epoch": 1.5996074582924436,
+      "grad_norm": 3.357276201248169,
+      "learning_rate": 9.673885036592884e-05,
+      "loss": 0.2201,
+      "mean_token_accuracy": 0.9211730480194091,
+      "num_tokens": 2254968.0,
+      "step": 1630
+    },
+    {
+      "entropy": 1.1343384742736817,
+      "epoch": 1.6094210009813543,
+      "grad_norm": 3.7672576904296875,
+      "learning_rate": 9.668095949199324e-05,
+      "loss": 0.2089,
+      "mean_token_accuracy": 0.9167727410793305,
+      "num_tokens": 2269175.0,
+      "step": 1640
+    },
+    {
+      "entropy": 1.1191704750061036,
+      "epoch": 1.6192345436702649,
+      "grad_norm": 3.786659002304077,
+      "learning_rate": 9.662257693139981e-05,
+      "loss": 0.2218,
+      "mean_token_accuracy": 0.911541610956192,
+      "num_tokens": 2283760.0,
+      "step": 1650
+    },
+    {
+      "entropy": 1.1226627230644226,
+      "epoch": 1.6290480863591756,
+      "grad_norm": 2.6021649837493896,
+      "learning_rate": 9.656370329908721e-05,
+      "loss": 0.1986,
+      "mean_token_accuracy": 0.9222010493278503,
+      "num_tokens": 2297542.0,
+      "step": 1660
+    },
+    {
+      "entropy": 1.1173729181289673,
+      "epoch": 1.6388616290480864,
+      "grad_norm": 3.7053918838500977,
+      "learning_rate": 9.650433921516652e-05,
+      "loss": 0.2197,
+      "mean_token_accuracy": 0.9145894408226013,
+      "num_tokens": 2311381.0,
+      "step": 1670
+    },
+    {
+      "entropy": 1.0843405246734619,
+      "epoch": 1.648675171736997,
+      "grad_norm": 2.9174821376800537,
+      "learning_rate": 9.644448530491468e-05,
+      "loss": 0.2349,
+      "mean_token_accuracy": 0.9096992194652558,
+      "num_tokens": 2324526.0,
+      "step": 1680
+    },
+    {
+      "entropy": 1.0902127861976623,
+      "epoch": 1.6584887144259077,
+      "grad_norm": 2.771421194076538,
+      "learning_rate": 9.638414219876793e-05,
+      "loss": 0.2065,
+      "mean_token_accuracy": 0.9221897780895233,
+      "num_tokens": 2337382.0,
+      "step": 1690
+    },
+    {
+      "entropy": 1.087600040435791,
+      "epoch": 1.6683022571148185,
+      "grad_norm": 3.538593292236328,
+      "learning_rate": 9.632331053231519e-05,
+      "loss": 0.225,
+      "mean_token_accuracy": 0.9170648396015167,
+      "num_tokens": 2351843.0,
+      "step": 1700
+    },
+    {
+      "entropy": 1.0950422167778016,
+      "epoch": 1.678115799803729,
+      "grad_norm": 3.984433650970459,
+      "learning_rate": 9.626199094629131e-05,
+      "loss": 0.1952,
+      "mean_token_accuracy": 0.9273364424705506,
+      "num_tokens": 2365499.0,
+      "step": 1710
+    },
+    {
+      "entropy": 1.1038031935691834,
+      "epoch": 1.6879293424926398,
+      "grad_norm": 2.8499574661254883,
+      "learning_rate": 9.620018408657043e-05,
+      "loss": 0.2099,
+      "mean_token_accuracy": 0.916943472623825,
+      "num_tokens": 2379228.0,
+      "step": 1720
+    },
+    {
+      "entropy": 1.0952557563781737,
+      "epoch": 1.6977428851815506,
+      "grad_norm": 4.106947422027588,
+      "learning_rate": 9.613789060415904e-05,
+      "loss": 0.2355,
+      "mean_token_accuracy": 0.9108680546283722,
+      "num_tokens": 2392855.0,
+      "step": 1730
+    },
+    {
+      "entropy": 1.100940227508545,
+      "epoch": 1.7075564278704611,
+      "grad_norm": 3.1745007038116455,
+      "learning_rate": 9.60751111551892e-05,
+      "loss": 0.191,
+      "mean_token_accuracy": 0.9246671259403229,
+      "num_tokens": 2406729.0,
+      "step": 1740
+    },
+    {
+      "entropy": 1.0995466351509093,
+      "epoch": 1.717369970559372,
+      "grad_norm": 3.483813524246216,
+      "learning_rate": 9.601184640091157e-05,
+      "loss": 0.2024,
+      "mean_token_accuracy": 0.9217610597610474,
+      "num_tokens": 2420319.0,
+      "step": 1750
+    },
+    {
+      "entropy": 1.1046346068382262,
+      "epoch": 1.7271835132482827,
+      "grad_norm": 3.463531970977783,
+      "learning_rate": 9.594809700768861e-05,
+      "loss": 0.2081,
+      "mean_token_accuracy": 0.9199326515197754,
+      "num_tokens": 2434508.0,
+      "step": 1760
+    },
+    {
+      "entropy": 1.1144657969474792,
+      "epoch": 1.7369970559371932,
+      "grad_norm": 4.0844926834106445,
+      "learning_rate": 9.588386364698732e-05,
+      "loss": 0.2137,
+      "mean_token_accuracy": 0.9138104736804962,
+      "num_tokens": 2447872.0,
+      "step": 1770
+    },
+    {
+      "entropy": 1.100430142879486,
+      "epoch": 1.746810598626104,
+      "grad_norm": 3.3607583045959473,
+      "learning_rate": 9.581914699537236e-05,
+      "loss": 0.2115,
+      "mean_token_accuracy": 0.9221645057201385,
+      "num_tokens": 2461151.0,
+      "step": 1780
+    },
+    {
+      "entropy": 1.107939398288727,
+      "epoch": 1.7566241413150148,
+      "grad_norm": 2.5180768966674805,
+      "learning_rate": 9.57539477344988e-05,
+      "loss": 0.2314,
+      "mean_token_accuracy": 0.912526398897171,
+      "num_tokens": 2474907.0,
+      "step": 1790
+    },
+    {
+      "entropy": 1.1100846648216247,
+      "epoch": 1.7664376840039253,
+      "grad_norm": 3.041205883026123,
+      "learning_rate": 9.568826655110506e-05,
+      "loss": 0.2404,
+      "mean_token_accuracy": 0.9084027290344239,
+      "num_tokens": 2488408.0,
+      "step": 1800
+    },
+    {
+      "entropy": 1.1225266814231873,
+      "epoch": 1.776251226692836,
+      "grad_norm": 2.935957193374634,
+      "learning_rate": 9.562210413700556e-05,
+      "loss": 0.2033,
+      "mean_token_accuracy": 0.9195843636989594,
+      "num_tokens": 2503031.0,
+      "step": 1810
+    },
+    {
+      "entropy": 1.0919470191001892,
+      "epoch": 1.7860647693817469,
+      "grad_norm": 3.78452730178833,
+      "learning_rate": 9.555546118908352e-05,
+      "loss": 0.2226,
+      "mean_token_accuracy": 0.9140250861644745,
+      "num_tokens": 2516322.0,
+      "step": 1820
+    },
+    {
+      "entropy": 1.1023133754730225,
+      "epoch": 1.7958783120706574,
+      "grad_norm": 3.2781333923339844,
+      "learning_rate": 9.548833840928351e-05,
+      "loss": 0.2149,
+      "mean_token_accuracy": 0.9184568583965301,
+      "num_tokens": 2529795.0,
+      "step": 1830
+    },
+    {
+      "entropy": 1.1099633693695068,
+      "epoch": 1.8056918547595682,
+      "grad_norm": 2.7979705333709717,
+      "learning_rate": 9.54207365046042e-05,
+      "loss": 0.2206,
+      "mean_token_accuracy": 0.9137540102005005,
+      "num_tokens": 2542968.0,
+      "step": 1840
+    },
+    {
+      "entropy": 1.0898523092269898,
+      "epoch": 1.815505397448479,
+      "grad_norm": 2.7799179553985596,
+      "learning_rate": 9.535265618709083e-05,
+      "loss": 0.1835,
+      "mean_token_accuracy": 0.9309212148189545,
+      "num_tokens": 2556526.0,
+      "step": 1850
+    },
+    {
+      "entropy": 1.0890620946884155,
+      "epoch": 1.8253189401373895,
+      "grad_norm": 3.6468944549560547,
+      "learning_rate": 9.52840981738277e-05,
+      "loss": 0.2232,
+      "mean_token_accuracy": 0.91695157289505,
+      "num_tokens": 2569913.0,
+      "step": 1860
+    },
+    {
+      "entropy": 1.1135707020759582,
+      "epoch": 1.8351324828263003,
+      "grad_norm": 2.7046544551849365,
+      "learning_rate": 9.521506318693068e-05,
+      "loss": 0.2231,
+      "mean_token_accuracy": 0.9195302724838257,
+      "num_tokens": 2584227.0,
+      "step": 1870
+    },
+    {
+      "entropy": 1.1209636449813842,
+      "epoch": 1.844946025515211,
+      "grad_norm": 2.8242557048797607,
+      "learning_rate": 9.514555195353951e-05,
+      "loss": 0.2209,
+      "mean_token_accuracy": 0.9123826503753663,
+      "num_tokens": 2598410.0,
+      "step": 1880
+    },
+    {
+      "entropy": 1.114969253540039,
+      "epoch": 1.8547595682041216,
+      "grad_norm": 2.861948013305664,
+      "learning_rate": 9.507556520581022e-05,
+      "loss": 0.1797,
+      "mean_token_accuracy": 0.9330613136291503,
+      "num_tokens": 2613088.0,
+      "step": 1890
+    },
+    {
+      "entropy": 1.1052782654762268,
+      "epoch": 1.8645731108930323,
+      "grad_norm": 2.4157450199127197,
+      "learning_rate": 9.500510368090741e-05,
+      "loss": 0.1841,
+      "mean_token_accuracy": 0.9287303447723388,
+      "num_tokens": 2627112.0,
+      "step": 1900
+    },
+    {
+      "entropy": 1.098567581176758,
+      "epoch": 1.8743866535819431,
+      "grad_norm": 3.3976595401763916,
+      "learning_rate": 9.493416812099645e-05,
+      "loss": 0.2247,
+      "mean_token_accuracy": 0.9159882724285126,
+      "num_tokens": 2640791.0,
+      "step": 1910
+    },
+    {
+      "entropy": 1.1170581817626952,
+      "epoch": 1.8842001962708537,
+      "grad_norm": 3.141498565673828,
+      "learning_rate": 9.486275927323566e-05,
+      "loss": 0.2199,
+      "mean_token_accuracy": 0.9170309126377105,
+      "num_tokens": 2654993.0,
+      "step": 1920
+    },
+    {
+      "entropy": 1.0925970435142518,
+      "epoch": 1.8940137389597644,
+      "grad_norm": 4.0427327156066895,
+      "learning_rate": 9.479087788976851e-05,
+      "loss": 0.1939,
+      "mean_token_accuracy": 0.9262156009674072,
+      "num_tokens": 2669247.0,
+      "step": 1930
+    },
+    {
+      "entropy": 1.0885946393013,
+      "epoch": 1.9038272816486752,
+      "grad_norm": 3.7226552963256836,
+      "learning_rate": 9.471852472771562e-05,
+      "loss": 0.1836,
+      "mean_token_accuracy": 0.927420461177826,
+      "num_tokens": 2682844.0,
+      "step": 1940
+    },
+    {
+      "entropy": 1.10577894449234,
+      "epoch": 1.9136408243375858,
+      "grad_norm": 3.322232246398926,
+      "learning_rate": 9.464570054916681e-05,
+      "loss": 0.2279,
+      "mean_token_accuracy": 0.9126473367214203,
+      "num_tokens": 2696806.0,
+      "step": 1950
+    },
+    {
+      "entropy": 1.0942806005477905,
+      "epoch": 1.9234543670264965,
+      "grad_norm": 3.0909481048583984,
+      "learning_rate": 9.457240612117311e-05,
+      "loss": 0.2095,
+      "mean_token_accuracy": 0.9270996153354645,
+      "num_tokens": 2710979.0,
+      "step": 1960
+    },
+    {
+      "entropy": 1.1020437240600587,
+      "epoch": 1.9332679097154073,
+      "grad_norm": 3.2755632400512695,
+      "learning_rate": 9.449864221573859e-05,
+      "loss": 0.2239,
+      "mean_token_accuracy": 0.917670601606369,
+      "num_tokens": 2724880.0,
+      "step": 1970
+    },
+    {
+      "entropy": 1.1107500076293946,
+      "epoch": 1.9430814524043178,
+      "grad_norm": 3.545135021209717,
+      "learning_rate": 9.442440960981234e-05,
+      "loss": 0.2147,
+      "mean_token_accuracy": 0.9109222888946533,
+      "num_tokens": 2738562.0,
+      "step": 1980
+    },
+    {
+      "entropy": 1.0981928944587707,
+      "epoch": 1.9528949950932286,
+      "grad_norm": 2.158135175704956,
+      "learning_rate": 9.434970908528021e-05,
+      "loss": 0.1933,
+      "mean_token_accuracy": 0.9260489761829376,
+      "num_tokens": 2752355.0,
+      "step": 1990
+    },
+    {
+      "entropy": 1.108481192588806,
+      "epoch": 1.9627085377821394,
+      "grad_norm": 2.3290560245513916,
+      "learning_rate": 9.427454142895663e-05,
+      "loss": 0.2197,
+      "mean_token_accuracy": 0.9166393756866456,
+      "num_tokens": 2766143.0,
+      "step": 2000
+    },
+    {
+      "entropy": 1.0959125280380249,
+      "epoch": 1.97252208047105,
+      "grad_norm": 2.3369574546813965,
+      "learning_rate": 9.41989074325762e-05,
+      "loss": 0.2103,
+      "mean_token_accuracy": 0.9162816107273102,
+      "num_tokens": 2779801.0,
+      "step": 2010
+    },
+    {
+      "entropy": 1.0738348126411439,
+      "epoch": 1.9823356231599607,
+      "grad_norm": 3.7078335285186768,
+      "learning_rate": 9.412280789278557e-05,
+      "loss": 0.177,
+      "mean_token_accuracy": 0.9321493089199067,
+      "num_tokens": 2793763.0,
+      "step": 2020
+    },
+    {
+      "entropy": 1.075346255302429,
+      "epoch": 1.9921491658488715,
+      "grad_norm": 3.2744557857513428,
+      "learning_rate": 9.404624361113482e-05,
+      "loss": 0.1922,
+      "mean_token_accuracy": 0.9244012117385865,
+      "num_tokens": 2807624.0,
+      "step": 2030
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 10190,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.1987310863414477e+17,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56509bf0973b5a62fa9102ae0f90af2527f08ae5fd5d1d953f30cc3fcb84c764
+size 6417

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-2038/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "up_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b557e73b7a9679e67d8521cf5f3bf58d53fcb2a17401f36d2261bc94b0ecfd6
+size 80792096

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/cat_qwen25_7b_r8_a32_adamw_e10_lr1e-4_s1_keep10/checkpoint-3057/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896