EvanSirius commited on Mar 4

Commit

fb5fb92

verified ·

1 Parent(s): 96fe7b0

Add InternVL3-1B LoRA checkpoints (5500/6000/6500/7000) for LEAP supplementary eval

Browse files

Files changed (50) hide show

.gitattributes +4 -0
internvl3_1b_lora_7000_20260304_104032/README.md +9 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/README.md +207 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/adapter_config.json +39 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/adapter_model.safetensors +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/added_tokens.json +34 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/chat_template.jinja +6 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/merges.txt +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/special_tokens_map.json +44 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/tokenizer.json +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/tokenizer_config.json +306 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/trainer_state.json +1966 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/training_args.bin +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/vocab.json +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/README.md +207 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/adapter_config.json +39 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/adapter_model.safetensors +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/added_tokens.json +34 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/chat_template.jinja +6 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/merges.txt +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/special_tokens_map.json +44 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/tokenizer.json +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/tokenizer_config.json +306 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/trainer_state.json +2141 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/training_args.bin +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/vocab.json +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/README.md +207 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/adapter_config.json +39 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/adapter_model.safetensors +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/added_tokens.json +34 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/chat_template.jinja +6 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/merges.txt +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/special_tokens_map.json +44 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/tokenizer.json +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/tokenizer_config.json +306 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/trainer_state.json +2316 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/training_args.bin +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/vocab.json +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/README.md +207 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/adapter_config.json +39 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/adapter_model.safetensors +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/added_tokens.json +34 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/chat_template.jinja +6 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/merges.txt +0 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/special_tokens_map.json +44 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/tokenizer.json +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/tokenizer_config.json +306 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/trainer_state.json +2491 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/training_args.bin +3 -0
internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -68,3 +68,7 @@ checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text

internvl3_1b_lora_7000_20260304_104032/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# InternVL3-1B LoRA checkpoints for LEAP (VLABench)
+Run: `internvl3_1b_lora_7000_20260304_104032`
+Included checkpoints: `5500`, `6000`, `6500`, `7000`.
+Base model: `OpenGVLab/InternVL3-1B-hf`
+These are LoRA adapter checkpoints used for VLABench supplementary experiments.

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7056e78e3e12966ca518a2f1fbf2aa6eac74ed640ab2879144abc5f276426479
+size 18138288

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<video>": 151674,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
+' }}{% elif content['type'] == 'video' %}{{ '<video>
+' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
+'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
+' }}{% endif %}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "start_image_token": "<img>",
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc80b7e20adf8bf6f6ca442bf1abfac8056bb3b7d3e0b11c9d497d3e79398c9
+size 11423732

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,306 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "start_image_token": "<img>",
+    "video_token": "<video>"
+  },
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "return_token_type_ids": false,
+  "split_special_tokens": false,
+  "start_image_token": "<img>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1966 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.5821183817108988,
+  "eval_steps": 500,
+  "global_step": 5500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00010583970576561797,
+      "grad_norm": 2.33493709564209,
+      "learning_rate": 0.0,
+      "loss": 1.2764,
+      "step": 1
+    },
+    {
+      "epoch": 0.0021167941153123595,
+      "grad_norm": 2.771491765975952,
+      "learning_rate": 9.5e-06,
+      "loss": 1.6307,
+      "step": 20
+    },
+    {
+      "epoch": 0.004233588230624719,
+      "grad_norm": 1.6168005466461182,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 1.5457,
+      "step": 40
+    },
+    {
+      "epoch": 0.006350382345937078,
+      "grad_norm": 1.033608317375183,
+      "learning_rate": 2.95e-05,
+      "loss": 1.333,
+      "step": 60
+    },
+    {
+      "epoch": 0.008467176461249438,
+      "grad_norm": 0.9740731716156006,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.1355,
+      "step": 80
+    },
+    {
+      "epoch": 0.010583970576561796,
+      "grad_norm": 1.2390116453170776,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.0417,
+      "step": 100
+    },
+    {
+      "epoch": 0.012700764691874157,
+      "grad_norm": 1.3689967393875122,
+      "learning_rate": 5.95e-05,
+      "loss": 0.9421,
+      "step": 120
+    },
+    {
+      "epoch": 0.014817558807186515,
+      "grad_norm": 2.2537403106689453,
+      "learning_rate": 6.95e-05,
+      "loss": 0.9362,
+      "step": 140
+    },
+    {
+      "epoch": 0.016934352922498876,
+      "grad_norm": 1.8407371044158936,
+      "learning_rate": 7.950000000000001e-05,
+      "loss": 0.8148,
+      "step": 160
+    },
+    {
+      "epoch": 0.019051147037811234,
+      "grad_norm": 1.8288472890853882,
+      "learning_rate": 8.950000000000001e-05,
+      "loss": 0.7605,
+      "step": 180
+    },
+    {
+      "epoch": 0.021167941153123593,
+      "grad_norm": 2.447781562805176,
+      "learning_rate": 9.95e-05,
+      "loss": 0.6912,
+      "step": 200
+    },
+    {
+      "epoch": 0.023284735268435955,
+      "grad_norm": 2.366830825805664,
+      "learning_rate": 9.972058823529412e-05,
+      "loss": 0.6242,
+      "step": 220
+    },
+    {
+      "epoch": 0.025401529383748313,
+      "grad_norm": 2.799335479736328,
+      "learning_rate": 9.94264705882353e-05,
+      "loss": 0.5381,
+      "step": 240
+    },
+    {
+      "epoch": 0.027518323499060672,
+      "grad_norm": 2.6497650146484375,
+      "learning_rate": 9.913235294117647e-05,
+      "loss": 0.5163,
+      "step": 260
+    },
+    {
+      "epoch": 0.02963511761437303,
+      "grad_norm": 3.2764251232147217,
+      "learning_rate": 9.883823529411765e-05,
+      "loss": 0.474,
+      "step": 280
+    },
+    {
+      "epoch": 0.03175191172968539,
+      "grad_norm": 3.223743200302124,
+      "learning_rate": 9.854411764705883e-05,
+      "loss": 0.461,
+      "step": 300
+    },
+    {
+      "epoch": 0.03386870584499775,
+      "grad_norm": 3.198038339614868,
+      "learning_rate": 9.825e-05,
+      "loss": 0.4216,
+      "step": 320
+    },
+    {
+      "epoch": 0.03598549996031011,
+      "grad_norm": 3.033092737197876,
+      "learning_rate": 9.795588235294119e-05,
+      "loss": 0.3453,
+      "step": 340
+    },
+    {
+      "epoch": 0.03810229407562247,
+      "grad_norm": 2.908698797225952,
+      "learning_rate": 9.766176470588236e-05,
+      "loss": 0.3506,
+      "step": 360
+    },
+    {
+      "epoch": 0.04021908819093483,
+      "grad_norm": 3.772873878479004,
+      "learning_rate": 9.736764705882353e-05,
+      "loss": 0.3381,
+      "step": 380
+    },
+    {
+      "epoch": 0.042335882306247186,
+      "grad_norm": 2.692840337753296,
+      "learning_rate": 9.707352941176471e-05,
+      "loss": 0.2868,
+      "step": 400
+    },
+    {
+      "epoch": 0.04445267642155955,
+      "grad_norm": 3.629152297973633,
+      "learning_rate": 9.677941176470589e-05,
+      "loss": 0.2845,
+      "step": 420
+    },
+    {
+      "epoch": 0.04656947053687191,
+      "grad_norm": 4.045558929443359,
+      "learning_rate": 9.648529411764706e-05,
+      "loss": 0.275,
+      "step": 440
+    },
+    {
+      "epoch": 0.048686264652184265,
+      "grad_norm": 2.3519065380096436,
+      "learning_rate": 9.619117647058824e-05,
+      "loss": 0.2254,
+      "step": 460
+    },
+    {
+      "epoch": 0.05080305876749663,
+      "grad_norm": 2.71055269241333,
+      "learning_rate": 9.589705882352941e-05,
+      "loss": 0.2356,
+      "step": 480
+    },
+    {
+      "epoch": 0.05291985288280899,
+      "grad_norm": 3.037832021713257,
+      "learning_rate": 9.560294117647059e-05,
+      "loss": 0.2,
+      "step": 500
+    },
+    {
+      "epoch": 0.055036646998121344,
+      "grad_norm": 3.366894245147705,
+      "learning_rate": 9.530882352941177e-05,
+      "loss": 0.1888,
+      "step": 520
+    },
+    {
+      "epoch": 0.057153441113433706,
+      "grad_norm": 2.728973865509033,
+      "learning_rate": 9.501470588235294e-05,
+      "loss": 0.1844,
+      "step": 540
+    },
+    {
+      "epoch": 0.05927023522874606,
+      "grad_norm": 2.229743719100952,
+      "learning_rate": 9.472058823529412e-05,
+      "loss": 0.1658,
+      "step": 560
+    },
+    {
+      "epoch": 0.06138702934405842,
+      "grad_norm": 2.3469460010528564,
+      "learning_rate": 9.44264705882353e-05,
+      "loss": 0.1556,
+      "step": 580
+    },
+    {
+      "epoch": 0.06350382345937078,
+      "grad_norm": 2.338606595993042,
+      "learning_rate": 9.413235294117647e-05,
+      "loss": 0.1391,
+      "step": 600
+    },
+    {
+      "epoch": 0.06562061757468314,
+      "grad_norm": 1.9111056327819824,
+      "learning_rate": 9.383823529411765e-05,
+      "loss": 0.1606,
+      "step": 620
+    },
+    {
+      "epoch": 0.0677374116899955,
+      "grad_norm": 3.3568716049194336,
+      "learning_rate": 9.354411764705883e-05,
+      "loss": 0.1664,
+      "step": 640
+    },
+    {
+      "epoch": 0.06985420580530786,
+      "grad_norm": 3.88547945022583,
+      "learning_rate": 9.325e-05,
+      "loss": 0.1494,
+      "step": 660
+    },
+    {
+      "epoch": 0.07197099992062023,
+      "grad_norm": 2.3967244625091553,
+      "learning_rate": 9.295588235294118e-05,
+      "loss": 0.141,
+      "step": 680
+    },
+    {
+      "epoch": 0.07408779403593257,
+      "grad_norm": 3.0165176391601562,
+      "learning_rate": 9.266176470588236e-05,
+      "loss": 0.1409,
+      "step": 700
+    },
+    {
+      "epoch": 0.07620458815124494,
+      "grad_norm": 3.2665436267852783,
+      "learning_rate": 9.236764705882353e-05,
+      "loss": 0.1112,
+      "step": 720
+    },
+    {
+      "epoch": 0.0783213822665573,
+      "grad_norm": 2.3310046195983887,
+      "learning_rate": 9.207352941176471e-05,
+      "loss": 0.1396,
+      "step": 740
+    },
+    {
+      "epoch": 0.08043817638186966,
+      "grad_norm": 1.8768619298934937,
+      "learning_rate": 9.177941176470589e-05,
+      "loss": 0.1114,
+      "step": 760
+    },
+    {
+      "epoch": 0.08255497049718202,
+      "grad_norm": 3.4282712936401367,
+      "learning_rate": 9.148529411764706e-05,
+      "loss": 0.1073,
+      "step": 780
+    },
+    {
+      "epoch": 0.08467176461249437,
+      "grad_norm": 3.2704601287841797,
+      "learning_rate": 9.119117647058824e-05,
+      "loss": 0.1281,
+      "step": 800
+    },
+    {
+      "epoch": 0.08678855872780673,
+      "grad_norm": 2.225818157196045,
+      "learning_rate": 9.089705882352942e-05,
+      "loss": 0.1046,
+      "step": 820
+    },
+    {
+      "epoch": 0.0889053528431191,
+      "grad_norm": 2.078011989593506,
+      "learning_rate": 9.060294117647059e-05,
+      "loss": 0.1033,
+      "step": 840
+    },
+    {
+      "epoch": 0.09102214695843146,
+      "grad_norm": 1.3325825929641724,
+      "learning_rate": 9.030882352941177e-05,
+      "loss": 0.0872,
+      "step": 860
+    },
+    {
+      "epoch": 0.09313894107374382,
+      "grad_norm": 3.0471086502075195,
+      "learning_rate": 9.001470588235294e-05,
+      "loss": 0.0927,
+      "step": 880
+    },
+    {
+      "epoch": 0.09525573518905617,
+      "grad_norm": 2.9685380458831787,
+      "learning_rate": 8.972058823529412e-05,
+      "loss": 0.0917,
+      "step": 900
+    },
+    {
+      "epoch": 0.09737252930436853,
+      "grad_norm": 1.8589142560958862,
+      "learning_rate": 8.94264705882353e-05,
+      "loss": 0.0836,
+      "step": 920
+    },
+    {
+      "epoch": 0.09948932341968089,
+      "grad_norm": 1.523457407951355,
+      "learning_rate": 8.913235294117647e-05,
+      "loss": 0.0862,
+      "step": 940
+    },
+    {
+      "epoch": 0.10160611753499325,
+      "grad_norm": 1.4009277820587158,
+      "learning_rate": 8.883823529411765e-05,
+      "loss": 0.068,
+      "step": 960
+    },
+    {
+      "epoch": 0.10372291165030562,
+      "grad_norm": 2.0816826820373535,
+      "learning_rate": 8.854411764705883e-05,
+      "loss": 0.0741,
+      "step": 980
+    },
+    {
+      "epoch": 0.10583970576561798,
+      "grad_norm": 2.218278408050537,
+      "learning_rate": 8.825e-05,
+      "loss": 0.0822,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10795649988093033,
+      "grad_norm": 1.188503623008728,
+      "learning_rate": 8.795588235294118e-05,
+      "loss": 0.0792,
+      "step": 1020
+    },
+    {
+      "epoch": 0.11007329399624269,
+      "grad_norm": 0.9847146272659302,
+      "learning_rate": 8.766176470588236e-05,
+      "loss": 0.0696,
+      "step": 1040
+    },
+    {
+      "epoch": 0.11219008811155505,
+      "grad_norm": 3.0967068672180176,
+      "learning_rate": 8.736764705882353e-05,
+      "loss": 0.0842,
+      "step": 1060
+    },
+    {
+      "epoch": 0.11430688222686741,
+      "grad_norm": 2.4966516494750977,
+      "learning_rate": 8.707352941176471e-05,
+      "loss": 0.065,
+      "step": 1080
+    },
+    {
+      "epoch": 0.11642367634217977,
+      "grad_norm": 1.7355480194091797,
+      "learning_rate": 8.677941176470589e-05,
+      "loss": 0.0672,
+      "step": 1100
+    },
+    {
+      "epoch": 0.11854047045749212,
+      "grad_norm": 2.5048105716705322,
+      "learning_rate": 8.648529411764706e-05,
+      "loss": 0.0616,
+      "step": 1120
+    },
+    {
+      "epoch": 0.12065726457280448,
+      "grad_norm": 1.285093903541565,
+      "learning_rate": 8.619117647058824e-05,
+      "loss": 0.0676,
+      "step": 1140
+    },
+    {
+      "epoch": 0.12277405868811685,
+      "grad_norm": 1.58004891872406,
+      "learning_rate": 8.589705882352942e-05,
+      "loss": 0.0827,
+      "step": 1160
+    },
+    {
+      "epoch": 0.12489085280342921,
+      "grad_norm": 1.571897029876709,
+      "learning_rate": 8.560294117647059e-05,
+      "loss": 0.0663,
+      "step": 1180
+    },
+    {
+      "epoch": 0.12700764691874156,
+      "grad_norm": 0.8998542428016663,
+      "learning_rate": 8.530882352941177e-05,
+      "loss": 0.0448,
+      "step": 1200
+    },
+    {
+      "epoch": 0.12912444103405393,
+      "grad_norm": 1.577183485031128,
+      "learning_rate": 8.501470588235295e-05,
+      "loss": 0.0574,
+      "step": 1220
+    },
+    {
+      "epoch": 0.13124123514936628,
+      "grad_norm": 1.7241120338439941,
+      "learning_rate": 8.472058823529412e-05,
+      "loss": 0.0586,
+      "step": 1240
+    },
+    {
+      "epoch": 0.13335802926467866,
+      "grad_norm": 1.4512884616851807,
+      "learning_rate": 8.44264705882353e-05,
+      "loss": 0.0457,
+      "step": 1260
+    },
+    {
+      "epoch": 0.135474823379991,
+      "grad_norm": 1.4320181608200073,
+      "learning_rate": 8.413235294117647e-05,
+      "loss": 0.0572,
+      "step": 1280
+    },
+    {
+      "epoch": 0.13759161749530335,
+      "grad_norm": 2.4721877574920654,
+      "learning_rate": 8.383823529411765e-05,
+      "loss": 0.0539,
+      "step": 1300
+    },
+    {
+      "epoch": 0.13970841161061573,
+      "grad_norm": 1.230265498161316,
+      "learning_rate": 8.354411764705883e-05,
+      "loss": 0.0524,
+      "step": 1320
+    },
+    {
+      "epoch": 0.14182520572592808,
+      "grad_norm": 1.5039700269699097,
+      "learning_rate": 8.325e-05,
+      "loss": 0.0569,
+      "step": 1340
+    },
+    {
+      "epoch": 0.14394199984124045,
+      "grad_norm": 1.9928780794143677,
+      "learning_rate": 8.295588235294118e-05,
+      "loss": 0.0402,
+      "step": 1360
+    },
+    {
+      "epoch": 0.1460587939565528,
+      "grad_norm": 1.8550405502319336,
+      "learning_rate": 8.266176470588236e-05,
+      "loss": 0.054,
+      "step": 1380
+    },
+    {
+      "epoch": 0.14817558807186515,
+      "grad_norm": 1.0700241327285767,
+      "learning_rate": 8.236764705882353e-05,
+      "loss": 0.052,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15029238218717753,
+      "grad_norm": 1.7121262550354004,
+      "learning_rate": 8.207352941176471e-05,
+      "loss": 0.0437,
+      "step": 1420
+    },
+    {
+      "epoch": 0.15240917630248987,
+      "grad_norm": 1.3593100309371948,
+      "learning_rate": 8.177941176470589e-05,
+      "loss": 0.0393,
+      "step": 1440
+    },
+    {
+      "epoch": 0.15452597041780225,
+      "grad_norm": 1.080735683441162,
+      "learning_rate": 8.148529411764706e-05,
+      "loss": 0.035,
+      "step": 1460
+    },
+    {
+      "epoch": 0.1566427645331146,
+      "grad_norm": 1.5516977310180664,
+      "learning_rate": 8.119117647058824e-05,
+      "loss": 0.0421,
+      "step": 1480
+    },
+    {
+      "epoch": 0.15875955864842695,
+      "grad_norm": 1.107473373413086,
+      "learning_rate": 8.089705882352942e-05,
+      "loss": 0.0442,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16087635276373932,
+      "grad_norm": 2.196147918701172,
+      "learning_rate": 8.060294117647059e-05,
+      "loss": 0.0443,
+      "step": 1520
+    },
+    {
+      "epoch": 0.16299314687905167,
+      "grad_norm": 1.4532606601715088,
+      "learning_rate": 8.030882352941177e-05,
+      "loss": 0.0417,
+      "step": 1540
+    },
+    {
+      "epoch": 0.16510994099436405,
+      "grad_norm": 3.0167882442474365,
+      "learning_rate": 8.001470588235295e-05,
+      "loss": 0.0472,
+      "step": 1560
+    },
+    {
+      "epoch": 0.1672267351096764,
+      "grad_norm": 1.764201283454895,
+      "learning_rate": 7.972058823529412e-05,
+      "loss": 0.031,
+      "step": 1580
+    },
+    {
+      "epoch": 0.16934352922498874,
+      "grad_norm": 0.8682387471199036,
+      "learning_rate": 7.94264705882353e-05,
+      "loss": 0.0291,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17146032334030112,
+      "grad_norm": 0.660894513130188,
+      "learning_rate": 7.913235294117648e-05,
+      "loss": 0.0572,
+      "step": 1620
+    },
+    {
+      "epoch": 0.17357711745561347,
+      "grad_norm": 1.7611377239227295,
+      "learning_rate": 7.883823529411765e-05,
+      "loss": 0.0453,
+      "step": 1640
+    },
+    {
+      "epoch": 0.17569391157092584,
+      "grad_norm": 0.6341773867607117,
+      "learning_rate": 7.854411764705883e-05,
+      "loss": 0.0299,
+      "step": 1660
+    },
+    {
+      "epoch": 0.1778107056862382,
+      "grad_norm": 1.4031453132629395,
+      "learning_rate": 7.825e-05,
+      "loss": 0.0358,
+      "step": 1680
+    },
+    {
+      "epoch": 0.17992749980155054,
+      "grad_norm": 1.0830997228622437,
+      "learning_rate": 7.795588235294118e-05,
+      "loss": 0.0373,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18204429391686291,
+      "grad_norm": 0.6576260924339294,
+      "learning_rate": 7.766176470588236e-05,
+      "loss": 0.0587,
+      "step": 1720
+    },
+    {
+      "epoch": 0.18416108803217526,
+      "grad_norm": 1.2640115022659302,
+      "learning_rate": 7.736764705882353e-05,
+      "loss": 0.0468,
+      "step": 1740
+    },
+    {
+      "epoch": 0.18627788214748764,
+      "grad_norm": 1.0660518407821655,
+      "learning_rate": 7.707352941176471e-05,
+      "loss": 0.0466,
+      "step": 1760
+    },
+    {
+      "epoch": 0.1883946762628,
+      "grad_norm": 1.22067129611969,
+      "learning_rate": 7.677941176470589e-05,
+      "loss": 0.0335,
+      "step": 1780
+    },
+    {
+      "epoch": 0.19051147037811234,
+      "grad_norm": 4.800387859344482,
+      "learning_rate": 7.648529411764706e-05,
+      "loss": 0.0461,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1926282644934247,
+      "grad_norm": 1.1434308290481567,
+      "learning_rate": 7.619117647058824e-05,
+      "loss": 0.0326,
+      "step": 1820
+    },
+    {
+      "epoch": 0.19474505860873706,
+      "grad_norm": 0.8925223350524902,
+      "learning_rate": 7.589705882352942e-05,
+      "loss": 0.0273,
+      "step": 1840
+    },
+    {
+      "epoch": 0.19686185272404944,
+      "grad_norm": 1.1678693294525146,
+      "learning_rate": 7.560294117647059e-05,
+      "loss": 0.0345,
+      "step": 1860
+    },
+    {
+      "epoch": 0.19897864683936178,
+      "grad_norm": 0.559644341468811,
+      "learning_rate": 7.530882352941177e-05,
+      "loss": 0.0394,
+      "step": 1880
+    },
+    {
+      "epoch": 0.20109544095467416,
+      "grad_norm": 1.4313390254974365,
+      "learning_rate": 7.501470588235295e-05,
+      "loss": 0.0475,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2032122350699865,
+      "grad_norm": 1.2470778226852417,
+      "learning_rate": 7.472058823529412e-05,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 0.20532902918529886,
+      "grad_norm": 1.390359878540039,
+      "learning_rate": 7.44264705882353e-05,
+      "loss": 0.0268,
+      "step": 1940
+    },
+    {
+      "epoch": 0.20744582330061123,
+      "grad_norm": 0.6755140423774719,
+      "learning_rate": 7.413235294117648e-05,
+      "loss": 0.0331,
+      "step": 1960
+    },
+    {
+      "epoch": 0.20956261741592358,
+      "grad_norm": 0.31457772850990295,
+      "learning_rate": 7.383823529411765e-05,
+      "loss": 0.0447,
+      "step": 1980
+    },
+    {
+      "epoch": 0.21167941153123596,
+      "grad_norm": 1.6619377136230469,
+      "learning_rate": 7.354411764705883e-05,
+      "loss": 0.0336,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2137962056465483,
+      "grad_norm": 1.033492088317871,
+      "learning_rate": 7.325e-05,
+      "loss": 0.0304,
+      "step": 2020
+    },
+    {
+      "epoch": 0.21591299976186065,
+      "grad_norm": 0.730675220489502,
+      "learning_rate": 7.295588235294118e-05,
+      "loss": 0.0311,
+      "step": 2040
+    },
+    {
+      "epoch": 0.21802979387717303,
+      "grad_norm": 0.6322308778762817,
+      "learning_rate": 7.266176470588236e-05,
+      "loss": 0.0258,
+      "step": 2060
+    },
+    {
+      "epoch": 0.22014658799248538,
+      "grad_norm": 0.7560809254646301,
+      "learning_rate": 7.236764705882353e-05,
+      "loss": 0.0213,
+      "step": 2080
+    },
+    {
+      "epoch": 0.22226338210779775,
+      "grad_norm": 1.1907991170883179,
+      "learning_rate": 7.207352941176471e-05,
+      "loss": 0.0311,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2243801762231101,
+      "grad_norm": 0.6392427086830139,
+      "learning_rate": 7.177941176470589e-05,
+      "loss": 0.0302,
+      "step": 2120
+    },
+    {
+      "epoch": 0.22649697033842245,
+      "grad_norm": 1.0621793270111084,
+      "learning_rate": 7.148529411764706e-05,
+      "loss": 0.0257,
+      "step": 2140
+    },
+    {
+      "epoch": 0.22861376445373482,
+      "grad_norm": 0.8459914326667786,
+      "learning_rate": 7.119117647058824e-05,
+      "loss": 0.0249,
+      "step": 2160
+    },
+    {
+      "epoch": 0.23073055856904717,
+      "grad_norm": 1.5384963750839233,
+      "learning_rate": 7.089705882352942e-05,
+      "loss": 0.0221,
+      "step": 2180
+    },
+    {
+      "epoch": 0.23284735268435955,
+      "grad_norm": 0.920907199382782,
+      "learning_rate": 7.06029411764706e-05,
+      "loss": 0.0307,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2349641467996719,
+      "grad_norm": 1.1640409231185913,
+      "learning_rate": 7.030882352941177e-05,
+      "loss": 0.0302,
+      "step": 2220
+    },
+    {
+      "epoch": 0.23708094091498425,
+      "grad_norm": 0.7336745858192444,
+      "learning_rate": 7.001470588235295e-05,
+      "loss": 0.0286,
+      "step": 2240
+    },
+    {
+      "epoch": 0.23919773503029662,
+      "grad_norm": 1.9110276699066162,
+      "learning_rate": 6.972058823529412e-05,
+      "loss": 0.0303,
+      "step": 2260
+    },
+    {
+      "epoch": 0.24131452914560897,
+      "grad_norm": 0.9055470824241638,
+      "learning_rate": 6.94264705882353e-05,
+      "loss": 0.0241,
+      "step": 2280
+    },
+    {
+      "epoch": 0.24343132326092135,
+      "grad_norm": 1.063379168510437,
+      "learning_rate": 6.913235294117648e-05,
+      "loss": 0.0244,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2455481173762337,
+      "grad_norm": 1.0067662000656128,
+      "learning_rate": 6.883823529411765e-05,
+      "loss": 0.026,
+      "step": 2320
+    },
+    {
+      "epoch": 0.24766491149154604,
+      "grad_norm": 1.1639182567596436,
+      "learning_rate": 6.854411764705883e-05,
+      "loss": 0.0253,
+      "step": 2340
+    },
+    {
+      "epoch": 0.24978170560685842,
+      "grad_norm": 0.9918274879455566,
+      "learning_rate": 6.825e-05,
+      "loss": 0.0218,
+      "step": 2360
+    },
+    {
+      "epoch": 0.25189849972217077,
+      "grad_norm": 0.7681129574775696,
+      "learning_rate": 6.795588235294118e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 0.2540152938374831,
+      "grad_norm": 0.7643230557441711,
+      "learning_rate": 6.766176470588236e-05,
+      "loss": 0.021,
+      "step": 2400
+    },
+    {
+      "epoch": 0.2561320879527955,
+      "grad_norm": 1.2285891771316528,
+      "learning_rate": 6.736764705882354e-05,
+      "loss": 0.0194,
+      "step": 2420
+    },
+    {
+      "epoch": 0.25824888206810787,
+      "grad_norm": 0.5345446467399597,
+      "learning_rate": 6.707352941176471e-05,
+      "loss": 0.0211,
+      "step": 2440
+    },
+    {
+      "epoch": 0.2603656761834202,
+      "grad_norm": 0.7964244484901428,
+      "learning_rate": 6.677941176470589e-05,
+      "loss": 0.024,
+      "step": 2460
+    },
+    {
+      "epoch": 0.26248247029873256,
+      "grad_norm": 0.5538131594657898,
+      "learning_rate": 6.648529411764705e-05,
+      "loss": 0.0258,
+      "step": 2480
+    },
+    {
+      "epoch": 0.2645992644140449,
+      "grad_norm": 0.9520718455314636,
+      "learning_rate": 6.619117647058823e-05,
+      "loss": 0.0178,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2667160585293573,
+      "grad_norm": 0.6036665439605713,
+      "learning_rate": 6.589705882352942e-05,
+      "loss": 0.0193,
+      "step": 2520
+    },
+    {
+      "epoch": 0.26883285264466966,
+      "grad_norm": 0.37941470742225647,
+      "learning_rate": 6.56029411764706e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 0.270949646759982,
+      "grad_norm": 0.3956536650657654,
+      "learning_rate": 6.530882352941177e-05,
+      "loss": 0.0239,
+      "step": 2560
+    },
+    {
+      "epoch": 0.27306644087529436,
+      "grad_norm": 0.4313443899154663,
+      "learning_rate": 6.501470588235295e-05,
+      "loss": 0.0185,
+      "step": 2580
+    },
+    {
+      "epoch": 0.2751832349906067,
+      "grad_norm": 1.083382248878479,
+      "learning_rate": 6.472058823529412e-05,
+      "loss": 0.026,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2773000291059191,
+      "grad_norm": 0.8067460060119629,
+      "learning_rate": 6.44264705882353e-05,
+      "loss": 0.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.27941682322123146,
+      "grad_norm": 1.2681511640548706,
+      "learning_rate": 6.413235294117648e-05,
+      "loss": 0.0232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.2815336173365438,
+      "grad_norm": 0.5592957139015198,
+      "learning_rate": 6.383823529411765e-05,
+      "loss": 0.0184,
+      "step": 2660
+    },
+    {
+      "epoch": 0.28365041145185615,
+      "grad_norm": 0.5282326936721802,
+      "learning_rate": 6.354411764705883e-05,
+      "loss": 0.0195,
+      "step": 2680
+    },
+    {
+      "epoch": 0.2857672055671685,
+      "grad_norm": 0.5503069758415222,
+      "learning_rate": 6.324999999999999e-05,
+      "loss": 0.0182,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2878839996824809,
+      "grad_norm": 0.9767094254493713,
+      "learning_rate": 6.295588235294117e-05,
+      "loss": 0.0174,
+      "step": 2720
+    },
+    {
+      "epoch": 0.29000079379779325,
+      "grad_norm": 0.5078358054161072,
+      "learning_rate": 6.266176470588236e-05,
+      "loss": 0.0214,
+      "step": 2740
+    },
+    {
+      "epoch": 0.2921175879131056,
+      "grad_norm": 0.8082838654518127,
+      "learning_rate": 6.236764705882354e-05,
+      "loss": 0.0151,
+      "step": 2760
+    },
+    {
+      "epoch": 0.29423438202841795,
+      "grad_norm": 0.49735844135284424,
+      "learning_rate": 6.207352941176471e-05,
+      "loss": 0.0235,
+      "step": 2780
+    },
+    {
+      "epoch": 0.2963511761437303,
+      "grad_norm": 1.0940418243408203,
+      "learning_rate": 6.177941176470589e-05,
+      "loss": 0.016,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2984679702590427,
+      "grad_norm": 0.9790317416191101,
+      "learning_rate": 6.148529411764706e-05,
+      "loss": 0.0204,
+      "step": 2820
+    },
+    {
+      "epoch": 0.30058476437435505,
+      "grad_norm": 0.9905364513397217,
+      "learning_rate": 6.119117647058824e-05,
+      "loss": 0.0189,
+      "step": 2840
+    },
+    {
+      "epoch": 0.3027015584896674,
+      "grad_norm": 0.5084486603736877,
+      "learning_rate": 6.089705882352942e-05,
+      "loss": 0.0216,
+      "step": 2860
+    },
+    {
+      "epoch": 0.30481835260497975,
+      "grad_norm": 0.6312965750694275,
+      "learning_rate": 6.0602941176470594e-05,
+      "loss": 0.0197,
+      "step": 2880
+    },
+    {
+      "epoch": 0.3069351467202921,
+      "grad_norm": 1.0345927476882935,
+      "learning_rate": 6.0308823529411764e-05,
+      "loss": 0.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3090519408356045,
+      "grad_norm": 1.1944761276245117,
+      "learning_rate": 6.001470588235294e-05,
+      "loss": 0.017,
+      "step": 2920
+    },
+    {
+      "epoch": 0.31116873495091685,
+      "grad_norm": 0.6866488456726074,
+      "learning_rate": 5.972058823529412e-05,
+      "loss": 0.0158,
+      "step": 2940
+    },
+    {
+      "epoch": 0.3132855290662292,
+      "grad_norm": 1.0443695783615112,
+      "learning_rate": 5.9426470588235294e-05,
+      "loss": 0.0193,
+      "step": 2960
+    },
+    {
+      "epoch": 0.31540232318154154,
+      "grad_norm": 0.6489245891571045,
+      "learning_rate": 5.913235294117647e-05,
+      "loss": 0.016,
+      "step": 2980
+    },
+    {
+      "epoch": 0.3175191172968539,
+      "grad_norm": 1.388348937034607,
+      "learning_rate": 5.883823529411765e-05,
+      "loss": 0.0284,
+      "step": 3000
+    },
+    {
+      "epoch": 0.3196359114121663,
+      "grad_norm": 0.4919748306274414,
+      "learning_rate": 5.854411764705883e-05,
+      "loss": 0.0205,
+      "step": 3020
+    },
+    {
+      "epoch": 0.32175270552747864,
+      "grad_norm": 0.65608811378479,
+      "learning_rate": 5.8250000000000006e-05,
+      "loss": 0.0159,
+      "step": 3040
+    },
+    {
+      "epoch": 0.323869499642791,
+      "grad_norm": 0.4175134599208832,
+      "learning_rate": 5.795588235294118e-05,
+      "loss": 0.0159,
+      "step": 3060
+    },
+    {
+      "epoch": 0.32598629375810334,
+      "grad_norm": 0.6232139468193054,
+      "learning_rate": 5.766176470588236e-05,
+      "loss": 0.0177,
+      "step": 3080
+    },
+    {
+      "epoch": 0.3281030878734157,
+      "grad_norm": 0.4555909037590027,
+      "learning_rate": 5.7367647058823536e-05,
+      "loss": 0.0138,
+      "step": 3100
+    },
+    {
+      "epoch": 0.3302198819887281,
+      "grad_norm": 0.538420557975769,
+      "learning_rate": 5.7073529411764706e-05,
+      "loss": 0.0158,
+      "step": 3120
+    },
+    {
+      "epoch": 0.33233667610404044,
+      "grad_norm": 0.5802947878837585,
+      "learning_rate": 5.677941176470588e-05,
+      "loss": 0.0155,
+      "step": 3140
+    },
+    {
+      "epoch": 0.3344534702193528,
+      "grad_norm": 0.588239848613739,
+      "learning_rate": 5.648529411764706e-05,
+      "loss": 0.0187,
+      "step": 3160
+    },
+    {
+      "epoch": 0.33657026433466514,
+      "grad_norm": 0.5712038278579712,
+      "learning_rate": 5.6191176470588235e-05,
+      "loss": 0.013,
+      "step": 3180
+    },
+    {
+      "epoch": 0.3386870584499775,
+      "grad_norm": 0.4135841727256775,
+      "learning_rate": 5.589705882352941e-05,
+      "loss": 0.0171,
+      "step": 3200
+    },
+    {
+      "epoch": 0.3408038525652899,
+      "grad_norm": 0.7402490377426147,
+      "learning_rate": 5.560294117647059e-05,
+      "loss": 0.015,
+      "step": 3220
+    },
+    {
+      "epoch": 0.34292064668060224,
+      "grad_norm": 0.5647472143173218,
+      "learning_rate": 5.530882352941177e-05,
+      "loss": 0.0132,
+      "step": 3240
+    },
+    {
+      "epoch": 0.3450374407959146,
+      "grad_norm": 0.7440519332885742,
+      "learning_rate": 5.501470588235295e-05,
+      "loss": 0.0154,
+      "step": 3260
+    },
+    {
+      "epoch": 0.34715423491122693,
+      "grad_norm": 0.40782037377357483,
+      "learning_rate": 5.4720588235294124e-05,
+      "loss": 0.0168,
+      "step": 3280
+    },
+    {
+      "epoch": 0.3492710290265393,
+      "grad_norm": 0.3933939039707184,
+      "learning_rate": 5.44264705882353e-05,
+      "loss": 0.0161,
+      "step": 3300
+    },
+    {
+      "epoch": 0.3513878231418517,
+      "grad_norm": 0.29135826230049133,
+      "learning_rate": 5.413235294117648e-05,
+      "loss": 0.0189,
+      "step": 3320
+    },
+    {
+      "epoch": 0.35350461725716403,
+      "grad_norm": 0.581210196018219,
+      "learning_rate": 5.383823529411765e-05,
+      "loss": 0.0157,
+      "step": 3340
+    },
+    {
+      "epoch": 0.3556214113724764,
+      "grad_norm": 0.4485796391963959,
+      "learning_rate": 5.3544117647058824e-05,
+      "loss": 0.0142,
+      "step": 3360
+    },
+    {
+      "epoch": 0.35773820548778873,
+      "grad_norm": 0.4352544844150543,
+      "learning_rate": 5.325e-05,
+      "loss": 0.0153,
+      "step": 3380
+    },
+    {
+      "epoch": 0.3598549996031011,
+      "grad_norm": 1.0922011137008667,
+      "learning_rate": 5.2955882352941177e-05,
+      "loss": 0.0167,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3619717937184135,
+      "grad_norm": 0.2693778872489929,
+      "learning_rate": 5.266176470588235e-05,
+      "loss": 0.0137,
+      "step": 3420
+    },
+    {
+      "epoch": 0.36408858783372583,
+      "grad_norm": 1.5889476537704468,
+      "learning_rate": 5.236764705882353e-05,
+      "loss": 0.0127,
+      "step": 3440
+    },
+    {
+      "epoch": 0.3662053819490382,
+      "grad_norm": 2.3836777210235596,
+      "learning_rate": 5.207352941176471e-05,
+      "loss": 0.0196,
+      "step": 3460
+    },
+    {
+      "epoch": 0.3683221760643505,
+      "grad_norm": 0.6966289281845093,
+      "learning_rate": 5.177941176470589e-05,
+      "loss": 0.0138,
+      "step": 3480
+    },
+    {
+      "epoch": 0.3704389701796629,
+      "grad_norm": 0.7514053583145142,
+      "learning_rate": 5.1485294117647066e-05,
+      "loss": 0.0143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3725557642949753,
+      "grad_norm": 0.461103618144989,
+      "learning_rate": 5.119117647058824e-05,
+      "loss": 0.0146,
+      "step": 3520
+    },
+    {
+      "epoch": 0.3746725584102876,
+      "grad_norm": 0.7384988069534302,
+      "learning_rate": 5.089705882352941e-05,
+      "loss": 0.0167,
+      "step": 3540
+    },
+    {
+      "epoch": 0.3767893525256,
+      "grad_norm": 0.7363691329956055,
+      "learning_rate": 5.060294117647059e-05,
+      "loss": 0.0148,
+      "step": 3560
+    },
+    {
+      "epoch": 0.3789061466409123,
+      "grad_norm": 0.4628554582595825,
+      "learning_rate": 5.0308823529411765e-05,
+      "loss": 0.0138,
+      "step": 3580
+    },
+    {
+      "epoch": 0.38102294075622467,
+      "grad_norm": 0.48070573806762695,
+      "learning_rate": 5.001470588235294e-05,
+      "loss": 0.0148,
+      "step": 3600
+    },
+    {
+      "epoch": 0.3831397348715371,
+      "grad_norm": 0.913800835609436,
+      "learning_rate": 4.972058823529412e-05,
+      "loss": 0.0109,
+      "step": 3620
+    },
+    {
+      "epoch": 0.3852565289868494,
+      "grad_norm": 0.5302271842956543,
+      "learning_rate": 4.9426470588235295e-05,
+      "loss": 0.0129,
+      "step": 3640
+    },
+    {
+      "epoch": 0.38737332310216177,
+      "grad_norm": 0.5563445687294006,
+      "learning_rate": 4.913235294117647e-05,
+      "loss": 0.0155,
+      "step": 3660
+    },
+    {
+      "epoch": 0.3894901172174741,
+      "grad_norm": 0.7449616193771362,
+      "learning_rate": 4.8838235294117654e-05,
+      "loss": 0.0139,
+      "step": 3680
+    },
+    {
+      "epoch": 0.3916069113327865,
+      "grad_norm": 0.45803868770599365,
+      "learning_rate": 4.8544117647058824e-05,
+      "loss": 0.0134,
+      "step": 3700
+    },
+    {
+      "epoch": 0.39372370544809887,
+      "grad_norm": 0.4495037794113159,
+      "learning_rate": 4.825e-05,
+      "loss": 0.015,
+      "step": 3720
+    },
+    {
+      "epoch": 0.3958404995634112,
+      "grad_norm": 0.6490349769592285,
+      "learning_rate": 4.795588235294118e-05,
+      "loss": 0.0143,
+      "step": 3740
+    },
+    {
+      "epoch": 0.39795729367872357,
+      "grad_norm": 0.3576687276363373,
+      "learning_rate": 4.7661764705882354e-05,
+      "loss": 0.0118,
+      "step": 3760
+    },
+    {
+      "epoch": 0.4000740877940359,
+      "grad_norm": 0.5015860199928284,
+      "learning_rate": 4.736764705882353e-05,
+      "loss": 0.0169,
+      "step": 3780
+    },
+    {
+      "epoch": 0.4021908819093483,
+      "grad_norm": 1.0271028280258179,
+      "learning_rate": 4.707352941176471e-05,
+      "loss": 0.0119,
+      "step": 3800
+    },
+    {
+      "epoch": 0.40430767602466067,
+      "grad_norm": 0.4724489748477936,
+      "learning_rate": 4.677941176470588e-05,
+      "loss": 0.0112,
+      "step": 3820
+    },
+    {
+      "epoch": 0.406424470139973,
+      "grad_norm": 0.5578377842903137,
+      "learning_rate": 4.648529411764706e-05,
+      "loss": 0.013,
+      "step": 3840
+    },
+    {
+      "epoch": 0.40854126425528536,
+      "grad_norm": 0.6067779660224915,
+      "learning_rate": 4.6191176470588236e-05,
+      "loss": 0.0149,
+      "step": 3860
+    },
+    {
+      "epoch": 0.4106580583705977,
+      "grad_norm": 0.8015718460083008,
+      "learning_rate": 4.589705882352941e-05,
+      "loss": 0.0124,
+      "step": 3880
+    },
+    {
+      "epoch": 0.4127748524859101,
+      "grad_norm": 0.6352400183677673,
+      "learning_rate": 4.5602941176470596e-05,
+      "loss": 0.013,
+      "step": 3900
+    },
+    {
+      "epoch": 0.41489164660122246,
+      "grad_norm": 0.3545617163181305,
+      "learning_rate": 4.5308823529411765e-05,
+      "loss": 0.0117,
+      "step": 3920
+    },
+    {
+      "epoch": 0.4170084407165348,
+      "grad_norm": 0.4562068283557892,
+      "learning_rate": 4.501470588235294e-05,
+      "loss": 0.0118,
+      "step": 3940
+    },
+    {
+      "epoch": 0.41912523483184716,
+      "grad_norm": 0.8685987591743469,
+      "learning_rate": 4.472058823529412e-05,
+      "loss": 0.0126,
+      "step": 3960
+    },
+    {
+      "epoch": 0.4212420289471595,
+      "grad_norm": 0.49269378185272217,
+      "learning_rate": 4.4426470588235295e-05,
+      "loss": 0.0107,
+      "step": 3980
+    },
+    {
+      "epoch": 0.4233588230624719,
+      "grad_norm": 0.7156255841255188,
+      "learning_rate": 4.413235294117647e-05,
+      "loss": 0.0107,
+      "step": 4000
+    },
+    {
+      "epoch": 0.42547561717778426,
+      "grad_norm": 0.6339916586875916,
+      "learning_rate": 4.383823529411765e-05,
+      "loss": 0.0149,
+      "step": 4020
+    },
+    {
+      "epoch": 0.4275924112930966,
+      "grad_norm": 0.6008257269859314,
+      "learning_rate": 4.3544117647058824e-05,
+      "loss": 0.0121,
+      "step": 4040
+    },
+    {
+      "epoch": 0.42970920540840896,
+      "grad_norm": 0.34715619683265686,
+      "learning_rate": 4.325e-05,
+      "loss": 0.0115,
+      "step": 4060
+    },
+    {
+      "epoch": 0.4318259995237213,
+      "grad_norm": 0.6943634152412415,
+      "learning_rate": 4.295588235294118e-05,
+      "loss": 0.0115,
+      "step": 4080
+    },
+    {
+      "epoch": 0.4339427936390337,
+      "grad_norm": 0.5919560194015503,
+      "learning_rate": 4.2661764705882354e-05,
+      "loss": 0.0094,
+      "step": 4100
+    },
+    {
+      "epoch": 0.43605958775434606,
+      "grad_norm": 0.23244401812553406,
+      "learning_rate": 4.236764705882354e-05,
+      "loss": 0.0112,
+      "step": 4120
+    },
+    {
+      "epoch": 0.4381763818696584,
+      "grad_norm": 0.35059890151023865,
+      "learning_rate": 4.207352941176471e-05,
+      "loss": 0.0133,
+      "step": 4140
+    },
+    {
+      "epoch": 0.44029317598497075,
+      "grad_norm": 0.32678091526031494,
+      "learning_rate": 4.1779411764705883e-05,
+      "loss": 0.0113,
+      "step": 4160
+    },
+    {
+      "epoch": 0.4424099701002831,
+      "grad_norm": 0.6617632508277893,
+      "learning_rate": 4.148529411764706e-05,
+      "loss": 0.0105,
+      "step": 4180
+    },
+    {
+      "epoch": 0.4445267642155955,
+      "grad_norm": 0.27029886841773987,
+      "learning_rate": 4.1191176470588236e-05,
+      "loss": 0.0115,
+      "step": 4200
+    },
+    {
+      "epoch": 0.44664355833090785,
+      "grad_norm": 0.7106760144233704,
+      "learning_rate": 4.089705882352941e-05,
+      "loss": 0.0124,
+      "step": 4220
+    },
+    {
+      "epoch": 0.4487603524462202,
+      "grad_norm": 0.5163691639900208,
+      "learning_rate": 4.060294117647059e-05,
+      "loss": 0.0111,
+      "step": 4240
+    },
+    {
+      "epoch": 0.45087714656153255,
+      "grad_norm": 0.7228760123252869,
+      "learning_rate": 4.0308823529411766e-05,
+      "loss": 0.0113,
+      "step": 4260
+    },
+    {
+      "epoch": 0.4529939406768449,
+      "grad_norm": 0.5797919631004333,
+      "learning_rate": 4.001470588235294e-05,
+      "loss": 0.0118,
+      "step": 4280
+    },
+    {
+      "epoch": 0.4551107347921573,
+      "grad_norm": 1.233983039855957,
+      "learning_rate": 3.972058823529412e-05,
+      "loss": 0.0087,
+      "step": 4300
+    },
+    {
+      "epoch": 0.45722752890746965,
+      "grad_norm": 0.657342791557312,
+      "learning_rate": 3.9426470588235295e-05,
+      "loss": 0.0092,
+      "step": 4320
+    },
+    {
+      "epoch": 0.459344323022782,
+      "grad_norm": 0.4171401262283325,
+      "learning_rate": 3.913235294117647e-05,
+      "loss": 0.0161,
+      "step": 4340
+    },
+    {
+      "epoch": 0.46146111713809435,
+      "grad_norm": 0.34782201051712036,
+      "learning_rate": 3.883823529411765e-05,
+      "loss": 0.0103,
+      "step": 4360
+    },
+    {
+      "epoch": 0.4635779112534067,
+      "grad_norm": 0.5111158490180969,
+      "learning_rate": 3.8544117647058825e-05,
+      "loss": 0.0097,
+      "step": 4380
+    },
+    {
+      "epoch": 0.4656947053687191,
+      "grad_norm": 0.5910077095031738,
+      "learning_rate": 3.825e-05,
+      "loss": 0.0176,
+      "step": 4400
+    },
+    {
+      "epoch": 0.46781149948403145,
+      "grad_norm": 0.6808711290359497,
+      "learning_rate": 3.795588235294118e-05,
+      "loss": 0.009,
+      "step": 4420
+    },
+    {
+      "epoch": 0.4699282935993438,
+      "grad_norm": 0.4499869644641876,
+      "learning_rate": 3.7661764705882354e-05,
+      "loss": 0.0106,
+      "step": 4440
+    },
+    {
+      "epoch": 0.47204508771465614,
+      "grad_norm": 0.4361923336982727,
+      "learning_rate": 3.736764705882353e-05,
+      "loss": 0.0097,
+      "step": 4460
+    },
+    {
+      "epoch": 0.4741618818299685,
+      "grad_norm": 0.3171451985836029,
+      "learning_rate": 3.707352941176471e-05,
+      "loss": 0.0092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.4762786759452809,
+      "grad_norm": 0.28628259897232056,
+      "learning_rate": 3.6779411764705884e-05,
+      "loss": 0.0081,
+      "step": 4500
+    },
+    {
+      "epoch": 0.47839547006059324,
+      "grad_norm": 0.5043999552726746,
+      "learning_rate": 3.648529411764706e-05,
+      "loss": 0.0102,
+      "step": 4520
+    },
+    {
+      "epoch": 0.4805122641759056,
+      "grad_norm": 0.3881862163543701,
+      "learning_rate": 3.619117647058824e-05,
+      "loss": 0.0109,
+      "step": 4540
+    },
+    {
+      "epoch": 0.48262905829121794,
+      "grad_norm": 0.6093239188194275,
+      "learning_rate": 3.589705882352941e-05,
+      "loss": 0.0089,
+      "step": 4560
+    },
+    {
+      "epoch": 0.4847458524065303,
+      "grad_norm": 0.4642229378223419,
+      "learning_rate": 3.560294117647059e-05,
+      "loss": 0.0092,
+      "step": 4580
+    },
+    {
+      "epoch": 0.4868626465218427,
+      "grad_norm": 0.4857279062271118,
+      "learning_rate": 3.5308823529411766e-05,
+      "loss": 0.0081,
+      "step": 4600
+    },
+    {
+      "epoch": 0.48897944063715504,
+      "grad_norm": 0.40589526295661926,
+      "learning_rate": 3.501470588235294e-05,
+      "loss": 0.0098,
+      "step": 4620
+    },
+    {
+      "epoch": 0.4910962347524674,
+      "grad_norm": 0.2723426818847656,
+      "learning_rate": 3.472058823529412e-05,
+      "loss": 0.0133,
+      "step": 4640
+    },
+    {
+      "epoch": 0.49321302886777973,
+      "grad_norm": 0.7545261383056641,
+      "learning_rate": 3.4426470588235296e-05,
+      "loss": 0.0103,
+      "step": 4660
+    },
+    {
+      "epoch": 0.4953298229830921,
+      "grad_norm": 1.5047451257705688,
+      "learning_rate": 3.413235294117647e-05,
+      "loss": 0.0103,
+      "step": 4680
+    },
+    {
+      "epoch": 0.4974466170984045,
+      "grad_norm": 0.46020635962486267,
+      "learning_rate": 3.383823529411765e-05,
+      "loss": 0.0092,
+      "step": 4700
+    },
+    {
+      "epoch": 0.49956341121371683,
+      "grad_norm": 0.42124831676483154,
+      "learning_rate": 3.3544117647058825e-05,
+      "loss": 0.0112,
+      "step": 4720
+    },
+    {
+      "epoch": 0.5016802053290292,
+      "grad_norm": 0.18676140904426575,
+      "learning_rate": 3.325e-05,
+      "loss": 0.0096,
+      "step": 4740
+    },
+    {
+      "epoch": 0.5037969994443415,
+      "grad_norm": 0.41889238357543945,
+      "learning_rate": 3.295588235294118e-05,
+      "loss": 0.0112,
+      "step": 4760
+    },
+    {
+      "epoch": 0.5059137935596539,
+      "grad_norm": 0.5965830087661743,
+      "learning_rate": 3.2661764705882355e-05,
+      "loss": 0.0082,
+      "step": 4780
+    },
+    {
+      "epoch": 0.5080305876749662,
+      "grad_norm": 0.5901793837547302,
+      "learning_rate": 3.236764705882353e-05,
+      "loss": 0.0092,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5101473817902786,
+      "grad_norm": 0.453032910823822,
+      "learning_rate": 3.207352941176471e-05,
+      "loss": 0.0104,
+      "step": 4820
+    },
+    {
+      "epoch": 0.512264175905591,
+      "grad_norm": 0.3099919557571411,
+      "learning_rate": 3.1779411764705884e-05,
+      "loss": 0.0097,
+      "step": 4840
+    },
+    {
+      "epoch": 0.5143809700209033,
+      "grad_norm": 0.28637203574180603,
+      "learning_rate": 3.148529411764706e-05,
+      "loss": 0.0074,
+      "step": 4860
+    },
+    {
+      "epoch": 0.5164977641362157,
+      "grad_norm": 0.45871102809906006,
+      "learning_rate": 3.119117647058824e-05,
+      "loss": 0.0093,
+      "step": 4880
+    },
+    {
+      "epoch": 0.518614558251528,
+      "grad_norm": 0.5844906568527222,
+      "learning_rate": 3.0897058823529414e-05,
+      "loss": 0.0097,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5207313523668404,
+      "grad_norm": 0.7102438807487488,
+      "learning_rate": 3.060294117647059e-05,
+      "loss": 0.0083,
+      "step": 4920
+    },
+    {
+      "epoch": 0.5228481464821528,
+      "grad_norm": 0.483784943819046,
+      "learning_rate": 3.0308823529411767e-05,
+      "loss": 0.0091,
+      "step": 4940
+    },
+    {
+      "epoch": 0.5249649405974651,
+      "grad_norm": 0.4747030436992645,
+      "learning_rate": 3.0014705882352943e-05,
+      "loss": 0.0091,
+      "step": 4960
+    },
+    {
+      "epoch": 0.5270817347127775,
+      "grad_norm": 0.3532012403011322,
+      "learning_rate": 2.9720588235294116e-05,
+      "loss": 0.0082,
+      "step": 4980
+    },
+    {
+      "epoch": 0.5291985288280898,
+      "grad_norm": 0.42889463901519775,
+      "learning_rate": 2.9426470588235293e-05,
+      "loss": 0.0091,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5313153229434022,
+      "grad_norm": 0.4388155937194824,
+      "learning_rate": 2.9132352941176473e-05,
+      "loss": 0.0088,
+      "step": 5020
+    },
+    {
+      "epoch": 0.5334321170587146,
+      "grad_norm": 0.49440255761146545,
+      "learning_rate": 2.883823529411765e-05,
+      "loss": 0.0091,
+      "step": 5040
+    },
+    {
+      "epoch": 0.5355489111740269,
+      "grad_norm": 0.3930880129337311,
+      "learning_rate": 2.8544117647058826e-05,
+      "loss": 0.0114,
+      "step": 5060
+    },
+    {
+      "epoch": 0.5376657052893393,
+      "grad_norm": 0.380283921957016,
+      "learning_rate": 2.825e-05,
+      "loss": 0.0105,
+      "step": 5080
+    },
+    {
+      "epoch": 0.5397824994046516,
+      "grad_norm": 0.3737698793411255,
+      "learning_rate": 2.7955882352941175e-05,
+      "loss": 0.0132,
+      "step": 5100
+    },
+    {
+      "epoch": 0.541899293519964,
+      "grad_norm": 0.5393537282943726,
+      "learning_rate": 2.7661764705882355e-05,
+      "loss": 0.0118,
+      "step": 5120
+    },
+    {
+      "epoch": 0.5440160876352764,
+      "grad_norm": 0.3449922502040863,
+      "learning_rate": 2.7367647058823532e-05,
+      "loss": 0.0077,
+      "step": 5140
+    },
+    {
+      "epoch": 0.5461328817505887,
+      "grad_norm": 0.6629793643951416,
+      "learning_rate": 2.7073529411764708e-05,
+      "loss": 0.0084,
+      "step": 5160
+    },
+    {
+      "epoch": 0.5482496758659011,
+      "grad_norm": 0.7243732810020447,
+      "learning_rate": 2.6779411764705885e-05,
+      "loss": 0.0073,
+      "step": 5180
+    },
+    {
+      "epoch": 0.5503664699812134,
+      "grad_norm": 0.6006022691726685,
+      "learning_rate": 2.6485294117647058e-05,
+      "loss": 0.0084,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5524832640965258,
+      "grad_norm": 0.5986945629119873,
+      "learning_rate": 2.6191176470588234e-05,
+      "loss": 0.0087,
+      "step": 5220
+    },
+    {
+      "epoch": 0.5546000582118382,
+      "grad_norm": 0.267560750246048,
+      "learning_rate": 2.5897058823529414e-05,
+      "loss": 0.0092,
+      "step": 5240
+    },
+    {
+      "epoch": 0.5567168523271505,
+      "grad_norm": 0.47937673330307007,
+      "learning_rate": 2.560294117647059e-05,
+      "loss": 0.0089,
+      "step": 5260
+    },
+    {
+      "epoch": 0.5588336464424629,
+      "grad_norm": 0.4451775550842285,
+      "learning_rate": 2.5308823529411767e-05,
+      "loss": 0.0082,
+      "step": 5280
+    },
+    {
+      "epoch": 0.5609504405577752,
+      "grad_norm": 0.7350065112113953,
+      "learning_rate": 2.501470588235294e-05,
+      "loss": 0.0087,
+      "step": 5300
+    },
+    {
+      "epoch": 0.5630672346730876,
+      "grad_norm": 0.43704766035079956,
+      "learning_rate": 2.4720588235294117e-05,
+      "loss": 0.0089,
+      "step": 5320
+    },
+    {
+      "epoch": 0.5651840287884,
+      "grad_norm": 0.29158827662467957,
+      "learning_rate": 2.4426470588235297e-05,
+      "loss": 0.0066,
+      "step": 5340
+    },
+    {
+      "epoch": 0.5673008229037123,
+      "grad_norm": 0.39838340878486633,
+      "learning_rate": 2.4132352941176473e-05,
+      "loss": 0.0081,
+      "step": 5360
+    },
+    {
+      "epoch": 0.5694176170190247,
+      "grad_norm": 0.4324835538864136,
+      "learning_rate": 2.3838235294117646e-05,
+      "loss": 0.008,
+      "step": 5380
+    },
+    {
+      "epoch": 0.571534411134337,
+      "grad_norm": 0.4358319938182831,
+      "learning_rate": 2.3544117647058826e-05,
+      "loss": 0.008,
+      "step": 5400
+    },
+    {
+      "epoch": 0.5736512052496494,
+      "grad_norm": 0.8966334462165833,
+      "learning_rate": 2.3250000000000003e-05,
+      "loss": 0.0078,
+      "step": 5420
+    },
+    {
+      "epoch": 0.5757679993649618,
+      "grad_norm": 0.9501079320907593,
+      "learning_rate": 2.2955882352941176e-05,
+      "loss": 0.0184,
+      "step": 5440
+    },
+    {
+      "epoch": 0.5778847934802741,
+      "grad_norm": 0.13483519852161407,
+      "learning_rate": 2.2661764705882356e-05,
+      "loss": 0.0154,
+      "step": 5460
+    },
+    {
+      "epoch": 0.5800015875955865,
+      "grad_norm": 0.4287421703338623,
+      "learning_rate": 2.236764705882353e-05,
+      "loss": 0.0084,
+      "step": 5480
+    },
+    {
+      "epoch": 0.5821183817108988,
+      "grad_norm": 0.1738578975200653,
+      "learning_rate": 2.2073529411764705e-05,
+      "loss": 0.0079,
+      "step": 5500
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 7000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.4978733484727508e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c1838316ae8c7df2b7eb5f039022e01d00f71f28b5a09877cd72af98fb0743
+size 5969

internvl3_1b_lora_7000_20260304_104032/checkpoint-5500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8e61f668388e8ada146ac126f552ea1bf7c2808f48f0fda53415e7f06cd006d9
+size 18138288

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<video>": 151674,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
+' }}{% elif content['type'] == 'video' %}{{ '<video>
+' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
+'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
+' }}{% endif %}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "start_image_token": "<img>",
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc80b7e20adf8bf6f6ca442bf1abfac8056bb3b7d3e0b11c9d497d3e79398c9
+size 11423732

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,306 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "start_image_token": "<img>",
+    "video_token": "<video>"
+  },
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "return_token_type_ids": false,
+  "split_special_tokens": false,
+  "start_image_token": "<img>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2141 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6350382345937078,
+  "eval_steps": 500,
+  "global_step": 6000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00010583970576561797,
+      "grad_norm": 2.33493709564209,
+      "learning_rate": 0.0,
+      "loss": 1.2764,
+      "step": 1
+    },
+    {
+      "epoch": 0.0021167941153123595,
+      "grad_norm": 2.771491765975952,
+      "learning_rate": 9.5e-06,
+      "loss": 1.6307,
+      "step": 20
+    },
+    {
+      "epoch": 0.004233588230624719,
+      "grad_norm": 1.6168005466461182,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 1.5457,
+      "step": 40
+    },
+    {
+      "epoch": 0.006350382345937078,
+      "grad_norm": 1.033608317375183,
+      "learning_rate": 2.95e-05,
+      "loss": 1.333,
+      "step": 60
+    },
+    {
+      "epoch": 0.008467176461249438,
+      "grad_norm": 0.9740731716156006,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.1355,
+      "step": 80
+    },
+    {
+      "epoch": 0.010583970576561796,
+      "grad_norm": 1.2390116453170776,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.0417,
+      "step": 100
+    },
+    {
+      "epoch": 0.012700764691874157,
+      "grad_norm": 1.3689967393875122,
+      "learning_rate": 5.95e-05,
+      "loss": 0.9421,
+      "step": 120
+    },
+    {
+      "epoch": 0.014817558807186515,
+      "grad_norm": 2.2537403106689453,
+      "learning_rate": 6.95e-05,
+      "loss": 0.9362,
+      "step": 140
+    },
+    {
+      "epoch": 0.016934352922498876,
+      "grad_norm": 1.8407371044158936,
+      "learning_rate": 7.950000000000001e-05,
+      "loss": 0.8148,
+      "step": 160
+    },
+    {
+      "epoch": 0.019051147037811234,
+      "grad_norm": 1.8288472890853882,
+      "learning_rate": 8.950000000000001e-05,
+      "loss": 0.7605,
+      "step": 180
+    },
+    {
+      "epoch": 0.021167941153123593,
+      "grad_norm": 2.447781562805176,
+      "learning_rate": 9.95e-05,
+      "loss": 0.6912,
+      "step": 200
+    },
+    {
+      "epoch": 0.023284735268435955,
+      "grad_norm": 2.366830825805664,
+      "learning_rate": 9.972058823529412e-05,
+      "loss": 0.6242,
+      "step": 220
+    },
+    {
+      "epoch": 0.025401529383748313,
+      "grad_norm": 2.799335479736328,
+      "learning_rate": 9.94264705882353e-05,
+      "loss": 0.5381,
+      "step": 240
+    },
+    {
+      "epoch": 0.027518323499060672,
+      "grad_norm": 2.6497650146484375,
+      "learning_rate": 9.913235294117647e-05,
+      "loss": 0.5163,
+      "step": 260
+    },
+    {
+      "epoch": 0.02963511761437303,
+      "grad_norm": 3.2764251232147217,
+      "learning_rate": 9.883823529411765e-05,
+      "loss": 0.474,
+      "step": 280
+    },
+    {
+      "epoch": 0.03175191172968539,
+      "grad_norm": 3.223743200302124,
+      "learning_rate": 9.854411764705883e-05,
+      "loss": 0.461,
+      "step": 300
+    },
+    {
+      "epoch": 0.03386870584499775,
+      "grad_norm": 3.198038339614868,
+      "learning_rate": 9.825e-05,
+      "loss": 0.4216,
+      "step": 320
+    },
+    {
+      "epoch": 0.03598549996031011,
+      "grad_norm": 3.033092737197876,
+      "learning_rate": 9.795588235294119e-05,
+      "loss": 0.3453,
+      "step": 340
+    },
+    {
+      "epoch": 0.03810229407562247,
+      "grad_norm": 2.908698797225952,
+      "learning_rate": 9.766176470588236e-05,
+      "loss": 0.3506,
+      "step": 360
+    },
+    {
+      "epoch": 0.04021908819093483,
+      "grad_norm": 3.772873878479004,
+      "learning_rate": 9.736764705882353e-05,
+      "loss": 0.3381,
+      "step": 380
+    },
+    {
+      "epoch": 0.042335882306247186,
+      "grad_norm": 2.692840337753296,
+      "learning_rate": 9.707352941176471e-05,
+      "loss": 0.2868,
+      "step": 400
+    },
+    {
+      "epoch": 0.04445267642155955,
+      "grad_norm": 3.629152297973633,
+      "learning_rate": 9.677941176470589e-05,
+      "loss": 0.2845,
+      "step": 420
+    },
+    {
+      "epoch": 0.04656947053687191,
+      "grad_norm": 4.045558929443359,
+      "learning_rate": 9.648529411764706e-05,
+      "loss": 0.275,
+      "step": 440
+    },
+    {
+      "epoch": 0.048686264652184265,
+      "grad_norm": 2.3519065380096436,
+      "learning_rate": 9.619117647058824e-05,
+      "loss": 0.2254,
+      "step": 460
+    },
+    {
+      "epoch": 0.05080305876749663,
+      "grad_norm": 2.71055269241333,
+      "learning_rate": 9.589705882352941e-05,
+      "loss": 0.2356,
+      "step": 480
+    },
+    {
+      "epoch": 0.05291985288280899,
+      "grad_norm": 3.037832021713257,
+      "learning_rate": 9.560294117647059e-05,
+      "loss": 0.2,
+      "step": 500
+    },
+    {
+      "epoch": 0.055036646998121344,
+      "grad_norm": 3.366894245147705,
+      "learning_rate": 9.530882352941177e-05,
+      "loss": 0.1888,
+      "step": 520
+    },
+    {
+      "epoch": 0.057153441113433706,
+      "grad_norm": 2.728973865509033,
+      "learning_rate": 9.501470588235294e-05,
+      "loss": 0.1844,
+      "step": 540
+    },
+    {
+      "epoch": 0.05927023522874606,
+      "grad_norm": 2.229743719100952,
+      "learning_rate": 9.472058823529412e-05,
+      "loss": 0.1658,
+      "step": 560
+    },
+    {
+      "epoch": 0.06138702934405842,
+      "grad_norm": 2.3469460010528564,
+      "learning_rate": 9.44264705882353e-05,
+      "loss": 0.1556,
+      "step": 580
+    },
+    {
+      "epoch": 0.06350382345937078,
+      "grad_norm": 2.338606595993042,
+      "learning_rate": 9.413235294117647e-05,
+      "loss": 0.1391,
+      "step": 600
+    },
+    {
+      "epoch": 0.06562061757468314,
+      "grad_norm": 1.9111056327819824,
+      "learning_rate": 9.383823529411765e-05,
+      "loss": 0.1606,
+      "step": 620
+    },
+    {
+      "epoch": 0.0677374116899955,
+      "grad_norm": 3.3568716049194336,
+      "learning_rate": 9.354411764705883e-05,
+      "loss": 0.1664,
+      "step": 640
+    },
+    {
+      "epoch": 0.06985420580530786,
+      "grad_norm": 3.88547945022583,
+      "learning_rate": 9.325e-05,
+      "loss": 0.1494,
+      "step": 660
+    },
+    {
+      "epoch": 0.07197099992062023,
+      "grad_norm": 2.3967244625091553,
+      "learning_rate": 9.295588235294118e-05,
+      "loss": 0.141,
+      "step": 680
+    },
+    {
+      "epoch": 0.07408779403593257,
+      "grad_norm": 3.0165176391601562,
+      "learning_rate": 9.266176470588236e-05,
+      "loss": 0.1409,
+      "step": 700
+    },
+    {
+      "epoch": 0.07620458815124494,
+      "grad_norm": 3.2665436267852783,
+      "learning_rate": 9.236764705882353e-05,
+      "loss": 0.1112,
+      "step": 720
+    },
+    {
+      "epoch": 0.0783213822665573,
+      "grad_norm": 2.3310046195983887,
+      "learning_rate": 9.207352941176471e-05,
+      "loss": 0.1396,
+      "step": 740
+    },
+    {
+      "epoch": 0.08043817638186966,
+      "grad_norm": 1.8768619298934937,
+      "learning_rate": 9.177941176470589e-05,
+      "loss": 0.1114,
+      "step": 760
+    },
+    {
+      "epoch": 0.08255497049718202,
+      "grad_norm": 3.4282712936401367,
+      "learning_rate": 9.148529411764706e-05,
+      "loss": 0.1073,
+      "step": 780
+    },
+    {
+      "epoch": 0.08467176461249437,
+      "grad_norm": 3.2704601287841797,
+      "learning_rate": 9.119117647058824e-05,
+      "loss": 0.1281,
+      "step": 800
+    },
+    {
+      "epoch": 0.08678855872780673,
+      "grad_norm": 2.225818157196045,
+      "learning_rate": 9.089705882352942e-05,
+      "loss": 0.1046,
+      "step": 820
+    },
+    {
+      "epoch": 0.0889053528431191,
+      "grad_norm": 2.078011989593506,
+      "learning_rate": 9.060294117647059e-05,
+      "loss": 0.1033,
+      "step": 840
+    },
+    {
+      "epoch": 0.09102214695843146,
+      "grad_norm": 1.3325825929641724,
+      "learning_rate": 9.030882352941177e-05,
+      "loss": 0.0872,
+      "step": 860
+    },
+    {
+      "epoch": 0.09313894107374382,
+      "grad_norm": 3.0471086502075195,
+      "learning_rate": 9.001470588235294e-05,
+      "loss": 0.0927,
+      "step": 880
+    },
+    {
+      "epoch": 0.09525573518905617,
+      "grad_norm": 2.9685380458831787,
+      "learning_rate": 8.972058823529412e-05,
+      "loss": 0.0917,
+      "step": 900
+    },
+    {
+      "epoch": 0.09737252930436853,
+      "grad_norm": 1.8589142560958862,
+      "learning_rate": 8.94264705882353e-05,
+      "loss": 0.0836,
+      "step": 920
+    },
+    {
+      "epoch": 0.09948932341968089,
+      "grad_norm": 1.523457407951355,
+      "learning_rate": 8.913235294117647e-05,
+      "loss": 0.0862,
+      "step": 940
+    },
+    {
+      "epoch": 0.10160611753499325,
+      "grad_norm": 1.4009277820587158,
+      "learning_rate": 8.883823529411765e-05,
+      "loss": 0.068,
+      "step": 960
+    },
+    {
+      "epoch": 0.10372291165030562,
+      "grad_norm": 2.0816826820373535,
+      "learning_rate": 8.854411764705883e-05,
+      "loss": 0.0741,
+      "step": 980
+    },
+    {
+      "epoch": 0.10583970576561798,
+      "grad_norm": 2.218278408050537,
+      "learning_rate": 8.825e-05,
+      "loss": 0.0822,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10795649988093033,
+      "grad_norm": 1.188503623008728,
+      "learning_rate": 8.795588235294118e-05,
+      "loss": 0.0792,
+      "step": 1020
+    },
+    {
+      "epoch": 0.11007329399624269,
+      "grad_norm": 0.9847146272659302,
+      "learning_rate": 8.766176470588236e-05,
+      "loss": 0.0696,
+      "step": 1040
+    },
+    {
+      "epoch": 0.11219008811155505,
+      "grad_norm": 3.0967068672180176,
+      "learning_rate": 8.736764705882353e-05,
+      "loss": 0.0842,
+      "step": 1060
+    },
+    {
+      "epoch": 0.11430688222686741,
+      "grad_norm": 2.4966516494750977,
+      "learning_rate": 8.707352941176471e-05,
+      "loss": 0.065,
+      "step": 1080
+    },
+    {
+      "epoch": 0.11642367634217977,
+      "grad_norm": 1.7355480194091797,
+      "learning_rate": 8.677941176470589e-05,
+      "loss": 0.0672,
+      "step": 1100
+    },
+    {
+      "epoch": 0.11854047045749212,
+      "grad_norm": 2.5048105716705322,
+      "learning_rate": 8.648529411764706e-05,
+      "loss": 0.0616,
+      "step": 1120
+    },
+    {
+      "epoch": 0.12065726457280448,
+      "grad_norm": 1.285093903541565,
+      "learning_rate": 8.619117647058824e-05,
+      "loss": 0.0676,
+      "step": 1140
+    },
+    {
+      "epoch": 0.12277405868811685,
+      "grad_norm": 1.58004891872406,
+      "learning_rate": 8.589705882352942e-05,
+      "loss": 0.0827,
+      "step": 1160
+    },
+    {
+      "epoch": 0.12489085280342921,
+      "grad_norm": 1.571897029876709,
+      "learning_rate": 8.560294117647059e-05,
+      "loss": 0.0663,
+      "step": 1180
+    },
+    {
+      "epoch": 0.12700764691874156,
+      "grad_norm": 0.8998542428016663,
+      "learning_rate": 8.530882352941177e-05,
+      "loss": 0.0448,
+      "step": 1200
+    },
+    {
+      "epoch": 0.12912444103405393,
+      "grad_norm": 1.577183485031128,
+      "learning_rate": 8.501470588235295e-05,
+      "loss": 0.0574,
+      "step": 1220
+    },
+    {
+      "epoch": 0.13124123514936628,
+      "grad_norm": 1.7241120338439941,
+      "learning_rate": 8.472058823529412e-05,
+      "loss": 0.0586,
+      "step": 1240
+    },
+    {
+      "epoch": 0.13335802926467866,
+      "grad_norm": 1.4512884616851807,
+      "learning_rate": 8.44264705882353e-05,
+      "loss": 0.0457,
+      "step": 1260
+    },
+    {
+      "epoch": 0.135474823379991,
+      "grad_norm": 1.4320181608200073,
+      "learning_rate": 8.413235294117647e-05,
+      "loss": 0.0572,
+      "step": 1280
+    },
+    {
+      "epoch": 0.13759161749530335,
+      "grad_norm": 2.4721877574920654,
+      "learning_rate": 8.383823529411765e-05,
+      "loss": 0.0539,
+      "step": 1300
+    },
+    {
+      "epoch": 0.13970841161061573,
+      "grad_norm": 1.230265498161316,
+      "learning_rate": 8.354411764705883e-05,
+      "loss": 0.0524,
+      "step": 1320
+    },
+    {
+      "epoch": 0.14182520572592808,
+      "grad_norm": 1.5039700269699097,
+      "learning_rate": 8.325e-05,
+      "loss": 0.0569,
+      "step": 1340
+    },
+    {
+      "epoch": 0.14394199984124045,
+      "grad_norm": 1.9928780794143677,
+      "learning_rate": 8.295588235294118e-05,
+      "loss": 0.0402,
+      "step": 1360
+    },
+    {
+      "epoch": 0.1460587939565528,
+      "grad_norm": 1.8550405502319336,
+      "learning_rate": 8.266176470588236e-05,
+      "loss": 0.054,
+      "step": 1380
+    },
+    {
+      "epoch": 0.14817558807186515,
+      "grad_norm": 1.0700241327285767,
+      "learning_rate": 8.236764705882353e-05,
+      "loss": 0.052,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15029238218717753,
+      "grad_norm": 1.7121262550354004,
+      "learning_rate": 8.207352941176471e-05,
+      "loss": 0.0437,
+      "step": 1420
+    },
+    {
+      "epoch": 0.15240917630248987,
+      "grad_norm": 1.3593100309371948,
+      "learning_rate": 8.177941176470589e-05,
+      "loss": 0.0393,
+      "step": 1440
+    },
+    {
+      "epoch": 0.15452597041780225,
+      "grad_norm": 1.080735683441162,
+      "learning_rate": 8.148529411764706e-05,
+      "loss": 0.035,
+      "step": 1460
+    },
+    {
+      "epoch": 0.1566427645331146,
+      "grad_norm": 1.5516977310180664,
+      "learning_rate": 8.119117647058824e-05,
+      "loss": 0.0421,
+      "step": 1480
+    },
+    {
+      "epoch": 0.15875955864842695,
+      "grad_norm": 1.107473373413086,
+      "learning_rate": 8.089705882352942e-05,
+      "loss": 0.0442,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16087635276373932,
+      "grad_norm": 2.196147918701172,
+      "learning_rate": 8.060294117647059e-05,
+      "loss": 0.0443,
+      "step": 1520
+    },
+    {
+      "epoch": 0.16299314687905167,
+      "grad_norm": 1.4532606601715088,
+      "learning_rate": 8.030882352941177e-05,
+      "loss": 0.0417,
+      "step": 1540
+    },
+    {
+      "epoch": 0.16510994099436405,
+      "grad_norm": 3.0167882442474365,
+      "learning_rate": 8.001470588235295e-05,
+      "loss": 0.0472,
+      "step": 1560
+    },
+    {
+      "epoch": 0.1672267351096764,
+      "grad_norm": 1.764201283454895,
+      "learning_rate": 7.972058823529412e-05,
+      "loss": 0.031,
+      "step": 1580
+    },
+    {
+      "epoch": 0.16934352922498874,
+      "grad_norm": 0.8682387471199036,
+      "learning_rate": 7.94264705882353e-05,
+      "loss": 0.0291,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17146032334030112,
+      "grad_norm": 0.660894513130188,
+      "learning_rate": 7.913235294117648e-05,
+      "loss": 0.0572,
+      "step": 1620
+    },
+    {
+      "epoch": 0.17357711745561347,
+      "grad_norm": 1.7611377239227295,
+      "learning_rate": 7.883823529411765e-05,
+      "loss": 0.0453,
+      "step": 1640
+    },
+    {
+      "epoch": 0.17569391157092584,
+      "grad_norm": 0.6341773867607117,
+      "learning_rate": 7.854411764705883e-05,
+      "loss": 0.0299,
+      "step": 1660
+    },
+    {
+      "epoch": 0.1778107056862382,
+      "grad_norm": 1.4031453132629395,
+      "learning_rate": 7.825e-05,
+      "loss": 0.0358,
+      "step": 1680
+    },
+    {
+      "epoch": 0.17992749980155054,
+      "grad_norm": 1.0830997228622437,
+      "learning_rate": 7.795588235294118e-05,
+      "loss": 0.0373,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18204429391686291,
+      "grad_norm": 0.6576260924339294,
+      "learning_rate": 7.766176470588236e-05,
+      "loss": 0.0587,
+      "step": 1720
+    },
+    {
+      "epoch": 0.18416108803217526,
+      "grad_norm": 1.2640115022659302,
+      "learning_rate": 7.736764705882353e-05,
+      "loss": 0.0468,
+      "step": 1740
+    },
+    {
+      "epoch": 0.18627788214748764,
+      "grad_norm": 1.0660518407821655,
+      "learning_rate": 7.707352941176471e-05,
+      "loss": 0.0466,
+      "step": 1760
+    },
+    {
+      "epoch": 0.1883946762628,
+      "grad_norm": 1.22067129611969,
+      "learning_rate": 7.677941176470589e-05,
+      "loss": 0.0335,
+      "step": 1780
+    },
+    {
+      "epoch": 0.19051147037811234,
+      "grad_norm": 4.800387859344482,
+      "learning_rate": 7.648529411764706e-05,
+      "loss": 0.0461,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1926282644934247,
+      "grad_norm": 1.1434308290481567,
+      "learning_rate": 7.619117647058824e-05,
+      "loss": 0.0326,
+      "step": 1820
+    },
+    {
+      "epoch": 0.19474505860873706,
+      "grad_norm": 0.8925223350524902,
+      "learning_rate": 7.589705882352942e-05,
+      "loss": 0.0273,
+      "step": 1840
+    },
+    {
+      "epoch": 0.19686185272404944,
+      "grad_norm": 1.1678693294525146,
+      "learning_rate": 7.560294117647059e-05,
+      "loss": 0.0345,
+      "step": 1860
+    },
+    {
+      "epoch": 0.19897864683936178,
+      "grad_norm": 0.559644341468811,
+      "learning_rate": 7.530882352941177e-05,
+      "loss": 0.0394,
+      "step": 1880
+    },
+    {
+      "epoch": 0.20109544095467416,
+      "grad_norm": 1.4313390254974365,
+      "learning_rate": 7.501470588235295e-05,
+      "loss": 0.0475,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2032122350699865,
+      "grad_norm": 1.2470778226852417,
+      "learning_rate": 7.472058823529412e-05,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 0.20532902918529886,
+      "grad_norm": 1.390359878540039,
+      "learning_rate": 7.44264705882353e-05,
+      "loss": 0.0268,
+      "step": 1940
+    },
+    {
+      "epoch": 0.20744582330061123,
+      "grad_norm": 0.6755140423774719,
+      "learning_rate": 7.413235294117648e-05,
+      "loss": 0.0331,
+      "step": 1960
+    },
+    {
+      "epoch": 0.20956261741592358,
+      "grad_norm": 0.31457772850990295,
+      "learning_rate": 7.383823529411765e-05,
+      "loss": 0.0447,
+      "step": 1980
+    },
+    {
+      "epoch": 0.21167941153123596,
+      "grad_norm": 1.6619377136230469,
+      "learning_rate": 7.354411764705883e-05,
+      "loss": 0.0336,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2137962056465483,
+      "grad_norm": 1.033492088317871,
+      "learning_rate": 7.325e-05,
+      "loss": 0.0304,
+      "step": 2020
+    },
+    {
+      "epoch": 0.21591299976186065,
+      "grad_norm": 0.730675220489502,
+      "learning_rate": 7.295588235294118e-05,
+      "loss": 0.0311,
+      "step": 2040
+    },
+    {
+      "epoch": 0.21802979387717303,
+      "grad_norm": 0.6322308778762817,
+      "learning_rate": 7.266176470588236e-05,
+      "loss": 0.0258,
+      "step": 2060
+    },
+    {
+      "epoch": 0.22014658799248538,
+      "grad_norm": 0.7560809254646301,
+      "learning_rate": 7.236764705882353e-05,
+      "loss": 0.0213,
+      "step": 2080
+    },
+    {
+      "epoch": 0.22226338210779775,
+      "grad_norm": 1.1907991170883179,
+      "learning_rate": 7.207352941176471e-05,
+      "loss": 0.0311,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2243801762231101,
+      "grad_norm": 0.6392427086830139,
+      "learning_rate": 7.177941176470589e-05,
+      "loss": 0.0302,
+      "step": 2120
+    },
+    {
+      "epoch": 0.22649697033842245,
+      "grad_norm": 1.0621793270111084,
+      "learning_rate": 7.148529411764706e-05,
+      "loss": 0.0257,
+      "step": 2140
+    },
+    {
+      "epoch": 0.22861376445373482,
+      "grad_norm": 0.8459914326667786,
+      "learning_rate": 7.119117647058824e-05,
+      "loss": 0.0249,
+      "step": 2160
+    },
+    {
+      "epoch": 0.23073055856904717,
+      "grad_norm": 1.5384963750839233,
+      "learning_rate": 7.089705882352942e-05,
+      "loss": 0.0221,
+      "step": 2180
+    },
+    {
+      "epoch": 0.23284735268435955,
+      "grad_norm": 0.920907199382782,
+      "learning_rate": 7.06029411764706e-05,
+      "loss": 0.0307,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2349641467996719,
+      "grad_norm": 1.1640409231185913,
+      "learning_rate": 7.030882352941177e-05,
+      "loss": 0.0302,
+      "step": 2220
+    },
+    {
+      "epoch": 0.23708094091498425,
+      "grad_norm": 0.7336745858192444,
+      "learning_rate": 7.001470588235295e-05,
+      "loss": 0.0286,
+      "step": 2240
+    },
+    {
+      "epoch": 0.23919773503029662,
+      "grad_norm": 1.9110276699066162,
+      "learning_rate": 6.972058823529412e-05,
+      "loss": 0.0303,
+      "step": 2260
+    },
+    {
+      "epoch": 0.24131452914560897,
+      "grad_norm": 0.9055470824241638,
+      "learning_rate": 6.94264705882353e-05,
+      "loss": 0.0241,
+      "step": 2280
+    },
+    {
+      "epoch": 0.24343132326092135,
+      "grad_norm": 1.063379168510437,
+      "learning_rate": 6.913235294117648e-05,
+      "loss": 0.0244,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2455481173762337,
+      "grad_norm": 1.0067662000656128,
+      "learning_rate": 6.883823529411765e-05,
+      "loss": 0.026,
+      "step": 2320
+    },
+    {
+      "epoch": 0.24766491149154604,
+      "grad_norm": 1.1639182567596436,
+      "learning_rate": 6.854411764705883e-05,
+      "loss": 0.0253,
+      "step": 2340
+    },
+    {
+      "epoch": 0.24978170560685842,
+      "grad_norm": 0.9918274879455566,
+      "learning_rate": 6.825e-05,
+      "loss": 0.0218,
+      "step": 2360
+    },
+    {
+      "epoch": 0.25189849972217077,
+      "grad_norm": 0.7681129574775696,
+      "learning_rate": 6.795588235294118e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 0.2540152938374831,
+      "grad_norm": 0.7643230557441711,
+      "learning_rate": 6.766176470588236e-05,
+      "loss": 0.021,
+      "step": 2400
+    },
+    {
+      "epoch": 0.2561320879527955,
+      "grad_norm": 1.2285891771316528,
+      "learning_rate": 6.736764705882354e-05,
+      "loss": 0.0194,
+      "step": 2420
+    },
+    {
+      "epoch": 0.25824888206810787,
+      "grad_norm": 0.5345446467399597,
+      "learning_rate": 6.707352941176471e-05,
+      "loss": 0.0211,
+      "step": 2440
+    },
+    {
+      "epoch": 0.2603656761834202,
+      "grad_norm": 0.7964244484901428,
+      "learning_rate": 6.677941176470589e-05,
+      "loss": 0.024,
+      "step": 2460
+    },
+    {
+      "epoch": 0.26248247029873256,
+      "grad_norm": 0.5538131594657898,
+      "learning_rate": 6.648529411764705e-05,
+      "loss": 0.0258,
+      "step": 2480
+    },
+    {
+      "epoch": 0.2645992644140449,
+      "grad_norm": 0.9520718455314636,
+      "learning_rate": 6.619117647058823e-05,
+      "loss": 0.0178,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2667160585293573,
+      "grad_norm": 0.6036665439605713,
+      "learning_rate": 6.589705882352942e-05,
+      "loss": 0.0193,
+      "step": 2520
+    },
+    {
+      "epoch": 0.26883285264466966,
+      "grad_norm": 0.37941470742225647,
+      "learning_rate": 6.56029411764706e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 0.270949646759982,
+      "grad_norm": 0.3956536650657654,
+      "learning_rate": 6.530882352941177e-05,
+      "loss": 0.0239,
+      "step": 2560
+    },
+    {
+      "epoch": 0.27306644087529436,
+      "grad_norm": 0.4313443899154663,
+      "learning_rate": 6.501470588235295e-05,
+      "loss": 0.0185,
+      "step": 2580
+    },
+    {
+      "epoch": 0.2751832349906067,
+      "grad_norm": 1.083382248878479,
+      "learning_rate": 6.472058823529412e-05,
+      "loss": 0.026,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2773000291059191,
+      "grad_norm": 0.8067460060119629,
+      "learning_rate": 6.44264705882353e-05,
+      "loss": 0.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.27941682322123146,
+      "grad_norm": 1.2681511640548706,
+      "learning_rate": 6.413235294117648e-05,
+      "loss": 0.0232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.2815336173365438,
+      "grad_norm": 0.5592957139015198,
+      "learning_rate": 6.383823529411765e-05,
+      "loss": 0.0184,
+      "step": 2660
+    },
+    {
+      "epoch": 0.28365041145185615,
+      "grad_norm": 0.5282326936721802,
+      "learning_rate": 6.354411764705883e-05,
+      "loss": 0.0195,
+      "step": 2680
+    },
+    {
+      "epoch": 0.2857672055671685,
+      "grad_norm": 0.5503069758415222,
+      "learning_rate": 6.324999999999999e-05,
+      "loss": 0.0182,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2878839996824809,
+      "grad_norm": 0.9767094254493713,
+      "learning_rate": 6.295588235294117e-05,
+      "loss": 0.0174,
+      "step": 2720
+    },
+    {
+      "epoch": 0.29000079379779325,
+      "grad_norm": 0.5078358054161072,
+      "learning_rate": 6.266176470588236e-05,
+      "loss": 0.0214,
+      "step": 2740
+    },
+    {
+      "epoch": 0.2921175879131056,
+      "grad_norm": 0.8082838654518127,
+      "learning_rate": 6.236764705882354e-05,
+      "loss": 0.0151,
+      "step": 2760
+    },
+    {
+      "epoch": 0.29423438202841795,
+      "grad_norm": 0.49735844135284424,
+      "learning_rate": 6.207352941176471e-05,
+      "loss": 0.0235,
+      "step": 2780
+    },
+    {
+      "epoch": 0.2963511761437303,
+      "grad_norm": 1.0940418243408203,
+      "learning_rate": 6.177941176470589e-05,
+      "loss": 0.016,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2984679702590427,
+      "grad_norm": 0.9790317416191101,
+      "learning_rate": 6.148529411764706e-05,
+      "loss": 0.0204,
+      "step": 2820
+    },
+    {
+      "epoch": 0.30058476437435505,
+      "grad_norm": 0.9905364513397217,
+      "learning_rate": 6.119117647058824e-05,
+      "loss": 0.0189,
+      "step": 2840
+    },
+    {
+      "epoch": 0.3027015584896674,
+      "grad_norm": 0.5084486603736877,
+      "learning_rate": 6.089705882352942e-05,
+      "loss": 0.0216,
+      "step": 2860
+    },
+    {
+      "epoch": 0.30481835260497975,
+      "grad_norm": 0.6312965750694275,
+      "learning_rate": 6.0602941176470594e-05,
+      "loss": 0.0197,
+      "step": 2880
+    },
+    {
+      "epoch": 0.3069351467202921,
+      "grad_norm": 1.0345927476882935,
+      "learning_rate": 6.0308823529411764e-05,
+      "loss": 0.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3090519408356045,
+      "grad_norm": 1.1944761276245117,
+      "learning_rate": 6.001470588235294e-05,
+      "loss": 0.017,
+      "step": 2920
+    },
+    {
+      "epoch": 0.31116873495091685,
+      "grad_norm": 0.6866488456726074,
+      "learning_rate": 5.972058823529412e-05,
+      "loss": 0.0158,
+      "step": 2940
+    },
+    {
+      "epoch": 0.3132855290662292,
+      "grad_norm": 1.0443695783615112,
+      "learning_rate": 5.9426470588235294e-05,
+      "loss": 0.0193,
+      "step": 2960
+    },
+    {
+      "epoch": 0.31540232318154154,
+      "grad_norm": 0.6489245891571045,
+      "learning_rate": 5.913235294117647e-05,
+      "loss": 0.016,
+      "step": 2980
+    },
+    {
+      "epoch": 0.3175191172968539,
+      "grad_norm": 1.388348937034607,
+      "learning_rate": 5.883823529411765e-05,
+      "loss": 0.0284,
+      "step": 3000
+    },
+    {
+      "epoch": 0.3196359114121663,
+      "grad_norm": 0.4919748306274414,
+      "learning_rate": 5.854411764705883e-05,
+      "loss": 0.0205,
+      "step": 3020
+    },
+    {
+      "epoch": 0.32175270552747864,
+      "grad_norm": 0.65608811378479,
+      "learning_rate": 5.8250000000000006e-05,
+      "loss": 0.0159,
+      "step": 3040
+    },
+    {
+      "epoch": 0.323869499642791,
+      "grad_norm": 0.4175134599208832,
+      "learning_rate": 5.795588235294118e-05,
+      "loss": 0.0159,
+      "step": 3060
+    },
+    {
+      "epoch": 0.32598629375810334,
+      "grad_norm": 0.6232139468193054,
+      "learning_rate": 5.766176470588236e-05,
+      "loss": 0.0177,
+      "step": 3080
+    },
+    {
+      "epoch": 0.3281030878734157,
+      "grad_norm": 0.4555909037590027,
+      "learning_rate": 5.7367647058823536e-05,
+      "loss": 0.0138,
+      "step": 3100
+    },
+    {
+      "epoch": 0.3302198819887281,
+      "grad_norm": 0.538420557975769,
+      "learning_rate": 5.7073529411764706e-05,
+      "loss": 0.0158,
+      "step": 3120
+    },
+    {
+      "epoch": 0.33233667610404044,
+      "grad_norm": 0.5802947878837585,
+      "learning_rate": 5.677941176470588e-05,
+      "loss": 0.0155,
+      "step": 3140
+    },
+    {
+      "epoch": 0.3344534702193528,
+      "grad_norm": 0.588239848613739,
+      "learning_rate": 5.648529411764706e-05,
+      "loss": 0.0187,
+      "step": 3160
+    },
+    {
+      "epoch": 0.33657026433466514,
+      "grad_norm": 0.5712038278579712,
+      "learning_rate": 5.6191176470588235e-05,
+      "loss": 0.013,
+      "step": 3180
+    },
+    {
+      "epoch": 0.3386870584499775,
+      "grad_norm": 0.4135841727256775,
+      "learning_rate": 5.589705882352941e-05,
+      "loss": 0.0171,
+      "step": 3200
+    },
+    {
+      "epoch": 0.3408038525652899,
+      "grad_norm": 0.7402490377426147,
+      "learning_rate": 5.560294117647059e-05,
+      "loss": 0.015,
+      "step": 3220
+    },
+    {
+      "epoch": 0.34292064668060224,
+      "grad_norm": 0.5647472143173218,
+      "learning_rate": 5.530882352941177e-05,
+      "loss": 0.0132,
+      "step": 3240
+    },
+    {
+      "epoch": 0.3450374407959146,
+      "grad_norm": 0.7440519332885742,
+      "learning_rate": 5.501470588235295e-05,
+      "loss": 0.0154,
+      "step": 3260
+    },
+    {
+      "epoch": 0.34715423491122693,
+      "grad_norm": 0.40782037377357483,
+      "learning_rate": 5.4720588235294124e-05,
+      "loss": 0.0168,
+      "step": 3280
+    },
+    {
+      "epoch": 0.3492710290265393,
+      "grad_norm": 0.3933939039707184,
+      "learning_rate": 5.44264705882353e-05,
+      "loss": 0.0161,
+      "step": 3300
+    },
+    {
+      "epoch": 0.3513878231418517,
+      "grad_norm": 0.29135826230049133,
+      "learning_rate": 5.413235294117648e-05,
+      "loss": 0.0189,
+      "step": 3320
+    },
+    {
+      "epoch": 0.35350461725716403,
+      "grad_norm": 0.581210196018219,
+      "learning_rate": 5.383823529411765e-05,
+      "loss": 0.0157,
+      "step": 3340
+    },
+    {
+      "epoch": 0.3556214113724764,
+      "grad_norm": 0.4485796391963959,
+      "learning_rate": 5.3544117647058824e-05,
+      "loss": 0.0142,
+      "step": 3360
+    },
+    {
+      "epoch": 0.35773820548778873,
+      "grad_norm": 0.4352544844150543,
+      "learning_rate": 5.325e-05,
+      "loss": 0.0153,
+      "step": 3380
+    },
+    {
+      "epoch": 0.3598549996031011,
+      "grad_norm": 1.0922011137008667,
+      "learning_rate": 5.2955882352941177e-05,
+      "loss": 0.0167,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3619717937184135,
+      "grad_norm": 0.2693778872489929,
+      "learning_rate": 5.266176470588235e-05,
+      "loss": 0.0137,
+      "step": 3420
+    },
+    {
+      "epoch": 0.36408858783372583,
+      "grad_norm": 1.5889476537704468,
+      "learning_rate": 5.236764705882353e-05,
+      "loss": 0.0127,
+      "step": 3440
+    },
+    {
+      "epoch": 0.3662053819490382,
+      "grad_norm": 2.3836777210235596,
+      "learning_rate": 5.207352941176471e-05,
+      "loss": 0.0196,
+      "step": 3460
+    },
+    {
+      "epoch": 0.3683221760643505,
+      "grad_norm": 0.6966289281845093,
+      "learning_rate": 5.177941176470589e-05,
+      "loss": 0.0138,
+      "step": 3480
+    },
+    {
+      "epoch": 0.3704389701796629,
+      "grad_norm": 0.7514053583145142,
+      "learning_rate": 5.1485294117647066e-05,
+      "loss": 0.0143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3725557642949753,
+      "grad_norm": 0.461103618144989,
+      "learning_rate": 5.119117647058824e-05,
+      "loss": 0.0146,
+      "step": 3520
+    },
+    {
+      "epoch": 0.3746725584102876,
+      "grad_norm": 0.7384988069534302,
+      "learning_rate": 5.089705882352941e-05,
+      "loss": 0.0167,
+      "step": 3540
+    },
+    {
+      "epoch": 0.3767893525256,
+      "grad_norm": 0.7363691329956055,
+      "learning_rate": 5.060294117647059e-05,
+      "loss": 0.0148,
+      "step": 3560
+    },
+    {
+      "epoch": 0.3789061466409123,
+      "grad_norm": 0.4628554582595825,
+      "learning_rate": 5.0308823529411765e-05,
+      "loss": 0.0138,
+      "step": 3580
+    },
+    {
+      "epoch": 0.38102294075622467,
+      "grad_norm": 0.48070573806762695,
+      "learning_rate": 5.001470588235294e-05,
+      "loss": 0.0148,
+      "step": 3600
+    },
+    {
+      "epoch": 0.3831397348715371,
+      "grad_norm": 0.913800835609436,
+      "learning_rate": 4.972058823529412e-05,
+      "loss": 0.0109,
+      "step": 3620
+    },
+    {
+      "epoch": 0.3852565289868494,
+      "grad_norm": 0.5302271842956543,
+      "learning_rate": 4.9426470588235295e-05,
+      "loss": 0.0129,
+      "step": 3640
+    },
+    {
+      "epoch": 0.38737332310216177,
+      "grad_norm": 0.5563445687294006,
+      "learning_rate": 4.913235294117647e-05,
+      "loss": 0.0155,
+      "step": 3660
+    },
+    {
+      "epoch": 0.3894901172174741,
+      "grad_norm": 0.7449616193771362,
+      "learning_rate": 4.8838235294117654e-05,
+      "loss": 0.0139,
+      "step": 3680
+    },
+    {
+      "epoch": 0.3916069113327865,
+      "grad_norm": 0.45803868770599365,
+      "learning_rate": 4.8544117647058824e-05,
+      "loss": 0.0134,
+      "step": 3700
+    },
+    {
+      "epoch": 0.39372370544809887,
+      "grad_norm": 0.4495037794113159,
+      "learning_rate": 4.825e-05,
+      "loss": 0.015,
+      "step": 3720
+    },
+    {
+      "epoch": 0.3958404995634112,
+      "grad_norm": 0.6490349769592285,
+      "learning_rate": 4.795588235294118e-05,
+      "loss": 0.0143,
+      "step": 3740
+    },
+    {
+      "epoch": 0.39795729367872357,
+      "grad_norm": 0.3576687276363373,
+      "learning_rate": 4.7661764705882354e-05,
+      "loss": 0.0118,
+      "step": 3760
+    },
+    {
+      "epoch": 0.4000740877940359,
+      "grad_norm": 0.5015860199928284,
+      "learning_rate": 4.736764705882353e-05,
+      "loss": 0.0169,
+      "step": 3780
+    },
+    {
+      "epoch": 0.4021908819093483,
+      "grad_norm": 1.0271028280258179,
+      "learning_rate": 4.707352941176471e-05,
+      "loss": 0.0119,
+      "step": 3800
+    },
+    {
+      "epoch": 0.40430767602466067,
+      "grad_norm": 0.4724489748477936,
+      "learning_rate": 4.677941176470588e-05,
+      "loss": 0.0112,
+      "step": 3820
+    },
+    {
+      "epoch": 0.406424470139973,
+      "grad_norm": 0.5578377842903137,
+      "learning_rate": 4.648529411764706e-05,
+      "loss": 0.013,
+      "step": 3840
+    },
+    {
+      "epoch": 0.40854126425528536,
+      "grad_norm": 0.6067779660224915,
+      "learning_rate": 4.6191176470588236e-05,
+      "loss": 0.0149,
+      "step": 3860
+    },
+    {
+      "epoch": 0.4106580583705977,
+      "grad_norm": 0.8015718460083008,
+      "learning_rate": 4.589705882352941e-05,
+      "loss": 0.0124,
+      "step": 3880
+    },
+    {
+      "epoch": 0.4127748524859101,
+      "grad_norm": 0.6352400183677673,
+      "learning_rate": 4.5602941176470596e-05,
+      "loss": 0.013,
+      "step": 3900
+    },
+    {
+      "epoch": 0.41489164660122246,
+      "grad_norm": 0.3545617163181305,
+      "learning_rate": 4.5308823529411765e-05,
+      "loss": 0.0117,
+      "step": 3920
+    },
+    {
+      "epoch": 0.4170084407165348,
+      "grad_norm": 0.4562068283557892,
+      "learning_rate": 4.501470588235294e-05,
+      "loss": 0.0118,
+      "step": 3940
+    },
+    {
+      "epoch": 0.41912523483184716,
+      "grad_norm": 0.8685987591743469,
+      "learning_rate": 4.472058823529412e-05,
+      "loss": 0.0126,
+      "step": 3960
+    },
+    {
+      "epoch": 0.4212420289471595,
+      "grad_norm": 0.49269378185272217,
+      "learning_rate": 4.4426470588235295e-05,
+      "loss": 0.0107,
+      "step": 3980
+    },
+    {
+      "epoch": 0.4233588230624719,
+      "grad_norm": 0.7156255841255188,
+      "learning_rate": 4.413235294117647e-05,
+      "loss": 0.0107,
+      "step": 4000
+    },
+    {
+      "epoch": 0.42547561717778426,
+      "grad_norm": 0.6339916586875916,
+      "learning_rate": 4.383823529411765e-05,
+      "loss": 0.0149,
+      "step": 4020
+    },
+    {
+      "epoch": 0.4275924112930966,
+      "grad_norm": 0.6008257269859314,
+      "learning_rate": 4.3544117647058824e-05,
+      "loss": 0.0121,
+      "step": 4040
+    },
+    {
+      "epoch": 0.42970920540840896,
+      "grad_norm": 0.34715619683265686,
+      "learning_rate": 4.325e-05,
+      "loss": 0.0115,
+      "step": 4060
+    },
+    {
+      "epoch": 0.4318259995237213,
+      "grad_norm": 0.6943634152412415,
+      "learning_rate": 4.295588235294118e-05,
+      "loss": 0.0115,
+      "step": 4080
+    },
+    {
+      "epoch": 0.4339427936390337,
+      "grad_norm": 0.5919560194015503,
+      "learning_rate": 4.2661764705882354e-05,
+      "loss": 0.0094,
+      "step": 4100
+    },
+    {
+      "epoch": 0.43605958775434606,
+      "grad_norm": 0.23244401812553406,
+      "learning_rate": 4.236764705882354e-05,
+      "loss": 0.0112,
+      "step": 4120
+    },
+    {
+      "epoch": 0.4381763818696584,
+      "grad_norm": 0.35059890151023865,
+      "learning_rate": 4.207352941176471e-05,
+      "loss": 0.0133,
+      "step": 4140
+    },
+    {
+      "epoch": 0.44029317598497075,
+      "grad_norm": 0.32678091526031494,
+      "learning_rate": 4.1779411764705883e-05,
+      "loss": 0.0113,
+      "step": 4160
+    },
+    {
+      "epoch": 0.4424099701002831,
+      "grad_norm": 0.6617632508277893,
+      "learning_rate": 4.148529411764706e-05,
+      "loss": 0.0105,
+      "step": 4180
+    },
+    {
+      "epoch": 0.4445267642155955,
+      "grad_norm": 0.27029886841773987,
+      "learning_rate": 4.1191176470588236e-05,
+      "loss": 0.0115,
+      "step": 4200
+    },
+    {
+      "epoch": 0.44664355833090785,
+      "grad_norm": 0.7106760144233704,
+      "learning_rate": 4.089705882352941e-05,
+      "loss": 0.0124,
+      "step": 4220
+    },
+    {
+      "epoch": 0.4487603524462202,
+      "grad_norm": 0.5163691639900208,
+      "learning_rate": 4.060294117647059e-05,
+      "loss": 0.0111,
+      "step": 4240
+    },
+    {
+      "epoch": 0.45087714656153255,
+      "grad_norm": 0.7228760123252869,
+      "learning_rate": 4.0308823529411766e-05,
+      "loss": 0.0113,
+      "step": 4260
+    },
+    {
+      "epoch": 0.4529939406768449,
+      "grad_norm": 0.5797919631004333,
+      "learning_rate": 4.001470588235294e-05,
+      "loss": 0.0118,
+      "step": 4280
+    },
+    {
+      "epoch": 0.4551107347921573,
+      "grad_norm": 1.233983039855957,
+      "learning_rate": 3.972058823529412e-05,
+      "loss": 0.0087,
+      "step": 4300
+    },
+    {
+      "epoch": 0.45722752890746965,
+      "grad_norm": 0.657342791557312,
+      "learning_rate": 3.9426470588235295e-05,
+      "loss": 0.0092,
+      "step": 4320
+    },
+    {
+      "epoch": 0.459344323022782,
+      "grad_norm": 0.4171401262283325,
+      "learning_rate": 3.913235294117647e-05,
+      "loss": 0.0161,
+      "step": 4340
+    },
+    {
+      "epoch": 0.46146111713809435,
+      "grad_norm": 0.34782201051712036,
+      "learning_rate": 3.883823529411765e-05,
+      "loss": 0.0103,
+      "step": 4360
+    },
+    {
+      "epoch": 0.4635779112534067,
+      "grad_norm": 0.5111158490180969,
+      "learning_rate": 3.8544117647058825e-05,
+      "loss": 0.0097,
+      "step": 4380
+    },
+    {
+      "epoch": 0.4656947053687191,
+      "grad_norm": 0.5910077095031738,
+      "learning_rate": 3.825e-05,
+      "loss": 0.0176,
+      "step": 4400
+    },
+    {
+      "epoch": 0.46781149948403145,
+      "grad_norm": 0.6808711290359497,
+      "learning_rate": 3.795588235294118e-05,
+      "loss": 0.009,
+      "step": 4420
+    },
+    {
+      "epoch": 0.4699282935993438,
+      "grad_norm": 0.4499869644641876,
+      "learning_rate": 3.7661764705882354e-05,
+      "loss": 0.0106,
+      "step": 4440
+    },
+    {
+      "epoch": 0.47204508771465614,
+      "grad_norm": 0.4361923336982727,
+      "learning_rate": 3.736764705882353e-05,
+      "loss": 0.0097,
+      "step": 4460
+    },
+    {
+      "epoch": 0.4741618818299685,
+      "grad_norm": 0.3171451985836029,
+      "learning_rate": 3.707352941176471e-05,
+      "loss": 0.0092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.4762786759452809,
+      "grad_norm": 0.28628259897232056,
+      "learning_rate": 3.6779411764705884e-05,
+      "loss": 0.0081,
+      "step": 4500
+    },
+    {
+      "epoch": 0.47839547006059324,
+      "grad_norm": 0.5043999552726746,
+      "learning_rate": 3.648529411764706e-05,
+      "loss": 0.0102,
+      "step": 4520
+    },
+    {
+      "epoch": 0.4805122641759056,
+      "grad_norm": 0.3881862163543701,
+      "learning_rate": 3.619117647058824e-05,
+      "loss": 0.0109,
+      "step": 4540
+    },
+    {
+      "epoch": 0.48262905829121794,
+      "grad_norm": 0.6093239188194275,
+      "learning_rate": 3.589705882352941e-05,
+      "loss": 0.0089,
+      "step": 4560
+    },
+    {
+      "epoch": 0.4847458524065303,
+      "grad_norm": 0.4642229378223419,
+      "learning_rate": 3.560294117647059e-05,
+      "loss": 0.0092,
+      "step": 4580
+    },
+    {
+      "epoch": 0.4868626465218427,
+      "grad_norm": 0.4857279062271118,
+      "learning_rate": 3.5308823529411766e-05,
+      "loss": 0.0081,
+      "step": 4600
+    },
+    {
+      "epoch": 0.48897944063715504,
+      "grad_norm": 0.40589526295661926,
+      "learning_rate": 3.501470588235294e-05,
+      "loss": 0.0098,
+      "step": 4620
+    },
+    {
+      "epoch": 0.4910962347524674,
+      "grad_norm": 0.2723426818847656,
+      "learning_rate": 3.472058823529412e-05,
+      "loss": 0.0133,
+      "step": 4640
+    },
+    {
+      "epoch": 0.49321302886777973,
+      "grad_norm": 0.7545261383056641,
+      "learning_rate": 3.4426470588235296e-05,
+      "loss": 0.0103,
+      "step": 4660
+    },
+    {
+      "epoch": 0.4953298229830921,
+      "grad_norm": 1.5047451257705688,
+      "learning_rate": 3.413235294117647e-05,
+      "loss": 0.0103,
+      "step": 4680
+    },
+    {
+      "epoch": 0.4974466170984045,
+      "grad_norm": 0.46020635962486267,
+      "learning_rate": 3.383823529411765e-05,
+      "loss": 0.0092,
+      "step": 4700
+    },
+    {
+      "epoch": 0.49956341121371683,
+      "grad_norm": 0.42124831676483154,
+      "learning_rate": 3.3544117647058825e-05,
+      "loss": 0.0112,
+      "step": 4720
+    },
+    {
+      "epoch": 0.5016802053290292,
+      "grad_norm": 0.18676140904426575,
+      "learning_rate": 3.325e-05,
+      "loss": 0.0096,
+      "step": 4740
+    },
+    {
+      "epoch": 0.5037969994443415,
+      "grad_norm": 0.41889238357543945,
+      "learning_rate": 3.295588235294118e-05,
+      "loss": 0.0112,
+      "step": 4760
+    },
+    {
+      "epoch": 0.5059137935596539,
+      "grad_norm": 0.5965830087661743,
+      "learning_rate": 3.2661764705882355e-05,
+      "loss": 0.0082,
+      "step": 4780
+    },
+    {
+      "epoch": 0.5080305876749662,
+      "grad_norm": 0.5901793837547302,
+      "learning_rate": 3.236764705882353e-05,
+      "loss": 0.0092,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5101473817902786,
+      "grad_norm": 0.453032910823822,
+      "learning_rate": 3.207352941176471e-05,
+      "loss": 0.0104,
+      "step": 4820
+    },
+    {
+      "epoch": 0.512264175905591,
+      "grad_norm": 0.3099919557571411,
+      "learning_rate": 3.1779411764705884e-05,
+      "loss": 0.0097,
+      "step": 4840
+    },
+    {
+      "epoch": 0.5143809700209033,
+      "grad_norm": 0.28637203574180603,
+      "learning_rate": 3.148529411764706e-05,
+      "loss": 0.0074,
+      "step": 4860
+    },
+    {
+      "epoch": 0.5164977641362157,
+      "grad_norm": 0.45871102809906006,
+      "learning_rate": 3.119117647058824e-05,
+      "loss": 0.0093,
+      "step": 4880
+    },
+    {
+      "epoch": 0.518614558251528,
+      "grad_norm": 0.5844906568527222,
+      "learning_rate": 3.0897058823529414e-05,
+      "loss": 0.0097,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5207313523668404,
+      "grad_norm": 0.7102438807487488,
+      "learning_rate": 3.060294117647059e-05,
+      "loss": 0.0083,
+      "step": 4920
+    },
+    {
+      "epoch": 0.5228481464821528,
+      "grad_norm": 0.483784943819046,
+      "learning_rate": 3.0308823529411767e-05,
+      "loss": 0.0091,
+      "step": 4940
+    },
+    {
+      "epoch": 0.5249649405974651,
+      "grad_norm": 0.4747030436992645,
+      "learning_rate": 3.0014705882352943e-05,
+      "loss": 0.0091,
+      "step": 4960
+    },
+    {
+      "epoch": 0.5270817347127775,
+      "grad_norm": 0.3532012403011322,
+      "learning_rate": 2.9720588235294116e-05,
+      "loss": 0.0082,
+      "step": 4980
+    },
+    {
+      "epoch": 0.5291985288280898,
+      "grad_norm": 0.42889463901519775,
+      "learning_rate": 2.9426470588235293e-05,
+      "loss": 0.0091,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5313153229434022,
+      "grad_norm": 0.4388155937194824,
+      "learning_rate": 2.9132352941176473e-05,
+      "loss": 0.0088,
+      "step": 5020
+    },
+    {
+      "epoch": 0.5334321170587146,
+      "grad_norm": 0.49440255761146545,
+      "learning_rate": 2.883823529411765e-05,
+      "loss": 0.0091,
+      "step": 5040
+    },
+    {
+      "epoch": 0.5355489111740269,
+      "grad_norm": 0.3930880129337311,
+      "learning_rate": 2.8544117647058826e-05,
+      "loss": 0.0114,
+      "step": 5060
+    },
+    {
+      "epoch": 0.5376657052893393,
+      "grad_norm": 0.380283921957016,
+      "learning_rate": 2.825e-05,
+      "loss": 0.0105,
+      "step": 5080
+    },
+    {
+      "epoch": 0.5397824994046516,
+      "grad_norm": 0.3737698793411255,
+      "learning_rate": 2.7955882352941175e-05,
+      "loss": 0.0132,
+      "step": 5100
+    },
+    {
+      "epoch": 0.541899293519964,
+      "grad_norm": 0.5393537282943726,
+      "learning_rate": 2.7661764705882355e-05,
+      "loss": 0.0118,
+      "step": 5120
+    },
+    {
+      "epoch": 0.5440160876352764,
+      "grad_norm": 0.3449922502040863,
+      "learning_rate": 2.7367647058823532e-05,
+      "loss": 0.0077,
+      "step": 5140
+    },
+    {
+      "epoch": 0.5461328817505887,
+      "grad_norm": 0.6629793643951416,
+      "learning_rate": 2.7073529411764708e-05,
+      "loss": 0.0084,
+      "step": 5160
+    },
+    {
+      "epoch": 0.5482496758659011,
+      "grad_norm": 0.7243732810020447,
+      "learning_rate": 2.6779411764705885e-05,
+      "loss": 0.0073,
+      "step": 5180
+    },
+    {
+      "epoch": 0.5503664699812134,
+      "grad_norm": 0.6006022691726685,
+      "learning_rate": 2.6485294117647058e-05,
+      "loss": 0.0084,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5524832640965258,
+      "grad_norm": 0.5986945629119873,
+      "learning_rate": 2.6191176470588234e-05,
+      "loss": 0.0087,
+      "step": 5220
+    },
+    {
+      "epoch": 0.5546000582118382,
+      "grad_norm": 0.267560750246048,
+      "learning_rate": 2.5897058823529414e-05,
+      "loss": 0.0092,
+      "step": 5240
+    },
+    {
+      "epoch": 0.5567168523271505,
+      "grad_norm": 0.47937673330307007,
+      "learning_rate": 2.560294117647059e-05,
+      "loss": 0.0089,
+      "step": 5260
+    },
+    {
+      "epoch": 0.5588336464424629,
+      "grad_norm": 0.4451775550842285,
+      "learning_rate": 2.5308823529411767e-05,
+      "loss": 0.0082,
+      "step": 5280
+    },
+    {
+      "epoch": 0.5609504405577752,
+      "grad_norm": 0.7350065112113953,
+      "learning_rate": 2.501470588235294e-05,
+      "loss": 0.0087,
+      "step": 5300
+    },
+    {
+      "epoch": 0.5630672346730876,
+      "grad_norm": 0.43704766035079956,
+      "learning_rate": 2.4720588235294117e-05,
+      "loss": 0.0089,
+      "step": 5320
+    },
+    {
+      "epoch": 0.5651840287884,
+      "grad_norm": 0.29158827662467957,
+      "learning_rate": 2.4426470588235297e-05,
+      "loss": 0.0066,
+      "step": 5340
+    },
+    {
+      "epoch": 0.5673008229037123,
+      "grad_norm": 0.39838340878486633,
+      "learning_rate": 2.4132352941176473e-05,
+      "loss": 0.0081,
+      "step": 5360
+    },
+    {
+      "epoch": 0.5694176170190247,
+      "grad_norm": 0.4324835538864136,
+      "learning_rate": 2.3838235294117646e-05,
+      "loss": 0.008,
+      "step": 5380
+    },
+    {
+      "epoch": 0.571534411134337,
+      "grad_norm": 0.4358319938182831,
+      "learning_rate": 2.3544117647058826e-05,
+      "loss": 0.008,
+      "step": 5400
+    },
+    {
+      "epoch": 0.5736512052496494,
+      "grad_norm": 0.8966334462165833,
+      "learning_rate": 2.3250000000000003e-05,
+      "loss": 0.0078,
+      "step": 5420
+    },
+    {
+      "epoch": 0.5757679993649618,
+      "grad_norm": 0.9501079320907593,
+      "learning_rate": 2.2955882352941176e-05,
+      "loss": 0.0184,
+      "step": 5440
+    },
+    {
+      "epoch": 0.5778847934802741,
+      "grad_norm": 0.13483519852161407,
+      "learning_rate": 2.2661764705882356e-05,
+      "loss": 0.0154,
+      "step": 5460
+    },
+    {
+      "epoch": 0.5800015875955865,
+      "grad_norm": 0.4287421703338623,
+      "learning_rate": 2.236764705882353e-05,
+      "loss": 0.0084,
+      "step": 5480
+    },
+    {
+      "epoch": 0.5821183817108988,
+      "grad_norm": 0.1738578975200653,
+      "learning_rate": 2.2073529411764705e-05,
+      "loss": 0.0079,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5842351758262112,
+      "grad_norm": 0.6555954217910767,
+      "learning_rate": 2.1779411764705885e-05,
+      "loss": 0.0091,
+      "step": 5520
+    },
+    {
+      "epoch": 0.5863519699415236,
+      "grad_norm": 0.5294132232666016,
+      "learning_rate": 2.1485294117647058e-05,
+      "loss": 0.007,
+      "step": 5540
+    },
+    {
+      "epoch": 0.5884687640568359,
+      "grad_norm": 0.3388701379299164,
+      "learning_rate": 2.1191176470588238e-05,
+      "loss": 0.007,
+      "step": 5560
+    },
+    {
+      "epoch": 0.5905855581721483,
+      "grad_norm": 0.4279813766479492,
+      "learning_rate": 2.0897058823529415e-05,
+      "loss": 0.0077,
+      "step": 5580
+    },
+    {
+      "epoch": 0.5927023522874606,
+      "grad_norm": 0.4467952847480774,
+      "learning_rate": 2.0602941176470588e-05,
+      "loss": 0.0083,
+      "step": 5600
+    },
+    {
+      "epoch": 0.594819146402773,
+      "grad_norm": 0.36640599370002747,
+      "learning_rate": 2.0308823529411768e-05,
+      "loss": 0.0081,
+      "step": 5620
+    },
+    {
+      "epoch": 0.5969359405180854,
+      "grad_norm": 0.2323896735906601,
+      "learning_rate": 2.001470588235294e-05,
+      "loss": 0.0065,
+      "step": 5640
+    },
+    {
+      "epoch": 0.5990527346333977,
+      "grad_norm": 0.5579979419708252,
+      "learning_rate": 1.9720588235294117e-05,
+      "loss": 0.0084,
+      "step": 5660
+    },
+    {
+      "epoch": 0.6011695287487101,
+      "grad_norm": 0.34144604206085205,
+      "learning_rate": 1.9426470588235297e-05,
+      "loss": 0.0069,
+      "step": 5680
+    },
+    {
+      "epoch": 0.6032863228640224,
+      "grad_norm": 0.5170475244522095,
+      "learning_rate": 1.913235294117647e-05,
+      "loss": 0.0074,
+      "step": 5700
+    },
+    {
+      "epoch": 0.6054031169793348,
+      "grad_norm": 0.34131792187690735,
+      "learning_rate": 1.8838235294117647e-05,
+      "loss": 0.0069,
+      "step": 5720
+    },
+    {
+      "epoch": 0.6075199110946472,
+      "grad_norm": 0.5252654552459717,
+      "learning_rate": 1.8544117647058827e-05,
+      "loss": 0.0074,
+      "step": 5740
+    },
+    {
+      "epoch": 0.6096367052099595,
+      "grad_norm": 0.23735718429088593,
+      "learning_rate": 1.825e-05,
+      "loss": 0.0079,
+      "step": 5760
+    },
+    {
+      "epoch": 0.6117534993252719,
+      "grad_norm": 0.3985564410686493,
+      "learning_rate": 1.7955882352941176e-05,
+      "loss": 0.0076,
+      "step": 5780
+    },
+    {
+      "epoch": 0.6138702934405842,
+      "grad_norm": 0.53111732006073,
+      "learning_rate": 1.7661764705882353e-05,
+      "loss": 0.0075,
+      "step": 5800
+    },
+    {
+      "epoch": 0.6159870875558966,
+      "grad_norm": 0.37471240758895874,
+      "learning_rate": 1.736764705882353e-05,
+      "loss": 0.007,
+      "step": 5820
+    },
+    {
+      "epoch": 0.618103881671209,
+      "grad_norm": 0.2607717514038086,
+      "learning_rate": 1.707352941176471e-05,
+      "loss": 0.0078,
+      "step": 5840
+    },
+    {
+      "epoch": 0.6202206757865213,
+      "grad_norm": 0.4577248990535736,
+      "learning_rate": 1.6779411764705882e-05,
+      "loss": 0.0099,
+      "step": 5860
+    },
+    {
+      "epoch": 0.6223374699018337,
+      "grad_norm": 0.44592851400375366,
+      "learning_rate": 1.648529411764706e-05,
+      "loss": 0.007,
+      "step": 5880
+    },
+    {
+      "epoch": 0.624454264017146,
+      "grad_norm": 0.4649290442466736,
+      "learning_rate": 1.619117647058824e-05,
+      "loss": 0.0078,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6265710581324584,
+      "grad_norm": 0.5193443298339844,
+      "learning_rate": 1.5897058823529412e-05,
+      "loss": 0.0086,
+      "step": 5920
+    },
+    {
+      "epoch": 0.6286878522477708,
+      "grad_norm": 0.5165125131607056,
+      "learning_rate": 1.5602941176470588e-05,
+      "loss": 0.0079,
+      "step": 5940
+    },
+    {
+      "epoch": 0.6308046463630831,
+      "grad_norm": 0.5387499928474426,
+      "learning_rate": 1.5308823529411765e-05,
+      "loss": 0.0072,
+      "step": 5960
+    },
+    {
+      "epoch": 0.6329214404783955,
+      "grad_norm": 0.3668934404850006,
+      "learning_rate": 1.5014705882352941e-05,
+      "loss": 0.0063,
+      "step": 5980
+    },
+    {
+      "epoch": 0.6350382345937078,
+      "grad_norm": 0.880902886390686,
+      "learning_rate": 1.472058823529412e-05,
+      "loss": 0.0071,
+      "step": 6000
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 7000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.634014872464362e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c1838316ae8c7df2b7eb5f039022e01d00f71f28b5a09877cd72af98fb0743
+size 5969

internvl3_1b_lora_7000_20260304_104032/checkpoint-6000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd3abe9bd3d14d831d66720b3e41b5ab90b1f076cfa716714f1e0ea67b1d6a0f
+size 18138288

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<video>": 151674,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
+' }}{% elif content['type'] == 'video' %}{{ '<video>
+' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
+'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
+' }}{% endif %}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "start_image_token": "<img>",
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc80b7e20adf8bf6f6ca442bf1abfac8056bb3b7d3e0b11c9d497d3e79398c9
+size 11423732

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,306 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "start_image_token": "<img>",
+    "video_token": "<video>"
+  },
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "return_token_type_ids": false,
+  "split_special_tokens": false,
+  "start_image_token": "<img>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2316 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.6879580874765168,
+  "eval_steps": 500,
+  "global_step": 6500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00010583970576561797,
+      "grad_norm": 2.33493709564209,
+      "learning_rate": 0.0,
+      "loss": 1.2764,
+      "step": 1
+    },
+    {
+      "epoch": 0.0021167941153123595,
+      "grad_norm": 2.771491765975952,
+      "learning_rate": 9.5e-06,
+      "loss": 1.6307,
+      "step": 20
+    },
+    {
+      "epoch": 0.004233588230624719,
+      "grad_norm": 1.6168005466461182,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 1.5457,
+      "step": 40
+    },
+    {
+      "epoch": 0.006350382345937078,
+      "grad_norm": 1.033608317375183,
+      "learning_rate": 2.95e-05,
+      "loss": 1.333,
+      "step": 60
+    },
+    {
+      "epoch": 0.008467176461249438,
+      "grad_norm": 0.9740731716156006,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.1355,
+      "step": 80
+    },
+    {
+      "epoch": 0.010583970576561796,
+      "grad_norm": 1.2390116453170776,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.0417,
+      "step": 100
+    },
+    {
+      "epoch": 0.012700764691874157,
+      "grad_norm": 1.3689967393875122,
+      "learning_rate": 5.95e-05,
+      "loss": 0.9421,
+      "step": 120
+    },
+    {
+      "epoch": 0.014817558807186515,
+      "grad_norm": 2.2537403106689453,
+      "learning_rate": 6.95e-05,
+      "loss": 0.9362,
+      "step": 140
+    },
+    {
+      "epoch": 0.016934352922498876,
+      "grad_norm": 1.8407371044158936,
+      "learning_rate": 7.950000000000001e-05,
+      "loss": 0.8148,
+      "step": 160
+    },
+    {
+      "epoch": 0.019051147037811234,
+      "grad_norm": 1.8288472890853882,
+      "learning_rate": 8.950000000000001e-05,
+      "loss": 0.7605,
+      "step": 180
+    },
+    {
+      "epoch": 0.021167941153123593,
+      "grad_norm": 2.447781562805176,
+      "learning_rate": 9.95e-05,
+      "loss": 0.6912,
+      "step": 200
+    },
+    {
+      "epoch": 0.023284735268435955,
+      "grad_norm": 2.366830825805664,
+      "learning_rate": 9.972058823529412e-05,
+      "loss": 0.6242,
+      "step": 220
+    },
+    {
+      "epoch": 0.025401529383748313,
+      "grad_norm": 2.799335479736328,
+      "learning_rate": 9.94264705882353e-05,
+      "loss": 0.5381,
+      "step": 240
+    },
+    {
+      "epoch": 0.027518323499060672,
+      "grad_norm": 2.6497650146484375,
+      "learning_rate": 9.913235294117647e-05,
+      "loss": 0.5163,
+      "step": 260
+    },
+    {
+      "epoch": 0.02963511761437303,
+      "grad_norm": 3.2764251232147217,
+      "learning_rate": 9.883823529411765e-05,
+      "loss": 0.474,
+      "step": 280
+    },
+    {
+      "epoch": 0.03175191172968539,
+      "grad_norm": 3.223743200302124,
+      "learning_rate": 9.854411764705883e-05,
+      "loss": 0.461,
+      "step": 300
+    },
+    {
+      "epoch": 0.03386870584499775,
+      "grad_norm": 3.198038339614868,
+      "learning_rate": 9.825e-05,
+      "loss": 0.4216,
+      "step": 320
+    },
+    {
+      "epoch": 0.03598549996031011,
+      "grad_norm": 3.033092737197876,
+      "learning_rate": 9.795588235294119e-05,
+      "loss": 0.3453,
+      "step": 340
+    },
+    {
+      "epoch": 0.03810229407562247,
+      "grad_norm": 2.908698797225952,
+      "learning_rate": 9.766176470588236e-05,
+      "loss": 0.3506,
+      "step": 360
+    },
+    {
+      "epoch": 0.04021908819093483,
+      "grad_norm": 3.772873878479004,
+      "learning_rate": 9.736764705882353e-05,
+      "loss": 0.3381,
+      "step": 380
+    },
+    {
+      "epoch": 0.042335882306247186,
+      "grad_norm": 2.692840337753296,
+      "learning_rate": 9.707352941176471e-05,
+      "loss": 0.2868,
+      "step": 400
+    },
+    {
+      "epoch": 0.04445267642155955,
+      "grad_norm": 3.629152297973633,
+      "learning_rate": 9.677941176470589e-05,
+      "loss": 0.2845,
+      "step": 420
+    },
+    {
+      "epoch": 0.04656947053687191,
+      "grad_norm": 4.045558929443359,
+      "learning_rate": 9.648529411764706e-05,
+      "loss": 0.275,
+      "step": 440
+    },
+    {
+      "epoch": 0.048686264652184265,
+      "grad_norm": 2.3519065380096436,
+      "learning_rate": 9.619117647058824e-05,
+      "loss": 0.2254,
+      "step": 460
+    },
+    {
+      "epoch": 0.05080305876749663,
+      "grad_norm": 2.71055269241333,
+      "learning_rate": 9.589705882352941e-05,
+      "loss": 0.2356,
+      "step": 480
+    },
+    {
+      "epoch": 0.05291985288280899,
+      "grad_norm": 3.037832021713257,
+      "learning_rate": 9.560294117647059e-05,
+      "loss": 0.2,
+      "step": 500
+    },
+    {
+      "epoch": 0.055036646998121344,
+      "grad_norm": 3.366894245147705,
+      "learning_rate": 9.530882352941177e-05,
+      "loss": 0.1888,
+      "step": 520
+    },
+    {
+      "epoch": 0.057153441113433706,
+      "grad_norm": 2.728973865509033,
+      "learning_rate": 9.501470588235294e-05,
+      "loss": 0.1844,
+      "step": 540
+    },
+    {
+      "epoch": 0.05927023522874606,
+      "grad_norm": 2.229743719100952,
+      "learning_rate": 9.472058823529412e-05,
+      "loss": 0.1658,
+      "step": 560
+    },
+    {
+      "epoch": 0.06138702934405842,
+      "grad_norm": 2.3469460010528564,
+      "learning_rate": 9.44264705882353e-05,
+      "loss": 0.1556,
+      "step": 580
+    },
+    {
+      "epoch": 0.06350382345937078,
+      "grad_norm": 2.338606595993042,
+      "learning_rate": 9.413235294117647e-05,
+      "loss": 0.1391,
+      "step": 600
+    },
+    {
+      "epoch": 0.06562061757468314,
+      "grad_norm": 1.9111056327819824,
+      "learning_rate": 9.383823529411765e-05,
+      "loss": 0.1606,
+      "step": 620
+    },
+    {
+      "epoch": 0.0677374116899955,
+      "grad_norm": 3.3568716049194336,
+      "learning_rate": 9.354411764705883e-05,
+      "loss": 0.1664,
+      "step": 640
+    },
+    {
+      "epoch": 0.06985420580530786,
+      "grad_norm": 3.88547945022583,
+      "learning_rate": 9.325e-05,
+      "loss": 0.1494,
+      "step": 660
+    },
+    {
+      "epoch": 0.07197099992062023,
+      "grad_norm": 2.3967244625091553,
+      "learning_rate": 9.295588235294118e-05,
+      "loss": 0.141,
+      "step": 680
+    },
+    {
+      "epoch": 0.07408779403593257,
+      "grad_norm": 3.0165176391601562,
+      "learning_rate": 9.266176470588236e-05,
+      "loss": 0.1409,
+      "step": 700
+    },
+    {
+      "epoch": 0.07620458815124494,
+      "grad_norm": 3.2665436267852783,
+      "learning_rate": 9.236764705882353e-05,
+      "loss": 0.1112,
+      "step": 720
+    },
+    {
+      "epoch": 0.0783213822665573,
+      "grad_norm": 2.3310046195983887,
+      "learning_rate": 9.207352941176471e-05,
+      "loss": 0.1396,
+      "step": 740
+    },
+    {
+      "epoch": 0.08043817638186966,
+      "grad_norm": 1.8768619298934937,
+      "learning_rate": 9.177941176470589e-05,
+      "loss": 0.1114,
+      "step": 760
+    },
+    {
+      "epoch": 0.08255497049718202,
+      "grad_norm": 3.4282712936401367,
+      "learning_rate": 9.148529411764706e-05,
+      "loss": 0.1073,
+      "step": 780
+    },
+    {
+      "epoch": 0.08467176461249437,
+      "grad_norm": 3.2704601287841797,
+      "learning_rate": 9.119117647058824e-05,
+      "loss": 0.1281,
+      "step": 800
+    },
+    {
+      "epoch": 0.08678855872780673,
+      "grad_norm": 2.225818157196045,
+      "learning_rate": 9.089705882352942e-05,
+      "loss": 0.1046,
+      "step": 820
+    },
+    {
+      "epoch": 0.0889053528431191,
+      "grad_norm": 2.078011989593506,
+      "learning_rate": 9.060294117647059e-05,
+      "loss": 0.1033,
+      "step": 840
+    },
+    {
+      "epoch": 0.09102214695843146,
+      "grad_norm": 1.3325825929641724,
+      "learning_rate": 9.030882352941177e-05,
+      "loss": 0.0872,
+      "step": 860
+    },
+    {
+      "epoch": 0.09313894107374382,
+      "grad_norm": 3.0471086502075195,
+      "learning_rate": 9.001470588235294e-05,
+      "loss": 0.0927,
+      "step": 880
+    },
+    {
+      "epoch": 0.09525573518905617,
+      "grad_norm": 2.9685380458831787,
+      "learning_rate": 8.972058823529412e-05,
+      "loss": 0.0917,
+      "step": 900
+    },
+    {
+      "epoch": 0.09737252930436853,
+      "grad_norm": 1.8589142560958862,
+      "learning_rate": 8.94264705882353e-05,
+      "loss": 0.0836,
+      "step": 920
+    },
+    {
+      "epoch": 0.09948932341968089,
+      "grad_norm": 1.523457407951355,
+      "learning_rate": 8.913235294117647e-05,
+      "loss": 0.0862,
+      "step": 940
+    },
+    {
+      "epoch": 0.10160611753499325,
+      "grad_norm": 1.4009277820587158,
+      "learning_rate": 8.883823529411765e-05,
+      "loss": 0.068,
+      "step": 960
+    },
+    {
+      "epoch": 0.10372291165030562,
+      "grad_norm": 2.0816826820373535,
+      "learning_rate": 8.854411764705883e-05,
+      "loss": 0.0741,
+      "step": 980
+    },
+    {
+      "epoch": 0.10583970576561798,
+      "grad_norm": 2.218278408050537,
+      "learning_rate": 8.825e-05,
+      "loss": 0.0822,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10795649988093033,
+      "grad_norm": 1.188503623008728,
+      "learning_rate": 8.795588235294118e-05,
+      "loss": 0.0792,
+      "step": 1020
+    },
+    {
+      "epoch": 0.11007329399624269,
+      "grad_norm": 0.9847146272659302,
+      "learning_rate": 8.766176470588236e-05,
+      "loss": 0.0696,
+      "step": 1040
+    },
+    {
+      "epoch": 0.11219008811155505,
+      "grad_norm": 3.0967068672180176,
+      "learning_rate": 8.736764705882353e-05,
+      "loss": 0.0842,
+      "step": 1060
+    },
+    {
+      "epoch": 0.11430688222686741,
+      "grad_norm": 2.4966516494750977,
+      "learning_rate": 8.707352941176471e-05,
+      "loss": 0.065,
+      "step": 1080
+    },
+    {
+      "epoch": 0.11642367634217977,
+      "grad_norm": 1.7355480194091797,
+      "learning_rate": 8.677941176470589e-05,
+      "loss": 0.0672,
+      "step": 1100
+    },
+    {
+      "epoch": 0.11854047045749212,
+      "grad_norm": 2.5048105716705322,
+      "learning_rate": 8.648529411764706e-05,
+      "loss": 0.0616,
+      "step": 1120
+    },
+    {
+      "epoch": 0.12065726457280448,
+      "grad_norm": 1.285093903541565,
+      "learning_rate": 8.619117647058824e-05,
+      "loss": 0.0676,
+      "step": 1140
+    },
+    {
+      "epoch": 0.12277405868811685,
+      "grad_norm": 1.58004891872406,
+      "learning_rate": 8.589705882352942e-05,
+      "loss": 0.0827,
+      "step": 1160
+    },
+    {
+      "epoch": 0.12489085280342921,
+      "grad_norm": 1.571897029876709,
+      "learning_rate": 8.560294117647059e-05,
+      "loss": 0.0663,
+      "step": 1180
+    },
+    {
+      "epoch": 0.12700764691874156,
+      "grad_norm": 0.8998542428016663,
+      "learning_rate": 8.530882352941177e-05,
+      "loss": 0.0448,
+      "step": 1200
+    },
+    {
+      "epoch": 0.12912444103405393,
+      "grad_norm": 1.577183485031128,
+      "learning_rate": 8.501470588235295e-05,
+      "loss": 0.0574,
+      "step": 1220
+    },
+    {
+      "epoch": 0.13124123514936628,
+      "grad_norm": 1.7241120338439941,
+      "learning_rate": 8.472058823529412e-05,
+      "loss": 0.0586,
+      "step": 1240
+    },
+    {
+      "epoch": 0.13335802926467866,
+      "grad_norm": 1.4512884616851807,
+      "learning_rate": 8.44264705882353e-05,
+      "loss": 0.0457,
+      "step": 1260
+    },
+    {
+      "epoch": 0.135474823379991,
+      "grad_norm": 1.4320181608200073,
+      "learning_rate": 8.413235294117647e-05,
+      "loss": 0.0572,
+      "step": 1280
+    },
+    {
+      "epoch": 0.13759161749530335,
+      "grad_norm": 2.4721877574920654,
+      "learning_rate": 8.383823529411765e-05,
+      "loss": 0.0539,
+      "step": 1300
+    },
+    {
+      "epoch": 0.13970841161061573,
+      "grad_norm": 1.230265498161316,
+      "learning_rate": 8.354411764705883e-05,
+      "loss": 0.0524,
+      "step": 1320
+    },
+    {
+      "epoch": 0.14182520572592808,
+      "grad_norm": 1.5039700269699097,
+      "learning_rate": 8.325e-05,
+      "loss": 0.0569,
+      "step": 1340
+    },
+    {
+      "epoch": 0.14394199984124045,
+      "grad_norm": 1.9928780794143677,
+      "learning_rate": 8.295588235294118e-05,
+      "loss": 0.0402,
+      "step": 1360
+    },
+    {
+      "epoch": 0.1460587939565528,
+      "grad_norm": 1.8550405502319336,
+      "learning_rate": 8.266176470588236e-05,
+      "loss": 0.054,
+      "step": 1380
+    },
+    {
+      "epoch": 0.14817558807186515,
+      "grad_norm": 1.0700241327285767,
+      "learning_rate": 8.236764705882353e-05,
+      "loss": 0.052,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15029238218717753,
+      "grad_norm": 1.7121262550354004,
+      "learning_rate": 8.207352941176471e-05,
+      "loss": 0.0437,
+      "step": 1420
+    },
+    {
+      "epoch": 0.15240917630248987,
+      "grad_norm": 1.3593100309371948,
+      "learning_rate": 8.177941176470589e-05,
+      "loss": 0.0393,
+      "step": 1440
+    },
+    {
+      "epoch": 0.15452597041780225,
+      "grad_norm": 1.080735683441162,
+      "learning_rate": 8.148529411764706e-05,
+      "loss": 0.035,
+      "step": 1460
+    },
+    {
+      "epoch": 0.1566427645331146,
+      "grad_norm": 1.5516977310180664,
+      "learning_rate": 8.119117647058824e-05,
+      "loss": 0.0421,
+      "step": 1480
+    },
+    {
+      "epoch": 0.15875955864842695,
+      "grad_norm": 1.107473373413086,
+      "learning_rate": 8.089705882352942e-05,
+      "loss": 0.0442,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16087635276373932,
+      "grad_norm": 2.196147918701172,
+      "learning_rate": 8.060294117647059e-05,
+      "loss": 0.0443,
+      "step": 1520
+    },
+    {
+      "epoch": 0.16299314687905167,
+      "grad_norm": 1.4532606601715088,
+      "learning_rate": 8.030882352941177e-05,
+      "loss": 0.0417,
+      "step": 1540
+    },
+    {
+      "epoch": 0.16510994099436405,
+      "grad_norm": 3.0167882442474365,
+      "learning_rate": 8.001470588235295e-05,
+      "loss": 0.0472,
+      "step": 1560
+    },
+    {
+      "epoch": 0.1672267351096764,
+      "grad_norm": 1.764201283454895,
+      "learning_rate": 7.972058823529412e-05,
+      "loss": 0.031,
+      "step": 1580
+    },
+    {
+      "epoch": 0.16934352922498874,
+      "grad_norm": 0.8682387471199036,
+      "learning_rate": 7.94264705882353e-05,
+      "loss": 0.0291,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17146032334030112,
+      "grad_norm": 0.660894513130188,
+      "learning_rate": 7.913235294117648e-05,
+      "loss": 0.0572,
+      "step": 1620
+    },
+    {
+      "epoch": 0.17357711745561347,
+      "grad_norm": 1.7611377239227295,
+      "learning_rate": 7.883823529411765e-05,
+      "loss": 0.0453,
+      "step": 1640
+    },
+    {
+      "epoch": 0.17569391157092584,
+      "grad_norm": 0.6341773867607117,
+      "learning_rate": 7.854411764705883e-05,
+      "loss": 0.0299,
+      "step": 1660
+    },
+    {
+      "epoch": 0.1778107056862382,
+      "grad_norm": 1.4031453132629395,
+      "learning_rate": 7.825e-05,
+      "loss": 0.0358,
+      "step": 1680
+    },
+    {
+      "epoch": 0.17992749980155054,
+      "grad_norm": 1.0830997228622437,
+      "learning_rate": 7.795588235294118e-05,
+      "loss": 0.0373,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18204429391686291,
+      "grad_norm": 0.6576260924339294,
+      "learning_rate": 7.766176470588236e-05,
+      "loss": 0.0587,
+      "step": 1720
+    },
+    {
+      "epoch": 0.18416108803217526,
+      "grad_norm": 1.2640115022659302,
+      "learning_rate": 7.736764705882353e-05,
+      "loss": 0.0468,
+      "step": 1740
+    },
+    {
+      "epoch": 0.18627788214748764,
+      "grad_norm": 1.0660518407821655,
+      "learning_rate": 7.707352941176471e-05,
+      "loss": 0.0466,
+      "step": 1760
+    },
+    {
+      "epoch": 0.1883946762628,
+      "grad_norm": 1.22067129611969,
+      "learning_rate": 7.677941176470589e-05,
+      "loss": 0.0335,
+      "step": 1780
+    },
+    {
+      "epoch": 0.19051147037811234,
+      "grad_norm": 4.800387859344482,
+      "learning_rate": 7.648529411764706e-05,
+      "loss": 0.0461,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1926282644934247,
+      "grad_norm": 1.1434308290481567,
+      "learning_rate": 7.619117647058824e-05,
+      "loss": 0.0326,
+      "step": 1820
+    },
+    {
+      "epoch": 0.19474505860873706,
+      "grad_norm": 0.8925223350524902,
+      "learning_rate": 7.589705882352942e-05,
+      "loss": 0.0273,
+      "step": 1840
+    },
+    {
+      "epoch": 0.19686185272404944,
+      "grad_norm": 1.1678693294525146,
+      "learning_rate": 7.560294117647059e-05,
+      "loss": 0.0345,
+      "step": 1860
+    },
+    {
+      "epoch": 0.19897864683936178,
+      "grad_norm": 0.559644341468811,
+      "learning_rate": 7.530882352941177e-05,
+      "loss": 0.0394,
+      "step": 1880
+    },
+    {
+      "epoch": 0.20109544095467416,
+      "grad_norm": 1.4313390254974365,
+      "learning_rate": 7.501470588235295e-05,
+      "loss": 0.0475,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2032122350699865,
+      "grad_norm": 1.2470778226852417,
+      "learning_rate": 7.472058823529412e-05,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 0.20532902918529886,
+      "grad_norm": 1.390359878540039,
+      "learning_rate": 7.44264705882353e-05,
+      "loss": 0.0268,
+      "step": 1940
+    },
+    {
+      "epoch": 0.20744582330061123,
+      "grad_norm": 0.6755140423774719,
+      "learning_rate": 7.413235294117648e-05,
+      "loss": 0.0331,
+      "step": 1960
+    },
+    {
+      "epoch": 0.20956261741592358,
+      "grad_norm": 0.31457772850990295,
+      "learning_rate": 7.383823529411765e-05,
+      "loss": 0.0447,
+      "step": 1980
+    },
+    {
+      "epoch": 0.21167941153123596,
+      "grad_norm": 1.6619377136230469,
+      "learning_rate": 7.354411764705883e-05,
+      "loss": 0.0336,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2137962056465483,
+      "grad_norm": 1.033492088317871,
+      "learning_rate": 7.325e-05,
+      "loss": 0.0304,
+      "step": 2020
+    },
+    {
+      "epoch": 0.21591299976186065,
+      "grad_norm": 0.730675220489502,
+      "learning_rate": 7.295588235294118e-05,
+      "loss": 0.0311,
+      "step": 2040
+    },
+    {
+      "epoch": 0.21802979387717303,
+      "grad_norm": 0.6322308778762817,
+      "learning_rate": 7.266176470588236e-05,
+      "loss": 0.0258,
+      "step": 2060
+    },
+    {
+      "epoch": 0.22014658799248538,
+      "grad_norm": 0.7560809254646301,
+      "learning_rate": 7.236764705882353e-05,
+      "loss": 0.0213,
+      "step": 2080
+    },
+    {
+      "epoch": 0.22226338210779775,
+      "grad_norm": 1.1907991170883179,
+      "learning_rate": 7.207352941176471e-05,
+      "loss": 0.0311,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2243801762231101,
+      "grad_norm": 0.6392427086830139,
+      "learning_rate": 7.177941176470589e-05,
+      "loss": 0.0302,
+      "step": 2120
+    },
+    {
+      "epoch": 0.22649697033842245,
+      "grad_norm": 1.0621793270111084,
+      "learning_rate": 7.148529411764706e-05,
+      "loss": 0.0257,
+      "step": 2140
+    },
+    {
+      "epoch": 0.22861376445373482,
+      "grad_norm": 0.8459914326667786,
+      "learning_rate": 7.119117647058824e-05,
+      "loss": 0.0249,
+      "step": 2160
+    },
+    {
+      "epoch": 0.23073055856904717,
+      "grad_norm": 1.5384963750839233,
+      "learning_rate": 7.089705882352942e-05,
+      "loss": 0.0221,
+      "step": 2180
+    },
+    {
+      "epoch": 0.23284735268435955,
+      "grad_norm": 0.920907199382782,
+      "learning_rate": 7.06029411764706e-05,
+      "loss": 0.0307,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2349641467996719,
+      "grad_norm": 1.1640409231185913,
+      "learning_rate": 7.030882352941177e-05,
+      "loss": 0.0302,
+      "step": 2220
+    },
+    {
+      "epoch": 0.23708094091498425,
+      "grad_norm": 0.7336745858192444,
+      "learning_rate": 7.001470588235295e-05,
+      "loss": 0.0286,
+      "step": 2240
+    },
+    {
+      "epoch": 0.23919773503029662,
+      "grad_norm": 1.9110276699066162,
+      "learning_rate": 6.972058823529412e-05,
+      "loss": 0.0303,
+      "step": 2260
+    },
+    {
+      "epoch": 0.24131452914560897,
+      "grad_norm": 0.9055470824241638,
+      "learning_rate": 6.94264705882353e-05,
+      "loss": 0.0241,
+      "step": 2280
+    },
+    {
+      "epoch": 0.24343132326092135,
+      "grad_norm": 1.063379168510437,
+      "learning_rate": 6.913235294117648e-05,
+      "loss": 0.0244,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2455481173762337,
+      "grad_norm": 1.0067662000656128,
+      "learning_rate": 6.883823529411765e-05,
+      "loss": 0.026,
+      "step": 2320
+    },
+    {
+      "epoch": 0.24766491149154604,
+      "grad_norm": 1.1639182567596436,
+      "learning_rate": 6.854411764705883e-05,
+      "loss": 0.0253,
+      "step": 2340
+    },
+    {
+      "epoch": 0.24978170560685842,
+      "grad_norm": 0.9918274879455566,
+      "learning_rate": 6.825e-05,
+      "loss": 0.0218,
+      "step": 2360
+    },
+    {
+      "epoch": 0.25189849972217077,
+      "grad_norm": 0.7681129574775696,
+      "learning_rate": 6.795588235294118e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 0.2540152938374831,
+      "grad_norm": 0.7643230557441711,
+      "learning_rate": 6.766176470588236e-05,
+      "loss": 0.021,
+      "step": 2400
+    },
+    {
+      "epoch": 0.2561320879527955,
+      "grad_norm": 1.2285891771316528,
+      "learning_rate": 6.736764705882354e-05,
+      "loss": 0.0194,
+      "step": 2420
+    },
+    {
+      "epoch": 0.25824888206810787,
+      "grad_norm": 0.5345446467399597,
+      "learning_rate": 6.707352941176471e-05,
+      "loss": 0.0211,
+      "step": 2440
+    },
+    {
+      "epoch": 0.2603656761834202,
+      "grad_norm": 0.7964244484901428,
+      "learning_rate": 6.677941176470589e-05,
+      "loss": 0.024,
+      "step": 2460
+    },
+    {
+      "epoch": 0.26248247029873256,
+      "grad_norm": 0.5538131594657898,
+      "learning_rate": 6.648529411764705e-05,
+      "loss": 0.0258,
+      "step": 2480
+    },
+    {
+      "epoch": 0.2645992644140449,
+      "grad_norm": 0.9520718455314636,
+      "learning_rate": 6.619117647058823e-05,
+      "loss": 0.0178,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2667160585293573,
+      "grad_norm": 0.6036665439605713,
+      "learning_rate": 6.589705882352942e-05,
+      "loss": 0.0193,
+      "step": 2520
+    },
+    {
+      "epoch": 0.26883285264466966,
+      "grad_norm": 0.37941470742225647,
+      "learning_rate": 6.56029411764706e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 0.270949646759982,
+      "grad_norm": 0.3956536650657654,
+      "learning_rate": 6.530882352941177e-05,
+      "loss": 0.0239,
+      "step": 2560
+    },
+    {
+      "epoch": 0.27306644087529436,
+      "grad_norm": 0.4313443899154663,
+      "learning_rate": 6.501470588235295e-05,
+      "loss": 0.0185,
+      "step": 2580
+    },
+    {
+      "epoch": 0.2751832349906067,
+      "grad_norm": 1.083382248878479,
+      "learning_rate": 6.472058823529412e-05,
+      "loss": 0.026,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2773000291059191,
+      "grad_norm": 0.8067460060119629,
+      "learning_rate": 6.44264705882353e-05,
+      "loss": 0.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.27941682322123146,
+      "grad_norm": 1.2681511640548706,
+      "learning_rate": 6.413235294117648e-05,
+      "loss": 0.0232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.2815336173365438,
+      "grad_norm": 0.5592957139015198,
+      "learning_rate": 6.383823529411765e-05,
+      "loss": 0.0184,
+      "step": 2660
+    },
+    {
+      "epoch": 0.28365041145185615,
+      "grad_norm": 0.5282326936721802,
+      "learning_rate": 6.354411764705883e-05,
+      "loss": 0.0195,
+      "step": 2680
+    },
+    {
+      "epoch": 0.2857672055671685,
+      "grad_norm": 0.5503069758415222,
+      "learning_rate": 6.324999999999999e-05,
+      "loss": 0.0182,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2878839996824809,
+      "grad_norm": 0.9767094254493713,
+      "learning_rate": 6.295588235294117e-05,
+      "loss": 0.0174,
+      "step": 2720
+    },
+    {
+      "epoch": 0.29000079379779325,
+      "grad_norm": 0.5078358054161072,
+      "learning_rate": 6.266176470588236e-05,
+      "loss": 0.0214,
+      "step": 2740
+    },
+    {
+      "epoch": 0.2921175879131056,
+      "grad_norm": 0.8082838654518127,
+      "learning_rate": 6.236764705882354e-05,
+      "loss": 0.0151,
+      "step": 2760
+    },
+    {
+      "epoch": 0.29423438202841795,
+      "grad_norm": 0.49735844135284424,
+      "learning_rate": 6.207352941176471e-05,
+      "loss": 0.0235,
+      "step": 2780
+    },
+    {
+      "epoch": 0.2963511761437303,
+      "grad_norm": 1.0940418243408203,
+      "learning_rate": 6.177941176470589e-05,
+      "loss": 0.016,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2984679702590427,
+      "grad_norm": 0.9790317416191101,
+      "learning_rate": 6.148529411764706e-05,
+      "loss": 0.0204,
+      "step": 2820
+    },
+    {
+      "epoch": 0.30058476437435505,
+      "grad_norm": 0.9905364513397217,
+      "learning_rate": 6.119117647058824e-05,
+      "loss": 0.0189,
+      "step": 2840
+    },
+    {
+      "epoch": 0.3027015584896674,
+      "grad_norm": 0.5084486603736877,
+      "learning_rate": 6.089705882352942e-05,
+      "loss": 0.0216,
+      "step": 2860
+    },
+    {
+      "epoch": 0.30481835260497975,
+      "grad_norm": 0.6312965750694275,
+      "learning_rate": 6.0602941176470594e-05,
+      "loss": 0.0197,
+      "step": 2880
+    },
+    {
+      "epoch": 0.3069351467202921,
+      "grad_norm": 1.0345927476882935,
+      "learning_rate": 6.0308823529411764e-05,
+      "loss": 0.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3090519408356045,
+      "grad_norm": 1.1944761276245117,
+      "learning_rate": 6.001470588235294e-05,
+      "loss": 0.017,
+      "step": 2920
+    },
+    {
+      "epoch": 0.31116873495091685,
+      "grad_norm": 0.6866488456726074,
+      "learning_rate": 5.972058823529412e-05,
+      "loss": 0.0158,
+      "step": 2940
+    },
+    {
+      "epoch": 0.3132855290662292,
+      "grad_norm": 1.0443695783615112,
+      "learning_rate": 5.9426470588235294e-05,
+      "loss": 0.0193,
+      "step": 2960
+    },
+    {
+      "epoch": 0.31540232318154154,
+      "grad_norm": 0.6489245891571045,
+      "learning_rate": 5.913235294117647e-05,
+      "loss": 0.016,
+      "step": 2980
+    },
+    {
+      "epoch": 0.3175191172968539,
+      "grad_norm": 1.388348937034607,
+      "learning_rate": 5.883823529411765e-05,
+      "loss": 0.0284,
+      "step": 3000
+    },
+    {
+      "epoch": 0.3196359114121663,
+      "grad_norm": 0.4919748306274414,
+      "learning_rate": 5.854411764705883e-05,
+      "loss": 0.0205,
+      "step": 3020
+    },
+    {
+      "epoch": 0.32175270552747864,
+      "grad_norm": 0.65608811378479,
+      "learning_rate": 5.8250000000000006e-05,
+      "loss": 0.0159,
+      "step": 3040
+    },
+    {
+      "epoch": 0.323869499642791,
+      "grad_norm": 0.4175134599208832,
+      "learning_rate": 5.795588235294118e-05,
+      "loss": 0.0159,
+      "step": 3060
+    },
+    {
+      "epoch": 0.32598629375810334,
+      "grad_norm": 0.6232139468193054,
+      "learning_rate": 5.766176470588236e-05,
+      "loss": 0.0177,
+      "step": 3080
+    },
+    {
+      "epoch": 0.3281030878734157,
+      "grad_norm": 0.4555909037590027,
+      "learning_rate": 5.7367647058823536e-05,
+      "loss": 0.0138,
+      "step": 3100
+    },
+    {
+      "epoch": 0.3302198819887281,
+      "grad_norm": 0.538420557975769,
+      "learning_rate": 5.7073529411764706e-05,
+      "loss": 0.0158,
+      "step": 3120
+    },
+    {
+      "epoch": 0.33233667610404044,
+      "grad_norm": 0.5802947878837585,
+      "learning_rate": 5.677941176470588e-05,
+      "loss": 0.0155,
+      "step": 3140
+    },
+    {
+      "epoch": 0.3344534702193528,
+      "grad_norm": 0.588239848613739,
+      "learning_rate": 5.648529411764706e-05,
+      "loss": 0.0187,
+      "step": 3160
+    },
+    {
+      "epoch": 0.33657026433466514,
+      "grad_norm": 0.5712038278579712,
+      "learning_rate": 5.6191176470588235e-05,
+      "loss": 0.013,
+      "step": 3180
+    },
+    {
+      "epoch": 0.3386870584499775,
+      "grad_norm": 0.4135841727256775,
+      "learning_rate": 5.589705882352941e-05,
+      "loss": 0.0171,
+      "step": 3200
+    },
+    {
+      "epoch": 0.3408038525652899,
+      "grad_norm": 0.7402490377426147,
+      "learning_rate": 5.560294117647059e-05,
+      "loss": 0.015,
+      "step": 3220
+    },
+    {
+      "epoch": 0.34292064668060224,
+      "grad_norm": 0.5647472143173218,
+      "learning_rate": 5.530882352941177e-05,
+      "loss": 0.0132,
+      "step": 3240
+    },
+    {
+      "epoch": 0.3450374407959146,
+      "grad_norm": 0.7440519332885742,
+      "learning_rate": 5.501470588235295e-05,
+      "loss": 0.0154,
+      "step": 3260
+    },
+    {
+      "epoch": 0.34715423491122693,
+      "grad_norm": 0.40782037377357483,
+      "learning_rate": 5.4720588235294124e-05,
+      "loss": 0.0168,
+      "step": 3280
+    },
+    {
+      "epoch": 0.3492710290265393,
+      "grad_norm": 0.3933939039707184,
+      "learning_rate": 5.44264705882353e-05,
+      "loss": 0.0161,
+      "step": 3300
+    },
+    {
+      "epoch": 0.3513878231418517,
+      "grad_norm": 0.29135826230049133,
+      "learning_rate": 5.413235294117648e-05,
+      "loss": 0.0189,
+      "step": 3320
+    },
+    {
+      "epoch": 0.35350461725716403,
+      "grad_norm": 0.581210196018219,
+      "learning_rate": 5.383823529411765e-05,
+      "loss": 0.0157,
+      "step": 3340
+    },
+    {
+      "epoch": 0.3556214113724764,
+      "grad_norm": 0.4485796391963959,
+      "learning_rate": 5.3544117647058824e-05,
+      "loss": 0.0142,
+      "step": 3360
+    },
+    {
+      "epoch": 0.35773820548778873,
+      "grad_norm": 0.4352544844150543,
+      "learning_rate": 5.325e-05,
+      "loss": 0.0153,
+      "step": 3380
+    },
+    {
+      "epoch": 0.3598549996031011,
+      "grad_norm": 1.0922011137008667,
+      "learning_rate": 5.2955882352941177e-05,
+      "loss": 0.0167,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3619717937184135,
+      "grad_norm": 0.2693778872489929,
+      "learning_rate": 5.266176470588235e-05,
+      "loss": 0.0137,
+      "step": 3420
+    },
+    {
+      "epoch": 0.36408858783372583,
+      "grad_norm": 1.5889476537704468,
+      "learning_rate": 5.236764705882353e-05,
+      "loss": 0.0127,
+      "step": 3440
+    },
+    {
+      "epoch": 0.3662053819490382,
+      "grad_norm": 2.3836777210235596,
+      "learning_rate": 5.207352941176471e-05,
+      "loss": 0.0196,
+      "step": 3460
+    },
+    {
+      "epoch": 0.3683221760643505,
+      "grad_norm": 0.6966289281845093,
+      "learning_rate": 5.177941176470589e-05,
+      "loss": 0.0138,
+      "step": 3480
+    },
+    {
+      "epoch": 0.3704389701796629,
+      "grad_norm": 0.7514053583145142,
+      "learning_rate": 5.1485294117647066e-05,
+      "loss": 0.0143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3725557642949753,
+      "grad_norm": 0.461103618144989,
+      "learning_rate": 5.119117647058824e-05,
+      "loss": 0.0146,
+      "step": 3520
+    },
+    {
+      "epoch": 0.3746725584102876,
+      "grad_norm": 0.7384988069534302,
+      "learning_rate": 5.089705882352941e-05,
+      "loss": 0.0167,
+      "step": 3540
+    },
+    {
+      "epoch": 0.3767893525256,
+      "grad_norm": 0.7363691329956055,
+      "learning_rate": 5.060294117647059e-05,
+      "loss": 0.0148,
+      "step": 3560
+    },
+    {
+      "epoch": 0.3789061466409123,
+      "grad_norm": 0.4628554582595825,
+      "learning_rate": 5.0308823529411765e-05,
+      "loss": 0.0138,
+      "step": 3580
+    },
+    {
+      "epoch": 0.38102294075622467,
+      "grad_norm": 0.48070573806762695,
+      "learning_rate": 5.001470588235294e-05,
+      "loss": 0.0148,
+      "step": 3600
+    },
+    {
+      "epoch": 0.3831397348715371,
+      "grad_norm": 0.913800835609436,
+      "learning_rate": 4.972058823529412e-05,
+      "loss": 0.0109,
+      "step": 3620
+    },
+    {
+      "epoch": 0.3852565289868494,
+      "grad_norm": 0.5302271842956543,
+      "learning_rate": 4.9426470588235295e-05,
+      "loss": 0.0129,
+      "step": 3640
+    },
+    {
+      "epoch": 0.38737332310216177,
+      "grad_norm": 0.5563445687294006,
+      "learning_rate": 4.913235294117647e-05,
+      "loss": 0.0155,
+      "step": 3660
+    },
+    {
+      "epoch": 0.3894901172174741,
+      "grad_norm": 0.7449616193771362,
+      "learning_rate": 4.8838235294117654e-05,
+      "loss": 0.0139,
+      "step": 3680
+    },
+    {
+      "epoch": 0.3916069113327865,
+      "grad_norm": 0.45803868770599365,
+      "learning_rate": 4.8544117647058824e-05,
+      "loss": 0.0134,
+      "step": 3700
+    },
+    {
+      "epoch": 0.39372370544809887,
+      "grad_norm": 0.4495037794113159,
+      "learning_rate": 4.825e-05,
+      "loss": 0.015,
+      "step": 3720
+    },
+    {
+      "epoch": 0.3958404995634112,
+      "grad_norm": 0.6490349769592285,
+      "learning_rate": 4.795588235294118e-05,
+      "loss": 0.0143,
+      "step": 3740
+    },
+    {
+      "epoch": 0.39795729367872357,
+      "grad_norm": 0.3576687276363373,
+      "learning_rate": 4.7661764705882354e-05,
+      "loss": 0.0118,
+      "step": 3760
+    },
+    {
+      "epoch": 0.4000740877940359,
+      "grad_norm": 0.5015860199928284,
+      "learning_rate": 4.736764705882353e-05,
+      "loss": 0.0169,
+      "step": 3780
+    },
+    {
+      "epoch": 0.4021908819093483,
+      "grad_norm": 1.0271028280258179,
+      "learning_rate": 4.707352941176471e-05,
+      "loss": 0.0119,
+      "step": 3800
+    },
+    {
+      "epoch": 0.40430767602466067,
+      "grad_norm": 0.4724489748477936,
+      "learning_rate": 4.677941176470588e-05,
+      "loss": 0.0112,
+      "step": 3820
+    },
+    {
+      "epoch": 0.406424470139973,
+      "grad_norm": 0.5578377842903137,
+      "learning_rate": 4.648529411764706e-05,
+      "loss": 0.013,
+      "step": 3840
+    },
+    {
+      "epoch": 0.40854126425528536,
+      "grad_norm": 0.6067779660224915,
+      "learning_rate": 4.6191176470588236e-05,
+      "loss": 0.0149,
+      "step": 3860
+    },
+    {
+      "epoch": 0.4106580583705977,
+      "grad_norm": 0.8015718460083008,
+      "learning_rate": 4.589705882352941e-05,
+      "loss": 0.0124,
+      "step": 3880
+    },
+    {
+      "epoch": 0.4127748524859101,
+      "grad_norm": 0.6352400183677673,
+      "learning_rate": 4.5602941176470596e-05,
+      "loss": 0.013,
+      "step": 3900
+    },
+    {
+      "epoch": 0.41489164660122246,
+      "grad_norm": 0.3545617163181305,
+      "learning_rate": 4.5308823529411765e-05,
+      "loss": 0.0117,
+      "step": 3920
+    },
+    {
+      "epoch": 0.4170084407165348,
+      "grad_norm": 0.4562068283557892,
+      "learning_rate": 4.501470588235294e-05,
+      "loss": 0.0118,
+      "step": 3940
+    },
+    {
+      "epoch": 0.41912523483184716,
+      "grad_norm": 0.8685987591743469,
+      "learning_rate": 4.472058823529412e-05,
+      "loss": 0.0126,
+      "step": 3960
+    },
+    {
+      "epoch": 0.4212420289471595,
+      "grad_norm": 0.49269378185272217,
+      "learning_rate": 4.4426470588235295e-05,
+      "loss": 0.0107,
+      "step": 3980
+    },
+    {
+      "epoch": 0.4233588230624719,
+      "grad_norm": 0.7156255841255188,
+      "learning_rate": 4.413235294117647e-05,
+      "loss": 0.0107,
+      "step": 4000
+    },
+    {
+      "epoch": 0.42547561717778426,
+      "grad_norm": 0.6339916586875916,
+      "learning_rate": 4.383823529411765e-05,
+      "loss": 0.0149,
+      "step": 4020
+    },
+    {
+      "epoch": 0.4275924112930966,
+      "grad_norm": 0.6008257269859314,
+      "learning_rate": 4.3544117647058824e-05,
+      "loss": 0.0121,
+      "step": 4040
+    },
+    {
+      "epoch": 0.42970920540840896,
+      "grad_norm": 0.34715619683265686,
+      "learning_rate": 4.325e-05,
+      "loss": 0.0115,
+      "step": 4060
+    },
+    {
+      "epoch": 0.4318259995237213,
+      "grad_norm": 0.6943634152412415,
+      "learning_rate": 4.295588235294118e-05,
+      "loss": 0.0115,
+      "step": 4080
+    },
+    {
+      "epoch": 0.4339427936390337,
+      "grad_norm": 0.5919560194015503,
+      "learning_rate": 4.2661764705882354e-05,
+      "loss": 0.0094,
+      "step": 4100
+    },
+    {
+      "epoch": 0.43605958775434606,
+      "grad_norm": 0.23244401812553406,
+      "learning_rate": 4.236764705882354e-05,
+      "loss": 0.0112,
+      "step": 4120
+    },
+    {
+      "epoch": 0.4381763818696584,
+      "grad_norm": 0.35059890151023865,
+      "learning_rate": 4.207352941176471e-05,
+      "loss": 0.0133,
+      "step": 4140
+    },
+    {
+      "epoch": 0.44029317598497075,
+      "grad_norm": 0.32678091526031494,
+      "learning_rate": 4.1779411764705883e-05,
+      "loss": 0.0113,
+      "step": 4160
+    },
+    {
+      "epoch": 0.4424099701002831,
+      "grad_norm": 0.6617632508277893,
+      "learning_rate": 4.148529411764706e-05,
+      "loss": 0.0105,
+      "step": 4180
+    },
+    {
+      "epoch": 0.4445267642155955,
+      "grad_norm": 0.27029886841773987,
+      "learning_rate": 4.1191176470588236e-05,
+      "loss": 0.0115,
+      "step": 4200
+    },
+    {
+      "epoch": 0.44664355833090785,
+      "grad_norm": 0.7106760144233704,
+      "learning_rate": 4.089705882352941e-05,
+      "loss": 0.0124,
+      "step": 4220
+    },
+    {
+      "epoch": 0.4487603524462202,
+      "grad_norm": 0.5163691639900208,
+      "learning_rate": 4.060294117647059e-05,
+      "loss": 0.0111,
+      "step": 4240
+    },
+    {
+      "epoch": 0.45087714656153255,
+      "grad_norm": 0.7228760123252869,
+      "learning_rate": 4.0308823529411766e-05,
+      "loss": 0.0113,
+      "step": 4260
+    },
+    {
+      "epoch": 0.4529939406768449,
+      "grad_norm": 0.5797919631004333,
+      "learning_rate": 4.001470588235294e-05,
+      "loss": 0.0118,
+      "step": 4280
+    },
+    {
+      "epoch": 0.4551107347921573,
+      "grad_norm": 1.233983039855957,
+      "learning_rate": 3.972058823529412e-05,
+      "loss": 0.0087,
+      "step": 4300
+    },
+    {
+      "epoch": 0.45722752890746965,
+      "grad_norm": 0.657342791557312,
+      "learning_rate": 3.9426470588235295e-05,
+      "loss": 0.0092,
+      "step": 4320
+    },
+    {
+      "epoch": 0.459344323022782,
+      "grad_norm": 0.4171401262283325,
+      "learning_rate": 3.913235294117647e-05,
+      "loss": 0.0161,
+      "step": 4340
+    },
+    {
+      "epoch": 0.46146111713809435,
+      "grad_norm": 0.34782201051712036,
+      "learning_rate": 3.883823529411765e-05,
+      "loss": 0.0103,
+      "step": 4360
+    },
+    {
+      "epoch": 0.4635779112534067,
+      "grad_norm": 0.5111158490180969,
+      "learning_rate": 3.8544117647058825e-05,
+      "loss": 0.0097,
+      "step": 4380
+    },
+    {
+      "epoch": 0.4656947053687191,
+      "grad_norm": 0.5910077095031738,
+      "learning_rate": 3.825e-05,
+      "loss": 0.0176,
+      "step": 4400
+    },
+    {
+      "epoch": 0.46781149948403145,
+      "grad_norm": 0.6808711290359497,
+      "learning_rate": 3.795588235294118e-05,
+      "loss": 0.009,
+      "step": 4420
+    },
+    {
+      "epoch": 0.4699282935993438,
+      "grad_norm": 0.4499869644641876,
+      "learning_rate": 3.7661764705882354e-05,
+      "loss": 0.0106,
+      "step": 4440
+    },
+    {
+      "epoch": 0.47204508771465614,
+      "grad_norm": 0.4361923336982727,
+      "learning_rate": 3.736764705882353e-05,
+      "loss": 0.0097,
+      "step": 4460
+    },
+    {
+      "epoch": 0.4741618818299685,
+      "grad_norm": 0.3171451985836029,
+      "learning_rate": 3.707352941176471e-05,
+      "loss": 0.0092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.4762786759452809,
+      "grad_norm": 0.28628259897232056,
+      "learning_rate": 3.6779411764705884e-05,
+      "loss": 0.0081,
+      "step": 4500
+    },
+    {
+      "epoch": 0.47839547006059324,
+      "grad_norm": 0.5043999552726746,
+      "learning_rate": 3.648529411764706e-05,
+      "loss": 0.0102,
+      "step": 4520
+    },
+    {
+      "epoch": 0.4805122641759056,
+      "grad_norm": 0.3881862163543701,
+      "learning_rate": 3.619117647058824e-05,
+      "loss": 0.0109,
+      "step": 4540
+    },
+    {
+      "epoch": 0.48262905829121794,
+      "grad_norm": 0.6093239188194275,
+      "learning_rate": 3.589705882352941e-05,
+      "loss": 0.0089,
+      "step": 4560
+    },
+    {
+      "epoch": 0.4847458524065303,
+      "grad_norm": 0.4642229378223419,
+      "learning_rate": 3.560294117647059e-05,
+      "loss": 0.0092,
+      "step": 4580
+    },
+    {
+      "epoch": 0.4868626465218427,
+      "grad_norm": 0.4857279062271118,
+      "learning_rate": 3.5308823529411766e-05,
+      "loss": 0.0081,
+      "step": 4600
+    },
+    {
+      "epoch": 0.48897944063715504,
+      "grad_norm": 0.40589526295661926,
+      "learning_rate": 3.501470588235294e-05,
+      "loss": 0.0098,
+      "step": 4620
+    },
+    {
+      "epoch": 0.4910962347524674,
+      "grad_norm": 0.2723426818847656,
+      "learning_rate": 3.472058823529412e-05,
+      "loss": 0.0133,
+      "step": 4640
+    },
+    {
+      "epoch": 0.49321302886777973,
+      "grad_norm": 0.7545261383056641,
+      "learning_rate": 3.4426470588235296e-05,
+      "loss": 0.0103,
+      "step": 4660
+    },
+    {
+      "epoch": 0.4953298229830921,
+      "grad_norm": 1.5047451257705688,
+      "learning_rate": 3.413235294117647e-05,
+      "loss": 0.0103,
+      "step": 4680
+    },
+    {
+      "epoch": 0.4974466170984045,
+      "grad_norm": 0.46020635962486267,
+      "learning_rate": 3.383823529411765e-05,
+      "loss": 0.0092,
+      "step": 4700
+    },
+    {
+      "epoch": 0.49956341121371683,
+      "grad_norm": 0.42124831676483154,
+      "learning_rate": 3.3544117647058825e-05,
+      "loss": 0.0112,
+      "step": 4720
+    },
+    {
+      "epoch": 0.5016802053290292,
+      "grad_norm": 0.18676140904426575,
+      "learning_rate": 3.325e-05,
+      "loss": 0.0096,
+      "step": 4740
+    },
+    {
+      "epoch": 0.5037969994443415,
+      "grad_norm": 0.41889238357543945,
+      "learning_rate": 3.295588235294118e-05,
+      "loss": 0.0112,
+      "step": 4760
+    },
+    {
+      "epoch": 0.5059137935596539,
+      "grad_norm": 0.5965830087661743,
+      "learning_rate": 3.2661764705882355e-05,
+      "loss": 0.0082,
+      "step": 4780
+    },
+    {
+      "epoch": 0.5080305876749662,
+      "grad_norm": 0.5901793837547302,
+      "learning_rate": 3.236764705882353e-05,
+      "loss": 0.0092,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5101473817902786,
+      "grad_norm": 0.453032910823822,
+      "learning_rate": 3.207352941176471e-05,
+      "loss": 0.0104,
+      "step": 4820
+    },
+    {
+      "epoch": 0.512264175905591,
+      "grad_norm": 0.3099919557571411,
+      "learning_rate": 3.1779411764705884e-05,
+      "loss": 0.0097,
+      "step": 4840
+    },
+    {
+      "epoch": 0.5143809700209033,
+      "grad_norm": 0.28637203574180603,
+      "learning_rate": 3.148529411764706e-05,
+      "loss": 0.0074,
+      "step": 4860
+    },
+    {
+      "epoch": 0.5164977641362157,
+      "grad_norm": 0.45871102809906006,
+      "learning_rate": 3.119117647058824e-05,
+      "loss": 0.0093,
+      "step": 4880
+    },
+    {
+      "epoch": 0.518614558251528,
+      "grad_norm": 0.5844906568527222,
+      "learning_rate": 3.0897058823529414e-05,
+      "loss": 0.0097,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5207313523668404,
+      "grad_norm": 0.7102438807487488,
+      "learning_rate": 3.060294117647059e-05,
+      "loss": 0.0083,
+      "step": 4920
+    },
+    {
+      "epoch": 0.5228481464821528,
+      "grad_norm": 0.483784943819046,
+      "learning_rate": 3.0308823529411767e-05,
+      "loss": 0.0091,
+      "step": 4940
+    },
+    {
+      "epoch": 0.5249649405974651,
+      "grad_norm": 0.4747030436992645,
+      "learning_rate": 3.0014705882352943e-05,
+      "loss": 0.0091,
+      "step": 4960
+    },
+    {
+      "epoch": 0.5270817347127775,
+      "grad_norm": 0.3532012403011322,
+      "learning_rate": 2.9720588235294116e-05,
+      "loss": 0.0082,
+      "step": 4980
+    },
+    {
+      "epoch": 0.5291985288280898,
+      "grad_norm": 0.42889463901519775,
+      "learning_rate": 2.9426470588235293e-05,
+      "loss": 0.0091,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5313153229434022,
+      "grad_norm": 0.4388155937194824,
+      "learning_rate": 2.9132352941176473e-05,
+      "loss": 0.0088,
+      "step": 5020
+    },
+    {
+      "epoch": 0.5334321170587146,
+      "grad_norm": 0.49440255761146545,
+      "learning_rate": 2.883823529411765e-05,
+      "loss": 0.0091,
+      "step": 5040
+    },
+    {
+      "epoch": 0.5355489111740269,
+      "grad_norm": 0.3930880129337311,
+      "learning_rate": 2.8544117647058826e-05,
+      "loss": 0.0114,
+      "step": 5060
+    },
+    {
+      "epoch": 0.5376657052893393,
+      "grad_norm": 0.380283921957016,
+      "learning_rate": 2.825e-05,
+      "loss": 0.0105,
+      "step": 5080
+    },
+    {
+      "epoch": 0.5397824994046516,
+      "grad_norm": 0.3737698793411255,
+      "learning_rate": 2.7955882352941175e-05,
+      "loss": 0.0132,
+      "step": 5100
+    },
+    {
+      "epoch": 0.541899293519964,
+      "grad_norm": 0.5393537282943726,
+      "learning_rate": 2.7661764705882355e-05,
+      "loss": 0.0118,
+      "step": 5120
+    },
+    {
+      "epoch": 0.5440160876352764,
+      "grad_norm": 0.3449922502040863,
+      "learning_rate": 2.7367647058823532e-05,
+      "loss": 0.0077,
+      "step": 5140
+    },
+    {
+      "epoch": 0.5461328817505887,
+      "grad_norm": 0.6629793643951416,
+      "learning_rate": 2.7073529411764708e-05,
+      "loss": 0.0084,
+      "step": 5160
+    },
+    {
+      "epoch": 0.5482496758659011,
+      "grad_norm": 0.7243732810020447,
+      "learning_rate": 2.6779411764705885e-05,
+      "loss": 0.0073,
+      "step": 5180
+    },
+    {
+      "epoch": 0.5503664699812134,
+      "grad_norm": 0.6006022691726685,
+      "learning_rate": 2.6485294117647058e-05,
+      "loss": 0.0084,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5524832640965258,
+      "grad_norm": 0.5986945629119873,
+      "learning_rate": 2.6191176470588234e-05,
+      "loss": 0.0087,
+      "step": 5220
+    },
+    {
+      "epoch": 0.5546000582118382,
+      "grad_norm": 0.267560750246048,
+      "learning_rate": 2.5897058823529414e-05,
+      "loss": 0.0092,
+      "step": 5240
+    },
+    {
+      "epoch": 0.5567168523271505,
+      "grad_norm": 0.47937673330307007,
+      "learning_rate": 2.560294117647059e-05,
+      "loss": 0.0089,
+      "step": 5260
+    },
+    {
+      "epoch": 0.5588336464424629,
+      "grad_norm": 0.4451775550842285,
+      "learning_rate": 2.5308823529411767e-05,
+      "loss": 0.0082,
+      "step": 5280
+    },
+    {
+      "epoch": 0.5609504405577752,
+      "grad_norm": 0.7350065112113953,
+      "learning_rate": 2.501470588235294e-05,
+      "loss": 0.0087,
+      "step": 5300
+    },
+    {
+      "epoch": 0.5630672346730876,
+      "grad_norm": 0.43704766035079956,
+      "learning_rate": 2.4720588235294117e-05,
+      "loss": 0.0089,
+      "step": 5320
+    },
+    {
+      "epoch": 0.5651840287884,
+      "grad_norm": 0.29158827662467957,
+      "learning_rate": 2.4426470588235297e-05,
+      "loss": 0.0066,
+      "step": 5340
+    },
+    {
+      "epoch": 0.5673008229037123,
+      "grad_norm": 0.39838340878486633,
+      "learning_rate": 2.4132352941176473e-05,
+      "loss": 0.0081,
+      "step": 5360
+    },
+    {
+      "epoch": 0.5694176170190247,
+      "grad_norm": 0.4324835538864136,
+      "learning_rate": 2.3838235294117646e-05,
+      "loss": 0.008,
+      "step": 5380
+    },
+    {
+      "epoch": 0.571534411134337,
+      "grad_norm": 0.4358319938182831,
+      "learning_rate": 2.3544117647058826e-05,
+      "loss": 0.008,
+      "step": 5400
+    },
+    {
+      "epoch": 0.5736512052496494,
+      "grad_norm": 0.8966334462165833,
+      "learning_rate": 2.3250000000000003e-05,
+      "loss": 0.0078,
+      "step": 5420
+    },
+    {
+      "epoch": 0.5757679993649618,
+      "grad_norm": 0.9501079320907593,
+      "learning_rate": 2.2955882352941176e-05,
+      "loss": 0.0184,
+      "step": 5440
+    },
+    {
+      "epoch": 0.5778847934802741,
+      "grad_norm": 0.13483519852161407,
+      "learning_rate": 2.2661764705882356e-05,
+      "loss": 0.0154,
+      "step": 5460
+    },
+    {
+      "epoch": 0.5800015875955865,
+      "grad_norm": 0.4287421703338623,
+      "learning_rate": 2.236764705882353e-05,
+      "loss": 0.0084,
+      "step": 5480
+    },
+    {
+      "epoch": 0.5821183817108988,
+      "grad_norm": 0.1738578975200653,
+      "learning_rate": 2.2073529411764705e-05,
+      "loss": 0.0079,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5842351758262112,
+      "grad_norm": 0.6555954217910767,
+      "learning_rate": 2.1779411764705885e-05,
+      "loss": 0.0091,
+      "step": 5520
+    },
+    {
+      "epoch": 0.5863519699415236,
+      "grad_norm": 0.5294132232666016,
+      "learning_rate": 2.1485294117647058e-05,
+      "loss": 0.007,
+      "step": 5540
+    },
+    {
+      "epoch": 0.5884687640568359,
+      "grad_norm": 0.3388701379299164,
+      "learning_rate": 2.1191176470588238e-05,
+      "loss": 0.007,
+      "step": 5560
+    },
+    {
+      "epoch": 0.5905855581721483,
+      "grad_norm": 0.4279813766479492,
+      "learning_rate": 2.0897058823529415e-05,
+      "loss": 0.0077,
+      "step": 5580
+    },
+    {
+      "epoch": 0.5927023522874606,
+      "grad_norm": 0.4467952847480774,
+      "learning_rate": 2.0602941176470588e-05,
+      "loss": 0.0083,
+      "step": 5600
+    },
+    {
+      "epoch": 0.594819146402773,
+      "grad_norm": 0.36640599370002747,
+      "learning_rate": 2.0308823529411768e-05,
+      "loss": 0.0081,
+      "step": 5620
+    },
+    {
+      "epoch": 0.5969359405180854,
+      "grad_norm": 0.2323896735906601,
+      "learning_rate": 2.001470588235294e-05,
+      "loss": 0.0065,
+      "step": 5640
+    },
+    {
+      "epoch": 0.5990527346333977,
+      "grad_norm": 0.5579979419708252,
+      "learning_rate": 1.9720588235294117e-05,
+      "loss": 0.0084,
+      "step": 5660
+    },
+    {
+      "epoch": 0.6011695287487101,
+      "grad_norm": 0.34144604206085205,
+      "learning_rate": 1.9426470588235297e-05,
+      "loss": 0.0069,
+      "step": 5680
+    },
+    {
+      "epoch": 0.6032863228640224,
+      "grad_norm": 0.5170475244522095,
+      "learning_rate": 1.913235294117647e-05,
+      "loss": 0.0074,
+      "step": 5700
+    },
+    {
+      "epoch": 0.6054031169793348,
+      "grad_norm": 0.34131792187690735,
+      "learning_rate": 1.8838235294117647e-05,
+      "loss": 0.0069,
+      "step": 5720
+    },
+    {
+      "epoch": 0.6075199110946472,
+      "grad_norm": 0.5252654552459717,
+      "learning_rate": 1.8544117647058827e-05,
+      "loss": 0.0074,
+      "step": 5740
+    },
+    {
+      "epoch": 0.6096367052099595,
+      "grad_norm": 0.23735718429088593,
+      "learning_rate": 1.825e-05,
+      "loss": 0.0079,
+      "step": 5760
+    },
+    {
+      "epoch": 0.6117534993252719,
+      "grad_norm": 0.3985564410686493,
+      "learning_rate": 1.7955882352941176e-05,
+      "loss": 0.0076,
+      "step": 5780
+    },
+    {
+      "epoch": 0.6138702934405842,
+      "grad_norm": 0.53111732006073,
+      "learning_rate": 1.7661764705882353e-05,
+      "loss": 0.0075,
+      "step": 5800
+    },
+    {
+      "epoch": 0.6159870875558966,
+      "grad_norm": 0.37471240758895874,
+      "learning_rate": 1.736764705882353e-05,
+      "loss": 0.007,
+      "step": 5820
+    },
+    {
+      "epoch": 0.618103881671209,
+      "grad_norm": 0.2607717514038086,
+      "learning_rate": 1.707352941176471e-05,
+      "loss": 0.0078,
+      "step": 5840
+    },
+    {
+      "epoch": 0.6202206757865213,
+      "grad_norm": 0.4577248990535736,
+      "learning_rate": 1.6779411764705882e-05,
+      "loss": 0.0099,
+      "step": 5860
+    },
+    {
+      "epoch": 0.6223374699018337,
+      "grad_norm": 0.44592851400375366,
+      "learning_rate": 1.648529411764706e-05,
+      "loss": 0.007,
+      "step": 5880
+    },
+    {
+      "epoch": 0.624454264017146,
+      "grad_norm": 0.4649290442466736,
+      "learning_rate": 1.619117647058824e-05,
+      "loss": 0.0078,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6265710581324584,
+      "grad_norm": 0.5193443298339844,
+      "learning_rate": 1.5897058823529412e-05,
+      "loss": 0.0086,
+      "step": 5920
+    },
+    {
+      "epoch": 0.6286878522477708,
+      "grad_norm": 0.5165125131607056,
+      "learning_rate": 1.5602941176470588e-05,
+      "loss": 0.0079,
+      "step": 5940
+    },
+    {
+      "epoch": 0.6308046463630831,
+      "grad_norm": 0.5387499928474426,
+      "learning_rate": 1.5308823529411765e-05,
+      "loss": 0.0072,
+      "step": 5960
+    },
+    {
+      "epoch": 0.6329214404783955,
+      "grad_norm": 0.3668934404850006,
+      "learning_rate": 1.5014705882352941e-05,
+      "loss": 0.0063,
+      "step": 5980
+    },
+    {
+      "epoch": 0.6350382345937078,
+      "grad_norm": 0.880902886390686,
+      "learning_rate": 1.472058823529412e-05,
+      "loss": 0.0071,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6371550287090202,
+      "grad_norm": 0.6322916150093079,
+      "learning_rate": 1.4426470588235294e-05,
+      "loss": 0.0066,
+      "step": 6020
+    },
+    {
+      "epoch": 0.6392718228243326,
+      "grad_norm": 0.6748706698417664,
+      "learning_rate": 1.4132352941176472e-05,
+      "loss": 0.0071,
+      "step": 6040
+    },
+    {
+      "epoch": 0.6413886169396449,
+      "grad_norm": 0.20786982774734497,
+      "learning_rate": 1.3838235294117649e-05,
+      "loss": 0.0062,
+      "step": 6060
+    },
+    {
+      "epoch": 0.6435054110549573,
+      "grad_norm": 0.913157045841217,
+      "learning_rate": 1.3544117647058824e-05,
+      "loss": 0.0071,
+      "step": 6080
+    },
+    {
+      "epoch": 0.6456222051702696,
+      "grad_norm": 0.38529446721076965,
+      "learning_rate": 1.3250000000000002e-05,
+      "loss": 0.0061,
+      "step": 6100
+    },
+    {
+      "epoch": 0.647738999285582,
+      "grad_norm": 0.35093656182289124,
+      "learning_rate": 1.2955882352941177e-05,
+      "loss": 0.0064,
+      "step": 6120
+    },
+    {
+      "epoch": 0.6498557934008944,
+      "grad_norm": 0.4418446123600006,
+      "learning_rate": 1.2661764705882353e-05,
+      "loss": 0.0069,
+      "step": 6140
+    },
+    {
+      "epoch": 0.6519725875162067,
+      "grad_norm": 0.20957966148853302,
+      "learning_rate": 1.236764705882353e-05,
+      "loss": 0.0059,
+      "step": 6160
+    },
+    {
+      "epoch": 0.6540893816315191,
+      "grad_norm": 0.2830004394054413,
+      "learning_rate": 1.2073529411764708e-05,
+      "loss": 0.0068,
+      "step": 6180
+    },
+    {
+      "epoch": 0.6562061757468314,
+      "grad_norm": 0.20361077785491943,
+      "learning_rate": 1.1779411764705883e-05,
+      "loss": 0.0075,
+      "step": 6200
+    },
+    {
+      "epoch": 0.6583229698621438,
+      "grad_norm": 0.44711142778396606,
+      "learning_rate": 1.148529411764706e-05,
+      "loss": 0.0063,
+      "step": 6220
+    },
+    {
+      "epoch": 0.6604397639774562,
+      "grad_norm": 0.361794650554657,
+      "learning_rate": 1.1191176470588236e-05,
+      "loss": 0.0087,
+      "step": 6240
+    },
+    {
+      "epoch": 0.6625565580927685,
+      "grad_norm": 0.4405740797519684,
+      "learning_rate": 1.0897058823529412e-05,
+      "loss": 0.0067,
+      "step": 6260
+    },
+    {
+      "epoch": 0.6646733522080809,
+      "grad_norm": 0.4276063144207001,
+      "learning_rate": 1.0602941176470589e-05,
+      "loss": 0.0058,
+      "step": 6280
+    },
+    {
+      "epoch": 0.6667901463233932,
+      "grad_norm": 0.267135351896286,
+      "learning_rate": 1.0308823529411765e-05,
+      "loss": 0.0065,
+      "step": 6300
+    },
+    {
+      "epoch": 0.6689069404387056,
+      "grad_norm": 0.41200748085975647,
+      "learning_rate": 1.0014705882352942e-05,
+      "loss": 0.0064,
+      "step": 6320
+    },
+    {
+      "epoch": 0.671023734554018,
+      "grad_norm": 0.30442219972610474,
+      "learning_rate": 9.720588235294118e-06,
+      "loss": 0.0071,
+      "step": 6340
+    },
+    {
+      "epoch": 0.6731405286693303,
+      "grad_norm": 0.5211018919944763,
+      "learning_rate": 9.426470588235295e-06,
+      "loss": 0.0052,
+      "step": 6360
+    },
+    {
+      "epoch": 0.6752573227846427,
+      "grad_norm": 0.3820931315422058,
+      "learning_rate": 9.132352941176471e-06,
+      "loss": 0.0058,
+      "step": 6380
+    },
+    {
+      "epoch": 0.677374116899955,
+      "grad_norm": 0.37423545122146606,
+      "learning_rate": 8.838235294117648e-06,
+      "loss": 0.0059,
+      "step": 6400
+    },
+    {
+      "epoch": 0.6794909110152674,
+      "grad_norm": 0.42695438861846924,
+      "learning_rate": 8.544117647058822e-06,
+      "loss": 0.006,
+      "step": 6420
+    },
+    {
+      "epoch": 0.6816077051305798,
+      "grad_norm": 0.37079957127571106,
+      "learning_rate": 8.25e-06,
+      "loss": 0.0076,
+      "step": 6440
+    },
+    {
+      "epoch": 0.6837244992458921,
+      "grad_norm": 0.17029517889022827,
+      "learning_rate": 7.955882352941177e-06,
+      "loss": 0.0055,
+      "step": 6460
+    },
+    {
+      "epoch": 0.6858412933612045,
+      "grad_norm": 0.30178797245025635,
+      "learning_rate": 7.661764705882354e-06,
+      "loss": 0.0053,
+      "step": 6480
+    },
+    {
+      "epoch": 0.6879580874765168,
+      "grad_norm": 0.2955467104911804,
+      "learning_rate": 7.367647058823529e-06,
+      "loss": 0.0069,
+      "step": 6500
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 7000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.770133440952634e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c1838316ae8c7df2b7eb5f039022e01d00f71f28b5a09877cd72af98fb0743
+size 5969

internvl3_1b_lora_7000_20260304_104032/checkpoint-6500/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: /root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "/root/autodl-tmp/LEAP_assets/models/InternVL3-1B-hf",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "q_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:477228bf6d531c12034f0409dbc13c57f3297227d18aa23baa6c90c90351224d
+size 18138288

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "</box>": 151673,
+  "</img>": 151666,
+  "</quad>": 151669,
+  "</ref>": 151671,
+  "</tool_call>": 151658,
+  "<IMG_CONTEXT>": 151667,
+  "<box>": 151672,
+  "<img>": 151665,
+  "<quad>": 151668,
+  "<ref>": 151670,
+  "<tool_call>": 151657,
+  "<video>": 151674,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,6 @@

+{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+'}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
+' }}{% elif content['type'] == 'video' %}{{ '<video>
+' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
+'}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
+' }}{% endif %}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "start_image_token": "<img>",
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cc80b7e20adf8bf6f6ca442bf1abfac8056bb3b7d3e0b11c9d497d3e79398c9
+size 11423732

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,306 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "</img>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<IMG_CONTEXT>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151668": {
+      "content": "<quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151669": {
+      "content": "</quad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151670": {
+      "content": "<ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151671": {
+      "content": "</ref>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151672": {
+      "content": "<box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151673": {
+      "content": "</box>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151674": {
+      "content": "<video>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>",
+    "<img>",
+    "</img>",
+    "<IMG_CONTEXT>",
+    "<quad>",
+    "</quad>",
+    "<ref>",
+    "</ref>",
+    "<box>",
+    "</box>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "context_image_token": "<IMG_CONTEXT>",
+  "end_image_token": "</img>",
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {
+    "context_image_token": "<IMG_CONTEXT>",
+    "end_image_token": "</img>",
+    "start_image_token": "<img>",
+    "video_token": "<video>"
+  },
+  "model_max_length": 4096,
+  "pad_token": "<|endoftext|>",
+  "return_token_type_ids": false,
+  "split_special_tokens": false,
+  "start_image_token": "<img>",
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "video_token": "<video>"
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2491 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.7408779403593257,
+  "eval_steps": 500,
+  "global_step": 7000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00010583970576561797,
+      "grad_norm": 2.33493709564209,
+      "learning_rate": 0.0,
+      "loss": 1.2764,
+      "step": 1
+    },
+    {
+      "epoch": 0.0021167941153123595,
+      "grad_norm": 2.771491765975952,
+      "learning_rate": 9.5e-06,
+      "loss": 1.6307,
+      "step": 20
+    },
+    {
+      "epoch": 0.004233588230624719,
+      "grad_norm": 1.6168005466461182,
+      "learning_rate": 1.9500000000000003e-05,
+      "loss": 1.5457,
+      "step": 40
+    },
+    {
+      "epoch": 0.006350382345937078,
+      "grad_norm": 1.033608317375183,
+      "learning_rate": 2.95e-05,
+      "loss": 1.333,
+      "step": 60
+    },
+    {
+      "epoch": 0.008467176461249438,
+      "grad_norm": 0.9740731716156006,
+      "learning_rate": 3.9500000000000005e-05,
+      "loss": 1.1355,
+      "step": 80
+    },
+    {
+      "epoch": 0.010583970576561796,
+      "grad_norm": 1.2390116453170776,
+      "learning_rate": 4.9500000000000004e-05,
+      "loss": 1.0417,
+      "step": 100
+    },
+    {
+      "epoch": 0.012700764691874157,
+      "grad_norm": 1.3689967393875122,
+      "learning_rate": 5.95e-05,
+      "loss": 0.9421,
+      "step": 120
+    },
+    {
+      "epoch": 0.014817558807186515,
+      "grad_norm": 2.2537403106689453,
+      "learning_rate": 6.95e-05,
+      "loss": 0.9362,
+      "step": 140
+    },
+    {
+      "epoch": 0.016934352922498876,
+      "grad_norm": 1.8407371044158936,
+      "learning_rate": 7.950000000000001e-05,
+      "loss": 0.8148,
+      "step": 160
+    },
+    {
+      "epoch": 0.019051147037811234,
+      "grad_norm": 1.8288472890853882,
+      "learning_rate": 8.950000000000001e-05,
+      "loss": 0.7605,
+      "step": 180
+    },
+    {
+      "epoch": 0.021167941153123593,
+      "grad_norm": 2.447781562805176,
+      "learning_rate": 9.95e-05,
+      "loss": 0.6912,
+      "step": 200
+    },
+    {
+      "epoch": 0.023284735268435955,
+      "grad_norm": 2.366830825805664,
+      "learning_rate": 9.972058823529412e-05,
+      "loss": 0.6242,
+      "step": 220
+    },
+    {
+      "epoch": 0.025401529383748313,
+      "grad_norm": 2.799335479736328,
+      "learning_rate": 9.94264705882353e-05,
+      "loss": 0.5381,
+      "step": 240
+    },
+    {
+      "epoch": 0.027518323499060672,
+      "grad_norm": 2.6497650146484375,
+      "learning_rate": 9.913235294117647e-05,
+      "loss": 0.5163,
+      "step": 260
+    },
+    {
+      "epoch": 0.02963511761437303,
+      "grad_norm": 3.2764251232147217,
+      "learning_rate": 9.883823529411765e-05,
+      "loss": 0.474,
+      "step": 280
+    },
+    {
+      "epoch": 0.03175191172968539,
+      "grad_norm": 3.223743200302124,
+      "learning_rate": 9.854411764705883e-05,
+      "loss": 0.461,
+      "step": 300
+    },
+    {
+      "epoch": 0.03386870584499775,
+      "grad_norm": 3.198038339614868,
+      "learning_rate": 9.825e-05,
+      "loss": 0.4216,
+      "step": 320
+    },
+    {
+      "epoch": 0.03598549996031011,
+      "grad_norm": 3.033092737197876,
+      "learning_rate": 9.795588235294119e-05,
+      "loss": 0.3453,
+      "step": 340
+    },
+    {
+      "epoch": 0.03810229407562247,
+      "grad_norm": 2.908698797225952,
+      "learning_rate": 9.766176470588236e-05,
+      "loss": 0.3506,
+      "step": 360
+    },
+    {
+      "epoch": 0.04021908819093483,
+      "grad_norm": 3.772873878479004,
+      "learning_rate": 9.736764705882353e-05,
+      "loss": 0.3381,
+      "step": 380
+    },
+    {
+      "epoch": 0.042335882306247186,
+      "grad_norm": 2.692840337753296,
+      "learning_rate": 9.707352941176471e-05,
+      "loss": 0.2868,
+      "step": 400
+    },
+    {
+      "epoch": 0.04445267642155955,
+      "grad_norm": 3.629152297973633,
+      "learning_rate": 9.677941176470589e-05,
+      "loss": 0.2845,
+      "step": 420
+    },
+    {
+      "epoch": 0.04656947053687191,
+      "grad_norm": 4.045558929443359,
+      "learning_rate": 9.648529411764706e-05,
+      "loss": 0.275,
+      "step": 440
+    },
+    {
+      "epoch": 0.048686264652184265,
+      "grad_norm": 2.3519065380096436,
+      "learning_rate": 9.619117647058824e-05,
+      "loss": 0.2254,
+      "step": 460
+    },
+    {
+      "epoch": 0.05080305876749663,
+      "grad_norm": 2.71055269241333,
+      "learning_rate": 9.589705882352941e-05,
+      "loss": 0.2356,
+      "step": 480
+    },
+    {
+      "epoch": 0.05291985288280899,
+      "grad_norm": 3.037832021713257,
+      "learning_rate": 9.560294117647059e-05,
+      "loss": 0.2,
+      "step": 500
+    },
+    {
+      "epoch": 0.055036646998121344,
+      "grad_norm": 3.366894245147705,
+      "learning_rate": 9.530882352941177e-05,
+      "loss": 0.1888,
+      "step": 520
+    },
+    {
+      "epoch": 0.057153441113433706,
+      "grad_norm": 2.728973865509033,
+      "learning_rate": 9.501470588235294e-05,
+      "loss": 0.1844,
+      "step": 540
+    },
+    {
+      "epoch": 0.05927023522874606,
+      "grad_norm": 2.229743719100952,
+      "learning_rate": 9.472058823529412e-05,
+      "loss": 0.1658,
+      "step": 560
+    },
+    {
+      "epoch": 0.06138702934405842,
+      "grad_norm": 2.3469460010528564,
+      "learning_rate": 9.44264705882353e-05,
+      "loss": 0.1556,
+      "step": 580
+    },
+    {
+      "epoch": 0.06350382345937078,
+      "grad_norm": 2.338606595993042,
+      "learning_rate": 9.413235294117647e-05,
+      "loss": 0.1391,
+      "step": 600
+    },
+    {
+      "epoch": 0.06562061757468314,
+      "grad_norm": 1.9111056327819824,
+      "learning_rate": 9.383823529411765e-05,
+      "loss": 0.1606,
+      "step": 620
+    },
+    {
+      "epoch": 0.0677374116899955,
+      "grad_norm": 3.3568716049194336,
+      "learning_rate": 9.354411764705883e-05,
+      "loss": 0.1664,
+      "step": 640
+    },
+    {
+      "epoch": 0.06985420580530786,
+      "grad_norm": 3.88547945022583,
+      "learning_rate": 9.325e-05,
+      "loss": 0.1494,
+      "step": 660
+    },
+    {
+      "epoch": 0.07197099992062023,
+      "grad_norm": 2.3967244625091553,
+      "learning_rate": 9.295588235294118e-05,
+      "loss": 0.141,
+      "step": 680
+    },
+    {
+      "epoch": 0.07408779403593257,
+      "grad_norm": 3.0165176391601562,
+      "learning_rate": 9.266176470588236e-05,
+      "loss": 0.1409,
+      "step": 700
+    },
+    {
+      "epoch": 0.07620458815124494,
+      "grad_norm": 3.2665436267852783,
+      "learning_rate": 9.236764705882353e-05,
+      "loss": 0.1112,
+      "step": 720
+    },
+    {
+      "epoch": 0.0783213822665573,
+      "grad_norm": 2.3310046195983887,
+      "learning_rate": 9.207352941176471e-05,
+      "loss": 0.1396,
+      "step": 740
+    },
+    {
+      "epoch": 0.08043817638186966,
+      "grad_norm": 1.8768619298934937,
+      "learning_rate": 9.177941176470589e-05,
+      "loss": 0.1114,
+      "step": 760
+    },
+    {
+      "epoch": 0.08255497049718202,
+      "grad_norm": 3.4282712936401367,
+      "learning_rate": 9.148529411764706e-05,
+      "loss": 0.1073,
+      "step": 780
+    },
+    {
+      "epoch": 0.08467176461249437,
+      "grad_norm": 3.2704601287841797,
+      "learning_rate": 9.119117647058824e-05,
+      "loss": 0.1281,
+      "step": 800
+    },
+    {
+      "epoch": 0.08678855872780673,
+      "grad_norm": 2.225818157196045,
+      "learning_rate": 9.089705882352942e-05,
+      "loss": 0.1046,
+      "step": 820
+    },
+    {
+      "epoch": 0.0889053528431191,
+      "grad_norm": 2.078011989593506,
+      "learning_rate": 9.060294117647059e-05,
+      "loss": 0.1033,
+      "step": 840
+    },
+    {
+      "epoch": 0.09102214695843146,
+      "grad_norm": 1.3325825929641724,
+      "learning_rate": 9.030882352941177e-05,
+      "loss": 0.0872,
+      "step": 860
+    },
+    {
+      "epoch": 0.09313894107374382,
+      "grad_norm": 3.0471086502075195,
+      "learning_rate": 9.001470588235294e-05,
+      "loss": 0.0927,
+      "step": 880
+    },
+    {
+      "epoch": 0.09525573518905617,
+      "grad_norm": 2.9685380458831787,
+      "learning_rate": 8.972058823529412e-05,
+      "loss": 0.0917,
+      "step": 900
+    },
+    {
+      "epoch": 0.09737252930436853,
+      "grad_norm": 1.8589142560958862,
+      "learning_rate": 8.94264705882353e-05,
+      "loss": 0.0836,
+      "step": 920
+    },
+    {
+      "epoch": 0.09948932341968089,
+      "grad_norm": 1.523457407951355,
+      "learning_rate": 8.913235294117647e-05,
+      "loss": 0.0862,
+      "step": 940
+    },
+    {
+      "epoch": 0.10160611753499325,
+      "grad_norm": 1.4009277820587158,
+      "learning_rate": 8.883823529411765e-05,
+      "loss": 0.068,
+      "step": 960
+    },
+    {
+      "epoch": 0.10372291165030562,
+      "grad_norm": 2.0816826820373535,
+      "learning_rate": 8.854411764705883e-05,
+      "loss": 0.0741,
+      "step": 980
+    },
+    {
+      "epoch": 0.10583970576561798,
+      "grad_norm": 2.218278408050537,
+      "learning_rate": 8.825e-05,
+      "loss": 0.0822,
+      "step": 1000
+    },
+    {
+      "epoch": 0.10795649988093033,
+      "grad_norm": 1.188503623008728,
+      "learning_rate": 8.795588235294118e-05,
+      "loss": 0.0792,
+      "step": 1020
+    },
+    {
+      "epoch": 0.11007329399624269,
+      "grad_norm": 0.9847146272659302,
+      "learning_rate": 8.766176470588236e-05,
+      "loss": 0.0696,
+      "step": 1040
+    },
+    {
+      "epoch": 0.11219008811155505,
+      "grad_norm": 3.0967068672180176,
+      "learning_rate": 8.736764705882353e-05,
+      "loss": 0.0842,
+      "step": 1060
+    },
+    {
+      "epoch": 0.11430688222686741,
+      "grad_norm": 2.4966516494750977,
+      "learning_rate": 8.707352941176471e-05,
+      "loss": 0.065,
+      "step": 1080
+    },
+    {
+      "epoch": 0.11642367634217977,
+      "grad_norm": 1.7355480194091797,
+      "learning_rate": 8.677941176470589e-05,
+      "loss": 0.0672,
+      "step": 1100
+    },
+    {
+      "epoch": 0.11854047045749212,
+      "grad_norm": 2.5048105716705322,
+      "learning_rate": 8.648529411764706e-05,
+      "loss": 0.0616,
+      "step": 1120
+    },
+    {
+      "epoch": 0.12065726457280448,
+      "grad_norm": 1.285093903541565,
+      "learning_rate": 8.619117647058824e-05,
+      "loss": 0.0676,
+      "step": 1140
+    },
+    {
+      "epoch": 0.12277405868811685,
+      "grad_norm": 1.58004891872406,
+      "learning_rate": 8.589705882352942e-05,
+      "loss": 0.0827,
+      "step": 1160
+    },
+    {
+      "epoch": 0.12489085280342921,
+      "grad_norm": 1.571897029876709,
+      "learning_rate": 8.560294117647059e-05,
+      "loss": 0.0663,
+      "step": 1180
+    },
+    {
+      "epoch": 0.12700764691874156,
+      "grad_norm": 0.8998542428016663,
+      "learning_rate": 8.530882352941177e-05,
+      "loss": 0.0448,
+      "step": 1200
+    },
+    {
+      "epoch": 0.12912444103405393,
+      "grad_norm": 1.577183485031128,
+      "learning_rate": 8.501470588235295e-05,
+      "loss": 0.0574,
+      "step": 1220
+    },
+    {
+      "epoch": 0.13124123514936628,
+      "grad_norm": 1.7241120338439941,
+      "learning_rate": 8.472058823529412e-05,
+      "loss": 0.0586,
+      "step": 1240
+    },
+    {
+      "epoch": 0.13335802926467866,
+      "grad_norm": 1.4512884616851807,
+      "learning_rate": 8.44264705882353e-05,
+      "loss": 0.0457,
+      "step": 1260
+    },
+    {
+      "epoch": 0.135474823379991,
+      "grad_norm": 1.4320181608200073,
+      "learning_rate": 8.413235294117647e-05,
+      "loss": 0.0572,
+      "step": 1280
+    },
+    {
+      "epoch": 0.13759161749530335,
+      "grad_norm": 2.4721877574920654,
+      "learning_rate": 8.383823529411765e-05,
+      "loss": 0.0539,
+      "step": 1300
+    },
+    {
+      "epoch": 0.13970841161061573,
+      "grad_norm": 1.230265498161316,
+      "learning_rate": 8.354411764705883e-05,
+      "loss": 0.0524,
+      "step": 1320
+    },
+    {
+      "epoch": 0.14182520572592808,
+      "grad_norm": 1.5039700269699097,
+      "learning_rate": 8.325e-05,
+      "loss": 0.0569,
+      "step": 1340
+    },
+    {
+      "epoch": 0.14394199984124045,
+      "grad_norm": 1.9928780794143677,
+      "learning_rate": 8.295588235294118e-05,
+      "loss": 0.0402,
+      "step": 1360
+    },
+    {
+      "epoch": 0.1460587939565528,
+      "grad_norm": 1.8550405502319336,
+      "learning_rate": 8.266176470588236e-05,
+      "loss": 0.054,
+      "step": 1380
+    },
+    {
+      "epoch": 0.14817558807186515,
+      "grad_norm": 1.0700241327285767,
+      "learning_rate": 8.236764705882353e-05,
+      "loss": 0.052,
+      "step": 1400
+    },
+    {
+      "epoch": 0.15029238218717753,
+      "grad_norm": 1.7121262550354004,
+      "learning_rate": 8.207352941176471e-05,
+      "loss": 0.0437,
+      "step": 1420
+    },
+    {
+      "epoch": 0.15240917630248987,
+      "grad_norm": 1.3593100309371948,
+      "learning_rate": 8.177941176470589e-05,
+      "loss": 0.0393,
+      "step": 1440
+    },
+    {
+      "epoch": 0.15452597041780225,
+      "grad_norm": 1.080735683441162,
+      "learning_rate": 8.148529411764706e-05,
+      "loss": 0.035,
+      "step": 1460
+    },
+    {
+      "epoch": 0.1566427645331146,
+      "grad_norm": 1.5516977310180664,
+      "learning_rate": 8.119117647058824e-05,
+      "loss": 0.0421,
+      "step": 1480
+    },
+    {
+      "epoch": 0.15875955864842695,
+      "grad_norm": 1.107473373413086,
+      "learning_rate": 8.089705882352942e-05,
+      "loss": 0.0442,
+      "step": 1500
+    },
+    {
+      "epoch": 0.16087635276373932,
+      "grad_norm": 2.196147918701172,
+      "learning_rate": 8.060294117647059e-05,
+      "loss": 0.0443,
+      "step": 1520
+    },
+    {
+      "epoch": 0.16299314687905167,
+      "grad_norm": 1.4532606601715088,
+      "learning_rate": 8.030882352941177e-05,
+      "loss": 0.0417,
+      "step": 1540
+    },
+    {
+      "epoch": 0.16510994099436405,
+      "grad_norm": 3.0167882442474365,
+      "learning_rate": 8.001470588235295e-05,
+      "loss": 0.0472,
+      "step": 1560
+    },
+    {
+      "epoch": 0.1672267351096764,
+      "grad_norm": 1.764201283454895,
+      "learning_rate": 7.972058823529412e-05,
+      "loss": 0.031,
+      "step": 1580
+    },
+    {
+      "epoch": 0.16934352922498874,
+      "grad_norm": 0.8682387471199036,
+      "learning_rate": 7.94264705882353e-05,
+      "loss": 0.0291,
+      "step": 1600
+    },
+    {
+      "epoch": 0.17146032334030112,
+      "grad_norm": 0.660894513130188,
+      "learning_rate": 7.913235294117648e-05,
+      "loss": 0.0572,
+      "step": 1620
+    },
+    {
+      "epoch": 0.17357711745561347,
+      "grad_norm": 1.7611377239227295,
+      "learning_rate": 7.883823529411765e-05,
+      "loss": 0.0453,
+      "step": 1640
+    },
+    {
+      "epoch": 0.17569391157092584,
+      "grad_norm": 0.6341773867607117,
+      "learning_rate": 7.854411764705883e-05,
+      "loss": 0.0299,
+      "step": 1660
+    },
+    {
+      "epoch": 0.1778107056862382,
+      "grad_norm": 1.4031453132629395,
+      "learning_rate": 7.825e-05,
+      "loss": 0.0358,
+      "step": 1680
+    },
+    {
+      "epoch": 0.17992749980155054,
+      "grad_norm": 1.0830997228622437,
+      "learning_rate": 7.795588235294118e-05,
+      "loss": 0.0373,
+      "step": 1700
+    },
+    {
+      "epoch": 0.18204429391686291,
+      "grad_norm": 0.6576260924339294,
+      "learning_rate": 7.766176470588236e-05,
+      "loss": 0.0587,
+      "step": 1720
+    },
+    {
+      "epoch": 0.18416108803217526,
+      "grad_norm": 1.2640115022659302,
+      "learning_rate": 7.736764705882353e-05,
+      "loss": 0.0468,
+      "step": 1740
+    },
+    {
+      "epoch": 0.18627788214748764,
+      "grad_norm": 1.0660518407821655,
+      "learning_rate": 7.707352941176471e-05,
+      "loss": 0.0466,
+      "step": 1760
+    },
+    {
+      "epoch": 0.1883946762628,
+      "grad_norm": 1.22067129611969,
+      "learning_rate": 7.677941176470589e-05,
+      "loss": 0.0335,
+      "step": 1780
+    },
+    {
+      "epoch": 0.19051147037811234,
+      "grad_norm": 4.800387859344482,
+      "learning_rate": 7.648529411764706e-05,
+      "loss": 0.0461,
+      "step": 1800
+    },
+    {
+      "epoch": 0.1926282644934247,
+      "grad_norm": 1.1434308290481567,
+      "learning_rate": 7.619117647058824e-05,
+      "loss": 0.0326,
+      "step": 1820
+    },
+    {
+      "epoch": 0.19474505860873706,
+      "grad_norm": 0.8925223350524902,
+      "learning_rate": 7.589705882352942e-05,
+      "loss": 0.0273,
+      "step": 1840
+    },
+    {
+      "epoch": 0.19686185272404944,
+      "grad_norm": 1.1678693294525146,
+      "learning_rate": 7.560294117647059e-05,
+      "loss": 0.0345,
+      "step": 1860
+    },
+    {
+      "epoch": 0.19897864683936178,
+      "grad_norm": 0.559644341468811,
+      "learning_rate": 7.530882352941177e-05,
+      "loss": 0.0394,
+      "step": 1880
+    },
+    {
+      "epoch": 0.20109544095467416,
+      "grad_norm": 1.4313390254974365,
+      "learning_rate": 7.501470588235295e-05,
+      "loss": 0.0475,
+      "step": 1900
+    },
+    {
+      "epoch": 0.2032122350699865,
+      "grad_norm": 1.2470778226852417,
+      "learning_rate": 7.472058823529412e-05,
+      "loss": 0.0317,
+      "step": 1920
+    },
+    {
+      "epoch": 0.20532902918529886,
+      "grad_norm": 1.390359878540039,
+      "learning_rate": 7.44264705882353e-05,
+      "loss": 0.0268,
+      "step": 1940
+    },
+    {
+      "epoch": 0.20744582330061123,
+      "grad_norm": 0.6755140423774719,
+      "learning_rate": 7.413235294117648e-05,
+      "loss": 0.0331,
+      "step": 1960
+    },
+    {
+      "epoch": 0.20956261741592358,
+      "grad_norm": 0.31457772850990295,
+      "learning_rate": 7.383823529411765e-05,
+      "loss": 0.0447,
+      "step": 1980
+    },
+    {
+      "epoch": 0.21167941153123596,
+      "grad_norm": 1.6619377136230469,
+      "learning_rate": 7.354411764705883e-05,
+      "loss": 0.0336,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2137962056465483,
+      "grad_norm": 1.033492088317871,
+      "learning_rate": 7.325e-05,
+      "loss": 0.0304,
+      "step": 2020
+    },
+    {
+      "epoch": 0.21591299976186065,
+      "grad_norm": 0.730675220489502,
+      "learning_rate": 7.295588235294118e-05,
+      "loss": 0.0311,
+      "step": 2040
+    },
+    {
+      "epoch": 0.21802979387717303,
+      "grad_norm": 0.6322308778762817,
+      "learning_rate": 7.266176470588236e-05,
+      "loss": 0.0258,
+      "step": 2060
+    },
+    {
+      "epoch": 0.22014658799248538,
+      "grad_norm": 0.7560809254646301,
+      "learning_rate": 7.236764705882353e-05,
+      "loss": 0.0213,
+      "step": 2080
+    },
+    {
+      "epoch": 0.22226338210779775,
+      "grad_norm": 1.1907991170883179,
+      "learning_rate": 7.207352941176471e-05,
+      "loss": 0.0311,
+      "step": 2100
+    },
+    {
+      "epoch": 0.2243801762231101,
+      "grad_norm": 0.6392427086830139,
+      "learning_rate": 7.177941176470589e-05,
+      "loss": 0.0302,
+      "step": 2120
+    },
+    {
+      "epoch": 0.22649697033842245,
+      "grad_norm": 1.0621793270111084,
+      "learning_rate": 7.148529411764706e-05,
+      "loss": 0.0257,
+      "step": 2140
+    },
+    {
+      "epoch": 0.22861376445373482,
+      "grad_norm": 0.8459914326667786,
+      "learning_rate": 7.119117647058824e-05,
+      "loss": 0.0249,
+      "step": 2160
+    },
+    {
+      "epoch": 0.23073055856904717,
+      "grad_norm": 1.5384963750839233,
+      "learning_rate": 7.089705882352942e-05,
+      "loss": 0.0221,
+      "step": 2180
+    },
+    {
+      "epoch": 0.23284735268435955,
+      "grad_norm": 0.920907199382782,
+      "learning_rate": 7.06029411764706e-05,
+      "loss": 0.0307,
+      "step": 2200
+    },
+    {
+      "epoch": 0.2349641467996719,
+      "grad_norm": 1.1640409231185913,
+      "learning_rate": 7.030882352941177e-05,
+      "loss": 0.0302,
+      "step": 2220
+    },
+    {
+      "epoch": 0.23708094091498425,
+      "grad_norm": 0.7336745858192444,
+      "learning_rate": 7.001470588235295e-05,
+      "loss": 0.0286,
+      "step": 2240
+    },
+    {
+      "epoch": 0.23919773503029662,
+      "grad_norm": 1.9110276699066162,
+      "learning_rate": 6.972058823529412e-05,
+      "loss": 0.0303,
+      "step": 2260
+    },
+    {
+      "epoch": 0.24131452914560897,
+      "grad_norm": 0.9055470824241638,
+      "learning_rate": 6.94264705882353e-05,
+      "loss": 0.0241,
+      "step": 2280
+    },
+    {
+      "epoch": 0.24343132326092135,
+      "grad_norm": 1.063379168510437,
+      "learning_rate": 6.913235294117648e-05,
+      "loss": 0.0244,
+      "step": 2300
+    },
+    {
+      "epoch": 0.2455481173762337,
+      "grad_norm": 1.0067662000656128,
+      "learning_rate": 6.883823529411765e-05,
+      "loss": 0.026,
+      "step": 2320
+    },
+    {
+      "epoch": 0.24766491149154604,
+      "grad_norm": 1.1639182567596436,
+      "learning_rate": 6.854411764705883e-05,
+      "loss": 0.0253,
+      "step": 2340
+    },
+    {
+      "epoch": 0.24978170560685842,
+      "grad_norm": 0.9918274879455566,
+      "learning_rate": 6.825e-05,
+      "loss": 0.0218,
+      "step": 2360
+    },
+    {
+      "epoch": 0.25189849972217077,
+      "grad_norm": 0.7681129574775696,
+      "learning_rate": 6.795588235294118e-05,
+      "loss": 0.0212,
+      "step": 2380
+    },
+    {
+      "epoch": 0.2540152938374831,
+      "grad_norm": 0.7643230557441711,
+      "learning_rate": 6.766176470588236e-05,
+      "loss": 0.021,
+      "step": 2400
+    },
+    {
+      "epoch": 0.2561320879527955,
+      "grad_norm": 1.2285891771316528,
+      "learning_rate": 6.736764705882354e-05,
+      "loss": 0.0194,
+      "step": 2420
+    },
+    {
+      "epoch": 0.25824888206810787,
+      "grad_norm": 0.5345446467399597,
+      "learning_rate": 6.707352941176471e-05,
+      "loss": 0.0211,
+      "step": 2440
+    },
+    {
+      "epoch": 0.2603656761834202,
+      "grad_norm": 0.7964244484901428,
+      "learning_rate": 6.677941176470589e-05,
+      "loss": 0.024,
+      "step": 2460
+    },
+    {
+      "epoch": 0.26248247029873256,
+      "grad_norm": 0.5538131594657898,
+      "learning_rate": 6.648529411764705e-05,
+      "loss": 0.0258,
+      "step": 2480
+    },
+    {
+      "epoch": 0.2645992644140449,
+      "grad_norm": 0.9520718455314636,
+      "learning_rate": 6.619117647058823e-05,
+      "loss": 0.0178,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2667160585293573,
+      "grad_norm": 0.6036665439605713,
+      "learning_rate": 6.589705882352942e-05,
+      "loss": 0.0193,
+      "step": 2520
+    },
+    {
+      "epoch": 0.26883285264466966,
+      "grad_norm": 0.37941470742225647,
+      "learning_rate": 6.56029411764706e-05,
+      "loss": 0.0184,
+      "step": 2540
+    },
+    {
+      "epoch": 0.270949646759982,
+      "grad_norm": 0.3956536650657654,
+      "learning_rate": 6.530882352941177e-05,
+      "loss": 0.0239,
+      "step": 2560
+    },
+    {
+      "epoch": 0.27306644087529436,
+      "grad_norm": 0.4313443899154663,
+      "learning_rate": 6.501470588235295e-05,
+      "loss": 0.0185,
+      "step": 2580
+    },
+    {
+      "epoch": 0.2751832349906067,
+      "grad_norm": 1.083382248878479,
+      "learning_rate": 6.472058823529412e-05,
+      "loss": 0.026,
+      "step": 2600
+    },
+    {
+      "epoch": 0.2773000291059191,
+      "grad_norm": 0.8067460060119629,
+      "learning_rate": 6.44264705882353e-05,
+      "loss": 0.0223,
+      "step": 2620
+    },
+    {
+      "epoch": 0.27941682322123146,
+      "grad_norm": 1.2681511640548706,
+      "learning_rate": 6.413235294117648e-05,
+      "loss": 0.0232,
+      "step": 2640
+    },
+    {
+      "epoch": 0.2815336173365438,
+      "grad_norm": 0.5592957139015198,
+      "learning_rate": 6.383823529411765e-05,
+      "loss": 0.0184,
+      "step": 2660
+    },
+    {
+      "epoch": 0.28365041145185615,
+      "grad_norm": 0.5282326936721802,
+      "learning_rate": 6.354411764705883e-05,
+      "loss": 0.0195,
+      "step": 2680
+    },
+    {
+      "epoch": 0.2857672055671685,
+      "grad_norm": 0.5503069758415222,
+      "learning_rate": 6.324999999999999e-05,
+      "loss": 0.0182,
+      "step": 2700
+    },
+    {
+      "epoch": 0.2878839996824809,
+      "grad_norm": 0.9767094254493713,
+      "learning_rate": 6.295588235294117e-05,
+      "loss": 0.0174,
+      "step": 2720
+    },
+    {
+      "epoch": 0.29000079379779325,
+      "grad_norm": 0.5078358054161072,
+      "learning_rate": 6.266176470588236e-05,
+      "loss": 0.0214,
+      "step": 2740
+    },
+    {
+      "epoch": 0.2921175879131056,
+      "grad_norm": 0.8082838654518127,
+      "learning_rate": 6.236764705882354e-05,
+      "loss": 0.0151,
+      "step": 2760
+    },
+    {
+      "epoch": 0.29423438202841795,
+      "grad_norm": 0.49735844135284424,
+      "learning_rate": 6.207352941176471e-05,
+      "loss": 0.0235,
+      "step": 2780
+    },
+    {
+      "epoch": 0.2963511761437303,
+      "grad_norm": 1.0940418243408203,
+      "learning_rate": 6.177941176470589e-05,
+      "loss": 0.016,
+      "step": 2800
+    },
+    {
+      "epoch": 0.2984679702590427,
+      "grad_norm": 0.9790317416191101,
+      "learning_rate": 6.148529411764706e-05,
+      "loss": 0.0204,
+      "step": 2820
+    },
+    {
+      "epoch": 0.30058476437435505,
+      "grad_norm": 0.9905364513397217,
+      "learning_rate": 6.119117647058824e-05,
+      "loss": 0.0189,
+      "step": 2840
+    },
+    {
+      "epoch": 0.3027015584896674,
+      "grad_norm": 0.5084486603736877,
+      "learning_rate": 6.089705882352942e-05,
+      "loss": 0.0216,
+      "step": 2860
+    },
+    {
+      "epoch": 0.30481835260497975,
+      "grad_norm": 0.6312965750694275,
+      "learning_rate": 6.0602941176470594e-05,
+      "loss": 0.0197,
+      "step": 2880
+    },
+    {
+      "epoch": 0.3069351467202921,
+      "grad_norm": 1.0345927476882935,
+      "learning_rate": 6.0308823529411764e-05,
+      "loss": 0.0154,
+      "step": 2900
+    },
+    {
+      "epoch": 0.3090519408356045,
+      "grad_norm": 1.1944761276245117,
+      "learning_rate": 6.001470588235294e-05,
+      "loss": 0.017,
+      "step": 2920
+    },
+    {
+      "epoch": 0.31116873495091685,
+      "grad_norm": 0.6866488456726074,
+      "learning_rate": 5.972058823529412e-05,
+      "loss": 0.0158,
+      "step": 2940
+    },
+    {
+      "epoch": 0.3132855290662292,
+      "grad_norm": 1.0443695783615112,
+      "learning_rate": 5.9426470588235294e-05,
+      "loss": 0.0193,
+      "step": 2960
+    },
+    {
+      "epoch": 0.31540232318154154,
+      "grad_norm": 0.6489245891571045,
+      "learning_rate": 5.913235294117647e-05,
+      "loss": 0.016,
+      "step": 2980
+    },
+    {
+      "epoch": 0.3175191172968539,
+      "grad_norm": 1.388348937034607,
+      "learning_rate": 5.883823529411765e-05,
+      "loss": 0.0284,
+      "step": 3000
+    },
+    {
+      "epoch": 0.3196359114121663,
+      "grad_norm": 0.4919748306274414,
+      "learning_rate": 5.854411764705883e-05,
+      "loss": 0.0205,
+      "step": 3020
+    },
+    {
+      "epoch": 0.32175270552747864,
+      "grad_norm": 0.65608811378479,
+      "learning_rate": 5.8250000000000006e-05,
+      "loss": 0.0159,
+      "step": 3040
+    },
+    {
+      "epoch": 0.323869499642791,
+      "grad_norm": 0.4175134599208832,
+      "learning_rate": 5.795588235294118e-05,
+      "loss": 0.0159,
+      "step": 3060
+    },
+    {
+      "epoch": 0.32598629375810334,
+      "grad_norm": 0.6232139468193054,
+      "learning_rate": 5.766176470588236e-05,
+      "loss": 0.0177,
+      "step": 3080
+    },
+    {
+      "epoch": 0.3281030878734157,
+      "grad_norm": 0.4555909037590027,
+      "learning_rate": 5.7367647058823536e-05,
+      "loss": 0.0138,
+      "step": 3100
+    },
+    {
+      "epoch": 0.3302198819887281,
+      "grad_norm": 0.538420557975769,
+      "learning_rate": 5.7073529411764706e-05,
+      "loss": 0.0158,
+      "step": 3120
+    },
+    {
+      "epoch": 0.33233667610404044,
+      "grad_norm": 0.5802947878837585,
+      "learning_rate": 5.677941176470588e-05,
+      "loss": 0.0155,
+      "step": 3140
+    },
+    {
+      "epoch": 0.3344534702193528,
+      "grad_norm": 0.588239848613739,
+      "learning_rate": 5.648529411764706e-05,
+      "loss": 0.0187,
+      "step": 3160
+    },
+    {
+      "epoch": 0.33657026433466514,
+      "grad_norm": 0.5712038278579712,
+      "learning_rate": 5.6191176470588235e-05,
+      "loss": 0.013,
+      "step": 3180
+    },
+    {
+      "epoch": 0.3386870584499775,
+      "grad_norm": 0.4135841727256775,
+      "learning_rate": 5.589705882352941e-05,
+      "loss": 0.0171,
+      "step": 3200
+    },
+    {
+      "epoch": 0.3408038525652899,
+      "grad_norm": 0.7402490377426147,
+      "learning_rate": 5.560294117647059e-05,
+      "loss": 0.015,
+      "step": 3220
+    },
+    {
+      "epoch": 0.34292064668060224,
+      "grad_norm": 0.5647472143173218,
+      "learning_rate": 5.530882352941177e-05,
+      "loss": 0.0132,
+      "step": 3240
+    },
+    {
+      "epoch": 0.3450374407959146,
+      "grad_norm": 0.7440519332885742,
+      "learning_rate": 5.501470588235295e-05,
+      "loss": 0.0154,
+      "step": 3260
+    },
+    {
+      "epoch": 0.34715423491122693,
+      "grad_norm": 0.40782037377357483,
+      "learning_rate": 5.4720588235294124e-05,
+      "loss": 0.0168,
+      "step": 3280
+    },
+    {
+      "epoch": 0.3492710290265393,
+      "grad_norm": 0.3933939039707184,
+      "learning_rate": 5.44264705882353e-05,
+      "loss": 0.0161,
+      "step": 3300
+    },
+    {
+      "epoch": 0.3513878231418517,
+      "grad_norm": 0.29135826230049133,
+      "learning_rate": 5.413235294117648e-05,
+      "loss": 0.0189,
+      "step": 3320
+    },
+    {
+      "epoch": 0.35350461725716403,
+      "grad_norm": 0.581210196018219,
+      "learning_rate": 5.383823529411765e-05,
+      "loss": 0.0157,
+      "step": 3340
+    },
+    {
+      "epoch": 0.3556214113724764,
+      "grad_norm": 0.4485796391963959,
+      "learning_rate": 5.3544117647058824e-05,
+      "loss": 0.0142,
+      "step": 3360
+    },
+    {
+      "epoch": 0.35773820548778873,
+      "grad_norm": 0.4352544844150543,
+      "learning_rate": 5.325e-05,
+      "loss": 0.0153,
+      "step": 3380
+    },
+    {
+      "epoch": 0.3598549996031011,
+      "grad_norm": 1.0922011137008667,
+      "learning_rate": 5.2955882352941177e-05,
+      "loss": 0.0167,
+      "step": 3400
+    },
+    {
+      "epoch": 0.3619717937184135,
+      "grad_norm": 0.2693778872489929,
+      "learning_rate": 5.266176470588235e-05,
+      "loss": 0.0137,
+      "step": 3420
+    },
+    {
+      "epoch": 0.36408858783372583,
+      "grad_norm": 1.5889476537704468,
+      "learning_rate": 5.236764705882353e-05,
+      "loss": 0.0127,
+      "step": 3440
+    },
+    {
+      "epoch": 0.3662053819490382,
+      "grad_norm": 2.3836777210235596,
+      "learning_rate": 5.207352941176471e-05,
+      "loss": 0.0196,
+      "step": 3460
+    },
+    {
+      "epoch": 0.3683221760643505,
+      "grad_norm": 0.6966289281845093,
+      "learning_rate": 5.177941176470589e-05,
+      "loss": 0.0138,
+      "step": 3480
+    },
+    {
+      "epoch": 0.3704389701796629,
+      "grad_norm": 0.7514053583145142,
+      "learning_rate": 5.1485294117647066e-05,
+      "loss": 0.0143,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3725557642949753,
+      "grad_norm": 0.461103618144989,
+      "learning_rate": 5.119117647058824e-05,
+      "loss": 0.0146,
+      "step": 3520
+    },
+    {
+      "epoch": 0.3746725584102876,
+      "grad_norm": 0.7384988069534302,
+      "learning_rate": 5.089705882352941e-05,
+      "loss": 0.0167,
+      "step": 3540
+    },
+    {
+      "epoch": 0.3767893525256,
+      "grad_norm": 0.7363691329956055,
+      "learning_rate": 5.060294117647059e-05,
+      "loss": 0.0148,
+      "step": 3560
+    },
+    {
+      "epoch": 0.3789061466409123,
+      "grad_norm": 0.4628554582595825,
+      "learning_rate": 5.0308823529411765e-05,
+      "loss": 0.0138,
+      "step": 3580
+    },
+    {
+      "epoch": 0.38102294075622467,
+      "grad_norm": 0.48070573806762695,
+      "learning_rate": 5.001470588235294e-05,
+      "loss": 0.0148,
+      "step": 3600
+    },
+    {
+      "epoch": 0.3831397348715371,
+      "grad_norm": 0.913800835609436,
+      "learning_rate": 4.972058823529412e-05,
+      "loss": 0.0109,
+      "step": 3620
+    },
+    {
+      "epoch": 0.3852565289868494,
+      "grad_norm": 0.5302271842956543,
+      "learning_rate": 4.9426470588235295e-05,
+      "loss": 0.0129,
+      "step": 3640
+    },
+    {
+      "epoch": 0.38737332310216177,
+      "grad_norm": 0.5563445687294006,
+      "learning_rate": 4.913235294117647e-05,
+      "loss": 0.0155,
+      "step": 3660
+    },
+    {
+      "epoch": 0.3894901172174741,
+      "grad_norm": 0.7449616193771362,
+      "learning_rate": 4.8838235294117654e-05,
+      "loss": 0.0139,
+      "step": 3680
+    },
+    {
+      "epoch": 0.3916069113327865,
+      "grad_norm": 0.45803868770599365,
+      "learning_rate": 4.8544117647058824e-05,
+      "loss": 0.0134,
+      "step": 3700
+    },
+    {
+      "epoch": 0.39372370544809887,
+      "grad_norm": 0.4495037794113159,
+      "learning_rate": 4.825e-05,
+      "loss": 0.015,
+      "step": 3720
+    },
+    {
+      "epoch": 0.3958404995634112,
+      "grad_norm": 0.6490349769592285,
+      "learning_rate": 4.795588235294118e-05,
+      "loss": 0.0143,
+      "step": 3740
+    },
+    {
+      "epoch": 0.39795729367872357,
+      "grad_norm": 0.3576687276363373,
+      "learning_rate": 4.7661764705882354e-05,
+      "loss": 0.0118,
+      "step": 3760
+    },
+    {
+      "epoch": 0.4000740877940359,
+      "grad_norm": 0.5015860199928284,
+      "learning_rate": 4.736764705882353e-05,
+      "loss": 0.0169,
+      "step": 3780
+    },
+    {
+      "epoch": 0.4021908819093483,
+      "grad_norm": 1.0271028280258179,
+      "learning_rate": 4.707352941176471e-05,
+      "loss": 0.0119,
+      "step": 3800
+    },
+    {
+      "epoch": 0.40430767602466067,
+      "grad_norm": 0.4724489748477936,
+      "learning_rate": 4.677941176470588e-05,
+      "loss": 0.0112,
+      "step": 3820
+    },
+    {
+      "epoch": 0.406424470139973,
+      "grad_norm": 0.5578377842903137,
+      "learning_rate": 4.648529411764706e-05,
+      "loss": 0.013,
+      "step": 3840
+    },
+    {
+      "epoch": 0.40854126425528536,
+      "grad_norm": 0.6067779660224915,
+      "learning_rate": 4.6191176470588236e-05,
+      "loss": 0.0149,
+      "step": 3860
+    },
+    {
+      "epoch": 0.4106580583705977,
+      "grad_norm": 0.8015718460083008,
+      "learning_rate": 4.589705882352941e-05,
+      "loss": 0.0124,
+      "step": 3880
+    },
+    {
+      "epoch": 0.4127748524859101,
+      "grad_norm": 0.6352400183677673,
+      "learning_rate": 4.5602941176470596e-05,
+      "loss": 0.013,
+      "step": 3900
+    },
+    {
+      "epoch": 0.41489164660122246,
+      "grad_norm": 0.3545617163181305,
+      "learning_rate": 4.5308823529411765e-05,
+      "loss": 0.0117,
+      "step": 3920
+    },
+    {
+      "epoch": 0.4170084407165348,
+      "grad_norm": 0.4562068283557892,
+      "learning_rate": 4.501470588235294e-05,
+      "loss": 0.0118,
+      "step": 3940
+    },
+    {
+      "epoch": 0.41912523483184716,
+      "grad_norm": 0.8685987591743469,
+      "learning_rate": 4.472058823529412e-05,
+      "loss": 0.0126,
+      "step": 3960
+    },
+    {
+      "epoch": 0.4212420289471595,
+      "grad_norm": 0.49269378185272217,
+      "learning_rate": 4.4426470588235295e-05,
+      "loss": 0.0107,
+      "step": 3980
+    },
+    {
+      "epoch": 0.4233588230624719,
+      "grad_norm": 0.7156255841255188,
+      "learning_rate": 4.413235294117647e-05,
+      "loss": 0.0107,
+      "step": 4000
+    },
+    {
+      "epoch": 0.42547561717778426,
+      "grad_norm": 0.6339916586875916,
+      "learning_rate": 4.383823529411765e-05,
+      "loss": 0.0149,
+      "step": 4020
+    },
+    {
+      "epoch": 0.4275924112930966,
+      "grad_norm": 0.6008257269859314,
+      "learning_rate": 4.3544117647058824e-05,
+      "loss": 0.0121,
+      "step": 4040
+    },
+    {
+      "epoch": 0.42970920540840896,
+      "grad_norm": 0.34715619683265686,
+      "learning_rate": 4.325e-05,
+      "loss": 0.0115,
+      "step": 4060
+    },
+    {
+      "epoch": 0.4318259995237213,
+      "grad_norm": 0.6943634152412415,
+      "learning_rate": 4.295588235294118e-05,
+      "loss": 0.0115,
+      "step": 4080
+    },
+    {
+      "epoch": 0.4339427936390337,
+      "grad_norm": 0.5919560194015503,
+      "learning_rate": 4.2661764705882354e-05,
+      "loss": 0.0094,
+      "step": 4100
+    },
+    {
+      "epoch": 0.43605958775434606,
+      "grad_norm": 0.23244401812553406,
+      "learning_rate": 4.236764705882354e-05,
+      "loss": 0.0112,
+      "step": 4120
+    },
+    {
+      "epoch": 0.4381763818696584,
+      "grad_norm": 0.35059890151023865,
+      "learning_rate": 4.207352941176471e-05,
+      "loss": 0.0133,
+      "step": 4140
+    },
+    {
+      "epoch": 0.44029317598497075,
+      "grad_norm": 0.32678091526031494,
+      "learning_rate": 4.1779411764705883e-05,
+      "loss": 0.0113,
+      "step": 4160
+    },
+    {
+      "epoch": 0.4424099701002831,
+      "grad_norm": 0.6617632508277893,
+      "learning_rate": 4.148529411764706e-05,
+      "loss": 0.0105,
+      "step": 4180
+    },
+    {
+      "epoch": 0.4445267642155955,
+      "grad_norm": 0.27029886841773987,
+      "learning_rate": 4.1191176470588236e-05,
+      "loss": 0.0115,
+      "step": 4200
+    },
+    {
+      "epoch": 0.44664355833090785,
+      "grad_norm": 0.7106760144233704,
+      "learning_rate": 4.089705882352941e-05,
+      "loss": 0.0124,
+      "step": 4220
+    },
+    {
+      "epoch": 0.4487603524462202,
+      "grad_norm": 0.5163691639900208,
+      "learning_rate": 4.060294117647059e-05,
+      "loss": 0.0111,
+      "step": 4240
+    },
+    {
+      "epoch": 0.45087714656153255,
+      "grad_norm": 0.7228760123252869,
+      "learning_rate": 4.0308823529411766e-05,
+      "loss": 0.0113,
+      "step": 4260
+    },
+    {
+      "epoch": 0.4529939406768449,
+      "grad_norm": 0.5797919631004333,
+      "learning_rate": 4.001470588235294e-05,
+      "loss": 0.0118,
+      "step": 4280
+    },
+    {
+      "epoch": 0.4551107347921573,
+      "grad_norm": 1.233983039855957,
+      "learning_rate": 3.972058823529412e-05,
+      "loss": 0.0087,
+      "step": 4300
+    },
+    {
+      "epoch": 0.45722752890746965,
+      "grad_norm": 0.657342791557312,
+      "learning_rate": 3.9426470588235295e-05,
+      "loss": 0.0092,
+      "step": 4320
+    },
+    {
+      "epoch": 0.459344323022782,
+      "grad_norm": 0.4171401262283325,
+      "learning_rate": 3.913235294117647e-05,
+      "loss": 0.0161,
+      "step": 4340
+    },
+    {
+      "epoch": 0.46146111713809435,
+      "grad_norm": 0.34782201051712036,
+      "learning_rate": 3.883823529411765e-05,
+      "loss": 0.0103,
+      "step": 4360
+    },
+    {
+      "epoch": 0.4635779112534067,
+      "grad_norm": 0.5111158490180969,
+      "learning_rate": 3.8544117647058825e-05,
+      "loss": 0.0097,
+      "step": 4380
+    },
+    {
+      "epoch": 0.4656947053687191,
+      "grad_norm": 0.5910077095031738,
+      "learning_rate": 3.825e-05,
+      "loss": 0.0176,
+      "step": 4400
+    },
+    {
+      "epoch": 0.46781149948403145,
+      "grad_norm": 0.6808711290359497,
+      "learning_rate": 3.795588235294118e-05,
+      "loss": 0.009,
+      "step": 4420
+    },
+    {
+      "epoch": 0.4699282935993438,
+      "grad_norm": 0.4499869644641876,
+      "learning_rate": 3.7661764705882354e-05,
+      "loss": 0.0106,
+      "step": 4440
+    },
+    {
+      "epoch": 0.47204508771465614,
+      "grad_norm": 0.4361923336982727,
+      "learning_rate": 3.736764705882353e-05,
+      "loss": 0.0097,
+      "step": 4460
+    },
+    {
+      "epoch": 0.4741618818299685,
+      "grad_norm": 0.3171451985836029,
+      "learning_rate": 3.707352941176471e-05,
+      "loss": 0.0092,
+      "step": 4480
+    },
+    {
+      "epoch": 0.4762786759452809,
+      "grad_norm": 0.28628259897232056,
+      "learning_rate": 3.6779411764705884e-05,
+      "loss": 0.0081,
+      "step": 4500
+    },
+    {
+      "epoch": 0.47839547006059324,
+      "grad_norm": 0.5043999552726746,
+      "learning_rate": 3.648529411764706e-05,
+      "loss": 0.0102,
+      "step": 4520
+    },
+    {
+      "epoch": 0.4805122641759056,
+      "grad_norm": 0.3881862163543701,
+      "learning_rate": 3.619117647058824e-05,
+      "loss": 0.0109,
+      "step": 4540
+    },
+    {
+      "epoch": 0.48262905829121794,
+      "grad_norm": 0.6093239188194275,
+      "learning_rate": 3.589705882352941e-05,
+      "loss": 0.0089,
+      "step": 4560
+    },
+    {
+      "epoch": 0.4847458524065303,
+      "grad_norm": 0.4642229378223419,
+      "learning_rate": 3.560294117647059e-05,
+      "loss": 0.0092,
+      "step": 4580
+    },
+    {
+      "epoch": 0.4868626465218427,
+      "grad_norm": 0.4857279062271118,
+      "learning_rate": 3.5308823529411766e-05,
+      "loss": 0.0081,
+      "step": 4600
+    },
+    {
+      "epoch": 0.48897944063715504,
+      "grad_norm": 0.40589526295661926,
+      "learning_rate": 3.501470588235294e-05,
+      "loss": 0.0098,
+      "step": 4620
+    },
+    {
+      "epoch": 0.4910962347524674,
+      "grad_norm": 0.2723426818847656,
+      "learning_rate": 3.472058823529412e-05,
+      "loss": 0.0133,
+      "step": 4640
+    },
+    {
+      "epoch": 0.49321302886777973,
+      "grad_norm": 0.7545261383056641,
+      "learning_rate": 3.4426470588235296e-05,
+      "loss": 0.0103,
+      "step": 4660
+    },
+    {
+      "epoch": 0.4953298229830921,
+      "grad_norm": 1.5047451257705688,
+      "learning_rate": 3.413235294117647e-05,
+      "loss": 0.0103,
+      "step": 4680
+    },
+    {
+      "epoch": 0.4974466170984045,
+      "grad_norm": 0.46020635962486267,
+      "learning_rate": 3.383823529411765e-05,
+      "loss": 0.0092,
+      "step": 4700
+    },
+    {
+      "epoch": 0.49956341121371683,
+      "grad_norm": 0.42124831676483154,
+      "learning_rate": 3.3544117647058825e-05,
+      "loss": 0.0112,
+      "step": 4720
+    },
+    {
+      "epoch": 0.5016802053290292,
+      "grad_norm": 0.18676140904426575,
+      "learning_rate": 3.325e-05,
+      "loss": 0.0096,
+      "step": 4740
+    },
+    {
+      "epoch": 0.5037969994443415,
+      "grad_norm": 0.41889238357543945,
+      "learning_rate": 3.295588235294118e-05,
+      "loss": 0.0112,
+      "step": 4760
+    },
+    {
+      "epoch": 0.5059137935596539,
+      "grad_norm": 0.5965830087661743,
+      "learning_rate": 3.2661764705882355e-05,
+      "loss": 0.0082,
+      "step": 4780
+    },
+    {
+      "epoch": 0.5080305876749662,
+      "grad_norm": 0.5901793837547302,
+      "learning_rate": 3.236764705882353e-05,
+      "loss": 0.0092,
+      "step": 4800
+    },
+    {
+      "epoch": 0.5101473817902786,
+      "grad_norm": 0.453032910823822,
+      "learning_rate": 3.207352941176471e-05,
+      "loss": 0.0104,
+      "step": 4820
+    },
+    {
+      "epoch": 0.512264175905591,
+      "grad_norm": 0.3099919557571411,
+      "learning_rate": 3.1779411764705884e-05,
+      "loss": 0.0097,
+      "step": 4840
+    },
+    {
+      "epoch": 0.5143809700209033,
+      "grad_norm": 0.28637203574180603,
+      "learning_rate": 3.148529411764706e-05,
+      "loss": 0.0074,
+      "step": 4860
+    },
+    {
+      "epoch": 0.5164977641362157,
+      "grad_norm": 0.45871102809906006,
+      "learning_rate": 3.119117647058824e-05,
+      "loss": 0.0093,
+      "step": 4880
+    },
+    {
+      "epoch": 0.518614558251528,
+      "grad_norm": 0.5844906568527222,
+      "learning_rate": 3.0897058823529414e-05,
+      "loss": 0.0097,
+      "step": 4900
+    },
+    {
+      "epoch": 0.5207313523668404,
+      "grad_norm": 0.7102438807487488,
+      "learning_rate": 3.060294117647059e-05,
+      "loss": 0.0083,
+      "step": 4920
+    },
+    {
+      "epoch": 0.5228481464821528,
+      "grad_norm": 0.483784943819046,
+      "learning_rate": 3.0308823529411767e-05,
+      "loss": 0.0091,
+      "step": 4940
+    },
+    {
+      "epoch": 0.5249649405974651,
+      "grad_norm": 0.4747030436992645,
+      "learning_rate": 3.0014705882352943e-05,
+      "loss": 0.0091,
+      "step": 4960
+    },
+    {
+      "epoch": 0.5270817347127775,
+      "grad_norm": 0.3532012403011322,
+      "learning_rate": 2.9720588235294116e-05,
+      "loss": 0.0082,
+      "step": 4980
+    },
+    {
+      "epoch": 0.5291985288280898,
+      "grad_norm": 0.42889463901519775,
+      "learning_rate": 2.9426470588235293e-05,
+      "loss": 0.0091,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5313153229434022,
+      "grad_norm": 0.4388155937194824,
+      "learning_rate": 2.9132352941176473e-05,
+      "loss": 0.0088,
+      "step": 5020
+    },
+    {
+      "epoch": 0.5334321170587146,
+      "grad_norm": 0.49440255761146545,
+      "learning_rate": 2.883823529411765e-05,
+      "loss": 0.0091,
+      "step": 5040
+    },
+    {
+      "epoch": 0.5355489111740269,
+      "grad_norm": 0.3930880129337311,
+      "learning_rate": 2.8544117647058826e-05,
+      "loss": 0.0114,
+      "step": 5060
+    },
+    {
+      "epoch": 0.5376657052893393,
+      "grad_norm": 0.380283921957016,
+      "learning_rate": 2.825e-05,
+      "loss": 0.0105,
+      "step": 5080
+    },
+    {
+      "epoch": 0.5397824994046516,
+      "grad_norm": 0.3737698793411255,
+      "learning_rate": 2.7955882352941175e-05,
+      "loss": 0.0132,
+      "step": 5100
+    },
+    {
+      "epoch": 0.541899293519964,
+      "grad_norm": 0.5393537282943726,
+      "learning_rate": 2.7661764705882355e-05,
+      "loss": 0.0118,
+      "step": 5120
+    },
+    {
+      "epoch": 0.5440160876352764,
+      "grad_norm": 0.3449922502040863,
+      "learning_rate": 2.7367647058823532e-05,
+      "loss": 0.0077,
+      "step": 5140
+    },
+    {
+      "epoch": 0.5461328817505887,
+      "grad_norm": 0.6629793643951416,
+      "learning_rate": 2.7073529411764708e-05,
+      "loss": 0.0084,
+      "step": 5160
+    },
+    {
+      "epoch": 0.5482496758659011,
+      "grad_norm": 0.7243732810020447,
+      "learning_rate": 2.6779411764705885e-05,
+      "loss": 0.0073,
+      "step": 5180
+    },
+    {
+      "epoch": 0.5503664699812134,
+      "grad_norm": 0.6006022691726685,
+      "learning_rate": 2.6485294117647058e-05,
+      "loss": 0.0084,
+      "step": 5200
+    },
+    {
+      "epoch": 0.5524832640965258,
+      "grad_norm": 0.5986945629119873,
+      "learning_rate": 2.6191176470588234e-05,
+      "loss": 0.0087,
+      "step": 5220
+    },
+    {
+      "epoch": 0.5546000582118382,
+      "grad_norm": 0.267560750246048,
+      "learning_rate": 2.5897058823529414e-05,
+      "loss": 0.0092,
+      "step": 5240
+    },
+    {
+      "epoch": 0.5567168523271505,
+      "grad_norm": 0.47937673330307007,
+      "learning_rate": 2.560294117647059e-05,
+      "loss": 0.0089,
+      "step": 5260
+    },
+    {
+      "epoch": 0.5588336464424629,
+      "grad_norm": 0.4451775550842285,
+      "learning_rate": 2.5308823529411767e-05,
+      "loss": 0.0082,
+      "step": 5280
+    },
+    {
+      "epoch": 0.5609504405577752,
+      "grad_norm": 0.7350065112113953,
+      "learning_rate": 2.501470588235294e-05,
+      "loss": 0.0087,
+      "step": 5300
+    },
+    {
+      "epoch": 0.5630672346730876,
+      "grad_norm": 0.43704766035079956,
+      "learning_rate": 2.4720588235294117e-05,
+      "loss": 0.0089,
+      "step": 5320
+    },
+    {
+      "epoch": 0.5651840287884,
+      "grad_norm": 0.29158827662467957,
+      "learning_rate": 2.4426470588235297e-05,
+      "loss": 0.0066,
+      "step": 5340
+    },
+    {
+      "epoch": 0.5673008229037123,
+      "grad_norm": 0.39838340878486633,
+      "learning_rate": 2.4132352941176473e-05,
+      "loss": 0.0081,
+      "step": 5360
+    },
+    {
+      "epoch": 0.5694176170190247,
+      "grad_norm": 0.4324835538864136,
+      "learning_rate": 2.3838235294117646e-05,
+      "loss": 0.008,
+      "step": 5380
+    },
+    {
+      "epoch": 0.571534411134337,
+      "grad_norm": 0.4358319938182831,
+      "learning_rate": 2.3544117647058826e-05,
+      "loss": 0.008,
+      "step": 5400
+    },
+    {
+      "epoch": 0.5736512052496494,
+      "grad_norm": 0.8966334462165833,
+      "learning_rate": 2.3250000000000003e-05,
+      "loss": 0.0078,
+      "step": 5420
+    },
+    {
+      "epoch": 0.5757679993649618,
+      "grad_norm": 0.9501079320907593,
+      "learning_rate": 2.2955882352941176e-05,
+      "loss": 0.0184,
+      "step": 5440
+    },
+    {
+      "epoch": 0.5778847934802741,
+      "grad_norm": 0.13483519852161407,
+      "learning_rate": 2.2661764705882356e-05,
+      "loss": 0.0154,
+      "step": 5460
+    },
+    {
+      "epoch": 0.5800015875955865,
+      "grad_norm": 0.4287421703338623,
+      "learning_rate": 2.236764705882353e-05,
+      "loss": 0.0084,
+      "step": 5480
+    },
+    {
+      "epoch": 0.5821183817108988,
+      "grad_norm": 0.1738578975200653,
+      "learning_rate": 2.2073529411764705e-05,
+      "loss": 0.0079,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5842351758262112,
+      "grad_norm": 0.6555954217910767,
+      "learning_rate": 2.1779411764705885e-05,
+      "loss": 0.0091,
+      "step": 5520
+    },
+    {
+      "epoch": 0.5863519699415236,
+      "grad_norm": 0.5294132232666016,
+      "learning_rate": 2.1485294117647058e-05,
+      "loss": 0.007,
+      "step": 5540
+    },
+    {
+      "epoch": 0.5884687640568359,
+      "grad_norm": 0.3388701379299164,
+      "learning_rate": 2.1191176470588238e-05,
+      "loss": 0.007,
+      "step": 5560
+    },
+    {
+      "epoch": 0.5905855581721483,
+      "grad_norm": 0.4279813766479492,
+      "learning_rate": 2.0897058823529415e-05,
+      "loss": 0.0077,
+      "step": 5580
+    },
+    {
+      "epoch": 0.5927023522874606,
+      "grad_norm": 0.4467952847480774,
+      "learning_rate": 2.0602941176470588e-05,
+      "loss": 0.0083,
+      "step": 5600
+    },
+    {
+      "epoch": 0.594819146402773,
+      "grad_norm": 0.36640599370002747,
+      "learning_rate": 2.0308823529411768e-05,
+      "loss": 0.0081,
+      "step": 5620
+    },
+    {
+      "epoch": 0.5969359405180854,
+      "grad_norm": 0.2323896735906601,
+      "learning_rate": 2.001470588235294e-05,
+      "loss": 0.0065,
+      "step": 5640
+    },
+    {
+      "epoch": 0.5990527346333977,
+      "grad_norm": 0.5579979419708252,
+      "learning_rate": 1.9720588235294117e-05,
+      "loss": 0.0084,
+      "step": 5660
+    },
+    {
+      "epoch": 0.6011695287487101,
+      "grad_norm": 0.34144604206085205,
+      "learning_rate": 1.9426470588235297e-05,
+      "loss": 0.0069,
+      "step": 5680
+    },
+    {
+      "epoch": 0.6032863228640224,
+      "grad_norm": 0.5170475244522095,
+      "learning_rate": 1.913235294117647e-05,
+      "loss": 0.0074,
+      "step": 5700
+    },
+    {
+      "epoch": 0.6054031169793348,
+      "grad_norm": 0.34131792187690735,
+      "learning_rate": 1.8838235294117647e-05,
+      "loss": 0.0069,
+      "step": 5720
+    },
+    {
+      "epoch": 0.6075199110946472,
+      "grad_norm": 0.5252654552459717,
+      "learning_rate": 1.8544117647058827e-05,
+      "loss": 0.0074,
+      "step": 5740
+    },
+    {
+      "epoch": 0.6096367052099595,
+      "grad_norm": 0.23735718429088593,
+      "learning_rate": 1.825e-05,
+      "loss": 0.0079,
+      "step": 5760
+    },
+    {
+      "epoch": 0.6117534993252719,
+      "grad_norm": 0.3985564410686493,
+      "learning_rate": 1.7955882352941176e-05,
+      "loss": 0.0076,
+      "step": 5780
+    },
+    {
+      "epoch": 0.6138702934405842,
+      "grad_norm": 0.53111732006073,
+      "learning_rate": 1.7661764705882353e-05,
+      "loss": 0.0075,
+      "step": 5800
+    },
+    {
+      "epoch": 0.6159870875558966,
+      "grad_norm": 0.37471240758895874,
+      "learning_rate": 1.736764705882353e-05,
+      "loss": 0.007,
+      "step": 5820
+    },
+    {
+      "epoch": 0.618103881671209,
+      "grad_norm": 0.2607717514038086,
+      "learning_rate": 1.707352941176471e-05,
+      "loss": 0.0078,
+      "step": 5840
+    },
+    {
+      "epoch": 0.6202206757865213,
+      "grad_norm": 0.4577248990535736,
+      "learning_rate": 1.6779411764705882e-05,
+      "loss": 0.0099,
+      "step": 5860
+    },
+    {
+      "epoch": 0.6223374699018337,
+      "grad_norm": 0.44592851400375366,
+      "learning_rate": 1.648529411764706e-05,
+      "loss": 0.007,
+      "step": 5880
+    },
+    {
+      "epoch": 0.624454264017146,
+      "grad_norm": 0.4649290442466736,
+      "learning_rate": 1.619117647058824e-05,
+      "loss": 0.0078,
+      "step": 5900
+    },
+    {
+      "epoch": 0.6265710581324584,
+      "grad_norm": 0.5193443298339844,
+      "learning_rate": 1.5897058823529412e-05,
+      "loss": 0.0086,
+      "step": 5920
+    },
+    {
+      "epoch": 0.6286878522477708,
+      "grad_norm": 0.5165125131607056,
+      "learning_rate": 1.5602941176470588e-05,
+      "loss": 0.0079,
+      "step": 5940
+    },
+    {
+      "epoch": 0.6308046463630831,
+      "grad_norm": 0.5387499928474426,
+      "learning_rate": 1.5308823529411765e-05,
+      "loss": 0.0072,
+      "step": 5960
+    },
+    {
+      "epoch": 0.6329214404783955,
+      "grad_norm": 0.3668934404850006,
+      "learning_rate": 1.5014705882352941e-05,
+      "loss": 0.0063,
+      "step": 5980
+    },
+    {
+      "epoch": 0.6350382345937078,
+      "grad_norm": 0.880902886390686,
+      "learning_rate": 1.472058823529412e-05,
+      "loss": 0.0071,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6371550287090202,
+      "grad_norm": 0.6322916150093079,
+      "learning_rate": 1.4426470588235294e-05,
+      "loss": 0.0066,
+      "step": 6020
+    },
+    {
+      "epoch": 0.6392718228243326,
+      "grad_norm": 0.6748706698417664,
+      "learning_rate": 1.4132352941176472e-05,
+      "loss": 0.0071,
+      "step": 6040
+    },
+    {
+      "epoch": 0.6413886169396449,
+      "grad_norm": 0.20786982774734497,
+      "learning_rate": 1.3838235294117649e-05,
+      "loss": 0.0062,
+      "step": 6060
+    },
+    {
+      "epoch": 0.6435054110549573,
+      "grad_norm": 0.913157045841217,
+      "learning_rate": 1.3544117647058824e-05,
+      "loss": 0.0071,
+      "step": 6080
+    },
+    {
+      "epoch": 0.6456222051702696,
+      "grad_norm": 0.38529446721076965,
+      "learning_rate": 1.3250000000000002e-05,
+      "loss": 0.0061,
+      "step": 6100
+    },
+    {
+      "epoch": 0.647738999285582,
+      "grad_norm": 0.35093656182289124,
+      "learning_rate": 1.2955882352941177e-05,
+      "loss": 0.0064,
+      "step": 6120
+    },
+    {
+      "epoch": 0.6498557934008944,
+      "grad_norm": 0.4418446123600006,
+      "learning_rate": 1.2661764705882353e-05,
+      "loss": 0.0069,
+      "step": 6140
+    },
+    {
+      "epoch": 0.6519725875162067,
+      "grad_norm": 0.20957966148853302,
+      "learning_rate": 1.236764705882353e-05,
+      "loss": 0.0059,
+      "step": 6160
+    },
+    {
+      "epoch": 0.6540893816315191,
+      "grad_norm": 0.2830004394054413,
+      "learning_rate": 1.2073529411764708e-05,
+      "loss": 0.0068,
+      "step": 6180
+    },
+    {
+      "epoch": 0.6562061757468314,
+      "grad_norm": 0.20361077785491943,
+      "learning_rate": 1.1779411764705883e-05,
+      "loss": 0.0075,
+      "step": 6200
+    },
+    {
+      "epoch": 0.6583229698621438,
+      "grad_norm": 0.44711142778396606,
+      "learning_rate": 1.148529411764706e-05,
+      "loss": 0.0063,
+      "step": 6220
+    },
+    {
+      "epoch": 0.6604397639774562,
+      "grad_norm": 0.361794650554657,
+      "learning_rate": 1.1191176470588236e-05,
+      "loss": 0.0087,
+      "step": 6240
+    },
+    {
+      "epoch": 0.6625565580927685,
+      "grad_norm": 0.4405740797519684,
+      "learning_rate": 1.0897058823529412e-05,
+      "loss": 0.0067,
+      "step": 6260
+    },
+    {
+      "epoch": 0.6646733522080809,
+      "grad_norm": 0.4276063144207001,
+      "learning_rate": 1.0602941176470589e-05,
+      "loss": 0.0058,
+      "step": 6280
+    },
+    {
+      "epoch": 0.6667901463233932,
+      "grad_norm": 0.267135351896286,
+      "learning_rate": 1.0308823529411765e-05,
+      "loss": 0.0065,
+      "step": 6300
+    },
+    {
+      "epoch": 0.6689069404387056,
+      "grad_norm": 0.41200748085975647,
+      "learning_rate": 1.0014705882352942e-05,
+      "loss": 0.0064,
+      "step": 6320
+    },
+    {
+      "epoch": 0.671023734554018,
+      "grad_norm": 0.30442219972610474,
+      "learning_rate": 9.720588235294118e-06,
+      "loss": 0.0071,
+      "step": 6340
+    },
+    {
+      "epoch": 0.6731405286693303,
+      "grad_norm": 0.5211018919944763,
+      "learning_rate": 9.426470588235295e-06,
+      "loss": 0.0052,
+      "step": 6360
+    },
+    {
+      "epoch": 0.6752573227846427,
+      "grad_norm": 0.3820931315422058,
+      "learning_rate": 9.132352941176471e-06,
+      "loss": 0.0058,
+      "step": 6380
+    },
+    {
+      "epoch": 0.677374116899955,
+      "grad_norm": 0.37423545122146606,
+      "learning_rate": 8.838235294117648e-06,
+      "loss": 0.0059,
+      "step": 6400
+    },
+    {
+      "epoch": 0.6794909110152674,
+      "grad_norm": 0.42695438861846924,
+      "learning_rate": 8.544117647058822e-06,
+      "loss": 0.006,
+      "step": 6420
+    },
+    {
+      "epoch": 0.6816077051305798,
+      "grad_norm": 0.37079957127571106,
+      "learning_rate": 8.25e-06,
+      "loss": 0.0076,
+      "step": 6440
+    },
+    {
+      "epoch": 0.6837244992458921,
+      "grad_norm": 0.17029517889022827,
+      "learning_rate": 7.955882352941177e-06,
+      "loss": 0.0055,
+      "step": 6460
+    },
+    {
+      "epoch": 0.6858412933612045,
+      "grad_norm": 0.30178797245025635,
+      "learning_rate": 7.661764705882354e-06,
+      "loss": 0.0053,
+      "step": 6480
+    },
+    {
+      "epoch": 0.6879580874765168,
+      "grad_norm": 0.2955467104911804,
+      "learning_rate": 7.367647058823529e-06,
+      "loss": 0.0069,
+      "step": 6500
+    },
+    {
+      "epoch": 0.6900748815918292,
+      "grad_norm": 0.3518420159816742,
+      "learning_rate": 7.073529411764707e-06,
+      "loss": 0.006,
+      "step": 6520
+    },
+    {
+      "epoch": 0.6921916757071416,
+      "grad_norm": 0.40877994894981384,
+      "learning_rate": 6.779411764705883e-06,
+      "loss": 0.006,
+      "step": 6540
+    },
+    {
+      "epoch": 0.6943084698224539,
+      "grad_norm": 0.3902340233325958,
+      "learning_rate": 6.485294117647059e-06,
+      "loss": 0.0064,
+      "step": 6560
+    },
+    {
+      "epoch": 0.6964252639377663,
+      "grad_norm": 0.24104374647140503,
+      "learning_rate": 6.191176470588235e-06,
+      "loss": 0.0057,
+      "step": 6580
+    },
+    {
+      "epoch": 0.6985420580530786,
+      "grad_norm": 0.6368237137794495,
+      "learning_rate": 5.897058823529412e-06,
+      "loss": 0.0059,
+      "step": 6600
+    },
+    {
+      "epoch": 0.700658852168391,
+      "grad_norm": 0.5903158783912659,
+      "learning_rate": 5.602941176470588e-06,
+      "loss": 0.0057,
+      "step": 6620
+    },
+    {
+      "epoch": 0.7027756462837034,
+      "grad_norm": 0.2732548713684082,
+      "learning_rate": 5.308823529411765e-06,
+      "loss": 0.0068,
+      "step": 6640
+    },
+    {
+      "epoch": 0.7048924403990157,
+      "grad_norm": 0.4018927812576294,
+      "learning_rate": 5.014705882352941e-06,
+      "loss": 0.0069,
+      "step": 6660
+    },
+    {
+      "epoch": 0.7070092345143281,
+      "grad_norm": 0.36459019780158997,
+      "learning_rate": 4.720588235294118e-06,
+      "loss": 0.0057,
+      "step": 6680
+    },
+    {
+      "epoch": 0.7091260286296404,
+      "grad_norm": 0.5796602368354797,
+      "learning_rate": 4.426470588235294e-06,
+      "loss": 0.0067,
+      "step": 6700
+    },
+    {
+      "epoch": 0.7112428227449528,
+      "grad_norm": 0.23343797028064728,
+      "learning_rate": 4.132352941176471e-06,
+      "loss": 0.0061,
+      "step": 6720
+    },
+    {
+      "epoch": 0.7133596168602652,
+      "grad_norm": 0.38248059153556824,
+      "learning_rate": 3.838235294117647e-06,
+      "loss": 0.0067,
+      "step": 6740
+    },
+    {
+      "epoch": 0.7154764109755775,
+      "grad_norm": 0.17756353318691254,
+      "learning_rate": 3.5441176470588233e-06,
+      "loss": 0.0051,
+      "step": 6760
+    },
+    {
+      "epoch": 0.7175932050908899,
+      "grad_norm": 0.512367844581604,
+      "learning_rate": 3.2500000000000002e-06,
+      "loss": 0.0059,
+      "step": 6780
+    },
+    {
+      "epoch": 0.7197099992062022,
+      "grad_norm": 0.599236249923706,
+      "learning_rate": 2.9558823529411767e-06,
+      "loss": 0.0065,
+      "step": 6800
+    },
+    {
+      "epoch": 0.7218267933215146,
+      "grad_norm": 0.3208792805671692,
+      "learning_rate": 2.6617647058823532e-06,
+      "loss": 0.0059,
+      "step": 6820
+    },
+    {
+      "epoch": 0.723943587436827,
+      "grad_norm": 0.31455111503601074,
+      "learning_rate": 2.3676470588235297e-06,
+      "loss": 0.0062,
+      "step": 6840
+    },
+    {
+      "epoch": 0.7260603815521393,
+      "grad_norm": 0.47607216238975525,
+      "learning_rate": 2.073529411764706e-06,
+      "loss": 0.0065,
+      "step": 6860
+    },
+    {
+      "epoch": 0.7281771756674517,
+      "grad_norm": 0.20253852009773254,
+      "learning_rate": 1.7794117647058825e-06,
+      "loss": 0.0055,
+      "step": 6880
+    },
+    {
+      "epoch": 0.730293969782764,
+      "grad_norm": 0.4787275493144989,
+      "learning_rate": 1.4852941176470588e-06,
+      "loss": 0.0056,
+      "step": 6900
+    },
+    {
+      "epoch": 0.7324107638980764,
+      "grad_norm": 0.3596799969673157,
+      "learning_rate": 1.1911764705882353e-06,
+      "loss": 0.0047,
+      "step": 6920
+    },
+    {
+      "epoch": 0.7345275580133888,
+      "grad_norm": 0.31353089213371277,
+      "learning_rate": 8.970588235294118e-07,
+      "loss": 0.0053,
+      "step": 6940
+    },
+    {
+      "epoch": 0.736644352128701,
+      "grad_norm": 0.3764742314815521,
+      "learning_rate": 6.029411764705883e-07,
+      "loss": 0.0048,
+      "step": 6960
+    },
+    {
+      "epoch": 0.7387611462440135,
+      "grad_norm": 0.4484200179576874,
+      "learning_rate": 3.088235294117647e-07,
+      "loss": 0.0062,
+      "step": 6980
+    },
+    {
+      "epoch": 0.7408779403593257,
+      "grad_norm": 0.4392724335193634,
+      "learning_rate": 1.4705882352941176e-08,
+      "loss": 0.0063,
+      "step": 7000
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 7000,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.9062733238823237e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87c1838316ae8c7df2b7eb5f039022e01d00f71f28b5a09877cd72af98fb0743
+size 5969

internvl3_1b_lora_7000_20260304_104032/checkpoint-7000/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff