agu18dec commited on 22 days ago

Commit

2d29b9b

verified ·

1 Parent(s): c9ab9cf

add checkpoint otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +11 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/README.md +61 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/adapter_config.json +48 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/adapter_model.safetensors +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/added_tokens.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/chat_template.jinja +47 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/README.md +209 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/adapter_config.json +48 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/adapter_model.safetensors +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/added_tokens.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/chat_template.jinja +47 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/preprocessor_config.json +29 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/processor_config.json +4 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/special_tokens_map.json +33 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer.model +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer_config.json +0 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/trainer_state.json +0 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/training_args.bin +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/README.md +209 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/adapter_config.json +48 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/adapter_model.safetensors +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/added_tokens.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/chat_template.jinja +47 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/preprocessor_config.json +29 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/processor_config.json +4 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/special_tokens_map.json +33 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer.model +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer_config.json +0 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/trainer_state.json +0 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/training_args.bin +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/README.md +209 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/adapter_config.json +48 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/adapter_model.safetensors +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/added_tokens.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/chat_template.jinja +47 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/preprocessor_config.json +29 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/processor_config.json +4 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/special_tokens_map.json +33 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer.model +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer_config.json +0 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/trainer_state.json +1284 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/training_args.bin +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/README.md +209 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/adapter_config.json +48 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/adapter_model.safetensors +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/added_tokens.json +3 -0
checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/chat_template.jinja +47 -0

.gitattributes CHANGED Viewed

@@ -382,3 +382,14 @@ checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/checkpoint-6250/
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/checkpoint-8750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/checkpoint-8750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s2_baseline/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-3750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-6250/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-8750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+model_name: otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+licence: license
+pipeline_tag: text-generation
+---
+# Model Card for otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline
+This model is a fine-tuned version of [google/gemma-3-4b-it](https://huggingface.co/google/gemma-3-4b-it).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="None", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/agam-research/huggingface/runs/byv5l5w7)
+This model was trained with SFT.
+### Framework versions
+- PEFT 0.19.1
+- TRL: 0.28.0
+- Transformers: 4.57.6
+- Pytorch: 2.9.1
+- Datasets: 4.5.0
+- Tokenizers: 0.22.2
+## Citations
+Cite TRL as:
+```bibtex
+@software{vonwerra2020trl,
+  title   = {{TRL: Transformers Reinforcement Learning}},
+  author  = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
+  license = {Apache-2.0},
+  url     = {https://github.com/huggingface/trl},
+  year    = {2020}
+}
+```

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb70dcda4b42fd284f3a34794b8278f4ecb4d143e214d35a922517e5cff749e6
+size 65674128

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17e1fee558d19cd4a85f315d65639fef9378acf4e913c092402ae56d721142b5
+size 65674128

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-10000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b25b8d8cffd67ea351659b350427764e9a6dcc4d1f692fe42a9968c21bd1cc6
+size 6417

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ebf28306f055adc5a30bba39695813a9244199947770236f5685a06b4c77062
+size 65674128

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-11250/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b25b8d8cffd67ea351659b350427764e9a6dcc4d1f692fe42a9968c21bd1cc6
+size 6417

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:371a515c36314e7cb665e296701075657197f9b4f555bb915c16308f987c82db
+size 65674128

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_pan_and_scan": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Gemma3ImageProcessor",
+  "image_seq_length": 256,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "pan_and_scan_max_num_crops": null,
+  "pan_and_scan_min_crop_size": null,
+  "pan_and_scan_min_ratio_to_activate": null,
+  "processor_class": "Gemma3Processor",
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 896,
+    "width": 896
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/processor_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "image_seq_length": 256,
+  "processor_class": "Gemma3Processor"
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "boi_token": "<start_of_image>",
+  "bos_token": {
+    "content": "<bos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eoi_token": "<end_of_image>",
+  "eos_token": {
+    "content": "<eos>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "image_token": "<image_soft_token>",
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
+size 33384568

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
+size 4689074

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1284 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 1250,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "entropy": 0.6517447352409362,
+      "epoch": 0.008,
+      "grad_norm": 13.304323196411133,
+      "learning_rate": 1.44e-06,
+      "loss": 1.2051,
+      "mean_token_accuracy": 0.7518506407737732,
+      "num_tokens": 10055.0,
+      "step": 10
+    },
+    {
+      "entropy": 0.6488827526569366,
+      "epoch": 0.016,
+      "grad_norm": 14.009625434875488,
+      "learning_rate": 3.04e-06,
+      "loss": 1.1682,
+      "mean_token_accuracy": 0.7532002747058868,
+      "num_tokens": 19963.0,
+      "step": 20
+    },
+    {
+      "entropy": 0.6849160015583038,
+      "epoch": 0.024,
+      "grad_norm": 14.544610023498535,
+      "learning_rate": 4.64e-06,
+      "loss": 1.0825,
+      "mean_token_accuracy": 0.7331112623214722,
+      "num_tokens": 30054.0,
+      "step": 30
+    },
+    {
+      "entropy": 0.6808853566646575,
+      "epoch": 0.032,
+      "grad_norm": 10.353639602661133,
+      "learning_rate": 6.24e-06,
+      "loss": 0.7754,
+      "mean_token_accuracy": 0.7654528617858887,
+      "num_tokens": 39938.0,
+      "step": 40
+    },
+    {
+      "entropy": 0.7086315333843232,
+      "epoch": 0.04,
+      "grad_norm": 7.067413806915283,
+      "learning_rate": 7.84e-06,
+      "loss": 0.5946,
+      "mean_token_accuracy": 0.8002863466739655,
+      "num_tokens": 50166.0,
+      "step": 50
+    },
+    {
+      "entropy": 0.7584897458553315,
+      "epoch": 0.048,
+      "grad_norm": 5.474998950958252,
+      "learning_rate": 9.44e-06,
+      "loss": 0.6412,
+      "mean_token_accuracy": 0.7837531447410584,
+      "num_tokens": 60059.0,
+      "step": 60
+    },
+    {
+      "entropy": 0.7520301103591919,
+      "epoch": 0.056,
+      "grad_norm": 4.642393112182617,
+      "learning_rate": 1.1040000000000001e-05,
+      "loss": 0.5667,
+      "mean_token_accuracy": 0.8009410679340363,
+      "num_tokens": 70143.0,
+      "step": 70
+    },
+    {
+      "entropy": 0.7466191172599792,
+      "epoch": 0.064,
+      "grad_norm": 6.564879894256592,
+      "learning_rate": 1.2640000000000003e-05,
+      "loss": 0.562,
+      "mean_token_accuracy": 0.8044180452823639,
+      "num_tokens": 80014.0,
+      "step": 80
+    },
+    {
+      "entropy": 0.737699156999588,
+      "epoch": 0.072,
+      "grad_norm": 5.0010881423950195,
+      "learning_rate": 1.4240000000000001e-05,
+      "loss": 0.5172,
+      "mean_token_accuracy": 0.8152086019515992,
+      "num_tokens": 90095.0,
+      "step": 90
+    },
+    {
+      "entropy": 0.7431365609169006,
+      "epoch": 0.08,
+      "grad_norm": 5.928659439086914,
+      "learning_rate": 1.584e-05,
+      "loss": 0.5221,
+      "mean_token_accuracy": 0.8172156095504761,
+      "num_tokens": 100362.0,
+      "step": 100
+    },
+    {
+      "entropy": 0.7396394312381744,
+      "epoch": 0.088,
+      "grad_norm": 4.8456950187683105,
+      "learning_rate": 1.7440000000000002e-05,
+      "loss": 0.5085,
+      "mean_token_accuracy": 0.8155511736869812,
+      "num_tokens": 110373.0,
+      "step": 110
+    },
+    {
+      "entropy": 0.7305532455444336,
+      "epoch": 0.096,
+      "grad_norm": 5.342586040496826,
+      "learning_rate": 1.904e-05,
+      "loss": 0.473,
+      "mean_token_accuracy": 0.8297189176082611,
+      "num_tokens": 120330.0,
+      "step": 120
+    },
+    {
+      "entropy": 0.7010469496250152,
+      "epoch": 0.104,
+      "grad_norm": 4.477506637573242,
+      "learning_rate": 2.0640000000000002e-05,
+      "loss": 0.4808,
+      "mean_token_accuracy": 0.8355620563030243,
+      "num_tokens": 130258.0,
+      "step": 130
+    },
+    {
+      "entropy": 0.7325722575187683,
+      "epoch": 0.112,
+      "grad_norm": 5.776942253112793,
+      "learning_rate": 2.224e-05,
+      "loss": 0.4747,
+      "mean_token_accuracy": 0.8217672824859619,
+      "num_tokens": 140436.0,
+      "step": 140
+    },
+    {
+      "entropy": 0.7283274173736572,
+      "epoch": 0.12,
+      "grad_norm": 4.776106834411621,
+      "learning_rate": 2.3840000000000002e-05,
+      "loss": 0.4878,
+      "mean_token_accuracy": 0.8249702036380768,
+      "num_tokens": 150558.0,
+      "step": 150
+    },
+    {
+      "entropy": 0.7461902379989624,
+      "epoch": 0.128,
+      "grad_norm": 5.491858959197998,
+      "learning_rate": 2.5440000000000004e-05,
+      "loss": 0.5055,
+      "mean_token_accuracy": 0.8181215822696686,
+      "num_tokens": 160454.0,
+      "step": 160
+    },
+    {
+      "entropy": 0.7387523293495178,
+      "epoch": 0.136,
+      "grad_norm": 4.031186103820801,
+      "learning_rate": 2.704e-05,
+      "loss": 0.4892,
+      "mean_token_accuracy": 0.8191783010959626,
+      "num_tokens": 170479.0,
+      "step": 170
+    },
+    {
+      "entropy": 0.7172978222370148,
+      "epoch": 0.144,
+      "grad_norm": 4.5944061279296875,
+      "learning_rate": 2.864e-05,
+      "loss": 0.4381,
+      "mean_token_accuracy": 0.8387054204940796,
+      "num_tokens": 180442.0,
+      "step": 180
+    },
+    {
+      "entropy": 0.711635684967041,
+      "epoch": 0.152,
+      "grad_norm": 3.530456066131592,
+      "learning_rate": 3.0240000000000002e-05,
+      "loss": 0.4265,
+      "mean_token_accuracy": 0.8430639147758484,
+      "num_tokens": 190507.0,
+      "step": 190
+    },
+    {
+      "entropy": 0.7339554727077484,
+      "epoch": 0.16,
+      "grad_norm": 3.645368814468384,
+      "learning_rate": 3.184e-05,
+      "loss": 0.465,
+      "mean_token_accuracy": 0.8289207279682159,
+      "num_tokens": 200508.0,
+      "step": 200
+    },
+    {
+      "entropy": 0.7236140966415405,
+      "epoch": 0.168,
+      "grad_norm": 4.230995178222656,
+      "learning_rate": 3.344e-05,
+      "loss": 0.4398,
+      "mean_token_accuracy": 0.8397033035755157,
+      "num_tokens": 210394.0,
+      "step": 210
+    },
+    {
+      "entropy": 0.7196837246418,
+      "epoch": 0.176,
+      "grad_norm": 3.253856897354126,
+      "learning_rate": 3.504e-05,
+      "loss": 0.4359,
+      "mean_token_accuracy": 0.840765792131424,
+      "num_tokens": 220420.0,
+      "step": 220
+    },
+    {
+      "entropy": 0.7125779867172242,
+      "epoch": 0.184,
+      "grad_norm": 4.0387163162231445,
+      "learning_rate": 3.664e-05,
+      "loss": 0.4118,
+      "mean_token_accuracy": 0.8490993976593018,
+      "num_tokens": 230359.0,
+      "step": 230
+    },
+    {
+      "entropy": 0.724159049987793,
+      "epoch": 0.192,
+      "grad_norm": 3.5710225105285645,
+      "learning_rate": 3.8240000000000007e-05,
+      "loss": 0.4602,
+      "mean_token_accuracy": 0.8292092382907867,
+      "num_tokens": 240142.0,
+      "step": 240
+    },
+    {
+      "entropy": 0.7112815618515015,
+      "epoch": 0.2,
+      "grad_norm": 4.761449337005615,
+      "learning_rate": 3.984e-05,
+      "loss": 0.4411,
+      "mean_token_accuracy": 0.8410702764987945,
+      "num_tokens": 250381.0,
+      "step": 250
+    },
+    {
+      "entropy": 0.7280168414115906,
+      "epoch": 0.208,
+      "grad_norm": 3.5113282203674316,
+      "learning_rate": 4.144e-05,
+      "loss": 0.4277,
+      "mean_token_accuracy": 0.8378039479255677,
+      "num_tokens": 260388.0,
+      "step": 260
+    },
+    {
+      "entropy": 0.715729832649231,
+      "epoch": 0.216,
+      "grad_norm": 3.883734703063965,
+      "learning_rate": 4.304e-05,
+      "loss": 0.4467,
+      "mean_token_accuracy": 0.8387612402439117,
+      "num_tokens": 270483.0,
+      "step": 270
+    },
+    {
+      "entropy": 0.6883831679821014,
+      "epoch": 0.224,
+      "grad_norm": 3.4563710689544678,
+      "learning_rate": 4.4640000000000006e-05,
+      "loss": 0.3854,
+      "mean_token_accuracy": 0.8561008334159851,
+      "num_tokens": 280550.0,
+      "step": 280
+    },
+    {
+      "entropy": 0.7114541232585907,
+      "epoch": 0.232,
+      "grad_norm": 4.078945636749268,
+      "learning_rate": 4.624e-05,
+      "loss": 0.4239,
+      "mean_token_accuracy": 0.8356595575809479,
+      "num_tokens": 290635.0,
+      "step": 290
+    },
+    {
+      "entropy": 0.7148261308670044,
+      "epoch": 0.24,
+      "grad_norm": 4.1704840660095215,
+      "learning_rate": 4.784e-05,
+      "loss": 0.4053,
+      "mean_token_accuracy": 0.8485374748706818,
+      "num_tokens": 300716.0,
+      "step": 300
+    },
+    {
+      "entropy": 0.6976641476154327,
+      "epoch": 0.248,
+      "grad_norm": 6.769944190979004,
+      "learning_rate": 4.944e-05,
+      "loss": 0.4463,
+      "mean_token_accuracy": 0.8296632826328277,
+      "num_tokens": 310763.0,
+      "step": 310
+    },
+    {
+      "entropy": 0.7336399972438812,
+      "epoch": 0.256,
+      "grad_norm": 3.4586522579193115,
+      "learning_rate": 5.104e-05,
+      "loss": 0.428,
+      "mean_token_accuracy": 0.839785122871399,
+      "num_tokens": 320739.0,
+      "step": 320
+    },
+    {
+      "entropy": 0.7039563238620759,
+      "epoch": 0.264,
+      "grad_norm": 4.9698591232299805,
+      "learning_rate": 5.264e-05,
+      "loss": 0.3886,
+      "mean_token_accuracy": 0.8518033504486084,
+      "num_tokens": 330765.0,
+      "step": 330
+    },
+    {
+      "entropy": 0.7286165356636047,
+      "epoch": 0.272,
+      "grad_norm": 4.455258369445801,
+      "learning_rate": 5.424e-05,
+      "loss": 0.4379,
+      "mean_token_accuracy": 0.8310850203037262,
+      "num_tokens": 340774.0,
+      "step": 340
+    },
+    {
+      "entropy": 0.7566006064414978,
+      "epoch": 0.28,
+      "grad_norm": 3.1348392963409424,
+      "learning_rate": 5.584e-05,
+      "loss": 0.4275,
+      "mean_token_accuracy": 0.8401350796222686,
+      "num_tokens": 350950.0,
+      "step": 350
+    },
+    {
+      "entropy": 0.7498272895812989,
+      "epoch": 0.288,
+      "grad_norm": 4.724481582641602,
+      "learning_rate": 5.7440000000000006e-05,
+      "loss": 0.4217,
+      "mean_token_accuracy": 0.8399569928646088,
+      "num_tokens": 361030.0,
+      "step": 360
+    },
+    {
+      "entropy": 0.7364974081516266,
+      "epoch": 0.296,
+      "grad_norm": 4.332886695861816,
+      "learning_rate": 5.9040000000000004e-05,
+      "loss": 0.3882,
+      "mean_token_accuracy": 0.8492311537265778,
+      "num_tokens": 371103.0,
+      "step": 370
+    },
+    {
+      "entropy": 0.7401327073574067,
+      "epoch": 0.304,
+      "grad_norm": 3.2729179859161377,
+      "learning_rate": 6.064000000000001e-05,
+      "loss": 0.4078,
+      "mean_token_accuracy": 0.8473742246627808,
+      "num_tokens": 381221.0,
+      "step": 380
+    },
+    {
+      "entropy": 0.7454682946205139,
+      "epoch": 0.312,
+      "grad_norm": 4.101840972900391,
+      "learning_rate": 6.224e-05,
+      "loss": 0.3827,
+      "mean_token_accuracy": 0.8516541600227356,
+      "num_tokens": 391401.0,
+      "step": 390
+    },
+    {
+      "entropy": 0.7540267109870911,
+      "epoch": 0.32,
+      "grad_norm": 4.584408760070801,
+      "learning_rate": 6.384e-05,
+      "loss": 0.3883,
+      "mean_token_accuracy": 0.8520707130432129,
+      "num_tokens": 401517.0,
+      "step": 400
+    },
+    {
+      "entropy": 0.7678898334503174,
+      "epoch": 0.328,
+      "grad_norm": 3.7839274406433105,
+      "learning_rate": 6.544e-05,
+      "loss": 0.4136,
+      "mean_token_accuracy": 0.8492289066314698,
+      "num_tokens": 411527.0,
+      "step": 410
+    },
+    {
+      "entropy": 0.75483837723732,
+      "epoch": 0.336,
+      "grad_norm": 3.7808070182800293,
+      "learning_rate": 6.704000000000001e-05,
+      "loss": 0.3864,
+      "mean_token_accuracy": 0.8533409416675568,
+      "num_tokens": 421379.0,
+      "step": 420
+    },
+    {
+      "entropy": 0.7809491693973541,
+      "epoch": 0.344,
+      "grad_norm": 4.6651201248168945,
+      "learning_rate": 6.864000000000001e-05,
+      "loss": 0.4003,
+      "mean_token_accuracy": 0.8476006865501404,
+      "num_tokens": 431499.0,
+      "step": 430
+    },
+    {
+      "entropy": 0.7637096405029297,
+      "epoch": 0.352,
+      "grad_norm": 3.700791597366333,
+      "learning_rate": 7.024e-05,
+      "loss": 0.4135,
+      "mean_token_accuracy": 0.8419409990310669,
+      "num_tokens": 441544.0,
+      "step": 440
+    },
+    {
+      "entropy": 0.7409621119499207,
+      "epoch": 0.36,
+      "grad_norm": 4.35798978805542,
+      "learning_rate": 7.184e-05,
+      "loss": 0.3577,
+      "mean_token_accuracy": 0.8639594912528992,
+      "num_tokens": 451566.0,
+      "step": 450
+    },
+    {
+      "entropy": 0.743074893951416,
+      "epoch": 0.368,
+      "grad_norm": 3.672800064086914,
+      "learning_rate": 7.344000000000002e-05,
+      "loss": 0.3981,
+      "mean_token_accuracy": 0.854806250333786,
+      "num_tokens": 461625.0,
+      "step": 460
+    },
+    {
+      "entropy": 0.7390820384025574,
+      "epoch": 0.376,
+      "grad_norm": 4.984609603881836,
+      "learning_rate": 7.504e-05,
+      "loss": 0.4038,
+      "mean_token_accuracy": 0.849916672706604,
+      "num_tokens": 471667.0,
+      "step": 470
+    },
+    {
+      "entropy": 0.7496122181415558,
+      "epoch": 0.384,
+      "grad_norm": 3.7070982456207275,
+      "learning_rate": 7.664e-05,
+      "loss": 0.364,
+      "mean_token_accuracy": 0.8587370991706849,
+      "num_tokens": 481767.0,
+      "step": 480
+    },
+    {
+      "entropy": 0.7406512439250946,
+      "epoch": 0.392,
+      "grad_norm": 4.055230617523193,
+      "learning_rate": 7.824e-05,
+      "loss": 0.3866,
+      "mean_token_accuracy": 0.853862851858139,
+      "num_tokens": 491859.0,
+      "step": 490
+    },
+    {
+      "entropy": 0.6691052734851837,
+      "epoch": 0.4,
+      "grad_norm": 6.206707000732422,
+      "learning_rate": 7.984000000000001e-05,
+      "loss": 0.3584,
+      "mean_token_accuracy": 0.8651340901851654,
+      "num_tokens": 502103.0,
+      "step": 500
+    },
+    {
+      "entropy": 0.7127264261245727,
+      "epoch": 0.408,
+      "grad_norm": 2.857799530029297,
+      "learning_rate": 8.144e-05,
+      "loss": 0.3749,
+      "mean_token_accuracy": 0.8569707155227662,
+      "num_tokens": 512215.0,
+      "step": 510
+    },
+    {
+      "entropy": 0.6989680230617523,
+      "epoch": 0.416,
+      "grad_norm": 3.7977161407470703,
+      "learning_rate": 8.304e-05,
+      "loss": 0.3696,
+      "mean_token_accuracy": 0.862414437532425,
+      "num_tokens": 522207.0,
+      "step": 520
+    },
+    {
+      "entropy": 0.7255340337753295,
+      "epoch": 0.424,
+      "grad_norm": 2.523075580596924,
+      "learning_rate": 8.464e-05,
+      "loss": 0.3501,
+      "mean_token_accuracy": 0.8673852860927582,
+      "num_tokens": 532139.0,
+      "step": 530
+    },
+    {
+      "entropy": 0.7114777028560638,
+      "epoch": 0.432,
+      "grad_norm": 3.5952746868133545,
+      "learning_rate": 8.624000000000001e-05,
+      "loss": 0.3632,
+      "mean_token_accuracy": 0.8614569187164307,
+      "num_tokens": 542125.0,
+      "step": 540
+    },
+    {
+      "entropy": 0.749547004699707,
+      "epoch": 0.44,
+      "grad_norm": 4.453001976013184,
+      "learning_rate": 8.784e-05,
+      "loss": 0.3985,
+      "mean_token_accuracy": 0.8507158756256104,
+      "num_tokens": 552410.0,
+      "step": 550
+    },
+    {
+      "entropy": 0.7473198175430298,
+      "epoch": 0.448,
+      "grad_norm": 3.6924071311950684,
+      "learning_rate": 8.944e-05,
+      "loss": 0.3995,
+      "mean_token_accuracy": 0.8486853897571563,
+      "num_tokens": 562606.0,
+      "step": 560
+    },
+    {
+      "entropy": 0.711093932390213,
+      "epoch": 0.456,
+      "grad_norm": 3.1549296379089355,
+      "learning_rate": 9.104000000000001e-05,
+      "loss": 0.3878,
+      "mean_token_accuracy": 0.8508474111557007,
+      "num_tokens": 572537.0,
+      "step": 570
+    },
+    {
+      "entropy": 0.7114456057548523,
+      "epoch": 0.464,
+      "grad_norm": 3.332331657409668,
+      "learning_rate": 9.264000000000001e-05,
+      "loss": 0.3725,
+      "mean_token_accuracy": 0.8580802261829377,
+      "num_tokens": 582661.0,
+      "step": 580
+    },
+    {
+      "entropy": 0.7008551478385925,
+      "epoch": 0.472,
+      "grad_norm": 4.147095203399658,
+      "learning_rate": 9.424e-05,
+      "loss": 0.3391,
+      "mean_token_accuracy": 0.870452344417572,
+      "num_tokens": 592642.0,
+      "step": 590
+    },
+    {
+      "entropy": 0.7253452599048614,
+      "epoch": 0.48,
+      "grad_norm": 4.314738750457764,
+      "learning_rate": 9.584e-05,
+      "loss": 0.3666,
+      "mean_token_accuracy": 0.8685142874717713,
+      "num_tokens": 602797.0,
+      "step": 600
+    },
+    {
+      "entropy": 0.7198026478290558,
+      "epoch": 0.488,
+      "grad_norm": 3.4705069065093994,
+      "learning_rate": 9.744000000000002e-05,
+      "loss": 0.3458,
+      "mean_token_accuracy": 0.8706252574920654,
+      "num_tokens": 612563.0,
+      "step": 610
+    },
+    {
+      "entropy": 0.750393933057785,
+      "epoch": 0.496,
+      "grad_norm": 3.8125782012939453,
+      "learning_rate": 9.904e-05,
+      "loss": 0.3761,
+      "mean_token_accuracy": 0.8550220787525177,
+      "num_tokens": 622605.0,
+      "step": 620
+    },
+    {
+      "entropy": 0.769760149717331,
+      "epoch": 0.504,
+      "grad_norm": 4.06261682510376,
+      "learning_rate": 9.999997200422726e-05,
+      "loss": 0.4081,
+      "mean_token_accuracy": 0.841312825679779,
+      "num_tokens": 632493.0,
+      "step": 630
+    },
+    {
+      "entropy": 0.7663174033164978,
+      "epoch": 0.512,
+      "grad_norm": 3.6885271072387695,
+      "learning_rate": 9.999965705214383e-05,
+      "loss": 0.3844,
+      "mean_token_accuracy": 0.8508110165596008,
+      "num_tokens": 642455.0,
+      "step": 640
+    },
+    {
+      "entropy": 0.7547533094882966,
+      "epoch": 0.52,
+      "grad_norm": 4.812737941741943,
+      "learning_rate": 9.999899215547273e-05,
+      "loss": 0.36,
+      "mean_token_accuracy": 0.8604591488838196,
+      "num_tokens": 652518.0,
+      "step": 650
+    },
+    {
+      "entropy": 0.7455555617809295,
+      "epoch": 0.528,
+      "grad_norm": 4.497723579406738,
+      "learning_rate": 9.99979773188675e-05,
+      "loss": 0.3779,
+      "mean_token_accuracy": 0.8556386411190033,
+      "num_tokens": 662510.0,
+      "step": 660
+    },
+    {
+      "entropy": 0.7497033298015594,
+      "epoch": 0.536,
+      "grad_norm": 2.8484747409820557,
+      "learning_rate": 9.999661254943096e-05,
+      "loss": 0.3728,
+      "mean_token_accuracy": 0.8578177571296692,
+      "num_tokens": 672542.0,
+      "step": 670
+    },
+    {
+      "entropy": 0.7327328979969024,
+      "epoch": 0.544,
+      "grad_norm": 3.5998294353485107,
+      "learning_rate": 9.999489785671501e-05,
+      "loss": 0.3552,
+      "mean_token_accuracy": 0.8648241460323334,
+      "num_tokens": 682614.0,
+      "step": 680
+    },
+    {
+      "entropy": 0.7525190949440003,
+      "epoch": 0.552,
+      "grad_norm": 3.5004866123199463,
+      "learning_rate": 9.99928332527207e-05,
+      "loss": 0.3843,
+      "mean_token_accuracy": 0.8582637786865235,
+      "num_tokens": 692618.0,
+      "step": 690
+    },
+    {
+      "entropy": 0.7497900485992431,
+      "epoch": 0.56,
+      "grad_norm": 4.519836902618408,
+      "learning_rate": 9.999041875189808e-05,
+      "loss": 0.3431,
+      "mean_token_accuracy": 0.8670572757720947,
+      "num_tokens": 702633.0,
+      "step": 700
+    },
+    {
+      "entropy": 0.77483931183815,
+      "epoch": 0.568,
+      "grad_norm": 3.1947054862976074,
+      "learning_rate": 9.998765437114606e-05,
+      "loss": 0.4002,
+      "mean_token_accuracy": 0.8486331045627594,
+      "num_tokens": 712707.0,
+      "step": 710
+    },
+    {
+      "entropy": 0.7584193348884583,
+      "epoch": 0.576,
+      "grad_norm": 3.9025089740753174,
+      "learning_rate": 9.998454012981241e-05,
+      "loss": 0.3547,
+      "mean_token_accuracy": 0.8697766363620758,
+      "num_tokens": 722760.0,
+      "step": 720
+    },
+    {
+      "entropy": 0.7460690081119538,
+      "epoch": 0.584,
+      "grad_norm": 4.149205684661865,
+      "learning_rate": 9.99810760496935e-05,
+      "loss": 0.3444,
+      "mean_token_accuracy": 0.8667214512825012,
+      "num_tokens": 732748.0,
+      "step": 730
+    },
+    {
+      "entropy": 0.7551231920719147,
+      "epoch": 0.592,
+      "grad_norm": 2.7930212020874023,
+      "learning_rate": 9.997726215503422e-05,
+      "loss": 0.3648,
+      "mean_token_accuracy": 0.8609663844108582,
+      "num_tokens": 742797.0,
+      "step": 740
+    },
+    {
+      "entropy": 0.7414549589157104,
+      "epoch": 0.6,
+      "grad_norm": 3.0811150074005127,
+      "learning_rate": 9.997309847252781e-05,
+      "loss": 0.3594,
+      "mean_token_accuracy": 0.873071813583374,
+      "num_tokens": 752710.0,
+      "step": 750
+    },
+    {
+      "entropy": 0.7489046216011047,
+      "epoch": 0.608,
+      "grad_norm": 3.5512616634368896,
+      "learning_rate": 9.99685850313156e-05,
+      "loss": 0.3546,
+      "mean_token_accuracy": 0.8655755579471588,
+      "num_tokens": 762688.0,
+      "step": 760
+    },
+    {
+      "entropy": 0.733052384853363,
+      "epoch": 0.616,
+      "grad_norm": 4.369715213775635,
+      "learning_rate": 9.99637218629869e-05,
+      "loss": 0.3417,
+      "mean_token_accuracy": 0.8683482468128204,
+      "num_tokens": 772650.0,
+      "step": 770
+    },
+    {
+      "entropy": 0.7423468112945557,
+      "epoch": 0.624,
+      "grad_norm": 4.777336597442627,
+      "learning_rate": 9.995850900157875e-05,
+      "loss": 0.357,
+      "mean_token_accuracy": 0.8606196939945221,
+      "num_tokens": 782702.0,
+      "step": 780
+    },
+    {
+      "entropy": 0.7481508612632751,
+      "epoch": 0.632,
+      "grad_norm": 4.678175926208496,
+      "learning_rate": 9.995294648357565e-05,
+      "loss": 0.3567,
+      "mean_token_accuracy": 0.865706866979599,
+      "num_tokens": 792664.0,
+      "step": 790
+    },
+    {
+      "entropy": 0.7382378697395324,
+      "epoch": 0.64,
+      "grad_norm": 3.274829149246216,
+      "learning_rate": 9.99470343479093e-05,
+      "loss": 0.3603,
+      "mean_token_accuracy": 0.8626951456069947,
+      "num_tokens": 802737.0,
+      "step": 800
+    },
+    {
+      "entropy": 0.7525390565395356,
+      "epoch": 0.648,
+      "grad_norm": 5.102212905883789,
+      "learning_rate": 9.994077263595842e-05,
+      "loss": 0.3633,
+      "mean_token_accuracy": 0.8564347326755524,
+      "num_tokens": 812785.0,
+      "step": 810
+    },
+    {
+      "entropy": 0.7348288774490357,
+      "epoch": 0.656,
+      "grad_norm": 3.2849156856536865,
+      "learning_rate": 9.993416139154834e-05,
+      "loss": 0.3207,
+      "mean_token_accuracy": 0.8772452592849731,
+      "num_tokens": 822907.0,
+      "step": 820
+    },
+    {
+      "entropy": 0.7165950059890747,
+      "epoch": 0.664,
+      "grad_norm": 4.360360145568848,
+      "learning_rate": 9.992720066095074e-05,
+      "loss": 0.319,
+      "mean_token_accuracy": 0.8817264854907989,
+      "num_tokens": 832978.0,
+      "step": 830
+    },
+    {
+      "entropy": 0.7572665691375733,
+      "epoch": 0.672,
+      "grad_norm": 3.737704038619995,
+      "learning_rate": 9.99198904928834e-05,
+      "loss": 0.3518,
+      "mean_token_accuracy": 0.8672956347465515,
+      "num_tokens": 843010.0,
+      "step": 840
+    },
+    {
+      "entropy": 0.7376816928386688,
+      "epoch": 0.68,
+      "grad_norm": 3.763667345046997,
+      "learning_rate": 9.99122309385097e-05,
+      "loss": 0.3343,
+      "mean_token_accuracy": 0.8720453321933747,
+      "num_tokens": 853158.0,
+      "step": 850
+    },
+    {
+      "entropy": 0.738130909204483,
+      "epoch": 0.688,
+      "grad_norm": 3.451171636581421,
+      "learning_rate": 9.990422205143842e-05,
+      "loss": 0.3337,
+      "mean_token_accuracy": 0.8736945927143097,
+      "num_tokens": 863197.0,
+      "step": 860
+    },
+    {
+      "entropy": 0.7318816602230072,
+      "epoch": 0.696,
+      "grad_norm": 4.782471656799316,
+      "learning_rate": 9.989586388772327e-05,
+      "loss": 0.3302,
+      "mean_token_accuracy": 0.8774139165878296,
+      "num_tokens": 873186.0,
+      "step": 870
+    },
+    {
+      "entropy": 0.7257109045982361,
+      "epoch": 0.704,
+      "grad_norm": 4.2577714920043945,
+      "learning_rate": 9.988715650586255e-05,
+      "loss": 0.3267,
+      "mean_token_accuracy": 0.8722849547863006,
+      "num_tokens": 882902.0,
+      "step": 880
+    },
+    {
+      "entropy": 0.7249192893505096,
+      "epoch": 0.712,
+      "grad_norm": 4.787215709686279,
+      "learning_rate": 9.987809996679868e-05,
+      "loss": 0.3538,
+      "mean_token_accuracy": 0.8667027592658997,
+      "num_tokens": 892970.0,
+      "step": 890
+    },
+    {
+      "entropy": 0.734320092201233,
+      "epoch": 0.72,
+      "grad_norm": 4.3855061531066895,
+      "learning_rate": 9.986869433391786e-05,
+      "loss": 0.3629,
+      "mean_token_accuracy": 0.8576407015323639,
+      "num_tokens": 902925.0,
+      "step": 900
+    },
+    {
+      "entropy": 0.7197588086128235,
+      "epoch": 0.728,
+      "grad_norm": 3.845344305038452,
+      "learning_rate": 9.985893967304953e-05,
+      "loss": 0.3334,
+      "mean_token_accuracy": 0.8752871453762054,
+      "num_tokens": 913073.0,
+      "step": 910
+    },
+    {
+      "entropy": 0.7076159596443177,
+      "epoch": 0.736,
+      "grad_norm": 4.655667304992676,
+      "learning_rate": 9.984883605246596e-05,
+      "loss": 0.3553,
+      "mean_token_accuracy": 0.85735222697258,
+      "num_tokens": 923177.0,
+      "step": 920
+    },
+    {
+      "entropy": 0.6949739217758178,
+      "epoch": 0.744,
+      "grad_norm": 4.328896999359131,
+      "learning_rate": 9.983838354288181e-05,
+      "loss": 0.3114,
+      "mean_token_accuracy": 0.8794396817684174,
+      "num_tokens": 933175.0,
+      "step": 930
+    },
+    {
+      "entropy": 0.7136138260364533,
+      "epoch": 0.752,
+      "grad_norm": 3.6101481914520264,
+      "learning_rate": 9.982758221745355e-05,
+      "loss": 0.3398,
+      "mean_token_accuracy": 0.8673068225383759,
+      "num_tokens": 943216.0,
+      "step": 940
+    },
+    {
+      "entropy": 0.7149393677711486,
+      "epoch": 0.76,
+      "grad_norm": 4.676361560821533,
+      "learning_rate": 9.981643215177901e-05,
+      "loss": 0.3454,
+      "mean_token_accuracy": 0.8639320135116577,
+      "num_tokens": 953195.0,
+      "step": 950
+    },
+    {
+      "entropy": 0.7293311178684234,
+      "epoch": 0.768,
+      "grad_norm": 4.241588592529297,
+      "learning_rate": 9.98049334238968e-05,
+      "loss": 0.3267,
+      "mean_token_accuracy": 0.8770649194717407,
+      "num_tokens": 963295.0,
+      "step": 960
+    },
+    {
+      "entropy": 0.6864521920680999,
+      "epoch": 0.776,
+      "grad_norm": 4.124149322509766,
+      "learning_rate": 9.979308611428588e-05,
+      "loss": 0.3475,
+      "mean_token_accuracy": 0.8687545955181122,
+      "num_tokens": 973211.0,
+      "step": 970
+    },
+    {
+      "entropy": 0.7115203201770782,
+      "epoch": 0.784,
+      "grad_norm": 2.776160478591919,
+      "learning_rate": 9.978089030586482e-05,
+      "loss": 0.3283,
+      "mean_token_accuracy": 0.8781632363796235,
+      "num_tokens": 983423.0,
+      "step": 980
+    },
+    {
+      "entropy": 0.70510174036026,
+      "epoch": 0.792,
+      "grad_norm": 4.5464186668396,
+      "learning_rate": 9.976834608399135e-05,
+      "loss": 0.3233,
+      "mean_token_accuracy": 0.8736263334751129,
+      "num_tokens": 993383.0,
+      "step": 990
+    },
+    {
+      "entropy": 0.6919874429702759,
+      "epoch": 0.8,
+      "grad_norm": 3.289616346359253,
+      "learning_rate": 9.975545353646172e-05,
+      "loss": 0.317,
+      "mean_token_accuracy": 0.8773585438728333,
+      "num_tokens": 1003617.0,
+      "step": 1000
+    },
+    {
+      "entropy": 0.7038690447807312,
+      "epoch": 0.808,
+      "grad_norm": 3.4383840560913086,
+      "learning_rate": 9.974221275351012e-05,
+      "loss": 0.3038,
+      "mean_token_accuracy": 0.8848366022109986,
+      "num_tokens": 1013574.0,
+      "step": 1010
+    },
+    {
+      "entropy": 0.6912132382392884,
+      "epoch": 0.816,
+      "grad_norm": 3.7912769317626953,
+      "learning_rate": 9.972862382780795e-05,
+      "loss": 0.3291,
+      "mean_token_accuracy": 0.8741970837116242,
+      "num_tokens": 1023678.0,
+      "step": 1020
+    },
+    {
+      "entropy": 0.6911644101142883,
+      "epoch": 0.824,
+      "grad_norm": 3.905447006225586,
+      "learning_rate": 9.971468685446332e-05,
+      "loss": 0.2983,
+      "mean_token_accuracy": 0.8851586222648621,
+      "num_tokens": 1033772.0,
+      "step": 1030
+    },
+    {
+      "entropy": 0.7238000333309174,
+      "epoch": 0.832,
+      "grad_norm": 3.6505768299102783,
+      "learning_rate": 9.970040193102024e-05,
+      "loss": 0.322,
+      "mean_token_accuracy": 0.8771763265132904,
+      "num_tokens": 1043688.0,
+      "step": 1040
+    },
+    {
+      "entropy": 0.7082896769046784,
+      "epoch": 0.84,
+      "grad_norm": 2.9899954795837402,
+      "learning_rate": 9.968576915745807e-05,
+      "loss": 0.3202,
+      "mean_token_accuracy": 0.8733674585819244,
+      "num_tokens": 1053816.0,
+      "step": 1050
+    },
+    {
+      "entropy": 0.726912397146225,
+      "epoch": 0.848,
+      "grad_norm": 3.2128050327301025,
+      "learning_rate": 9.967078863619065e-05,
+      "loss": 0.3469,
+      "mean_token_accuracy": 0.8619545817375183,
+      "num_tokens": 1063713.0,
+      "step": 1060
+    },
+    {
+      "entropy": 0.7287257611751556,
+      "epoch": 0.856,
+      "grad_norm": 4.8999714851379395,
+      "learning_rate": 9.96554604720658e-05,
+      "loss": 0.3387,
+      "mean_token_accuracy": 0.8722339987754821,
+      "num_tokens": 1073905.0,
+      "step": 1070
+    },
+    {
+      "entropy": 0.7408074200153351,
+      "epoch": 0.864,
+      "grad_norm": 2.664285898208618,
+      "learning_rate": 9.963978477236437e-05,
+      "loss": 0.3379,
+      "mean_token_accuracy": 0.8713842570781708,
+      "num_tokens": 1083866.0,
+      "step": 1080
+    },
+    {
+      "entropy": 0.7120798885822296,
+      "epoch": 0.872,
+      "grad_norm": 4.573596954345703,
+      "learning_rate": 9.962376164679968e-05,
+      "loss": 0.3133,
+      "mean_token_accuracy": 0.8781111538410187,
+      "num_tokens": 1093886.0,
+      "step": 1090
+    },
+    {
+      "entropy": 0.6998519122600555,
+      "epoch": 0.88,
+      "grad_norm": 2.1679275035858154,
+      "learning_rate": 9.960739120751661e-05,
+      "loss": 0.3348,
+      "mean_token_accuracy": 0.8770085752010346,
+      "num_tokens": 1103980.0,
+      "step": 1100
+    },
+    {
+      "entropy": 0.7093372285366059,
+      "epoch": 0.888,
+      "grad_norm": 4.117437839508057,
+      "learning_rate": 9.959067356909086e-05,
+      "loss": 0.3239,
+      "mean_token_accuracy": 0.8707855939865112,
+      "num_tokens": 1113959.0,
+      "step": 1110
+    },
+    {
+      "entropy": 0.7140217363834381,
+      "epoch": 0.896,
+      "grad_norm": 3.9262735843658447,
+      "learning_rate": 9.957360884852817e-05,
+      "loss": 0.3105,
+      "mean_token_accuracy": 0.8786974430084229,
+      "num_tokens": 1123920.0,
+      "step": 1120
+    },
+    {
+      "entropy": 0.7165806293487549,
+      "epoch": 0.904,
+      "grad_norm": 3.4237773418426514,
+      "learning_rate": 9.955619716526355e-05,
+      "loss": 0.2988,
+      "mean_token_accuracy": 0.8893916308879852,
+      "num_tokens": 1133883.0,
+      "step": 1130
+    },
+    {
+      "entropy": 0.7115492939949035,
+      "epoch": 0.912,
+      "grad_norm": 5.193104267120361,
+      "learning_rate": 9.953843864116024e-05,
+      "loss": 0.2975,
+      "mean_token_accuracy": 0.8788837730884552,
+      "num_tokens": 1143953.0,
+      "step": 1140
+    },
+    {
+      "entropy": 0.738818782567978,
+      "epoch": 0.92,
+      "grad_norm": 4.208317279815674,
+      "learning_rate": 9.952033340050914e-05,
+      "loss": 0.3082,
+      "mean_token_accuracy": 0.8904856204986572,
+      "num_tokens": 1153991.0,
+      "step": 1150
+    },
+    {
+      "entropy": 0.7402492463588715,
+      "epoch": 0.928,
+      "grad_norm": 3.7344741821289062,
+      "learning_rate": 9.95018815700277e-05,
+      "loss": 0.2897,
+      "mean_token_accuracy": 0.8875938773155212,
+      "num_tokens": 1163916.0,
+      "step": 1160
+    },
+    {
+      "entropy": 0.7180757343769073,
+      "epoch": 0.936,
+      "grad_norm": 3.379906415939331,
+      "learning_rate": 9.948308327885921e-05,
+      "loss": 0.3012,
+      "mean_token_accuracy": 0.882535457611084,
+      "num_tokens": 1173898.0,
+      "step": 1170
+    },
+    {
+      "entropy": 0.7472505629062652,
+      "epoch": 0.944,
+      "grad_norm": 3.1298375129699707,
+      "learning_rate": 9.946393865857175e-05,
+      "loss": 0.3208,
+      "mean_token_accuracy": 0.8773401141166687,
+      "num_tokens": 1183791.0,
+      "step": 1180
+    },
+    {
+      "entropy": 0.705861485004425,
+      "epoch": 0.952,
+      "grad_norm": 4.013529300689697,
+      "learning_rate": 9.944444784315737e-05,
+      "loss": 0.2821,
+      "mean_token_accuracy": 0.8861022651195526,
+      "num_tokens": 1193813.0,
+      "step": 1190
+    },
+    {
+      "entropy": 0.699193000793457,
+      "epoch": 0.96,
+      "grad_norm": 5.319438457489014,
+      "learning_rate": 9.942461096903111e-05,
+      "loss": 0.3386,
+      "mean_token_accuracy": 0.8674045443534851,
+      "num_tokens": 1203738.0,
+      "step": 1200
+    },
+    {
+      "entropy": 0.7452460646629333,
+      "epoch": 0.968,
+      "grad_norm": 4.127198696136475,
+      "learning_rate": 9.940442817503006e-05,
+      "loss": 0.3312,
+      "mean_token_accuracy": 0.8679324150085449,
+      "num_tokens": 1213769.0,
+      "step": 1210
+    },
+    {
+      "entropy": 0.709743332862854,
+      "epoch": 0.976,
+      "grad_norm": 3.213073492050171,
+      "learning_rate": 9.938389960241237e-05,
+      "loss": 0.3247,
+      "mean_token_accuracy": 0.8760505378246307,
+      "num_tokens": 1223599.0,
+      "step": 1220
+    },
+    {
+      "entropy": 0.6922394454479217,
+      "epoch": 0.984,
+      "grad_norm": 5.325689315795898,
+      "learning_rate": 9.93630253948563e-05,
+      "loss": 0.3179,
+      "mean_token_accuracy": 0.8699888408184051,
+      "num_tokens": 1233633.0,
+      "step": 1230
+    },
+    {
+      "entropy": 0.7295576393604278,
+      "epoch": 0.992,
+      "grad_norm": 3.195913791656494,
+      "learning_rate": 9.934180569845917e-05,
+      "loss": 0.3064,
+      "mean_token_accuracy": 0.8825580894947052,
+      "num_tokens": 1243671.0,
+      "step": 1240
+    },
+    {
+      "entropy": 0.7127263069152832,
+      "epoch": 1.0,
+      "grad_norm": 3.3763585090637207,
+      "learning_rate": 9.932024066173635e-05,
+      "loss": 0.3251,
+      "mean_token_accuracy": 0.8774601638317108,
+      "num_tokens": 1253644.0,
+      "step": 1250
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 12500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 10,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.097584287889408e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-1250/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b25b8d8cffd67ea351659b350427764e9a6dcc4d1f692fe42a9968c21bd1cc6
+size 6417

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/README.md ADDED Viewed

	@@ -0,0 +1,209 @@

+---
+base_model: google/gemma-3-4b-it
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:google/gemma-3-4b-it
+- lora
+- sft
+- transformers
+- trl
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.19.1

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "google/gemma-3-4b-it",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "lora_ga_config": null,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.19.1",
+  "qalora_group_size": 16,
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "v_proj",
+    "q_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_bdlora": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb70dcda4b42fd284f3a34794b8278f4ecb4d143e214d35a922517e5cff749e6
+size 65674128

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "<image_soft_token>": 262144
+}

checkpoints/otter_gemma3_4b_r8_a32_adamw_e10_lr1e-4_s3_baseline/checkpoint-12500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,47 @@

+{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '
+' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
+' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}
+{%- else -%}
+    {%- set first_user_prefix = "" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = "model" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '
+' + (first_user_prefix if loop.first else "") }}
+    {%- if message['content'] is string -%}
+        {{ message['content'] | trim }}
+    {%- elif message['content'] is iterable -%}
+        {%- for item in message['content'] -%}
+            {%- if item['type'] == 'image' -%}
+                {{ '<start_of_image>' }}
+            {%- elif item['type'] == 'text' -%}
+                {{ item['text'] | trim }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{ raise_exception("Invalid content type") }}
+    {%- endif -%}
+    {{ '<end_of_turn>
+' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model
+'}}
+{%- endif -%}