Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md +58 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md +58 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json +297 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json +378 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json +469 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json +560 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json +641 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md +209 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json +40 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json +54 -0
- DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json +732 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md +58 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json +54 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json +287 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json +54 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json +368 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json +54 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json +459 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json +54 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json +540 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json +54 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json +631 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md +209 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json +40 -0
- DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json +54 -0
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: transformers
|
| 4 |
+
model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1
|
| 5 |
+
tags:
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- trl
|
| 8 |
+
- sft
|
| 9 |
+
licence: license
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1
|
| 13 |
+
|
| 14 |
+
This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
|
| 15 |
+
It has been trained using [TRL](https://github.com/huggingface/trl).
|
| 16 |
+
|
| 17 |
+
## Quick start
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from transformers import pipeline
|
| 21 |
+
|
| 22 |
+
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
| 23 |
+
generator = pipeline("text-generation", model="None", device="cuda")
|
| 24 |
+
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
| 25 |
+
print(output["generated_text"])
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Training procedure
|
| 29 |
+
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/sfblzvnx)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
This model was trained with SFT.
|
| 35 |
+
|
| 36 |
+
### Framework versions
|
| 37 |
+
|
| 38 |
+
- TRL: 0.29.0
|
| 39 |
+
- Transformers: 5.5.4
|
| 40 |
+
- Pytorch: 2.10.0
|
| 41 |
+
- Datasets: 4.6.1
|
| 42 |
+
- Tokenizers: 0.22.2
|
| 43 |
+
|
| 44 |
+
## Citations
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
Cite TRL as:
|
| 49 |
+
|
| 50 |
+
```bibtex
|
| 51 |
+
@software{vonwerra2020trl,
|
| 52 |
+
title = {{TRL: Transformers Reinforcement Learning}},
|
| 53 |
+
author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
|
| 54 |
+
license = {Apache-2.0},
|
| 55 |
+
url = {https://github.com/huggingface/trl},
|
| 56 |
+
year = {2020}
|
| 57 |
+
}
|
| 58 |
+
```
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: transformers
|
| 4 |
+
model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test2
|
| 5 |
+
tags:
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- trl
|
| 8 |
+
- sft
|
| 9 |
+
licence: license
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test2
|
| 13 |
+
|
| 14 |
+
This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
|
| 15 |
+
It has been trained using [TRL](https://github.com/huggingface/trl).
|
| 16 |
+
|
| 17 |
+
## Quick start
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from transformers import pipeline
|
| 21 |
+
|
| 22 |
+
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
| 23 |
+
generator = pipeline("text-generation", model="None", device="cuda")
|
| 24 |
+
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
| 25 |
+
print(output["generated_text"])
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Training procedure
|
| 29 |
+
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ncgnoczk)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
This model was trained with SFT.
|
| 35 |
+
|
| 36 |
+
### Framework versions
|
| 37 |
+
|
| 38 |
+
- TRL: 0.29.0
|
| 39 |
+
- Transformers: 5.5.4
|
| 40 |
+
- Pytorch: 2.10.0
|
| 41 |
+
- Datasets: 4.6.1
|
| 42 |
+
- Tokenizers: 0.22.2
|
| 43 |
+
|
| 44 |
+
## Citations
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
Cite TRL as:
|
| 49 |
+
|
| 50 |
+
```bibtex
|
| 51 |
+
@software{vonwerra2020trl,
|
| 52 |
+
title = {{TRL: Transformers Reinforcement Learning}},
|
| 53 |
+
author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
|
| 54 |
+
license = {Apache-2.0},
|
| 55 |
+
url = {https://github.com/huggingface/trl},
|
| 56 |
+
year = {2020}
|
| 57 |
+
}
|
| 58 |
+
```
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json
ADDED
|
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1155,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
}
|
| 275 |
+
],
|
| 276 |
+
"logging_steps": 50,
|
| 277 |
+
"max_steps": 3850,
|
| 278 |
+
"num_input_tokens_seen": 0,
|
| 279 |
+
"num_train_epochs": 10,
|
| 280 |
+
"save_steps": 500,
|
| 281 |
+
"stateful_callbacks": {
|
| 282 |
+
"TrainerControl": {
|
| 283 |
+
"args": {
|
| 284 |
+
"should_epoch_stop": false,
|
| 285 |
+
"should_evaluate": false,
|
| 286 |
+
"should_log": false,
|
| 287 |
+
"should_save": true,
|
| 288 |
+
"should_training_stop": false
|
| 289 |
+
},
|
| 290 |
+
"attributes": {}
|
| 291 |
+
}
|
| 292 |
+
},
|
| 293 |
+
"total_flos": 9.957948339009064e+17,
|
| 294 |
+
"train_batch_size": 4,
|
| 295 |
+
"trial_name": null,
|
| 296 |
+
"trial_params": null
|
| 297 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json
ADDED
|
@@ -0,0 +1,378 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 4.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1540,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.37655152073457615,
|
| 277 |
+
"epoch": 3.116959064327485,
|
| 278 |
+
"grad_norm": 1.504384160041809,
|
| 279 |
+
"learning_rate": 0.0003313287743759729,
|
| 280 |
+
"loss": 1.3653451538085937,
|
| 281 |
+
"mean_token_accuracy": 0.8971295344769655,
|
| 282 |
+
"num_tokens": 2939773.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.37069276951253416,
|
| 287 |
+
"epoch": 3.246913580246914,
|
| 288 |
+
"grad_norm": 1.9665946960449219,
|
| 289 |
+
"learning_rate": 0.0003253772935629151,
|
| 290 |
+
"loss": 1.3458108520507812,
|
| 291 |
+
"mean_token_accuracy": 0.8982205548882485,
|
| 292 |
+
"num_tokens": 3063617.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.37295883789658546,
|
| 297 |
+
"epoch": 3.3768680961663415,
|
| 298 |
+
"grad_norm": 1.7501362562179565,
|
| 299 |
+
"learning_rate": 0.00031914855214759165,
|
| 300 |
+
"loss": 1.357562255859375,
|
| 301 |
+
"mean_token_accuracy": 0.8977113124728203,
|
| 302 |
+
"num_tokens": 3189800.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.3805788069963455,
|
| 307 |
+
"epoch": 3.50682261208577,
|
| 308 |
+
"grad_norm": 1.7277154922485352,
|
| 309 |
+
"learning_rate": 0.00031265534863374894,
|
| 310 |
+
"loss": 1.3735618591308594,
|
| 311 |
+
"mean_token_accuracy": 0.8962143072485924,
|
| 312 |
+
"num_tokens": 3311908.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.3840580120682716,
|
| 317 |
+
"epoch": 3.636777128005198,
|
| 318 |
+
"grad_norm": 2.2338802814483643,
|
| 319 |
+
"learning_rate": 0.0003059110249285165,
|
| 320 |
+
"loss": 1.3903216552734374,
|
| 321 |
+
"mean_token_accuracy": 0.8958476388454437,
|
| 322 |
+
"num_tokens": 3432934.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.37621145449578763,
|
| 327 |
+
"epoch": 3.7667316439246266,
|
| 328 |
+
"grad_norm": 1.9029661417007446,
|
| 329 |
+
"learning_rate": 0.00029892943892812944,
|
| 330 |
+
"loss": 1.3776657104492187,
|
| 331 |
+
"mean_token_accuracy": 0.8964926180243492,
|
| 332 |
+
"num_tokens": 3561408.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"entropy": 0.3784803995490074,
|
| 337 |
+
"epoch": 3.8966861598440543,
|
| 338 |
+
"grad_norm": 2.089708089828491,
|
| 339 |
+
"learning_rate": 0.00029172493604342163,
|
| 340 |
+
"loss": 1.3816807556152344,
|
| 341 |
+
"mean_token_accuracy": 0.8962833172082901,
|
| 342 |
+
"num_tokens": 3684624.0,
|
| 343 |
+
"step": 1500
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 4.0,
|
| 347 |
+
"eval_entropy": 0.4351254403591156,
|
| 348 |
+
"eval_loss": 0.5814722180366516,
|
| 349 |
+
"eval_mean_token_accuracy": 0.8530604747625498,
|
| 350 |
+
"eval_num_tokens": 3779128.0,
|
| 351 |
+
"eval_runtime": 90.2232,
|
| 352 |
+
"eval_samples_per_second": 18.366,
|
| 353 |
+
"eval_steps_per_second": 2.305,
|
| 354 |
+
"step": 1540
|
| 355 |
+
}
|
| 356 |
+
],
|
| 357 |
+
"logging_steps": 50,
|
| 358 |
+
"max_steps": 3850,
|
| 359 |
+
"num_input_tokens_seen": 0,
|
| 360 |
+
"num_train_epochs": 10,
|
| 361 |
+
"save_steps": 500,
|
| 362 |
+
"stateful_callbacks": {
|
| 363 |
+
"TrainerControl": {
|
| 364 |
+
"args": {
|
| 365 |
+
"should_epoch_stop": false,
|
| 366 |
+
"should_evaluate": false,
|
| 367 |
+
"should_log": false,
|
| 368 |
+
"should_save": true,
|
| 369 |
+
"should_training_stop": false
|
| 370 |
+
},
|
| 371 |
+
"attributes": {}
|
| 372 |
+
}
|
| 373 |
+
},
|
| 374 |
+
"total_flos": 1.3259599564032195e+18,
|
| 375 |
+
"train_batch_size": 4,
|
| 376 |
+
"trial_name": null,
|
| 377 |
+
"trial_params": null
|
| 378 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json
ADDED
|
@@ -0,0 +1,469 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 5.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1925,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.37655152073457615,
|
| 277 |
+
"epoch": 3.116959064327485,
|
| 278 |
+
"grad_norm": 1.504384160041809,
|
| 279 |
+
"learning_rate": 0.0003313287743759729,
|
| 280 |
+
"loss": 1.3653451538085937,
|
| 281 |
+
"mean_token_accuracy": 0.8971295344769655,
|
| 282 |
+
"num_tokens": 2939773.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.37069276951253416,
|
| 287 |
+
"epoch": 3.246913580246914,
|
| 288 |
+
"grad_norm": 1.9665946960449219,
|
| 289 |
+
"learning_rate": 0.0003253772935629151,
|
| 290 |
+
"loss": 1.3458108520507812,
|
| 291 |
+
"mean_token_accuracy": 0.8982205548882485,
|
| 292 |
+
"num_tokens": 3063617.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.37295883789658546,
|
| 297 |
+
"epoch": 3.3768680961663415,
|
| 298 |
+
"grad_norm": 1.7501362562179565,
|
| 299 |
+
"learning_rate": 0.00031914855214759165,
|
| 300 |
+
"loss": 1.357562255859375,
|
| 301 |
+
"mean_token_accuracy": 0.8977113124728203,
|
| 302 |
+
"num_tokens": 3189800.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.3805788069963455,
|
| 307 |
+
"epoch": 3.50682261208577,
|
| 308 |
+
"grad_norm": 1.7277154922485352,
|
| 309 |
+
"learning_rate": 0.00031265534863374894,
|
| 310 |
+
"loss": 1.3735618591308594,
|
| 311 |
+
"mean_token_accuracy": 0.8962143072485924,
|
| 312 |
+
"num_tokens": 3311908.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.3840580120682716,
|
| 317 |
+
"epoch": 3.636777128005198,
|
| 318 |
+
"grad_norm": 2.2338802814483643,
|
| 319 |
+
"learning_rate": 0.0003059110249285165,
|
| 320 |
+
"loss": 1.3903216552734374,
|
| 321 |
+
"mean_token_accuracy": 0.8958476388454437,
|
| 322 |
+
"num_tokens": 3432934.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.37621145449578763,
|
| 327 |
+
"epoch": 3.7667316439246266,
|
| 328 |
+
"grad_norm": 1.9029661417007446,
|
| 329 |
+
"learning_rate": 0.00029892943892812944,
|
| 330 |
+
"loss": 1.3776657104492187,
|
| 331 |
+
"mean_token_accuracy": 0.8964926180243492,
|
| 332 |
+
"num_tokens": 3561408.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"entropy": 0.3784803995490074,
|
| 337 |
+
"epoch": 3.8966861598440543,
|
| 338 |
+
"grad_norm": 2.089708089828491,
|
| 339 |
+
"learning_rate": 0.00029172493604342163,
|
| 340 |
+
"loss": 1.3816807556152344,
|
| 341 |
+
"mean_token_accuracy": 0.8962833172082901,
|
| 342 |
+
"num_tokens": 3684624.0,
|
| 343 |
+
"step": 1500
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 4.0,
|
| 347 |
+
"eval_entropy": 0.4351254403591156,
|
| 348 |
+
"eval_loss": 0.5814722180366516,
|
| 349 |
+
"eval_mean_token_accuracy": 0.8530604747625498,
|
| 350 |
+
"eval_num_tokens": 3779128.0,
|
| 351 |
+
"eval_runtime": 90.2232,
|
| 352 |
+
"eval_samples_per_second": 18.366,
|
| 353 |
+
"eval_steps_per_second": 2.305,
|
| 354 |
+
"step": 1540
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.36326556409423677,
|
| 358 |
+
"epoch": 4.025990903183885,
|
| 359 |
+
"grad_norm": 2.1354947090148926,
|
| 360 |
+
"learning_rate": 0.0002843123197235993,
|
| 361 |
+
"loss": 1.3295362854003907,
|
| 362 |
+
"mean_token_accuracy": 0.8993093811686913,
|
| 363 |
+
"num_tokens": 3804993.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.2879397062957287,
|
| 368 |
+
"epoch": 4.155945419103314,
|
| 369 |
+
"grad_norm": 2.201097011566162,
|
| 370 |
+
"learning_rate": 0.0002767068210388601,
|
| 371 |
+
"loss": 1.0272974395751953,
|
| 372 |
+
"mean_token_accuracy": 0.9182627710700035,
|
| 373 |
+
"num_tokens": 3928162.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.2848948486149311,
|
| 378 |
+
"epoch": 4.2858999350227425,
|
| 379 |
+
"grad_norm": 2.01479172706604,
|
| 380 |
+
"learning_rate": 0.000268924067384358,
|
| 381 |
+
"loss": 1.0278727722167968,
|
| 382 |
+
"mean_token_accuracy": 0.9194766515493393,
|
| 383 |
+
"num_tokens": 4049012.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.2940504560619593,
|
| 388 |
+
"epoch": 4.41585445094217,
|
| 389 |
+
"grad_norm": 2.0893027782440186,
|
| 390 |
+
"learning_rate": 0.00026098005036982003,
|
| 391 |
+
"loss": 1.0586751556396485,
|
| 392 |
+
"mean_token_accuracy": 0.9167885810136795,
|
| 393 |
+
"num_tokens": 4167845.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.293505182415247,
|
| 398 |
+
"epoch": 4.545808966861598,
|
| 399 |
+
"grad_norm": 1.6346389055252075,
|
| 400 |
+
"learning_rate": 0.0002528910929607928,
|
| 401 |
+
"loss": 1.0669570922851563,
|
| 402 |
+
"mean_token_accuracy": 0.9160876458883286,
|
| 403 |
+
"num_tokens": 4287505.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.2898535231500864,
|
| 408 |
+
"epoch": 4.675763482781027,
|
| 409 |
+
"grad_norm": 1.6645033359527588,
|
| 410 |
+
"learning_rate": 0.0002446738159390364,
|
| 411 |
+
"loss": 1.0582612609863282,
|
| 412 |
+
"mean_token_accuracy": 0.9177632886171341,
|
| 413 |
+
"num_tokens": 4412221.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.2842763290554285,
|
| 418 |
+
"epoch": 4.805717998700455,
|
| 419 |
+
"grad_norm": 2.4594268798828125,
|
| 420 |
+
"learning_rate": 0.0002363451037509798,
|
| 421 |
+
"loss": 1.0467537689208983,
|
| 422 |
+
"mean_token_accuracy": 0.9177608361840248,
|
| 423 |
+
"num_tokens": 4537178.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"entropy": 0.284430123642087,
|
| 428 |
+
"epoch": 4.935672514619883,
|
| 429 |
+
"grad_norm": 2.1724514961242676,
|
| 430 |
+
"learning_rate": 0.00022792206981441223,
|
| 431 |
+
"loss": 1.0753899383544923,
|
| 432 |
+
"mean_token_accuracy": 0.915192686021328,
|
| 433 |
+
"num_tokens": 4664196.0,
|
| 434 |
+
"step": 1900
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 5.0,
|
| 438 |
+
"eval_entropy": 0.3632780872285366,
|
| 439 |
+
"eval_loss": 0.6438126564025879,
|
| 440 |
+
"eval_mean_token_accuracy": 0.8511462942338907,
|
| 441 |
+
"eval_num_tokens": 4723910.0,
|
| 442 |
+
"eval_runtime": 90.1846,
|
| 443 |
+
"eval_samples_per_second": 18.373,
|
| 444 |
+
"eval_steps_per_second": 2.306,
|
| 445 |
+
"step": 1925
|
| 446 |
+
}
|
| 447 |
+
],
|
| 448 |
+
"logging_steps": 50,
|
| 449 |
+
"max_steps": 3850,
|
| 450 |
+
"num_input_tokens_seen": 0,
|
| 451 |
+
"num_train_epochs": 10,
|
| 452 |
+
"save_steps": 500,
|
| 453 |
+
"stateful_callbacks": {
|
| 454 |
+
"TrainerControl": {
|
| 455 |
+
"args": {
|
| 456 |
+
"should_epoch_stop": false,
|
| 457 |
+
"should_evaluate": false,
|
| 458 |
+
"should_log": false,
|
| 459 |
+
"should_save": true,
|
| 460 |
+
"should_training_stop": false
|
| 461 |
+
},
|
| 462 |
+
"attributes": {}
|
| 463 |
+
}
|
| 464 |
+
},
|
| 465 |
+
"total_flos": 1.6564080889424607e+18,
|
| 466 |
+
"train_batch_size": 4,
|
| 467 |
+
"trial_name": null,
|
| 468 |
+
"trial_params": null
|
| 469 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json
ADDED
|
@@ -0,0 +1,560 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 6.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2310,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.37655152073457615,
|
| 277 |
+
"epoch": 3.116959064327485,
|
| 278 |
+
"grad_norm": 1.504384160041809,
|
| 279 |
+
"learning_rate": 0.0003313287743759729,
|
| 280 |
+
"loss": 1.3653451538085937,
|
| 281 |
+
"mean_token_accuracy": 0.8971295344769655,
|
| 282 |
+
"num_tokens": 2939773.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.37069276951253416,
|
| 287 |
+
"epoch": 3.246913580246914,
|
| 288 |
+
"grad_norm": 1.9665946960449219,
|
| 289 |
+
"learning_rate": 0.0003253772935629151,
|
| 290 |
+
"loss": 1.3458108520507812,
|
| 291 |
+
"mean_token_accuracy": 0.8982205548882485,
|
| 292 |
+
"num_tokens": 3063617.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.37295883789658546,
|
| 297 |
+
"epoch": 3.3768680961663415,
|
| 298 |
+
"grad_norm": 1.7501362562179565,
|
| 299 |
+
"learning_rate": 0.00031914855214759165,
|
| 300 |
+
"loss": 1.357562255859375,
|
| 301 |
+
"mean_token_accuracy": 0.8977113124728203,
|
| 302 |
+
"num_tokens": 3189800.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.3805788069963455,
|
| 307 |
+
"epoch": 3.50682261208577,
|
| 308 |
+
"grad_norm": 1.7277154922485352,
|
| 309 |
+
"learning_rate": 0.00031265534863374894,
|
| 310 |
+
"loss": 1.3735618591308594,
|
| 311 |
+
"mean_token_accuracy": 0.8962143072485924,
|
| 312 |
+
"num_tokens": 3311908.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.3840580120682716,
|
| 317 |
+
"epoch": 3.636777128005198,
|
| 318 |
+
"grad_norm": 2.2338802814483643,
|
| 319 |
+
"learning_rate": 0.0003059110249285165,
|
| 320 |
+
"loss": 1.3903216552734374,
|
| 321 |
+
"mean_token_accuracy": 0.8958476388454437,
|
| 322 |
+
"num_tokens": 3432934.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.37621145449578763,
|
| 327 |
+
"epoch": 3.7667316439246266,
|
| 328 |
+
"grad_norm": 1.9029661417007446,
|
| 329 |
+
"learning_rate": 0.00029892943892812944,
|
| 330 |
+
"loss": 1.3776657104492187,
|
| 331 |
+
"mean_token_accuracy": 0.8964926180243492,
|
| 332 |
+
"num_tokens": 3561408.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"entropy": 0.3784803995490074,
|
| 337 |
+
"epoch": 3.8966861598440543,
|
| 338 |
+
"grad_norm": 2.089708089828491,
|
| 339 |
+
"learning_rate": 0.00029172493604342163,
|
| 340 |
+
"loss": 1.3816807556152344,
|
| 341 |
+
"mean_token_accuracy": 0.8962833172082901,
|
| 342 |
+
"num_tokens": 3684624.0,
|
| 343 |
+
"step": 1500
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 4.0,
|
| 347 |
+
"eval_entropy": 0.4351254403591156,
|
| 348 |
+
"eval_loss": 0.5814722180366516,
|
| 349 |
+
"eval_mean_token_accuracy": 0.8530604747625498,
|
| 350 |
+
"eval_num_tokens": 3779128.0,
|
| 351 |
+
"eval_runtime": 90.2232,
|
| 352 |
+
"eval_samples_per_second": 18.366,
|
| 353 |
+
"eval_steps_per_second": 2.305,
|
| 354 |
+
"step": 1540
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.36326556409423677,
|
| 358 |
+
"epoch": 4.025990903183885,
|
| 359 |
+
"grad_norm": 2.1354947090148926,
|
| 360 |
+
"learning_rate": 0.0002843123197235993,
|
| 361 |
+
"loss": 1.3295362854003907,
|
| 362 |
+
"mean_token_accuracy": 0.8993093811686913,
|
| 363 |
+
"num_tokens": 3804993.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.2879397062957287,
|
| 368 |
+
"epoch": 4.155945419103314,
|
| 369 |
+
"grad_norm": 2.201097011566162,
|
| 370 |
+
"learning_rate": 0.0002767068210388601,
|
| 371 |
+
"loss": 1.0272974395751953,
|
| 372 |
+
"mean_token_accuracy": 0.9182627710700035,
|
| 373 |
+
"num_tokens": 3928162.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.2848948486149311,
|
| 378 |
+
"epoch": 4.2858999350227425,
|
| 379 |
+
"grad_norm": 2.01479172706604,
|
| 380 |
+
"learning_rate": 0.000268924067384358,
|
| 381 |
+
"loss": 1.0278727722167968,
|
| 382 |
+
"mean_token_accuracy": 0.9194766515493393,
|
| 383 |
+
"num_tokens": 4049012.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.2940504560619593,
|
| 388 |
+
"epoch": 4.41585445094217,
|
| 389 |
+
"grad_norm": 2.0893027782440186,
|
| 390 |
+
"learning_rate": 0.00026098005036982003,
|
| 391 |
+
"loss": 1.0586751556396485,
|
| 392 |
+
"mean_token_accuracy": 0.9167885810136795,
|
| 393 |
+
"num_tokens": 4167845.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.293505182415247,
|
| 398 |
+
"epoch": 4.545808966861598,
|
| 399 |
+
"grad_norm": 1.6346389055252075,
|
| 400 |
+
"learning_rate": 0.0002528910929607928,
|
| 401 |
+
"loss": 1.0669570922851563,
|
| 402 |
+
"mean_token_accuracy": 0.9160876458883286,
|
| 403 |
+
"num_tokens": 4287505.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.2898535231500864,
|
| 408 |
+
"epoch": 4.675763482781027,
|
| 409 |
+
"grad_norm": 1.6645033359527588,
|
| 410 |
+
"learning_rate": 0.0002446738159390364,
|
| 411 |
+
"loss": 1.0582612609863282,
|
| 412 |
+
"mean_token_accuracy": 0.9177632886171341,
|
| 413 |
+
"num_tokens": 4412221.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.2842763290554285,
|
| 418 |
+
"epoch": 4.805717998700455,
|
| 419 |
+
"grad_norm": 2.4594268798828125,
|
| 420 |
+
"learning_rate": 0.0002363451037509798,
|
| 421 |
+
"loss": 1.0467537689208983,
|
| 422 |
+
"mean_token_accuracy": 0.9177608361840248,
|
| 423 |
+
"num_tokens": 4537178.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"entropy": 0.284430123642087,
|
| 428 |
+
"epoch": 4.935672514619883,
|
| 429 |
+
"grad_norm": 2.1724514961242676,
|
| 430 |
+
"learning_rate": 0.00022792206981441223,
|
| 431 |
+
"loss": 1.0753899383544923,
|
| 432 |
+
"mean_token_accuracy": 0.915192686021328,
|
| 433 |
+
"num_tokens": 4664196.0,
|
| 434 |
+
"step": 1900
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 5.0,
|
| 438 |
+
"eval_entropy": 0.3632780872285366,
|
| 439 |
+
"eval_loss": 0.6438126564025879,
|
| 440 |
+
"eval_mean_token_accuracy": 0.8511462942338907,
|
| 441 |
+
"eval_num_tokens": 4723910.0,
|
| 442 |
+
"eval_runtime": 90.1846,
|
| 443 |
+
"eval_samples_per_second": 18.373,
|
| 444 |
+
"eval_steps_per_second": 2.306,
|
| 445 |
+
"step": 1925
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"entropy": 0.23515464736139355,
|
| 449 |
+
"epoch": 5.064977257959714,
|
| 450 |
+
"grad_norm": 1.651587724685669,
|
| 451 |
+
"learning_rate": 0.00021942202135469513,
|
| 452 |
+
"loss": 0.8597064971923828,
|
| 453 |
+
"mean_token_accuracy": 0.9324622603517082,
|
| 454 |
+
"num_tokens": 4789568.0,
|
| 455 |
+
"step": 1950
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"entropy": 0.1958953895419836,
|
| 459 |
+
"epoch": 5.1949317738791425,
|
| 460 |
+
"grad_norm": 1.923292636871338,
|
| 461 |
+
"learning_rate": 0.0002108624238427481,
|
| 462 |
+
"loss": 0.7188112640380859,
|
| 463 |
+
"mean_token_accuracy": 0.9416415295004845,
|
| 464 |
+
"num_tokens": 4913407.0,
|
| 465 |
+
"step": 2000
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"entropy": 0.21068542070686816,
|
| 469 |
+
"epoch": 5.32488628979857,
|
| 470 |
+
"grad_norm": 2.299356460571289,
|
| 471 |
+
"learning_rate": 0.0002022608651078804,
|
| 472 |
+
"loss": 0.7712985229492187,
|
| 473 |
+
"mean_token_accuracy": 0.9386440163850784,
|
| 474 |
+
"num_tokens": 5032951.0,
|
| 475 |
+
"step": 2050
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"entropy": 0.21234643168747425,
|
| 479 |
+
"epoch": 5.454840805717999,
|
| 480 |
+
"grad_norm": 2.2119295597076416,
|
| 481 |
+
"learning_rate": 0.00019363501919920608,
|
| 482 |
+
"loss": 0.7650181579589844,
|
| 483 |
+
"mean_token_accuracy": 0.938471505343914,
|
| 484 |
+
"num_tokens": 5156908.0,
|
| 485 |
+
"step": 2100
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"entropy": 0.21658269092440605,
|
| 489 |
+
"epoch": 5.584795321637427,
|
| 490 |
+
"grad_norm": 1.5394288301467896,
|
| 491 |
+
"learning_rate": 0.00018500261006989887,
|
| 492 |
+
"loss": 0.7784209442138672,
|
| 493 |
+
"mean_token_accuracy": 0.9371598136425018,
|
| 494 |
+
"num_tokens": 5276087.0,
|
| 495 |
+
"step": 2150
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"entropy": 0.2045296123996377,
|
| 499 |
+
"epoch": 5.714749837556855,
|
| 500 |
+
"grad_norm": 1.913680076599121,
|
| 501 |
+
"learning_rate": 0.00017638137515890763,
|
| 502 |
+
"loss": 0.7638166046142578,
|
| 503 |
+
"mean_token_accuracy": 0.9378301629424095,
|
| 504 |
+
"num_tokens": 5398787.0,
|
| 505 |
+
"step": 2200
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"entropy": 0.20917976945638656,
|
| 509 |
+
"epoch": 5.844704353476283,
|
| 510 |
+
"grad_norm": 2.0847299098968506,
|
| 511 |
+
"learning_rate": 0.00016778902894496063,
|
| 512 |
+
"loss": 0.7631703186035156,
|
| 513 |
+
"mean_token_accuracy": 0.9387557968497277,
|
| 514 |
+
"num_tokens": 5522332.0,
|
| 515 |
+
"step": 2250
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"entropy": 0.22262076318264007,
|
| 519 |
+
"epoch": 5.974658869395712,
|
| 520 |
+
"grad_norm": 2.1597352027893066,
|
| 521 |
+
"learning_rate": 0.0001592432265477485,
|
| 522 |
+
"loss": 0.798133773803711,
|
| 523 |
+
"mean_token_accuracy": 0.936034984588623,
|
| 524 |
+
"num_tokens": 5642361.0,
|
| 525 |
+
"step": 2300
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"epoch": 6.0,
|
| 529 |
+
"eval_entropy": 0.31502799331568754,
|
| 530 |
+
"eval_loss": 0.7417300343513489,
|
| 531 |
+
"eval_mean_token_accuracy": 0.8477253922476218,
|
| 532 |
+
"eval_num_tokens": 5668692.0,
|
| 533 |
+
"eval_runtime": 90.4252,
|
| 534 |
+
"eval_samples_per_second": 18.325,
|
| 535 |
+
"eval_steps_per_second": 2.3,
|
| 536 |
+
"step": 2310
|
| 537 |
+
}
|
| 538 |
+
],
|
| 539 |
+
"logging_steps": 50,
|
| 540 |
+
"max_steps": 3850,
|
| 541 |
+
"num_input_tokens_seen": 0,
|
| 542 |
+
"num_train_epochs": 10,
|
| 543 |
+
"save_steps": 500,
|
| 544 |
+
"stateful_callbacks": {
|
| 545 |
+
"TrainerControl": {
|
| 546 |
+
"args": {
|
| 547 |
+
"should_epoch_stop": false,
|
| 548 |
+
"should_evaluate": false,
|
| 549 |
+
"should_log": false,
|
| 550 |
+
"should_save": true,
|
| 551 |
+
"should_training_stop": false
|
| 552 |
+
},
|
| 553 |
+
"attributes": {}
|
| 554 |
+
}
|
| 555 |
+
},
|
| 556 |
+
"total_flos": 1.9871331143277489e+18,
|
| 557 |
+
"train_batch_size": 4,
|
| 558 |
+
"trial_name": null,
|
| 559 |
+
"trial_params": null
|
| 560 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json
ADDED
|
@@ -0,0 +1,641 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 7.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2695,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.37655152073457615,
|
| 277 |
+
"epoch": 3.116959064327485,
|
| 278 |
+
"grad_norm": 1.504384160041809,
|
| 279 |
+
"learning_rate": 0.0003313287743759729,
|
| 280 |
+
"loss": 1.3653451538085937,
|
| 281 |
+
"mean_token_accuracy": 0.8971295344769655,
|
| 282 |
+
"num_tokens": 2939773.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.37069276951253416,
|
| 287 |
+
"epoch": 3.246913580246914,
|
| 288 |
+
"grad_norm": 1.9665946960449219,
|
| 289 |
+
"learning_rate": 0.0003253772935629151,
|
| 290 |
+
"loss": 1.3458108520507812,
|
| 291 |
+
"mean_token_accuracy": 0.8982205548882485,
|
| 292 |
+
"num_tokens": 3063617.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.37295883789658546,
|
| 297 |
+
"epoch": 3.3768680961663415,
|
| 298 |
+
"grad_norm": 1.7501362562179565,
|
| 299 |
+
"learning_rate": 0.00031914855214759165,
|
| 300 |
+
"loss": 1.357562255859375,
|
| 301 |
+
"mean_token_accuracy": 0.8977113124728203,
|
| 302 |
+
"num_tokens": 3189800.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.3805788069963455,
|
| 307 |
+
"epoch": 3.50682261208577,
|
| 308 |
+
"grad_norm": 1.7277154922485352,
|
| 309 |
+
"learning_rate": 0.00031265534863374894,
|
| 310 |
+
"loss": 1.3735618591308594,
|
| 311 |
+
"mean_token_accuracy": 0.8962143072485924,
|
| 312 |
+
"num_tokens": 3311908.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.3840580120682716,
|
| 317 |
+
"epoch": 3.636777128005198,
|
| 318 |
+
"grad_norm": 2.2338802814483643,
|
| 319 |
+
"learning_rate": 0.0003059110249285165,
|
| 320 |
+
"loss": 1.3903216552734374,
|
| 321 |
+
"mean_token_accuracy": 0.8958476388454437,
|
| 322 |
+
"num_tokens": 3432934.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.37621145449578763,
|
| 327 |
+
"epoch": 3.7667316439246266,
|
| 328 |
+
"grad_norm": 1.9029661417007446,
|
| 329 |
+
"learning_rate": 0.00029892943892812944,
|
| 330 |
+
"loss": 1.3776657104492187,
|
| 331 |
+
"mean_token_accuracy": 0.8964926180243492,
|
| 332 |
+
"num_tokens": 3561408.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"entropy": 0.3784803995490074,
|
| 337 |
+
"epoch": 3.8966861598440543,
|
| 338 |
+
"grad_norm": 2.089708089828491,
|
| 339 |
+
"learning_rate": 0.00029172493604342163,
|
| 340 |
+
"loss": 1.3816807556152344,
|
| 341 |
+
"mean_token_accuracy": 0.8962833172082901,
|
| 342 |
+
"num_tokens": 3684624.0,
|
| 343 |
+
"step": 1500
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 4.0,
|
| 347 |
+
"eval_entropy": 0.4351254403591156,
|
| 348 |
+
"eval_loss": 0.5814722180366516,
|
| 349 |
+
"eval_mean_token_accuracy": 0.8530604747625498,
|
| 350 |
+
"eval_num_tokens": 3779128.0,
|
| 351 |
+
"eval_runtime": 90.2232,
|
| 352 |
+
"eval_samples_per_second": 18.366,
|
| 353 |
+
"eval_steps_per_second": 2.305,
|
| 354 |
+
"step": 1540
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.36326556409423677,
|
| 358 |
+
"epoch": 4.025990903183885,
|
| 359 |
+
"grad_norm": 2.1354947090148926,
|
| 360 |
+
"learning_rate": 0.0002843123197235993,
|
| 361 |
+
"loss": 1.3295362854003907,
|
| 362 |
+
"mean_token_accuracy": 0.8993093811686913,
|
| 363 |
+
"num_tokens": 3804993.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.2879397062957287,
|
| 368 |
+
"epoch": 4.155945419103314,
|
| 369 |
+
"grad_norm": 2.201097011566162,
|
| 370 |
+
"learning_rate": 0.0002767068210388601,
|
| 371 |
+
"loss": 1.0272974395751953,
|
| 372 |
+
"mean_token_accuracy": 0.9182627710700035,
|
| 373 |
+
"num_tokens": 3928162.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.2848948486149311,
|
| 378 |
+
"epoch": 4.2858999350227425,
|
| 379 |
+
"grad_norm": 2.01479172706604,
|
| 380 |
+
"learning_rate": 0.000268924067384358,
|
| 381 |
+
"loss": 1.0278727722167968,
|
| 382 |
+
"mean_token_accuracy": 0.9194766515493393,
|
| 383 |
+
"num_tokens": 4049012.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.2940504560619593,
|
| 388 |
+
"epoch": 4.41585445094217,
|
| 389 |
+
"grad_norm": 2.0893027782440186,
|
| 390 |
+
"learning_rate": 0.00026098005036982003,
|
| 391 |
+
"loss": 1.0586751556396485,
|
| 392 |
+
"mean_token_accuracy": 0.9167885810136795,
|
| 393 |
+
"num_tokens": 4167845.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.293505182415247,
|
| 398 |
+
"epoch": 4.545808966861598,
|
| 399 |
+
"grad_norm": 1.6346389055252075,
|
| 400 |
+
"learning_rate": 0.0002528910929607928,
|
| 401 |
+
"loss": 1.0669570922851563,
|
| 402 |
+
"mean_token_accuracy": 0.9160876458883286,
|
| 403 |
+
"num_tokens": 4287505.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.2898535231500864,
|
| 408 |
+
"epoch": 4.675763482781027,
|
| 409 |
+
"grad_norm": 1.6645033359527588,
|
| 410 |
+
"learning_rate": 0.0002446738159390364,
|
| 411 |
+
"loss": 1.0582612609863282,
|
| 412 |
+
"mean_token_accuracy": 0.9177632886171341,
|
| 413 |
+
"num_tokens": 4412221.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.2842763290554285,
|
| 418 |
+
"epoch": 4.805717998700455,
|
| 419 |
+
"grad_norm": 2.4594268798828125,
|
| 420 |
+
"learning_rate": 0.0002363451037509798,
|
| 421 |
+
"loss": 1.0467537689208983,
|
| 422 |
+
"mean_token_accuracy": 0.9177608361840248,
|
| 423 |
+
"num_tokens": 4537178.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"entropy": 0.284430123642087,
|
| 428 |
+
"epoch": 4.935672514619883,
|
| 429 |
+
"grad_norm": 2.1724514961242676,
|
| 430 |
+
"learning_rate": 0.00022792206981441223,
|
| 431 |
+
"loss": 1.0753899383544923,
|
| 432 |
+
"mean_token_accuracy": 0.915192686021328,
|
| 433 |
+
"num_tokens": 4664196.0,
|
| 434 |
+
"step": 1900
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 5.0,
|
| 438 |
+
"eval_entropy": 0.3632780872285366,
|
| 439 |
+
"eval_loss": 0.6438126564025879,
|
| 440 |
+
"eval_mean_token_accuracy": 0.8511462942338907,
|
| 441 |
+
"eval_num_tokens": 4723910.0,
|
| 442 |
+
"eval_runtime": 90.1846,
|
| 443 |
+
"eval_samples_per_second": 18.373,
|
| 444 |
+
"eval_steps_per_second": 2.306,
|
| 445 |
+
"step": 1925
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"entropy": 0.23515464736139355,
|
| 449 |
+
"epoch": 5.064977257959714,
|
| 450 |
+
"grad_norm": 1.651587724685669,
|
| 451 |
+
"learning_rate": 0.00021942202135469513,
|
| 452 |
+
"loss": 0.8597064971923828,
|
| 453 |
+
"mean_token_accuracy": 0.9324622603517082,
|
| 454 |
+
"num_tokens": 4789568.0,
|
| 455 |
+
"step": 1950
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"entropy": 0.1958953895419836,
|
| 459 |
+
"epoch": 5.1949317738791425,
|
| 460 |
+
"grad_norm": 1.923292636871338,
|
| 461 |
+
"learning_rate": 0.0002108624238427481,
|
| 462 |
+
"loss": 0.7188112640380859,
|
| 463 |
+
"mean_token_accuracy": 0.9416415295004845,
|
| 464 |
+
"num_tokens": 4913407.0,
|
| 465 |
+
"step": 2000
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"entropy": 0.21068542070686816,
|
| 469 |
+
"epoch": 5.32488628979857,
|
| 470 |
+
"grad_norm": 2.299356460571289,
|
| 471 |
+
"learning_rate": 0.0002022608651078804,
|
| 472 |
+
"loss": 0.7712985229492187,
|
| 473 |
+
"mean_token_accuracy": 0.9386440163850784,
|
| 474 |
+
"num_tokens": 5032951.0,
|
| 475 |
+
"step": 2050
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"entropy": 0.21234643168747425,
|
| 479 |
+
"epoch": 5.454840805717999,
|
| 480 |
+
"grad_norm": 2.2119295597076416,
|
| 481 |
+
"learning_rate": 0.00019363501919920608,
|
| 482 |
+
"loss": 0.7650181579589844,
|
| 483 |
+
"mean_token_accuracy": 0.938471505343914,
|
| 484 |
+
"num_tokens": 5156908.0,
|
| 485 |
+
"step": 2100
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"entropy": 0.21658269092440605,
|
| 489 |
+
"epoch": 5.584795321637427,
|
| 490 |
+
"grad_norm": 1.5394288301467896,
|
| 491 |
+
"learning_rate": 0.00018500261006989887,
|
| 492 |
+
"loss": 0.7784209442138672,
|
| 493 |
+
"mean_token_accuracy": 0.9371598136425018,
|
| 494 |
+
"num_tokens": 5276087.0,
|
| 495 |
+
"step": 2150
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"entropy": 0.2045296123996377,
|
| 499 |
+
"epoch": 5.714749837556855,
|
| 500 |
+
"grad_norm": 1.913680076599121,
|
| 501 |
+
"learning_rate": 0.00017638137515890763,
|
| 502 |
+
"loss": 0.7638166046142578,
|
| 503 |
+
"mean_token_accuracy": 0.9378301629424095,
|
| 504 |
+
"num_tokens": 5398787.0,
|
| 505 |
+
"step": 2200
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"entropy": 0.20917976945638656,
|
| 509 |
+
"epoch": 5.844704353476283,
|
| 510 |
+
"grad_norm": 2.0847299098968506,
|
| 511 |
+
"learning_rate": 0.00016778902894496063,
|
| 512 |
+
"loss": 0.7631703186035156,
|
| 513 |
+
"mean_token_accuracy": 0.9387557968497277,
|
| 514 |
+
"num_tokens": 5522332.0,
|
| 515 |
+
"step": 2250
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"entropy": 0.22262076318264007,
|
| 519 |
+
"epoch": 5.974658869395712,
|
| 520 |
+
"grad_norm": 2.1597352027893066,
|
| 521 |
+
"learning_rate": 0.0001592432265477485,
|
| 522 |
+
"loss": 0.798133773803711,
|
| 523 |
+
"mean_token_accuracy": 0.936034984588623,
|
| 524 |
+
"num_tokens": 5642361.0,
|
| 525 |
+
"step": 2300
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"epoch": 6.0,
|
| 529 |
+
"eval_entropy": 0.31502799331568754,
|
| 530 |
+
"eval_loss": 0.7417300343513489,
|
| 531 |
+
"eval_mean_token_accuracy": 0.8477253922476218,
|
| 532 |
+
"eval_num_tokens": 5668692.0,
|
| 533 |
+
"eval_runtime": 90.4252,
|
| 534 |
+
"eval_samples_per_second": 18.325,
|
| 535 |
+
"eval_steps_per_second": 2.3,
|
| 536 |
+
"step": 2310
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"entropy": 0.16796037876725795,
|
| 540 |
+
"epoch": 6.1039636127355426,
|
| 541 |
+
"grad_norm": 2.2228569984436035,
|
| 542 |
+
"learning_rate": 0.00015076152745107442,
|
| 543 |
+
"loss": 0.5835284805297851,
|
| 544 |
+
"mean_token_accuracy": 0.9529892874123463,
|
| 545 |
+
"num_tokens": 5766129.0,
|
| 546 |
+
"step": 2350
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"entropy": 0.14919219192117453,
|
| 550 |
+
"epoch": 6.23391812865497,
|
| 551 |
+
"grad_norm": 1.408840298652649,
|
| 552 |
+
"learning_rate": 0.00014236135942251215,
|
| 553 |
+
"loss": 0.5310631561279296,
|
| 554 |
+
"mean_token_accuracy": 0.9586454060673714,
|
| 555 |
+
"num_tokens": 5888746.0,
|
| 556 |
+
"step": 2400
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"entropy": 0.1499051059409976,
|
| 560 |
+
"epoch": 6.363872644574399,
|
| 561 |
+
"grad_norm": 1.8611102104187012,
|
| 562 |
+
"learning_rate": 0.00013405998270370849,
|
| 563 |
+
"loss": 0.5127810668945313,
|
| 564 |
+
"mean_token_accuracy": 0.9591325157880783,
|
| 565 |
+
"num_tokens": 6014455.0,
|
| 566 |
+
"step": 2450
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"entropy": 0.15334193099290133,
|
| 570 |
+
"epoch": 6.493827160493828,
|
| 571 |
+
"grad_norm": 1.6051015853881836,
|
| 572 |
+
"learning_rate": 0.00012587445454490892,
|
| 573 |
+
"loss": 0.5349758529663086,
|
| 574 |
+
"mean_token_accuracy": 0.9574431091547012,
|
| 575 |
+
"num_tokens": 6141229.0,
|
| 576 |
+
"step": 2500
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"entropy": 0.15982334002852439,
|
| 580 |
+
"epoch": 6.623781676413255,
|
| 581 |
+
"grad_norm": 3.7065205574035645,
|
| 582 |
+
"learning_rate": 0.00011782159415658008,
|
| 583 |
+
"loss": 0.5602469253540039,
|
| 584 |
+
"mean_token_accuracy": 0.9555372184515,
|
| 585 |
+
"num_tokens": 6257983.0,
|
| 586 |
+
"step": 2550
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"entropy": 0.16072992872446776,
|
| 590 |
+
"epoch": 6.753736192332683,
|
| 591 |
+
"grad_norm": 2.282320976257324,
|
| 592 |
+
"learning_rate": 0.00010991794815014401,
|
| 593 |
+
"loss": 0.5657939910888672,
|
| 594 |
+
"mean_token_accuracy": 0.9550630164146423,
|
| 595 |
+
"num_tokens": 6376198.0,
|
| 596 |
+
"step": 2600
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"entropy": 0.1512781011685729,
|
| 600 |
+
"epoch": 6.883690708252112,
|
| 601 |
+
"grad_norm": 1.3716893196105957,
|
| 602 |
+
"learning_rate": 0.00010217975653883603,
|
| 603 |
+
"loss": 0.5340792465209961,
|
| 604 |
+
"mean_token_accuracy": 0.9578188157081604,
|
| 605 |
+
"num_tokens": 6502526.0,
|
| 606 |
+
"step": 2650
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
"epoch": 7.0,
|
| 610 |
+
"eval_entropy": 0.2444461930829745,
|
| 611 |
+
"eval_loss": 0.8798949718475342,
|
| 612 |
+
"eval_mean_token_accuracy": 0.8457763839799625,
|
| 613 |
+
"eval_num_tokens": 6613474.0,
|
| 614 |
+
"eval_runtime": 90.2868,
|
| 615 |
+
"eval_samples_per_second": 18.353,
|
| 616 |
+
"eval_steps_per_second": 2.304,
|
| 617 |
+
"step": 2695
|
| 618 |
+
}
|
| 619 |
+
],
|
| 620 |
+
"logging_steps": 50,
|
| 621 |
+
"max_steps": 3850,
|
| 622 |
+
"num_input_tokens_seen": 0,
|
| 623 |
+
"num_train_epochs": 10,
|
| 624 |
+
"save_steps": 500,
|
| 625 |
+
"stateful_callbacks": {
|
| 626 |
+
"TrainerControl": {
|
| 627 |
+
"args": {
|
| 628 |
+
"should_epoch_stop": false,
|
| 629 |
+
"should_evaluate": false,
|
| 630 |
+
"should_log": false,
|
| 631 |
+
"should_save": true,
|
| 632 |
+
"should_training_stop": false
|
| 633 |
+
},
|
| 634 |
+
"attributes": {}
|
| 635 |
+
}
|
| 636 |
+
},
|
| 637 |
+
"total_flos": 2.31810912445653e+18,
|
| 638 |
+
"train_batch_size": 4,
|
| 639 |
+
"trial_name": null,
|
| 640 |
+
"trial_params": null
|
| 641 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 16,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.00985279561940916,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 16,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json
ADDED
|
@@ -0,0 +1,732 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 8.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 3080,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.353258643448353,
|
| 14 |
+
"epoch": 0.1299545159194282,
|
| 15 |
+
"grad_norm": 3.010725975036621,
|
| 16 |
+
"learning_rate": 4.8475852375026876e-05,
|
| 17 |
+
"loss": 5.475971069335937,
|
| 18 |
+
"mean_token_accuracy": 0.7263440760970116,
|
| 19 |
+
"num_tokens": 128842.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.649170914888382,
|
| 24 |
+
"epoch": 0.2599090318388564,
|
| 25 |
+
"grad_norm": 1.9099390506744385,
|
| 26 |
+
"learning_rate": 9.794100785974817e-05,
|
| 27 |
+
"loss": 2.55168701171875,
|
| 28 |
+
"mean_token_accuracy": 0.8364580717682838,
|
| 29 |
+
"num_tokens": 255497.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5930788792669773,
|
| 34 |
+
"epoch": 0.3898635477582846,
|
| 35 |
+
"grad_norm": 2.1239051818847656,
|
| 36 |
+
"learning_rate": 0.0001474061633444695,
|
| 37 |
+
"loss": 2.3440716552734373,
|
| 38 |
+
"mean_token_accuracy": 0.8452290838956833,
|
| 39 |
+
"num_tokens": 372014.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.5564522063732147,
|
| 44 |
+
"epoch": 0.5198180636777128,
|
| 45 |
+
"grad_norm": 411.71807861328125,
|
| 46 |
+
"learning_rate": 0.00019687131882919077,
|
| 47 |
+
"loss": 2.2838446044921876,
|
| 48 |
+
"mean_token_accuracy": 0.8498487600684166,
|
| 49 |
+
"num_tokens": 500623.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.5539529167115689,
|
| 54 |
+
"epoch": 0.649772579597141,
|
| 55 |
+
"grad_norm": 2.1969902515411377,
|
| 56 |
+
"learning_rate": 0.0002463364743139121,
|
| 57 |
+
"loss": 2.675394287109375,
|
| 58 |
+
"mean_token_accuracy": 0.8430694487690925,
|
| 59 |
+
"num_tokens": 616223.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.5719467167556286,
|
| 64 |
+
"epoch": 0.7797270955165692,
|
| 65 |
+
"grad_norm": 1.98796546459198,
|
| 66 |
+
"learning_rate": 0.00029580162979863343,
|
| 67 |
+
"loss": 2.2434300231933593,
|
| 68 |
+
"mean_token_accuracy": 0.851241897046566,
|
| 69 |
+
"num_tokens": 737263.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.5502805083990097,
|
| 74 |
+
"epoch": 0.9096816114359974,
|
| 75 |
+
"grad_norm": 2.0211398601531982,
|
| 76 |
+
"learning_rate": 0.0003452667852833547,
|
| 77 |
+
"loss": 2.1729367065429686,
|
| 78 |
+
"mean_token_accuracy": 0.8554597494006156,
|
| 79 |
+
"num_tokens": 861477.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5580813550891784,
|
| 85 |
+
"eval_loss": 0.5830356478691101,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8432669037809739,
|
| 87 |
+
"eval_num_tokens": 944782.0,
|
| 88 |
+
"eval_runtime": 90.3664,
|
| 89 |
+
"eval_samples_per_second": 18.336,
|
| 90 |
+
"eval_steps_per_second": 2.302,
|
| 91 |
+
"step": 385
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.5498402091725987,
|
| 95 |
+
"epoch": 1.0389863547758285,
|
| 96 |
+
"grad_norm": 3.8034188747406006,
|
| 97 |
+
"learning_rate": 0.000380866355527619,
|
| 98 |
+
"loss": 2.113946990966797,
|
| 99 |
+
"mean_token_accuracy": 0.8578129452676629,
|
| 100 |
+
"num_tokens": 982803.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.5182110907137394,
|
| 105 |
+
"epoch": 1.1689408706952567,
|
| 106 |
+
"grad_norm": 2.7830824851989746,
|
| 107 |
+
"learning_rate": 0.0003805611725593471,
|
| 108 |
+
"loss": 1.9833453369140626,
|
| 109 |
+
"mean_token_accuracy": 0.8656822636723518,
|
| 110 |
+
"num_tokens": 1105926.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.5260789206624031,
|
| 115 |
+
"epoch": 1.2988953866146848,
|
| 116 |
+
"grad_norm": 1.7993361949920654,
|
| 117 |
+
"learning_rate": 0.0003798653399371568,
|
| 118 |
+
"loss": 2.006897430419922,
|
| 119 |
+
"mean_token_accuracy": 0.8631055191159248,
|
| 120 |
+
"num_tokens": 1229857.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.5327546864748001,
|
| 125 |
+
"epoch": 1.428849902534113,
|
| 126 |
+
"grad_norm": 1.7606678009033203,
|
| 127 |
+
"learning_rate": 0.0003787802874228295,
|
| 128 |
+
"loss": 2.020283050537109,
|
| 129 |
+
"mean_token_accuracy": 0.8638329988718033,
|
| 130 |
+
"num_tokens": 1352330.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.5285360223054886,
|
| 135 |
+
"epoch": 1.5588044184535412,
|
| 136 |
+
"grad_norm": 4.76006555557251,
|
| 137 |
+
"learning_rate": 0.00037730824452755275,
|
| 138 |
+
"loss": 1.9987391662597656,
|
| 139 |
+
"mean_token_accuracy": 0.8644696187973022,
|
| 140 |
+
"num_tokens": 1474790.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.5134804363548756,
|
| 145 |
+
"epoch": 1.6887589343729694,
|
| 146 |
+
"grad_norm": 1.8447264432907104,
|
| 147 |
+
"learning_rate": 0.000375452235930833,
|
| 148 |
+
"loss": 1.9669386291503905,
|
| 149 |
+
"mean_token_accuracy": 0.8659948265552521,
|
| 150 |
+
"num_tokens": 1600381.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.5371069309115409,
|
| 155 |
+
"epoch": 1.8187134502923976,
|
| 156 |
+
"grad_norm": 1.6537392139434814,
|
| 157 |
+
"learning_rate": 0.00037321607526553675,
|
| 158 |
+
"loss": 2.0411550903320315,
|
| 159 |
+
"mean_token_accuracy": 0.8624854254722595,
|
| 160 |
+
"num_tokens": 1716827.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"entropy": 0.5270501750707627,
|
| 165 |
+
"epoch": 1.9486679662118258,
|
| 166 |
+
"grad_norm": 2.6990911960601807,
|
| 167 |
+
"learning_rate": 0.00037060435728183,
|
| 168 |
+
"loss": 2.015792236328125,
|
| 169 |
+
"mean_token_accuracy": 0.8631013777852058,
|
| 170 |
+
"num_tokens": 1842798.0,
|
| 171 |
+
"step": 750
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 2.0,
|
| 175 |
+
"eval_entropy": 0.5477195472384875,
|
| 176 |
+
"eval_loss": 0.5585702657699585,
|
| 177 |
+
"eval_mean_token_accuracy": 0.8486175815073344,
|
| 178 |
+
"eval_num_tokens": 1889564.0,
|
| 179 |
+
"eval_runtime": 90.2194,
|
| 180 |
+
"eval_samples_per_second": 18.366,
|
| 181 |
+
"eval_steps_per_second": 2.305,
|
| 182 |
+
"step": 770
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.4782189565088282,
|
| 186 |
+
"epoch": 2.077972709551657,
|
| 187 |
+
"grad_norm": 2.041952610015869,
|
| 188 |
+
"learning_rate": 0.0003676224484061175,
|
| 189 |
+
"loss": 1.7843829345703126,
|
| 190 |
+
"mean_token_accuracy": 0.8739750406250881,
|
| 191 |
+
"num_tokens": 1959778.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.4443667846918106,
|
| 196 |
+
"epoch": 2.207927225471085,
|
| 197 |
+
"grad_norm": 16.27313804626465,
|
| 198 |
+
"learning_rate": 0.00036427647571437996,
|
| 199 |
+
"loss": 1.6559255981445313,
|
| 200 |
+
"mean_token_accuracy": 0.8808386281132699,
|
| 201 |
+
"num_tokens": 2087384.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.44861202985048293,
|
| 206 |
+
"epoch": 2.3378817413905133,
|
| 207 |
+
"grad_norm": 1.648870587348938,
|
| 208 |
+
"learning_rate": 0.0003605733143425679,
|
| 209 |
+
"loss": 1.677943878173828,
|
| 210 |
+
"mean_token_accuracy": 0.879555520415306,
|
| 211 |
+
"num_tokens": 2211962.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.4568726105988026,
|
| 216 |
+
"epoch": 2.4678362573099415,
|
| 217 |
+
"grad_norm": 1.7573126554489136,
|
| 218 |
+
"learning_rate": 0.00035652057335991866,
|
| 219 |
+
"loss": 1.6760734558105468,
|
| 220 |
+
"mean_token_accuracy": 0.8791913360357284,
|
| 221 |
+
"num_tokens": 2334838.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.44863338857889173,
|
| 226 |
+
"epoch": 2.5977907732293697,
|
| 227 |
+
"grad_norm": 1.8639047145843506,
|
| 228 |
+
"learning_rate": 0.00035212658013422465,
|
| 229 |
+
"loss": 1.6799411010742187,
|
| 230 |
+
"mean_token_accuracy": 0.8790675121545791,
|
| 231 |
+
"num_tokens": 2461732.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.4585830120742321,
|
| 236 |
+
"epoch": 2.727745289148798,
|
| 237 |
+
"grad_norm": 1.9825985431671143,
|
| 238 |
+
"learning_rate": 0.0003474003632211781,
|
| 239 |
+
"loss": 1.7172026062011718,
|
| 240 |
+
"mean_token_accuracy": 0.8782495930790901,
|
| 241 |
+
"num_tokens": 2580026.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.45422692246735097,
|
| 246 |
+
"epoch": 2.857699805068226,
|
| 247 |
+
"grad_norm": 1.7149962186813354,
|
| 248 |
+
"learning_rate": 0.00034235163381294995,
|
| 249 |
+
"loss": 1.679084014892578,
|
| 250 |
+
"mean_token_accuracy": 0.8795321774482727,
|
| 251 |
+
"num_tokens": 2705600.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"entropy": 0.47297614574432373,
|
| 256 |
+
"epoch": 2.9876543209876543,
|
| 257 |
+
"grad_norm": 1.7435617446899414,
|
| 258 |
+
"learning_rate": 0.0003369907657841221,
|
| 259 |
+
"loss": 1.7386201477050782,
|
| 260 |
+
"mean_token_accuracy": 0.8779115182161331,
|
| 261 |
+
"num_tokens": 2822808.0,
|
| 262 |
+
"step": 1150
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 3.0,
|
| 266 |
+
"eval_entropy": 0.5031588454372607,
|
| 267 |
+
"eval_loss": 0.5551120638847351,
|
| 268 |
+
"eval_mean_token_accuracy": 0.8531603300227568,
|
| 269 |
+
"eval_num_tokens": 2834346.0,
|
| 270 |
+
"eval_runtime": 90.2397,
|
| 271 |
+
"eval_samples_per_second": 18.362,
|
| 272 |
+
"eval_steps_per_second": 2.305,
|
| 273 |
+
"step": 1155
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.37655152073457615,
|
| 277 |
+
"epoch": 3.116959064327485,
|
| 278 |
+
"grad_norm": 1.504384160041809,
|
| 279 |
+
"learning_rate": 0.0003313287743759729,
|
| 280 |
+
"loss": 1.3653451538085937,
|
| 281 |
+
"mean_token_accuracy": 0.8971295344769655,
|
| 282 |
+
"num_tokens": 2939773.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.37069276951253416,
|
| 287 |
+
"epoch": 3.246913580246914,
|
| 288 |
+
"grad_norm": 1.9665946960449219,
|
| 289 |
+
"learning_rate": 0.0003253772935629151,
|
| 290 |
+
"loss": 1.3458108520507812,
|
| 291 |
+
"mean_token_accuracy": 0.8982205548882485,
|
| 292 |
+
"num_tokens": 3063617.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.37295883789658546,
|
| 297 |
+
"epoch": 3.3768680961663415,
|
| 298 |
+
"grad_norm": 1.7501362562179565,
|
| 299 |
+
"learning_rate": 0.00031914855214759165,
|
| 300 |
+
"loss": 1.357562255859375,
|
| 301 |
+
"mean_token_accuracy": 0.8977113124728203,
|
| 302 |
+
"num_tokens": 3189800.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.3805788069963455,
|
| 307 |
+
"epoch": 3.50682261208577,
|
| 308 |
+
"grad_norm": 1.7277154922485352,
|
| 309 |
+
"learning_rate": 0.00031265534863374894,
|
| 310 |
+
"loss": 1.3735618591308594,
|
| 311 |
+
"mean_token_accuracy": 0.8962143072485924,
|
| 312 |
+
"num_tokens": 3311908.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.3840580120682716,
|
| 317 |
+
"epoch": 3.636777128005198,
|
| 318 |
+
"grad_norm": 2.2338802814483643,
|
| 319 |
+
"learning_rate": 0.0003059110249285165,
|
| 320 |
+
"loss": 1.3903216552734374,
|
| 321 |
+
"mean_token_accuracy": 0.8958476388454437,
|
| 322 |
+
"num_tokens": 3432934.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.37621145449578763,
|
| 327 |
+
"epoch": 3.7667316439246266,
|
| 328 |
+
"grad_norm": 1.9029661417007446,
|
| 329 |
+
"learning_rate": 0.00029892943892812944,
|
| 330 |
+
"loss": 1.3776657104492187,
|
| 331 |
+
"mean_token_accuracy": 0.8964926180243492,
|
| 332 |
+
"num_tokens": 3561408.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"entropy": 0.3784803995490074,
|
| 337 |
+
"epoch": 3.8966861598440543,
|
| 338 |
+
"grad_norm": 2.089708089828491,
|
| 339 |
+
"learning_rate": 0.00029172493604342163,
|
| 340 |
+
"loss": 1.3816807556152344,
|
| 341 |
+
"mean_token_accuracy": 0.8962833172082901,
|
| 342 |
+
"num_tokens": 3684624.0,
|
| 343 |
+
"step": 1500
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"epoch": 4.0,
|
| 347 |
+
"eval_entropy": 0.4351254403591156,
|
| 348 |
+
"eval_loss": 0.5814722180366516,
|
| 349 |
+
"eval_mean_token_accuracy": 0.8530604747625498,
|
| 350 |
+
"eval_num_tokens": 3779128.0,
|
| 351 |
+
"eval_runtime": 90.2232,
|
| 352 |
+
"eval_samples_per_second": 18.366,
|
| 353 |
+
"eval_steps_per_second": 2.305,
|
| 354 |
+
"step": 1540
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.36326556409423677,
|
| 358 |
+
"epoch": 4.025990903183885,
|
| 359 |
+
"grad_norm": 2.1354947090148926,
|
| 360 |
+
"learning_rate": 0.0002843123197235993,
|
| 361 |
+
"loss": 1.3295362854003907,
|
| 362 |
+
"mean_token_accuracy": 0.8993093811686913,
|
| 363 |
+
"num_tokens": 3804993.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.2879397062957287,
|
| 368 |
+
"epoch": 4.155945419103314,
|
| 369 |
+
"grad_norm": 2.201097011566162,
|
| 370 |
+
"learning_rate": 0.0002767068210388601,
|
| 371 |
+
"loss": 1.0272974395751953,
|
| 372 |
+
"mean_token_accuracy": 0.9182627710700035,
|
| 373 |
+
"num_tokens": 3928162.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.2848948486149311,
|
| 378 |
+
"epoch": 4.2858999350227425,
|
| 379 |
+
"grad_norm": 2.01479172706604,
|
| 380 |
+
"learning_rate": 0.000268924067384358,
|
| 381 |
+
"loss": 1.0278727722167968,
|
| 382 |
+
"mean_token_accuracy": 0.9194766515493393,
|
| 383 |
+
"num_tokens": 4049012.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.2940504560619593,
|
| 388 |
+
"epoch": 4.41585445094217,
|
| 389 |
+
"grad_norm": 2.0893027782440186,
|
| 390 |
+
"learning_rate": 0.00026098005036982003,
|
| 391 |
+
"loss": 1.0586751556396485,
|
| 392 |
+
"mean_token_accuracy": 0.9167885810136795,
|
| 393 |
+
"num_tokens": 4167845.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.293505182415247,
|
| 398 |
+
"epoch": 4.545808966861598,
|
| 399 |
+
"grad_norm": 1.6346389055252075,
|
| 400 |
+
"learning_rate": 0.0002528910929607928,
|
| 401 |
+
"loss": 1.0669570922851563,
|
| 402 |
+
"mean_token_accuracy": 0.9160876458883286,
|
| 403 |
+
"num_tokens": 4287505.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.2898535231500864,
|
| 408 |
+
"epoch": 4.675763482781027,
|
| 409 |
+
"grad_norm": 1.6645033359527588,
|
| 410 |
+
"learning_rate": 0.0002446738159390364,
|
| 411 |
+
"loss": 1.0582612609863282,
|
| 412 |
+
"mean_token_accuracy": 0.9177632886171341,
|
| 413 |
+
"num_tokens": 4412221.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.2842763290554285,
|
| 418 |
+
"epoch": 4.805717998700455,
|
| 419 |
+
"grad_norm": 2.4594268798828125,
|
| 420 |
+
"learning_rate": 0.0002363451037509798,
|
| 421 |
+
"loss": 1.0467537689208983,
|
| 422 |
+
"mean_token_accuracy": 0.9177608361840248,
|
| 423 |
+
"num_tokens": 4537178.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"entropy": 0.284430123642087,
|
| 428 |
+
"epoch": 4.935672514619883,
|
| 429 |
+
"grad_norm": 2.1724514961242676,
|
| 430 |
+
"learning_rate": 0.00022792206981441223,
|
| 431 |
+
"loss": 1.0753899383544923,
|
| 432 |
+
"mean_token_accuracy": 0.915192686021328,
|
| 433 |
+
"num_tokens": 4664196.0,
|
| 434 |
+
"step": 1900
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 5.0,
|
| 438 |
+
"eval_entropy": 0.3632780872285366,
|
| 439 |
+
"eval_loss": 0.6438126564025879,
|
| 440 |
+
"eval_mean_token_accuracy": 0.8511462942338907,
|
| 441 |
+
"eval_num_tokens": 4723910.0,
|
| 442 |
+
"eval_runtime": 90.1846,
|
| 443 |
+
"eval_samples_per_second": 18.373,
|
| 444 |
+
"eval_steps_per_second": 2.306,
|
| 445 |
+
"step": 1925
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"entropy": 0.23515464736139355,
|
| 449 |
+
"epoch": 5.064977257959714,
|
| 450 |
+
"grad_norm": 1.651587724685669,
|
| 451 |
+
"learning_rate": 0.00021942202135469513,
|
| 452 |
+
"loss": 0.8597064971923828,
|
| 453 |
+
"mean_token_accuracy": 0.9324622603517082,
|
| 454 |
+
"num_tokens": 4789568.0,
|
| 455 |
+
"step": 1950
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"entropy": 0.1958953895419836,
|
| 459 |
+
"epoch": 5.1949317738791425,
|
| 460 |
+
"grad_norm": 1.923292636871338,
|
| 461 |
+
"learning_rate": 0.0002108624238427481,
|
| 462 |
+
"loss": 0.7188112640380859,
|
| 463 |
+
"mean_token_accuracy": 0.9416415295004845,
|
| 464 |
+
"num_tokens": 4913407.0,
|
| 465 |
+
"step": 2000
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"entropy": 0.21068542070686816,
|
| 469 |
+
"epoch": 5.32488628979857,
|
| 470 |
+
"grad_norm": 2.299356460571289,
|
| 471 |
+
"learning_rate": 0.0002022608651078804,
|
| 472 |
+
"loss": 0.7712985229492187,
|
| 473 |
+
"mean_token_accuracy": 0.9386440163850784,
|
| 474 |
+
"num_tokens": 5032951.0,
|
| 475 |
+
"step": 2050
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"entropy": 0.21234643168747425,
|
| 479 |
+
"epoch": 5.454840805717999,
|
| 480 |
+
"grad_norm": 2.2119295597076416,
|
| 481 |
+
"learning_rate": 0.00019363501919920608,
|
| 482 |
+
"loss": 0.7650181579589844,
|
| 483 |
+
"mean_token_accuracy": 0.938471505343914,
|
| 484 |
+
"num_tokens": 5156908.0,
|
| 485 |
+
"step": 2100
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"entropy": 0.21658269092440605,
|
| 489 |
+
"epoch": 5.584795321637427,
|
| 490 |
+
"grad_norm": 1.5394288301467896,
|
| 491 |
+
"learning_rate": 0.00018500261006989887,
|
| 492 |
+
"loss": 0.7784209442138672,
|
| 493 |
+
"mean_token_accuracy": 0.9371598136425018,
|
| 494 |
+
"num_tokens": 5276087.0,
|
| 495 |
+
"step": 2150
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"entropy": 0.2045296123996377,
|
| 499 |
+
"epoch": 5.714749837556855,
|
| 500 |
+
"grad_norm": 1.913680076599121,
|
| 501 |
+
"learning_rate": 0.00017638137515890763,
|
| 502 |
+
"loss": 0.7638166046142578,
|
| 503 |
+
"mean_token_accuracy": 0.9378301629424095,
|
| 504 |
+
"num_tokens": 5398787.0,
|
| 505 |
+
"step": 2200
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"entropy": 0.20917976945638656,
|
| 509 |
+
"epoch": 5.844704353476283,
|
| 510 |
+
"grad_norm": 2.0847299098968506,
|
| 511 |
+
"learning_rate": 0.00016778902894496063,
|
| 512 |
+
"loss": 0.7631703186035156,
|
| 513 |
+
"mean_token_accuracy": 0.9387557968497277,
|
| 514 |
+
"num_tokens": 5522332.0,
|
| 515 |
+
"step": 2250
|
| 516 |
+
},
|
| 517 |
+
{
|
| 518 |
+
"entropy": 0.22262076318264007,
|
| 519 |
+
"epoch": 5.974658869395712,
|
| 520 |
+
"grad_norm": 2.1597352027893066,
|
| 521 |
+
"learning_rate": 0.0001592432265477485,
|
| 522 |
+
"loss": 0.798133773803711,
|
| 523 |
+
"mean_token_accuracy": 0.936034984588623,
|
| 524 |
+
"num_tokens": 5642361.0,
|
| 525 |
+
"step": 2300
|
| 526 |
+
},
|
| 527 |
+
{
|
| 528 |
+
"epoch": 6.0,
|
| 529 |
+
"eval_entropy": 0.31502799331568754,
|
| 530 |
+
"eval_loss": 0.7417300343513489,
|
| 531 |
+
"eval_mean_token_accuracy": 0.8477253922476218,
|
| 532 |
+
"eval_num_tokens": 5668692.0,
|
| 533 |
+
"eval_runtime": 90.4252,
|
| 534 |
+
"eval_samples_per_second": 18.325,
|
| 535 |
+
"eval_steps_per_second": 2.3,
|
| 536 |
+
"step": 2310
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"entropy": 0.16796037876725795,
|
| 540 |
+
"epoch": 6.1039636127355426,
|
| 541 |
+
"grad_norm": 2.2228569984436035,
|
| 542 |
+
"learning_rate": 0.00015076152745107442,
|
| 543 |
+
"loss": 0.5835284805297851,
|
| 544 |
+
"mean_token_accuracy": 0.9529892874123463,
|
| 545 |
+
"num_tokens": 5766129.0,
|
| 546 |
+
"step": 2350
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"entropy": 0.14919219192117453,
|
| 550 |
+
"epoch": 6.23391812865497,
|
| 551 |
+
"grad_norm": 1.408840298652649,
|
| 552 |
+
"learning_rate": 0.00014236135942251215,
|
| 553 |
+
"loss": 0.5310631561279296,
|
| 554 |
+
"mean_token_accuracy": 0.9586454060673714,
|
| 555 |
+
"num_tokens": 5888746.0,
|
| 556 |
+
"step": 2400
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"entropy": 0.1499051059409976,
|
| 560 |
+
"epoch": 6.363872644574399,
|
| 561 |
+
"grad_norm": 1.8611102104187012,
|
| 562 |
+
"learning_rate": 0.00013405998270370849,
|
| 563 |
+
"loss": 0.5127810668945313,
|
| 564 |
+
"mean_token_accuracy": 0.9591325157880783,
|
| 565 |
+
"num_tokens": 6014455.0,
|
| 566 |
+
"step": 2450
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"entropy": 0.15334193099290133,
|
| 570 |
+
"epoch": 6.493827160493828,
|
| 571 |
+
"grad_norm": 1.6051015853881836,
|
| 572 |
+
"learning_rate": 0.00012587445454490892,
|
| 573 |
+
"loss": 0.5349758529663086,
|
| 574 |
+
"mean_token_accuracy": 0.9574431091547012,
|
| 575 |
+
"num_tokens": 6141229.0,
|
| 576 |
+
"step": 2500
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"entropy": 0.15982334002852439,
|
| 580 |
+
"epoch": 6.623781676413255,
|
| 581 |
+
"grad_norm": 3.7065205574035645,
|
| 582 |
+
"learning_rate": 0.00011782159415658008,
|
| 583 |
+
"loss": 0.5602469253540039,
|
| 584 |
+
"mean_token_accuracy": 0.9555372184515,
|
| 585 |
+
"num_tokens": 6257983.0,
|
| 586 |
+
"step": 2550
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"entropy": 0.16072992872446776,
|
| 590 |
+
"epoch": 6.753736192332683,
|
| 591 |
+
"grad_norm": 2.282320976257324,
|
| 592 |
+
"learning_rate": 0.00010991794815014401,
|
| 593 |
+
"loss": 0.5657939910888672,
|
| 594 |
+
"mean_token_accuracy": 0.9550630164146423,
|
| 595 |
+
"num_tokens": 6376198.0,
|
| 596 |
+
"step": 2600
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"entropy": 0.1512781011685729,
|
| 600 |
+
"epoch": 6.883690708252112,
|
| 601 |
+
"grad_norm": 1.3716893196105957,
|
| 602 |
+
"learning_rate": 0.00010217975653883603,
|
| 603 |
+
"loss": 0.5340792465209961,
|
| 604 |
+
"mean_token_accuracy": 0.9578188157081604,
|
| 605 |
+
"num_tokens": 6502526.0,
|
| 606 |
+
"step": 2650
|
| 607 |
+
},
|
| 608 |
+
{
|
| 609 |
+
"epoch": 7.0,
|
| 610 |
+
"eval_entropy": 0.2444461930829745,
|
| 611 |
+
"eval_loss": 0.8798949718475342,
|
| 612 |
+
"eval_mean_token_accuracy": 0.8457763839799625,
|
| 613 |
+
"eval_num_tokens": 6613474.0,
|
| 614 |
+
"eval_runtime": 90.2868,
|
| 615 |
+
"eval_samples_per_second": 18.353,
|
| 616 |
+
"eval_steps_per_second": 2.304,
|
| 617 |
+
"step": 2695
|
| 618 |
+
},
|
| 619 |
+
{
|
| 620 |
+
"entropy": 0.1444593005668578,
|
| 621 |
+
"epoch": 7.012995451591943,
|
| 622 |
+
"grad_norm": 1.0965569019317627,
|
| 623 |
+
"learning_rate": 9.462291936854386e-05,
|
| 624 |
+
"loss": 0.511833839416504,
|
| 625 |
+
"mean_token_accuracy": 0.9595773016388093,
|
| 626 |
+
"num_tokens": 6626464.0,
|
| 627 |
+
"step": 2700
|
| 628 |
+
},
|
| 629 |
+
{
|
| 630 |
+
"entropy": 0.10985541097819805,
|
| 631 |
+
"epoch": 7.142949967511371,
|
| 632 |
+
"grad_norm": 1.8079149723052979,
|
| 633 |
+
"learning_rate": 8.726296404719584e-05,
|
| 634 |
+
"loss": 0.3876673126220703,
|
| 635 |
+
"mean_token_accuracy": 0.9704919803142548,
|
| 636 |
+
"num_tokens": 6746276.0,
|
| 637 |
+
"step": 2750
|
| 638 |
+
},
|
| 639 |
+
{
|
| 640 |
+
"entropy": 0.11304264679551125,
|
| 641 |
+
"epoch": 7.272904483430799,
|
| 642 |
+
"grad_norm": 1.5228444337844849,
|
| 643 |
+
"learning_rate": 8.01150134398253e-05,
|
| 644 |
+
"loss": 0.39335052490234373,
|
| 645 |
+
"mean_token_accuracy": 0.9695766788721084,
|
| 646 |
+
"num_tokens": 6868131.0,
|
| 647 |
+
"step": 2800
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"entropy": 0.11066193280741572,
|
| 651 |
+
"epoch": 7.402858999350228,
|
| 652 |
+
"grad_norm": 2.265174388885498,
|
| 653 |
+
"learning_rate": 7.319375479487112e-05,
|
| 654 |
+
"loss": 0.38289966583251955,
|
| 655 |
+
"mean_token_accuracy": 0.9707033503055572,
|
| 656 |
+
"num_tokens": 6993803.0,
|
| 657 |
+
"step": 2850
|
| 658 |
+
},
|
| 659 |
+
{
|
| 660 |
+
"entropy": 0.12022399662062526,
|
| 661 |
+
"epoch": 7.532813515269655,
|
| 662 |
+
"grad_norm": 1.0657345056533813,
|
| 663 |
+
"learning_rate": 6.65134095655596e-05,
|
| 664 |
+
"loss": 0.4089087677001953,
|
| 665 |
+
"mean_token_accuracy": 0.9689779531955719,
|
| 666 |
+
"num_tokens": 7113063.0,
|
| 667 |
+
"step": 2900
|
| 668 |
+
},
|
| 669 |
+
{
|
| 670 |
+
"entropy": 0.11429863104596734,
|
| 671 |
+
"epoch": 7.662768031189084,
|
| 672 |
+
"grad_norm": 1.3440358638763428,
|
| 673 |
+
"learning_rate": 6.008770418837973e-05,
|
| 674 |
+
"loss": 0.3935198593139648,
|
| 675 |
+
"mean_token_accuracy": 0.9698223957419395,
|
| 676 |
+
"num_tokens": 7237174.0,
|
| 677 |
+
"step": 2950
|
| 678 |
+
},
|
| 679 |
+
{
|
| 680 |
+
"entropy": 0.11748226622119545,
|
| 681 |
+
"epoch": 7.792722547108512,
|
| 682 |
+
"grad_norm": 1.4607034921646118,
|
| 683 |
+
"learning_rate": 5.3929841878693804e-05,
|
| 684 |
+
"loss": 0.40399799346923826,
|
| 685 |
+
"mean_token_accuracy": 0.9695871344208717,
|
| 686 |
+
"num_tokens": 7357301.0,
|
| 687 |
+
"step": 3000
|
| 688 |
+
},
|
| 689 |
+
{
|
| 690 |
+
"entropy": 0.11790506653487683,
|
| 691 |
+
"epoch": 7.92267706302794,
|
| 692 |
+
"grad_norm": 1.4574708938598633,
|
| 693 |
+
"learning_rate": 4.805247550143646e-05,
|
| 694 |
+
"loss": 0.4049314880371094,
|
| 695 |
+
"mean_token_accuracy": 0.9693469110131264,
|
| 696 |
+
"num_tokens": 7482431.0,
|
| 697 |
+
"step": 3050
|
| 698 |
+
},
|
| 699 |
+
{
|
| 700 |
+
"epoch": 8.0,
|
| 701 |
+
"eval_entropy": 0.2104659411483086,
|
| 702 |
+
"eval_loss": 0.9939886927604675,
|
| 703 |
+
"eval_mean_token_accuracy": 0.8444042455118436,
|
| 704 |
+
"eval_num_tokens": 7558256.0,
|
| 705 |
+
"eval_runtime": 90.3118,
|
| 706 |
+
"eval_samples_per_second": 18.348,
|
| 707 |
+
"eval_steps_per_second": 2.303,
|
| 708 |
+
"step": 3080
|
| 709 |
+
}
|
| 710 |
+
],
|
| 711 |
+
"logging_steps": 50,
|
| 712 |
+
"max_steps": 3850,
|
| 713 |
+
"num_input_tokens_seen": 0,
|
| 714 |
+
"num_train_epochs": 10,
|
| 715 |
+
"save_steps": 500,
|
| 716 |
+
"stateful_callbacks": {
|
| 717 |
+
"TrainerControl": {
|
| 718 |
+
"args": {
|
| 719 |
+
"should_epoch_stop": false,
|
| 720 |
+
"should_evaluate": false,
|
| 721 |
+
"should_log": false,
|
| 722 |
+
"should_save": true,
|
| 723 |
+
"should_training_stop": false
|
| 724 |
+
},
|
| 725 |
+
"attributes": {}
|
| 726 |
+
}
|
| 727 |
+
},
|
| 728 |
+
"total_flos": 2.648642717750723e+18,
|
| 729 |
+
"train_batch_size": 4,
|
| 730 |
+
"trial_name": null,
|
| 731 |
+
"trial_params": null
|
| 732 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: transformers
|
| 4 |
+
model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1
|
| 5 |
+
tags:
|
| 6 |
+
- generated_from_trainer
|
| 7 |
+
- sft
|
| 8 |
+
- trl
|
| 9 |
+
licence: license
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1
|
| 13 |
+
|
| 14 |
+
This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
|
| 15 |
+
It has been trained using [TRL](https://github.com/huggingface/trl).
|
| 16 |
+
|
| 17 |
+
## Quick start
|
| 18 |
+
|
| 19 |
+
```python
|
| 20 |
+
from transformers import pipeline
|
| 21 |
+
|
| 22 |
+
question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
|
| 23 |
+
generator = pipeline("text-generation", model="None", device="cuda")
|
| 24 |
+
output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
|
| 25 |
+
print(output["generated_text"])
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
## Training procedure
|
| 29 |
+
|
| 30 |
+
[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/rfqns0wc)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
This model was trained with SFT.
|
| 35 |
+
|
| 36 |
+
### Framework versions
|
| 37 |
+
|
| 38 |
+
- TRL: 0.29.0
|
| 39 |
+
- Transformers: 5.5.4
|
| 40 |
+
- Pytorch: 2.10.0
|
| 41 |
+
- Datasets: 4.6.1
|
| 42 |
+
- Tokenizers: 0.22.2
|
| 43 |
+
|
| 44 |
+
## Citations
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
Cite TRL as:
|
| 49 |
+
|
| 50 |
+
```bibtex
|
| 51 |
+
@software{vonwerra2020trl,
|
| 52 |
+
title = {{TRL: Transformers Reinforcement Learning}},
|
| 53 |
+
author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
|
| 54 |
+
license = {Apache-2.0},
|
| 55 |
+
url = {https://github.com/huggingface/trl},
|
| 56 |
+
year = {2020}
|
| 57 |
+
}
|
| 58 |
+
```
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1122,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.3355020767450332,
|
| 14 |
+
"epoch": 0.13386880856760375,
|
| 15 |
+
"grad_norm": 3.2956597805023193,
|
| 16 |
+
"learning_rate": 1.628530639938585e-05,
|
| 17 |
+
"loss": 5.349910278320312,
|
| 18 |
+
"mean_token_accuracy": 0.7383818039298058,
|
| 19 |
+
"num_tokens": 116199.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.5958842460811138,
|
| 24 |
+
"epoch": 0.2677376171352075,
|
| 25 |
+
"grad_norm": 2.5947492122650146,
|
| 26 |
+
"learning_rate": 3.290296599059591e-05,
|
| 27 |
+
"loss": 2.312855072021484,
|
| 28 |
+
"mean_token_accuracy": 0.8520967712998391,
|
| 29 |
+
"num_tokens": 232864.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5190362003445625,
|
| 34 |
+
"epoch": 0.40160642570281124,
|
| 35 |
+
"grad_norm": 1.5038394927978516,
|
| 36 |
+
"learning_rate": 4.9520625581805955e-05,
|
| 37 |
+
"loss": 2.0574468994140624,
|
| 38 |
+
"mean_token_accuracy": 0.8657039344310761,
|
| 39 |
+
"num_tokens": 352382.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.4922871346771717,
|
| 44 |
+
"epoch": 0.535475234270415,
|
| 45 |
+
"grad_norm": 1.645923137664795,
|
| 46 |
+
"learning_rate": 6.613828517301602e-05,
|
| 47 |
+
"loss": 1.916438446044922,
|
| 48 |
+
"mean_token_accuracy": 0.8717759534716606,
|
| 49 |
+
"num_tokens": 474532.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.491110111027956,
|
| 54 |
+
"epoch": 0.6693440428380187,
|
| 55 |
+
"grad_norm": 1.866817593574524,
|
| 56 |
+
"learning_rate": 8.275594476422607e-05,
|
| 57 |
+
"loss": 1.9421713256835937,
|
| 58 |
+
"mean_token_accuracy": 0.8710730043053627,
|
| 59 |
+
"num_tokens": 589198.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.47134352535009383,
|
| 64 |
+
"epoch": 0.8032128514056225,
|
| 65 |
+
"grad_norm": 117.62409210205078,
|
| 66 |
+
"learning_rate": 9.937360435543611e-05,
|
| 67 |
+
"loss": 1.9768324279785157,
|
| 68 |
+
"mean_token_accuracy": 0.8741078078746796,
|
| 69 |
+
"num_tokens": 707057.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.4820582258701325,
|
| 74 |
+
"epoch": 0.9370816599732262,
|
| 75 |
+
"grad_norm": 2.3274827003479004,
|
| 76 |
+
"learning_rate": 0.00011599126394664616,
|
| 77 |
+
"loss": 2.2025875854492187,
|
| 78 |
+
"mean_token_accuracy": 0.8697148504853248,
|
| 79 |
+
"num_tokens": 822888.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5010400542616844,
|
| 85 |
+
"eval_loss": 0.5114277601242065,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8587275749444961,
|
| 87 |
+
"eval_num_tokens": 872247.0,
|
| 88 |
+
"eval_runtime": 96.5515,
|
| 89 |
+
"eval_samples_per_second": 16.561,
|
| 90 |
+
"eval_steps_per_second": 2.071,
|
| 91 |
+
"step": 374
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.4708875769918615,
|
| 95 |
+
"epoch": 1.069611780455154,
|
| 96 |
+
"grad_norm": 3.3712940216064453,
|
| 97 |
+
"learning_rate": 0.00012428317596508976,
|
| 98 |
+
"loss": 1.83294189453125,
|
| 99 |
+
"mean_token_accuracy": 0.8772370366737096,
|
| 100 |
+
"num_tokens": 929365.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.44804590195417404,
|
| 105 |
+
"epoch": 1.2034805890227578,
|
| 106 |
+
"grad_norm": 1.4833389520645142,
|
| 107 |
+
"learning_rate": 0.00012414788900475706,
|
| 108 |
+
"loss": 1.7768891906738282,
|
| 109 |
+
"mean_token_accuracy": 0.8791097947955131,
|
| 110 |
+
"num_tokens": 1046629.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.4510513086616993,
|
| 115 |
+
"epoch": 1.3373493975903614,
|
| 116 |
+
"grad_norm": 2.814790964126587,
|
| 117 |
+
"learning_rate": 0.00012387760965418496,
|
| 118 |
+
"loss": 1.7745071411132813,
|
| 119 |
+
"mean_token_accuracy": 0.8813075706362724,
|
| 120 |
+
"num_tokens": 1165744.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.4479117552936077,
|
| 125 |
+
"epoch": 1.4712182061579653,
|
| 126 |
+
"grad_norm": 1.855610728263855,
|
| 127 |
+
"learning_rate": 0.00012347292641217135,
|
| 128 |
+
"loss": 1.7583291625976563,
|
| 129 |
+
"mean_token_accuracy": 0.8815277495980263,
|
| 130 |
+
"num_tokens": 1284843.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.4380264139175415,
|
| 135 |
+
"epoch": 1.605087014725569,
|
| 136 |
+
"grad_norm": 1.383190631866455,
|
| 137 |
+
"learning_rate": 0.00012293472042483757,
|
| 138 |
+
"loss": 1.7229583740234375,
|
| 139 |
+
"mean_token_accuracy": 0.8832098203897476,
|
| 140 |
+
"num_tokens": 1406485.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.4342571949958801,
|
| 145 |
+
"epoch": 1.7389558232931726,
|
| 146 |
+
"grad_norm": 1.4977834224700928,
|
| 147 |
+
"learning_rate": 0.00012226416356704526,
|
| 148 |
+
"loss": 1.7174737548828125,
|
| 149 |
+
"mean_token_accuracy": 0.8834967383742333,
|
| 150 |
+
"num_tokens": 1525460.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.42700962007045745,
|
| 155 |
+
"epoch": 1.8728246318607764,
|
| 156 |
+
"grad_norm": 1.6156537532806396,
|
| 157 |
+
"learning_rate": 0.00012146271589078838,
|
| 158 |
+
"loss": 1.682061767578125,
|
| 159 |
+
"mean_token_accuracy": 0.8858474844694137,
|
| 160 |
+
"num_tokens": 1638984.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"epoch": 2.0,
|
| 165 |
+
"eval_entropy": 0.4838937771320343,
|
| 166 |
+
"eval_loss": 0.4826815128326416,
|
| 167 |
+
"eval_mean_token_accuracy": 0.8682844692468643,
|
| 168 |
+
"eval_num_tokens": 1744494.0,
|
| 169 |
+
"eval_runtime": 96.5071,
|
| 170 |
+
"eval_samples_per_second": 16.569,
|
| 171 |
+
"eval_steps_per_second": 2.072,
|
| 172 |
+
"step": 748
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"entropy": 0.4378527848407476,
|
| 176 |
+
"epoch": 2.005354752342704,
|
| 177 |
+
"grad_norm": 1.400229573249817,
|
| 178 |
+
"learning_rate": 0.0001205321224461161,
|
| 179 |
+
"loss": 1.7096096801757812,
|
| 180 |
+
"mean_token_accuracy": 0.8838462468349573,
|
| 181 |
+
"num_tokens": 1749755.0,
|
| 182 |
+
"step": 750
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.3559799794852734,
|
| 186 |
+
"epoch": 2.139223560910308,
|
| 187 |
+
"grad_norm": 1.7168083190917969,
|
| 188 |
+
"learning_rate": 0.0001194744094815093,
|
| 189 |
+
"loss": 1.3893603515625,
|
| 190 |
+
"mean_token_accuracy": 0.9004731178283691,
|
| 191 |
+
"num_tokens": 1868231.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.3671448823064566,
|
| 196 |
+
"epoch": 2.2730923694779115,
|
| 197 |
+
"grad_norm": 1.9720135927200317,
|
| 198 |
+
"learning_rate": 0.00011829188003198282,
|
| 199 |
+
"loss": 1.429988555908203,
|
| 200 |
+
"mean_token_accuracy": 0.8970818132162094,
|
| 201 |
+
"num_tokens": 1979116.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.3597494306415319,
|
| 206 |
+
"epoch": 2.4069611780455156,
|
| 207 |
+
"grad_norm": 1.4947372674942017,
|
| 208 |
+
"learning_rate": 0.00011698710890452068,
|
| 209 |
+
"loss": 1.418173828125,
|
| 210 |
+
"mean_token_accuracy": 0.8994651186466217,
|
| 211 |
+
"num_tokens": 2094539.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.36254502907395364,
|
| 216 |
+
"epoch": 2.540829986613119,
|
| 217 |
+
"grad_norm": 1.6768454313278198,
|
| 218 |
+
"learning_rate": 0.00011556293707176242,
|
| 219 |
+
"loss": 1.4158590698242188,
|
| 220 |
+
"mean_token_accuracy": 0.8995477721095085,
|
| 221 |
+
"num_tokens": 2209415.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.36290778368711474,
|
| 226 |
+
"epoch": 2.674698795180723,
|
| 227 |
+
"grad_norm": 1.6033697128295898,
|
| 228 |
+
"learning_rate": 0.00011402246548614765,
|
| 229 |
+
"loss": 1.4300469970703125,
|
| 230 |
+
"mean_token_accuracy": 0.8986452376842499,
|
| 231 |
+
"num_tokens": 2324269.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.3635872249305248,
|
| 236 |
+
"epoch": 2.8085676037483265,
|
| 237 |
+
"grad_norm": 1.546893835067749,
|
| 238 |
+
"learning_rate": 0.00011236904832798785,
|
| 239 |
+
"loss": 1.42587646484375,
|
| 240 |
+
"mean_token_accuracy": 0.9003903394937516,
|
| 241 |
+
"num_tokens": 2447336.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.36871150620281695,
|
| 246 |
+
"epoch": 2.9424364123159306,
|
| 247 |
+
"grad_norm": 1.2951405048370361,
|
| 248 |
+
"learning_rate": 0.0001106062857021667,
|
| 249 |
+
"loss": 1.448046875,
|
| 250 |
+
"mean_token_accuracy": 0.8967258337140084,
|
| 251 |
+
"num_tokens": 2565837.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"epoch": 3.0,
|
| 256 |
+
"eval_entropy": 0.4225208269059658,
|
| 257 |
+
"eval_loss": 0.489418089389801,
|
| 258 |
+
"eval_mean_token_accuracy": 0.8697815361618996,
|
| 259 |
+
"eval_num_tokens": 2616741.0,
|
| 260 |
+
"eval_runtime": 96.4058,
|
| 261 |
+
"eval_samples_per_second": 16.586,
|
| 262 |
+
"eval_steps_per_second": 2.075,
|
| 263 |
+
"step": 1122
|
| 264 |
+
}
|
| 265 |
+
],
|
| 266 |
+
"logging_steps": 50,
|
| 267 |
+
"max_steps": 3740,
|
| 268 |
+
"num_input_tokens_seen": 0,
|
| 269 |
+
"num_train_epochs": 10,
|
| 270 |
+
"save_steps": 500,
|
| 271 |
+
"stateful_callbacks": {
|
| 272 |
+
"TrainerControl": {
|
| 273 |
+
"args": {
|
| 274 |
+
"should_epoch_stop": false,
|
| 275 |
+
"should_evaluate": false,
|
| 276 |
+
"should_log": false,
|
| 277 |
+
"should_save": true,
|
| 278 |
+
"should_training_stop": false
|
| 279 |
+
},
|
| 280 |
+
"attributes": {}
|
| 281 |
+
}
|
| 282 |
+
},
|
| 283 |
+
"total_flos": 8.979346498185751e+17,
|
| 284 |
+
"train_batch_size": 4,
|
| 285 |
+
"trial_name": null,
|
| 286 |
+
"trial_params": null
|
| 287 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json
ADDED
|
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 4.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1496,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.3355020767450332,
|
| 14 |
+
"epoch": 0.13386880856760375,
|
| 15 |
+
"grad_norm": 3.2956597805023193,
|
| 16 |
+
"learning_rate": 1.628530639938585e-05,
|
| 17 |
+
"loss": 5.349910278320312,
|
| 18 |
+
"mean_token_accuracy": 0.7383818039298058,
|
| 19 |
+
"num_tokens": 116199.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.5958842460811138,
|
| 24 |
+
"epoch": 0.2677376171352075,
|
| 25 |
+
"grad_norm": 2.5947492122650146,
|
| 26 |
+
"learning_rate": 3.290296599059591e-05,
|
| 27 |
+
"loss": 2.312855072021484,
|
| 28 |
+
"mean_token_accuracy": 0.8520967712998391,
|
| 29 |
+
"num_tokens": 232864.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5190362003445625,
|
| 34 |
+
"epoch": 0.40160642570281124,
|
| 35 |
+
"grad_norm": 1.5038394927978516,
|
| 36 |
+
"learning_rate": 4.9520625581805955e-05,
|
| 37 |
+
"loss": 2.0574468994140624,
|
| 38 |
+
"mean_token_accuracy": 0.8657039344310761,
|
| 39 |
+
"num_tokens": 352382.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.4922871346771717,
|
| 44 |
+
"epoch": 0.535475234270415,
|
| 45 |
+
"grad_norm": 1.645923137664795,
|
| 46 |
+
"learning_rate": 6.613828517301602e-05,
|
| 47 |
+
"loss": 1.916438446044922,
|
| 48 |
+
"mean_token_accuracy": 0.8717759534716606,
|
| 49 |
+
"num_tokens": 474532.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.491110111027956,
|
| 54 |
+
"epoch": 0.6693440428380187,
|
| 55 |
+
"grad_norm": 1.866817593574524,
|
| 56 |
+
"learning_rate": 8.275594476422607e-05,
|
| 57 |
+
"loss": 1.9421713256835937,
|
| 58 |
+
"mean_token_accuracy": 0.8710730043053627,
|
| 59 |
+
"num_tokens": 589198.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.47134352535009383,
|
| 64 |
+
"epoch": 0.8032128514056225,
|
| 65 |
+
"grad_norm": 117.62409210205078,
|
| 66 |
+
"learning_rate": 9.937360435543611e-05,
|
| 67 |
+
"loss": 1.9768324279785157,
|
| 68 |
+
"mean_token_accuracy": 0.8741078078746796,
|
| 69 |
+
"num_tokens": 707057.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.4820582258701325,
|
| 74 |
+
"epoch": 0.9370816599732262,
|
| 75 |
+
"grad_norm": 2.3274827003479004,
|
| 76 |
+
"learning_rate": 0.00011599126394664616,
|
| 77 |
+
"loss": 2.2025875854492187,
|
| 78 |
+
"mean_token_accuracy": 0.8697148504853248,
|
| 79 |
+
"num_tokens": 822888.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5010400542616844,
|
| 85 |
+
"eval_loss": 0.5114277601242065,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8587275749444961,
|
| 87 |
+
"eval_num_tokens": 872247.0,
|
| 88 |
+
"eval_runtime": 96.5515,
|
| 89 |
+
"eval_samples_per_second": 16.561,
|
| 90 |
+
"eval_steps_per_second": 2.071,
|
| 91 |
+
"step": 374
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.4708875769918615,
|
| 95 |
+
"epoch": 1.069611780455154,
|
| 96 |
+
"grad_norm": 3.3712940216064453,
|
| 97 |
+
"learning_rate": 0.00012428317596508976,
|
| 98 |
+
"loss": 1.83294189453125,
|
| 99 |
+
"mean_token_accuracy": 0.8772370366737096,
|
| 100 |
+
"num_tokens": 929365.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.44804590195417404,
|
| 105 |
+
"epoch": 1.2034805890227578,
|
| 106 |
+
"grad_norm": 1.4833389520645142,
|
| 107 |
+
"learning_rate": 0.00012414788900475706,
|
| 108 |
+
"loss": 1.7768891906738282,
|
| 109 |
+
"mean_token_accuracy": 0.8791097947955131,
|
| 110 |
+
"num_tokens": 1046629.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.4510513086616993,
|
| 115 |
+
"epoch": 1.3373493975903614,
|
| 116 |
+
"grad_norm": 2.814790964126587,
|
| 117 |
+
"learning_rate": 0.00012387760965418496,
|
| 118 |
+
"loss": 1.7745071411132813,
|
| 119 |
+
"mean_token_accuracy": 0.8813075706362724,
|
| 120 |
+
"num_tokens": 1165744.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.4479117552936077,
|
| 125 |
+
"epoch": 1.4712182061579653,
|
| 126 |
+
"grad_norm": 1.855610728263855,
|
| 127 |
+
"learning_rate": 0.00012347292641217135,
|
| 128 |
+
"loss": 1.7583291625976563,
|
| 129 |
+
"mean_token_accuracy": 0.8815277495980263,
|
| 130 |
+
"num_tokens": 1284843.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.4380264139175415,
|
| 135 |
+
"epoch": 1.605087014725569,
|
| 136 |
+
"grad_norm": 1.383190631866455,
|
| 137 |
+
"learning_rate": 0.00012293472042483757,
|
| 138 |
+
"loss": 1.7229583740234375,
|
| 139 |
+
"mean_token_accuracy": 0.8832098203897476,
|
| 140 |
+
"num_tokens": 1406485.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.4342571949958801,
|
| 145 |
+
"epoch": 1.7389558232931726,
|
| 146 |
+
"grad_norm": 1.4977834224700928,
|
| 147 |
+
"learning_rate": 0.00012226416356704526,
|
| 148 |
+
"loss": 1.7174737548828125,
|
| 149 |
+
"mean_token_accuracy": 0.8834967383742333,
|
| 150 |
+
"num_tokens": 1525460.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.42700962007045745,
|
| 155 |
+
"epoch": 1.8728246318607764,
|
| 156 |
+
"grad_norm": 1.6156537532806396,
|
| 157 |
+
"learning_rate": 0.00012146271589078838,
|
| 158 |
+
"loss": 1.682061767578125,
|
| 159 |
+
"mean_token_accuracy": 0.8858474844694137,
|
| 160 |
+
"num_tokens": 1638984.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"epoch": 2.0,
|
| 165 |
+
"eval_entropy": 0.4838937771320343,
|
| 166 |
+
"eval_loss": 0.4826815128326416,
|
| 167 |
+
"eval_mean_token_accuracy": 0.8682844692468643,
|
| 168 |
+
"eval_num_tokens": 1744494.0,
|
| 169 |
+
"eval_runtime": 96.5071,
|
| 170 |
+
"eval_samples_per_second": 16.569,
|
| 171 |
+
"eval_steps_per_second": 2.072,
|
| 172 |
+
"step": 748
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"entropy": 0.4378527848407476,
|
| 176 |
+
"epoch": 2.005354752342704,
|
| 177 |
+
"grad_norm": 1.400229573249817,
|
| 178 |
+
"learning_rate": 0.0001205321224461161,
|
| 179 |
+
"loss": 1.7096096801757812,
|
| 180 |
+
"mean_token_accuracy": 0.8838462468349573,
|
| 181 |
+
"num_tokens": 1749755.0,
|
| 182 |
+
"step": 750
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.3559799794852734,
|
| 186 |
+
"epoch": 2.139223560910308,
|
| 187 |
+
"grad_norm": 1.7168083190917969,
|
| 188 |
+
"learning_rate": 0.0001194744094815093,
|
| 189 |
+
"loss": 1.3893603515625,
|
| 190 |
+
"mean_token_accuracy": 0.9004731178283691,
|
| 191 |
+
"num_tokens": 1868231.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.3671448823064566,
|
| 196 |
+
"epoch": 2.2730923694779115,
|
| 197 |
+
"grad_norm": 1.9720135927200317,
|
| 198 |
+
"learning_rate": 0.00011829188003198282,
|
| 199 |
+
"loss": 1.429988555908203,
|
| 200 |
+
"mean_token_accuracy": 0.8970818132162094,
|
| 201 |
+
"num_tokens": 1979116.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.3597494306415319,
|
| 206 |
+
"epoch": 2.4069611780455156,
|
| 207 |
+
"grad_norm": 1.4947372674942017,
|
| 208 |
+
"learning_rate": 0.00011698710890452068,
|
| 209 |
+
"loss": 1.418173828125,
|
| 210 |
+
"mean_token_accuracy": 0.8994651186466217,
|
| 211 |
+
"num_tokens": 2094539.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.36254502907395364,
|
| 216 |
+
"epoch": 2.540829986613119,
|
| 217 |
+
"grad_norm": 1.6768454313278198,
|
| 218 |
+
"learning_rate": 0.00011556293707176242,
|
| 219 |
+
"loss": 1.4158590698242188,
|
| 220 |
+
"mean_token_accuracy": 0.8995477721095085,
|
| 221 |
+
"num_tokens": 2209415.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.36290778368711474,
|
| 226 |
+
"epoch": 2.674698795180723,
|
| 227 |
+
"grad_norm": 1.6033697128295898,
|
| 228 |
+
"learning_rate": 0.00011402246548614765,
|
| 229 |
+
"loss": 1.4300469970703125,
|
| 230 |
+
"mean_token_accuracy": 0.8986452376842499,
|
| 231 |
+
"num_tokens": 2324269.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.3635872249305248,
|
| 236 |
+
"epoch": 2.8085676037483265,
|
| 237 |
+
"grad_norm": 1.546893835067749,
|
| 238 |
+
"learning_rate": 0.00011236904832798785,
|
| 239 |
+
"loss": 1.42587646484375,
|
| 240 |
+
"mean_token_accuracy": 0.9003903394937516,
|
| 241 |
+
"num_tokens": 2447336.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.36871150620281695,
|
| 246 |
+
"epoch": 2.9424364123159306,
|
| 247 |
+
"grad_norm": 1.2951405048370361,
|
| 248 |
+
"learning_rate": 0.0001106062857021667,
|
| 249 |
+
"loss": 1.448046875,
|
| 250 |
+
"mean_token_accuracy": 0.8967258337140084,
|
| 251 |
+
"num_tokens": 2565837.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"epoch": 3.0,
|
| 256 |
+
"eval_entropy": 0.4225208269059658,
|
| 257 |
+
"eval_loss": 0.489418089389801,
|
| 258 |
+
"eval_mean_token_accuracy": 0.8697815361618996,
|
| 259 |
+
"eval_num_tokens": 2616741.0,
|
| 260 |
+
"eval_runtime": 96.4058,
|
| 261 |
+
"eval_samples_per_second": 16.586,
|
| 262 |
+
"eval_steps_per_second": 2.075,
|
| 263 |
+
"step": 1122
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"entropy": 0.3120347365285411,
|
| 267 |
+
"epoch": 3.074966532797858,
|
| 268 |
+
"grad_norm": 1.639520287513733,
|
| 269 |
+
"learning_rate": 0.00010873801579937106,
|
| 270 |
+
"loss": 1.1941973876953125,
|
| 271 |
+
"mean_token_accuracy": 0.9117801315856703,
|
| 272 |
+
"num_tokens": 2685975.0,
|
| 273 |
+
"step": 1150
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.28257040068507194,
|
| 277 |
+
"epoch": 3.208835341365462,
|
| 278 |
+
"grad_norm": 1.7459681034088135,
|
| 279 |
+
"learning_rate": 0.00010676830653892058,
|
| 280 |
+
"loss": 1.0850601196289062,
|
| 281 |
+
"mean_token_accuracy": 0.9177472350001336,
|
| 282 |
+
"num_tokens": 2798277.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.27802520349621773,
|
| 287 |
+
"epoch": 3.3427041499330654,
|
| 288 |
+
"grad_norm": 1.5176103115081787,
|
| 289 |
+
"learning_rate": 0.00010470144671139238,
|
| 290 |
+
"loss": 1.0840838623046876,
|
| 291 |
+
"mean_token_accuracy": 0.9179763168096542,
|
| 292 |
+
"num_tokens": 2918973.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.280417420566082,
|
| 297 |
+
"epoch": 3.4765729585006695,
|
| 298 |
+
"grad_norm": 1.3774974346160889,
|
| 299 |
+
"learning_rate": 0.00010254193664032686,
|
| 300 |
+
"loss": 1.0911756896972655,
|
| 301 |
+
"mean_token_accuracy": 0.9162956389784813,
|
| 302 |
+
"num_tokens": 3039073.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.2834589210152626,
|
| 307 |
+
"epoch": 3.610441767068273,
|
| 308 |
+
"grad_norm": 1.5929396152496338,
|
| 309 |
+
"learning_rate": 0.00010029447838334742,
|
| 310 |
+
"loss": 1.0985262298583984,
|
| 311 |
+
"mean_token_accuracy": 0.9174074530601501,
|
| 312 |
+
"num_tokens": 3153710.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.282296127229929,
|
| 317 |
+
"epoch": 3.7443105756358768,
|
| 318 |
+
"grad_norm": 1.50350022315979,
|
| 319 |
+
"learning_rate": 9.796396549403e-05,
|
| 320 |
+
"loss": 1.101386260986328,
|
| 321 |
+
"mean_token_accuracy": 0.9168545073270797,
|
| 322 |
+
"num_tokens": 3263594.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.279728781580925,
|
| 327 |
+
"epoch": 3.878179384203481,
|
| 328 |
+
"grad_norm": 1.4728187322616577,
|
| 329 |
+
"learning_rate": 9.555547236681456e-05,
|
| 330 |
+
"loss": 1.0859880065917968,
|
| 331 |
+
"mean_token_accuracy": 0.9178367125988006,
|
| 332 |
+
"num_tokens": 3386033.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"epoch": 4.0,
|
| 337 |
+
"eval_entropy": 0.34304031178355215,
|
| 338 |
+
"eval_loss": 0.5295785665512085,
|
| 339 |
+
"eval_mean_token_accuracy": 0.8698753178119659,
|
| 340 |
+
"eval_num_tokens": 3488988.0,
|
| 341 |
+
"eval_runtime": 96.3616,
|
| 342 |
+
"eval_samples_per_second": 16.594,
|
| 343 |
+
"eval_steps_per_second": 2.076,
|
| 344 |
+
"step": 1496
|
| 345 |
+
}
|
| 346 |
+
],
|
| 347 |
+
"logging_steps": 50,
|
| 348 |
+
"max_steps": 3740,
|
| 349 |
+
"num_input_tokens_seen": 0,
|
| 350 |
+
"num_train_epochs": 10,
|
| 351 |
+
"save_steps": 500,
|
| 352 |
+
"stateful_callbacks": {
|
| 353 |
+
"TrainerControl": {
|
| 354 |
+
"args": {
|
| 355 |
+
"should_epoch_stop": false,
|
| 356 |
+
"should_evaluate": false,
|
| 357 |
+
"should_log": false,
|
| 358 |
+
"should_save": true,
|
| 359 |
+
"should_training_stop": false
|
| 360 |
+
},
|
| 361 |
+
"attributes": {}
|
| 362 |
+
}
|
| 363 |
+
},
|
| 364 |
+
"total_flos": 1.1971161045794035e+18,
|
| 365 |
+
"train_batch_size": 4,
|
| 366 |
+
"trial_name": null,
|
| 367 |
+
"trial_params": null
|
| 368 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json
ADDED
|
@@ -0,0 +1,459 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 5.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1870,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.3355020767450332,
|
| 14 |
+
"epoch": 0.13386880856760375,
|
| 15 |
+
"grad_norm": 3.2956597805023193,
|
| 16 |
+
"learning_rate": 1.628530639938585e-05,
|
| 17 |
+
"loss": 5.349910278320312,
|
| 18 |
+
"mean_token_accuracy": 0.7383818039298058,
|
| 19 |
+
"num_tokens": 116199.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.5958842460811138,
|
| 24 |
+
"epoch": 0.2677376171352075,
|
| 25 |
+
"grad_norm": 2.5947492122650146,
|
| 26 |
+
"learning_rate": 3.290296599059591e-05,
|
| 27 |
+
"loss": 2.312855072021484,
|
| 28 |
+
"mean_token_accuracy": 0.8520967712998391,
|
| 29 |
+
"num_tokens": 232864.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5190362003445625,
|
| 34 |
+
"epoch": 0.40160642570281124,
|
| 35 |
+
"grad_norm": 1.5038394927978516,
|
| 36 |
+
"learning_rate": 4.9520625581805955e-05,
|
| 37 |
+
"loss": 2.0574468994140624,
|
| 38 |
+
"mean_token_accuracy": 0.8657039344310761,
|
| 39 |
+
"num_tokens": 352382.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.4922871346771717,
|
| 44 |
+
"epoch": 0.535475234270415,
|
| 45 |
+
"grad_norm": 1.645923137664795,
|
| 46 |
+
"learning_rate": 6.613828517301602e-05,
|
| 47 |
+
"loss": 1.916438446044922,
|
| 48 |
+
"mean_token_accuracy": 0.8717759534716606,
|
| 49 |
+
"num_tokens": 474532.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.491110111027956,
|
| 54 |
+
"epoch": 0.6693440428380187,
|
| 55 |
+
"grad_norm": 1.866817593574524,
|
| 56 |
+
"learning_rate": 8.275594476422607e-05,
|
| 57 |
+
"loss": 1.9421713256835937,
|
| 58 |
+
"mean_token_accuracy": 0.8710730043053627,
|
| 59 |
+
"num_tokens": 589198.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.47134352535009383,
|
| 64 |
+
"epoch": 0.8032128514056225,
|
| 65 |
+
"grad_norm": 117.62409210205078,
|
| 66 |
+
"learning_rate": 9.937360435543611e-05,
|
| 67 |
+
"loss": 1.9768324279785157,
|
| 68 |
+
"mean_token_accuracy": 0.8741078078746796,
|
| 69 |
+
"num_tokens": 707057.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.4820582258701325,
|
| 74 |
+
"epoch": 0.9370816599732262,
|
| 75 |
+
"grad_norm": 2.3274827003479004,
|
| 76 |
+
"learning_rate": 0.00011599126394664616,
|
| 77 |
+
"loss": 2.2025875854492187,
|
| 78 |
+
"mean_token_accuracy": 0.8697148504853248,
|
| 79 |
+
"num_tokens": 822888.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5010400542616844,
|
| 85 |
+
"eval_loss": 0.5114277601242065,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8587275749444961,
|
| 87 |
+
"eval_num_tokens": 872247.0,
|
| 88 |
+
"eval_runtime": 96.5515,
|
| 89 |
+
"eval_samples_per_second": 16.561,
|
| 90 |
+
"eval_steps_per_second": 2.071,
|
| 91 |
+
"step": 374
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.4708875769918615,
|
| 95 |
+
"epoch": 1.069611780455154,
|
| 96 |
+
"grad_norm": 3.3712940216064453,
|
| 97 |
+
"learning_rate": 0.00012428317596508976,
|
| 98 |
+
"loss": 1.83294189453125,
|
| 99 |
+
"mean_token_accuracy": 0.8772370366737096,
|
| 100 |
+
"num_tokens": 929365.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.44804590195417404,
|
| 105 |
+
"epoch": 1.2034805890227578,
|
| 106 |
+
"grad_norm": 1.4833389520645142,
|
| 107 |
+
"learning_rate": 0.00012414788900475706,
|
| 108 |
+
"loss": 1.7768891906738282,
|
| 109 |
+
"mean_token_accuracy": 0.8791097947955131,
|
| 110 |
+
"num_tokens": 1046629.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.4510513086616993,
|
| 115 |
+
"epoch": 1.3373493975903614,
|
| 116 |
+
"grad_norm": 2.814790964126587,
|
| 117 |
+
"learning_rate": 0.00012387760965418496,
|
| 118 |
+
"loss": 1.7745071411132813,
|
| 119 |
+
"mean_token_accuracy": 0.8813075706362724,
|
| 120 |
+
"num_tokens": 1165744.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.4479117552936077,
|
| 125 |
+
"epoch": 1.4712182061579653,
|
| 126 |
+
"grad_norm": 1.855610728263855,
|
| 127 |
+
"learning_rate": 0.00012347292641217135,
|
| 128 |
+
"loss": 1.7583291625976563,
|
| 129 |
+
"mean_token_accuracy": 0.8815277495980263,
|
| 130 |
+
"num_tokens": 1284843.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.4380264139175415,
|
| 135 |
+
"epoch": 1.605087014725569,
|
| 136 |
+
"grad_norm": 1.383190631866455,
|
| 137 |
+
"learning_rate": 0.00012293472042483757,
|
| 138 |
+
"loss": 1.7229583740234375,
|
| 139 |
+
"mean_token_accuracy": 0.8832098203897476,
|
| 140 |
+
"num_tokens": 1406485.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.4342571949958801,
|
| 145 |
+
"epoch": 1.7389558232931726,
|
| 146 |
+
"grad_norm": 1.4977834224700928,
|
| 147 |
+
"learning_rate": 0.00012226416356704526,
|
| 148 |
+
"loss": 1.7174737548828125,
|
| 149 |
+
"mean_token_accuracy": 0.8834967383742333,
|
| 150 |
+
"num_tokens": 1525460.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.42700962007045745,
|
| 155 |
+
"epoch": 1.8728246318607764,
|
| 156 |
+
"grad_norm": 1.6156537532806396,
|
| 157 |
+
"learning_rate": 0.00012146271589078838,
|
| 158 |
+
"loss": 1.682061767578125,
|
| 159 |
+
"mean_token_accuracy": 0.8858474844694137,
|
| 160 |
+
"num_tokens": 1638984.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"epoch": 2.0,
|
| 165 |
+
"eval_entropy": 0.4838937771320343,
|
| 166 |
+
"eval_loss": 0.4826815128326416,
|
| 167 |
+
"eval_mean_token_accuracy": 0.8682844692468643,
|
| 168 |
+
"eval_num_tokens": 1744494.0,
|
| 169 |
+
"eval_runtime": 96.5071,
|
| 170 |
+
"eval_samples_per_second": 16.569,
|
| 171 |
+
"eval_steps_per_second": 2.072,
|
| 172 |
+
"step": 748
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"entropy": 0.4378527848407476,
|
| 176 |
+
"epoch": 2.005354752342704,
|
| 177 |
+
"grad_norm": 1.400229573249817,
|
| 178 |
+
"learning_rate": 0.0001205321224461161,
|
| 179 |
+
"loss": 1.7096096801757812,
|
| 180 |
+
"mean_token_accuracy": 0.8838462468349573,
|
| 181 |
+
"num_tokens": 1749755.0,
|
| 182 |
+
"step": 750
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.3559799794852734,
|
| 186 |
+
"epoch": 2.139223560910308,
|
| 187 |
+
"grad_norm": 1.7168083190917969,
|
| 188 |
+
"learning_rate": 0.0001194744094815093,
|
| 189 |
+
"loss": 1.3893603515625,
|
| 190 |
+
"mean_token_accuracy": 0.9004731178283691,
|
| 191 |
+
"num_tokens": 1868231.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.3671448823064566,
|
| 196 |
+
"epoch": 2.2730923694779115,
|
| 197 |
+
"grad_norm": 1.9720135927200317,
|
| 198 |
+
"learning_rate": 0.00011829188003198282,
|
| 199 |
+
"loss": 1.429988555908203,
|
| 200 |
+
"mean_token_accuracy": 0.8970818132162094,
|
| 201 |
+
"num_tokens": 1979116.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.3597494306415319,
|
| 206 |
+
"epoch": 2.4069611780455156,
|
| 207 |
+
"grad_norm": 1.4947372674942017,
|
| 208 |
+
"learning_rate": 0.00011698710890452068,
|
| 209 |
+
"loss": 1.418173828125,
|
| 210 |
+
"mean_token_accuracy": 0.8994651186466217,
|
| 211 |
+
"num_tokens": 2094539.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.36254502907395364,
|
| 216 |
+
"epoch": 2.540829986613119,
|
| 217 |
+
"grad_norm": 1.6768454313278198,
|
| 218 |
+
"learning_rate": 0.00011556293707176242,
|
| 219 |
+
"loss": 1.4158590698242188,
|
| 220 |
+
"mean_token_accuracy": 0.8995477721095085,
|
| 221 |
+
"num_tokens": 2209415.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.36290778368711474,
|
| 226 |
+
"epoch": 2.674698795180723,
|
| 227 |
+
"grad_norm": 1.6033697128295898,
|
| 228 |
+
"learning_rate": 0.00011402246548614765,
|
| 229 |
+
"loss": 1.4300469970703125,
|
| 230 |
+
"mean_token_accuracy": 0.8986452376842499,
|
| 231 |
+
"num_tokens": 2324269.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.3635872249305248,
|
| 236 |
+
"epoch": 2.8085676037483265,
|
| 237 |
+
"grad_norm": 1.546893835067749,
|
| 238 |
+
"learning_rate": 0.00011236904832798785,
|
| 239 |
+
"loss": 1.42587646484375,
|
| 240 |
+
"mean_token_accuracy": 0.9003903394937516,
|
| 241 |
+
"num_tokens": 2447336.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.36871150620281695,
|
| 246 |
+
"epoch": 2.9424364123159306,
|
| 247 |
+
"grad_norm": 1.2951405048370361,
|
| 248 |
+
"learning_rate": 0.0001106062857021667,
|
| 249 |
+
"loss": 1.448046875,
|
| 250 |
+
"mean_token_accuracy": 0.8967258337140084,
|
| 251 |
+
"num_tokens": 2565837.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"epoch": 3.0,
|
| 256 |
+
"eval_entropy": 0.4225208269059658,
|
| 257 |
+
"eval_loss": 0.489418089389801,
|
| 258 |
+
"eval_mean_token_accuracy": 0.8697815361618996,
|
| 259 |
+
"eval_num_tokens": 2616741.0,
|
| 260 |
+
"eval_runtime": 96.4058,
|
| 261 |
+
"eval_samples_per_second": 16.586,
|
| 262 |
+
"eval_steps_per_second": 2.075,
|
| 263 |
+
"step": 1122
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"entropy": 0.3120347365285411,
|
| 267 |
+
"epoch": 3.074966532797858,
|
| 268 |
+
"grad_norm": 1.639520287513733,
|
| 269 |
+
"learning_rate": 0.00010873801579937106,
|
| 270 |
+
"loss": 1.1941973876953125,
|
| 271 |
+
"mean_token_accuracy": 0.9117801315856703,
|
| 272 |
+
"num_tokens": 2685975.0,
|
| 273 |
+
"step": 1150
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.28257040068507194,
|
| 277 |
+
"epoch": 3.208835341365462,
|
| 278 |
+
"grad_norm": 1.7459681034088135,
|
| 279 |
+
"learning_rate": 0.00010676830653892058,
|
| 280 |
+
"loss": 1.0850601196289062,
|
| 281 |
+
"mean_token_accuracy": 0.9177472350001336,
|
| 282 |
+
"num_tokens": 2798277.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.27802520349621773,
|
| 287 |
+
"epoch": 3.3427041499330654,
|
| 288 |
+
"grad_norm": 1.5176103115081787,
|
| 289 |
+
"learning_rate": 0.00010470144671139238,
|
| 290 |
+
"loss": 1.0840838623046876,
|
| 291 |
+
"mean_token_accuracy": 0.9179763168096542,
|
| 292 |
+
"num_tokens": 2918973.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.280417420566082,
|
| 297 |
+
"epoch": 3.4765729585006695,
|
| 298 |
+
"grad_norm": 1.3774974346160889,
|
| 299 |
+
"learning_rate": 0.00010254193664032686,
|
| 300 |
+
"loss": 1.0911756896972655,
|
| 301 |
+
"mean_token_accuracy": 0.9162956389784813,
|
| 302 |
+
"num_tokens": 3039073.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.2834589210152626,
|
| 307 |
+
"epoch": 3.610441767068273,
|
| 308 |
+
"grad_norm": 1.5929396152496338,
|
| 309 |
+
"learning_rate": 0.00010029447838334742,
|
| 310 |
+
"loss": 1.0985262298583984,
|
| 311 |
+
"mean_token_accuracy": 0.9174074530601501,
|
| 312 |
+
"num_tokens": 3153710.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.282296127229929,
|
| 317 |
+
"epoch": 3.7443105756358768,
|
| 318 |
+
"grad_norm": 1.50350022315979,
|
| 319 |
+
"learning_rate": 9.796396549403e-05,
|
| 320 |
+
"loss": 1.101386260986328,
|
| 321 |
+
"mean_token_accuracy": 0.9168545073270797,
|
| 322 |
+
"num_tokens": 3263594.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.279728781580925,
|
| 327 |
+
"epoch": 3.878179384203481,
|
| 328 |
+
"grad_norm": 1.4728187322616577,
|
| 329 |
+
"learning_rate": 9.555547236681456e-05,
|
| 330 |
+
"loss": 1.0859880065917968,
|
| 331 |
+
"mean_token_accuracy": 0.9178367125988006,
|
| 332 |
+
"num_tokens": 3386033.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"epoch": 4.0,
|
| 337 |
+
"eval_entropy": 0.34304031178355215,
|
| 338 |
+
"eval_loss": 0.5295785665512085,
|
| 339 |
+
"eval_mean_token_accuracy": 0.8698753178119659,
|
| 340 |
+
"eval_num_tokens": 3488988.0,
|
| 341 |
+
"eval_runtime": 96.3616,
|
| 342 |
+
"eval_samples_per_second": 16.594,
|
| 343 |
+
"eval_steps_per_second": 2.076,
|
| 344 |
+
"step": 1496
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"entropy": 0.27893446536377225,
|
| 348 |
+
"epoch": 4.010709504685408,
|
| 349 |
+
"grad_norm": 1.545491337776184,
|
| 350 |
+
"learning_rate": 9.30742431881587e-05,
|
| 351 |
+
"loss": 1.0577442169189453,
|
| 352 |
+
"mean_token_accuracy": 0.9191552999645772,
|
| 353 |
+
"num_tokens": 3498406.0,
|
| 354 |
+
"step": 1500
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.19769302535802125,
|
| 358 |
+
"epoch": 4.144578313253012,
|
| 359 |
+
"grad_norm": 2.10296893119812,
|
| 360 |
+
"learning_rate": 9.052568051799083e-05,
|
| 361 |
+
"loss": 0.7461458587646485,
|
| 362 |
+
"mean_token_accuracy": 0.9415343621373177,
|
| 363 |
+
"num_tokens": 3614301.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.1981763695180416,
|
| 368 |
+
"epoch": 4.278447121820616,
|
| 369 |
+
"grad_norm": 2.067410945892334,
|
| 370 |
+
"learning_rate": 8.791533352632524e-05,
|
| 371 |
+
"loss": 0.7580889892578125,
|
| 372 |
+
"mean_token_accuracy": 0.9396374526619912,
|
| 373 |
+
"num_tokens": 3735705.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.19850988369435071,
|
| 378 |
+
"epoch": 4.412315930388219,
|
| 379 |
+
"grad_norm": 1.9034850597381592,
|
| 380 |
+
"learning_rate": 8.524888591065258e-05,
|
| 381 |
+
"loss": 0.7526986694335938,
|
| 382 |
+
"mean_token_accuracy": 0.9402479353547096,
|
| 383 |
+
"num_tokens": 3854287.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.19905407220125199,
|
| 388 |
+
"epoch": 4.546184738955823,
|
| 389 |
+
"grad_norm": 2.1477949619293213,
|
| 390 |
+
"learning_rate": 8.253214352041379e-05,
|
| 391 |
+
"loss": 0.7603612518310547,
|
| 392 |
+
"mean_token_accuracy": 0.9396576225757599,
|
| 393 |
+
"num_tokens": 3967362.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.20251497332006693,
|
| 398 |
+
"epoch": 4.680053547523427,
|
| 399 |
+
"grad_norm": 1.5489246845245361,
|
| 400 |
+
"learning_rate": 7.97710217155036e-05,
|
| 401 |
+
"loss": 0.7711930084228515,
|
| 402 |
+
"mean_token_accuracy": 0.9400961664319039,
|
| 403 |
+
"num_tokens": 4081441.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.1991352306306362,
|
| 408 |
+
"epoch": 4.813922356091031,
|
| 409 |
+
"grad_norm": 1.969994068145752,
|
| 410 |
+
"learning_rate": 7.697153248632946e-05,
|
| 411 |
+
"loss": 0.7681967163085938,
|
| 412 |
+
"mean_token_accuracy": 0.9399621617794037,
|
| 413 |
+
"num_tokens": 4197604.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.20229352474212647,
|
| 418 |
+
"epoch": 4.947791164658635,
|
| 419 |
+
"grad_norm": 2.2329719066619873,
|
| 420 |
+
"learning_rate": 7.41397713634694e-05,
|
| 421 |
+
"loss": 0.7733911895751953,
|
| 422 |
+
"mean_token_accuracy": 0.9396535342931748,
|
| 423 |
+
"num_tokens": 4318894.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"epoch": 5.0,
|
| 428 |
+
"eval_entropy": 0.270584502145648,
|
| 429 |
+
"eval_loss": 0.6255385875701904,
|
| 430 |
+
"eval_mean_token_accuracy": 0.8687835082411766,
|
| 431 |
+
"eval_num_tokens": 4361235.0,
|
| 432 |
+
"eval_runtime": 96.6331,
|
| 433 |
+
"eval_samples_per_second": 16.547,
|
| 434 |
+
"eval_steps_per_second": 2.07,
|
| 435 |
+
"step": 1870
|
| 436 |
+
}
|
| 437 |
+
],
|
| 438 |
+
"logging_steps": 50,
|
| 439 |
+
"max_steps": 3740,
|
| 440 |
+
"num_input_tokens_seen": 0,
|
| 441 |
+
"num_train_epochs": 10,
|
| 442 |
+
"save_steps": 500,
|
| 443 |
+
"stateful_callbacks": {
|
| 444 |
+
"TrainerControl": {
|
| 445 |
+
"args": {
|
| 446 |
+
"should_epoch_stop": false,
|
| 447 |
+
"should_evaluate": false,
|
| 448 |
+
"should_log": false,
|
| 449 |
+
"should_save": true,
|
| 450 |
+
"should_training_stop": false
|
| 451 |
+
},
|
| 452 |
+
"attributes": {}
|
| 453 |
+
}
|
| 454 |
+
},
|
| 455 |
+
"total_flos": 1.4947622783933181e+18,
|
| 456 |
+
"train_batch_size": 4,
|
| 457 |
+
"trial_name": null,
|
| 458 |
+
"trial_params": null
|
| 459 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json
ADDED
|
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 6.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2244,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.3355020767450332,
|
| 14 |
+
"epoch": 0.13386880856760375,
|
| 15 |
+
"grad_norm": 3.2956597805023193,
|
| 16 |
+
"learning_rate": 1.628530639938585e-05,
|
| 17 |
+
"loss": 5.349910278320312,
|
| 18 |
+
"mean_token_accuracy": 0.7383818039298058,
|
| 19 |
+
"num_tokens": 116199.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.5958842460811138,
|
| 24 |
+
"epoch": 0.2677376171352075,
|
| 25 |
+
"grad_norm": 2.5947492122650146,
|
| 26 |
+
"learning_rate": 3.290296599059591e-05,
|
| 27 |
+
"loss": 2.312855072021484,
|
| 28 |
+
"mean_token_accuracy": 0.8520967712998391,
|
| 29 |
+
"num_tokens": 232864.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5190362003445625,
|
| 34 |
+
"epoch": 0.40160642570281124,
|
| 35 |
+
"grad_norm": 1.5038394927978516,
|
| 36 |
+
"learning_rate": 4.9520625581805955e-05,
|
| 37 |
+
"loss": 2.0574468994140624,
|
| 38 |
+
"mean_token_accuracy": 0.8657039344310761,
|
| 39 |
+
"num_tokens": 352382.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.4922871346771717,
|
| 44 |
+
"epoch": 0.535475234270415,
|
| 45 |
+
"grad_norm": 1.645923137664795,
|
| 46 |
+
"learning_rate": 6.613828517301602e-05,
|
| 47 |
+
"loss": 1.916438446044922,
|
| 48 |
+
"mean_token_accuracy": 0.8717759534716606,
|
| 49 |
+
"num_tokens": 474532.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.491110111027956,
|
| 54 |
+
"epoch": 0.6693440428380187,
|
| 55 |
+
"grad_norm": 1.866817593574524,
|
| 56 |
+
"learning_rate": 8.275594476422607e-05,
|
| 57 |
+
"loss": 1.9421713256835937,
|
| 58 |
+
"mean_token_accuracy": 0.8710730043053627,
|
| 59 |
+
"num_tokens": 589198.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.47134352535009383,
|
| 64 |
+
"epoch": 0.8032128514056225,
|
| 65 |
+
"grad_norm": 117.62409210205078,
|
| 66 |
+
"learning_rate": 9.937360435543611e-05,
|
| 67 |
+
"loss": 1.9768324279785157,
|
| 68 |
+
"mean_token_accuracy": 0.8741078078746796,
|
| 69 |
+
"num_tokens": 707057.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.4820582258701325,
|
| 74 |
+
"epoch": 0.9370816599732262,
|
| 75 |
+
"grad_norm": 2.3274827003479004,
|
| 76 |
+
"learning_rate": 0.00011599126394664616,
|
| 77 |
+
"loss": 2.2025875854492187,
|
| 78 |
+
"mean_token_accuracy": 0.8697148504853248,
|
| 79 |
+
"num_tokens": 822888.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5010400542616844,
|
| 85 |
+
"eval_loss": 0.5114277601242065,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8587275749444961,
|
| 87 |
+
"eval_num_tokens": 872247.0,
|
| 88 |
+
"eval_runtime": 96.5515,
|
| 89 |
+
"eval_samples_per_second": 16.561,
|
| 90 |
+
"eval_steps_per_second": 2.071,
|
| 91 |
+
"step": 374
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.4708875769918615,
|
| 95 |
+
"epoch": 1.069611780455154,
|
| 96 |
+
"grad_norm": 3.3712940216064453,
|
| 97 |
+
"learning_rate": 0.00012428317596508976,
|
| 98 |
+
"loss": 1.83294189453125,
|
| 99 |
+
"mean_token_accuracy": 0.8772370366737096,
|
| 100 |
+
"num_tokens": 929365.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.44804590195417404,
|
| 105 |
+
"epoch": 1.2034805890227578,
|
| 106 |
+
"grad_norm": 1.4833389520645142,
|
| 107 |
+
"learning_rate": 0.00012414788900475706,
|
| 108 |
+
"loss": 1.7768891906738282,
|
| 109 |
+
"mean_token_accuracy": 0.8791097947955131,
|
| 110 |
+
"num_tokens": 1046629.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.4510513086616993,
|
| 115 |
+
"epoch": 1.3373493975903614,
|
| 116 |
+
"grad_norm": 2.814790964126587,
|
| 117 |
+
"learning_rate": 0.00012387760965418496,
|
| 118 |
+
"loss": 1.7745071411132813,
|
| 119 |
+
"mean_token_accuracy": 0.8813075706362724,
|
| 120 |
+
"num_tokens": 1165744.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.4479117552936077,
|
| 125 |
+
"epoch": 1.4712182061579653,
|
| 126 |
+
"grad_norm": 1.855610728263855,
|
| 127 |
+
"learning_rate": 0.00012347292641217135,
|
| 128 |
+
"loss": 1.7583291625976563,
|
| 129 |
+
"mean_token_accuracy": 0.8815277495980263,
|
| 130 |
+
"num_tokens": 1284843.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.4380264139175415,
|
| 135 |
+
"epoch": 1.605087014725569,
|
| 136 |
+
"grad_norm": 1.383190631866455,
|
| 137 |
+
"learning_rate": 0.00012293472042483757,
|
| 138 |
+
"loss": 1.7229583740234375,
|
| 139 |
+
"mean_token_accuracy": 0.8832098203897476,
|
| 140 |
+
"num_tokens": 1406485.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.4342571949958801,
|
| 145 |
+
"epoch": 1.7389558232931726,
|
| 146 |
+
"grad_norm": 1.4977834224700928,
|
| 147 |
+
"learning_rate": 0.00012226416356704526,
|
| 148 |
+
"loss": 1.7174737548828125,
|
| 149 |
+
"mean_token_accuracy": 0.8834967383742333,
|
| 150 |
+
"num_tokens": 1525460.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.42700962007045745,
|
| 155 |
+
"epoch": 1.8728246318607764,
|
| 156 |
+
"grad_norm": 1.6156537532806396,
|
| 157 |
+
"learning_rate": 0.00012146271589078838,
|
| 158 |
+
"loss": 1.682061767578125,
|
| 159 |
+
"mean_token_accuracy": 0.8858474844694137,
|
| 160 |
+
"num_tokens": 1638984.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"epoch": 2.0,
|
| 165 |
+
"eval_entropy": 0.4838937771320343,
|
| 166 |
+
"eval_loss": 0.4826815128326416,
|
| 167 |
+
"eval_mean_token_accuracy": 0.8682844692468643,
|
| 168 |
+
"eval_num_tokens": 1744494.0,
|
| 169 |
+
"eval_runtime": 96.5071,
|
| 170 |
+
"eval_samples_per_second": 16.569,
|
| 171 |
+
"eval_steps_per_second": 2.072,
|
| 172 |
+
"step": 748
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"entropy": 0.4378527848407476,
|
| 176 |
+
"epoch": 2.005354752342704,
|
| 177 |
+
"grad_norm": 1.400229573249817,
|
| 178 |
+
"learning_rate": 0.0001205321224461161,
|
| 179 |
+
"loss": 1.7096096801757812,
|
| 180 |
+
"mean_token_accuracy": 0.8838462468349573,
|
| 181 |
+
"num_tokens": 1749755.0,
|
| 182 |
+
"step": 750
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.3559799794852734,
|
| 186 |
+
"epoch": 2.139223560910308,
|
| 187 |
+
"grad_norm": 1.7168083190917969,
|
| 188 |
+
"learning_rate": 0.0001194744094815093,
|
| 189 |
+
"loss": 1.3893603515625,
|
| 190 |
+
"mean_token_accuracy": 0.9004731178283691,
|
| 191 |
+
"num_tokens": 1868231.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.3671448823064566,
|
| 196 |
+
"epoch": 2.2730923694779115,
|
| 197 |
+
"grad_norm": 1.9720135927200317,
|
| 198 |
+
"learning_rate": 0.00011829188003198282,
|
| 199 |
+
"loss": 1.429988555908203,
|
| 200 |
+
"mean_token_accuracy": 0.8970818132162094,
|
| 201 |
+
"num_tokens": 1979116.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.3597494306415319,
|
| 206 |
+
"epoch": 2.4069611780455156,
|
| 207 |
+
"grad_norm": 1.4947372674942017,
|
| 208 |
+
"learning_rate": 0.00011698710890452068,
|
| 209 |
+
"loss": 1.418173828125,
|
| 210 |
+
"mean_token_accuracy": 0.8994651186466217,
|
| 211 |
+
"num_tokens": 2094539.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.36254502907395364,
|
| 216 |
+
"epoch": 2.540829986613119,
|
| 217 |
+
"grad_norm": 1.6768454313278198,
|
| 218 |
+
"learning_rate": 0.00011556293707176242,
|
| 219 |
+
"loss": 1.4158590698242188,
|
| 220 |
+
"mean_token_accuracy": 0.8995477721095085,
|
| 221 |
+
"num_tokens": 2209415.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.36290778368711474,
|
| 226 |
+
"epoch": 2.674698795180723,
|
| 227 |
+
"grad_norm": 1.6033697128295898,
|
| 228 |
+
"learning_rate": 0.00011402246548614765,
|
| 229 |
+
"loss": 1.4300469970703125,
|
| 230 |
+
"mean_token_accuracy": 0.8986452376842499,
|
| 231 |
+
"num_tokens": 2324269.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.3635872249305248,
|
| 236 |
+
"epoch": 2.8085676037483265,
|
| 237 |
+
"grad_norm": 1.546893835067749,
|
| 238 |
+
"learning_rate": 0.00011236904832798785,
|
| 239 |
+
"loss": 1.42587646484375,
|
| 240 |
+
"mean_token_accuracy": 0.9003903394937516,
|
| 241 |
+
"num_tokens": 2447336.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.36871150620281695,
|
| 246 |
+
"epoch": 2.9424364123159306,
|
| 247 |
+
"grad_norm": 1.2951405048370361,
|
| 248 |
+
"learning_rate": 0.0001106062857021667,
|
| 249 |
+
"loss": 1.448046875,
|
| 250 |
+
"mean_token_accuracy": 0.8967258337140084,
|
| 251 |
+
"num_tokens": 2565837.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"epoch": 3.0,
|
| 256 |
+
"eval_entropy": 0.4225208269059658,
|
| 257 |
+
"eval_loss": 0.489418089389801,
|
| 258 |
+
"eval_mean_token_accuracy": 0.8697815361618996,
|
| 259 |
+
"eval_num_tokens": 2616741.0,
|
| 260 |
+
"eval_runtime": 96.4058,
|
| 261 |
+
"eval_samples_per_second": 16.586,
|
| 262 |
+
"eval_steps_per_second": 2.075,
|
| 263 |
+
"step": 1122
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"entropy": 0.3120347365285411,
|
| 267 |
+
"epoch": 3.074966532797858,
|
| 268 |
+
"grad_norm": 1.639520287513733,
|
| 269 |
+
"learning_rate": 0.00010873801579937106,
|
| 270 |
+
"loss": 1.1941973876953125,
|
| 271 |
+
"mean_token_accuracy": 0.9117801315856703,
|
| 272 |
+
"num_tokens": 2685975.0,
|
| 273 |
+
"step": 1150
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.28257040068507194,
|
| 277 |
+
"epoch": 3.208835341365462,
|
| 278 |
+
"grad_norm": 1.7459681034088135,
|
| 279 |
+
"learning_rate": 0.00010676830653892058,
|
| 280 |
+
"loss": 1.0850601196289062,
|
| 281 |
+
"mean_token_accuracy": 0.9177472350001336,
|
| 282 |
+
"num_tokens": 2798277.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.27802520349621773,
|
| 287 |
+
"epoch": 3.3427041499330654,
|
| 288 |
+
"grad_norm": 1.5176103115081787,
|
| 289 |
+
"learning_rate": 0.00010470144671139238,
|
| 290 |
+
"loss": 1.0840838623046876,
|
| 291 |
+
"mean_token_accuracy": 0.9179763168096542,
|
| 292 |
+
"num_tokens": 2918973.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.280417420566082,
|
| 297 |
+
"epoch": 3.4765729585006695,
|
| 298 |
+
"grad_norm": 1.3774974346160889,
|
| 299 |
+
"learning_rate": 0.00010254193664032686,
|
| 300 |
+
"loss": 1.0911756896972655,
|
| 301 |
+
"mean_token_accuracy": 0.9162956389784813,
|
| 302 |
+
"num_tokens": 3039073.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.2834589210152626,
|
| 307 |
+
"epoch": 3.610441767068273,
|
| 308 |
+
"grad_norm": 1.5929396152496338,
|
| 309 |
+
"learning_rate": 0.00010029447838334742,
|
| 310 |
+
"loss": 1.0985262298583984,
|
| 311 |
+
"mean_token_accuracy": 0.9174074530601501,
|
| 312 |
+
"num_tokens": 3153710.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.282296127229929,
|
| 317 |
+
"epoch": 3.7443105756358768,
|
| 318 |
+
"grad_norm": 1.50350022315979,
|
| 319 |
+
"learning_rate": 9.796396549403e-05,
|
| 320 |
+
"loss": 1.101386260986328,
|
| 321 |
+
"mean_token_accuracy": 0.9168545073270797,
|
| 322 |
+
"num_tokens": 3263594.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.279728781580925,
|
| 327 |
+
"epoch": 3.878179384203481,
|
| 328 |
+
"grad_norm": 1.4728187322616577,
|
| 329 |
+
"learning_rate": 9.555547236681456e-05,
|
| 330 |
+
"loss": 1.0859880065917968,
|
| 331 |
+
"mean_token_accuracy": 0.9178367125988006,
|
| 332 |
+
"num_tokens": 3386033.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"epoch": 4.0,
|
| 337 |
+
"eval_entropy": 0.34304031178355215,
|
| 338 |
+
"eval_loss": 0.5295785665512085,
|
| 339 |
+
"eval_mean_token_accuracy": 0.8698753178119659,
|
| 340 |
+
"eval_num_tokens": 3488988.0,
|
| 341 |
+
"eval_runtime": 96.3616,
|
| 342 |
+
"eval_samples_per_second": 16.594,
|
| 343 |
+
"eval_steps_per_second": 2.076,
|
| 344 |
+
"step": 1496
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"entropy": 0.27893446536377225,
|
| 348 |
+
"epoch": 4.010709504685408,
|
| 349 |
+
"grad_norm": 1.545491337776184,
|
| 350 |
+
"learning_rate": 9.30742431881587e-05,
|
| 351 |
+
"loss": 1.0577442169189453,
|
| 352 |
+
"mean_token_accuracy": 0.9191552999645772,
|
| 353 |
+
"num_tokens": 3498406.0,
|
| 354 |
+
"step": 1500
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.19769302535802125,
|
| 358 |
+
"epoch": 4.144578313253012,
|
| 359 |
+
"grad_norm": 2.10296893119812,
|
| 360 |
+
"learning_rate": 9.052568051799083e-05,
|
| 361 |
+
"loss": 0.7461458587646485,
|
| 362 |
+
"mean_token_accuracy": 0.9415343621373177,
|
| 363 |
+
"num_tokens": 3614301.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.1981763695180416,
|
| 368 |
+
"epoch": 4.278447121820616,
|
| 369 |
+
"grad_norm": 2.067410945892334,
|
| 370 |
+
"learning_rate": 8.791533352632524e-05,
|
| 371 |
+
"loss": 0.7580889892578125,
|
| 372 |
+
"mean_token_accuracy": 0.9396374526619912,
|
| 373 |
+
"num_tokens": 3735705.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.19850988369435071,
|
| 378 |
+
"epoch": 4.412315930388219,
|
| 379 |
+
"grad_norm": 1.9034850597381592,
|
| 380 |
+
"learning_rate": 8.524888591065258e-05,
|
| 381 |
+
"loss": 0.7526986694335938,
|
| 382 |
+
"mean_token_accuracy": 0.9402479353547096,
|
| 383 |
+
"num_tokens": 3854287.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.19905407220125199,
|
| 388 |
+
"epoch": 4.546184738955823,
|
| 389 |
+
"grad_norm": 2.1477949619293213,
|
| 390 |
+
"learning_rate": 8.253214352041379e-05,
|
| 391 |
+
"loss": 0.7603612518310547,
|
| 392 |
+
"mean_token_accuracy": 0.9396576225757599,
|
| 393 |
+
"num_tokens": 3967362.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.20251497332006693,
|
| 398 |
+
"epoch": 4.680053547523427,
|
| 399 |
+
"grad_norm": 1.5489246845245361,
|
| 400 |
+
"learning_rate": 7.97710217155036e-05,
|
| 401 |
+
"loss": 0.7711930084228515,
|
| 402 |
+
"mean_token_accuracy": 0.9400961664319039,
|
| 403 |
+
"num_tokens": 4081441.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.1991352306306362,
|
| 408 |
+
"epoch": 4.813922356091031,
|
| 409 |
+
"grad_norm": 1.969994068145752,
|
| 410 |
+
"learning_rate": 7.697153248632946e-05,
|
| 411 |
+
"loss": 0.7681967163085938,
|
| 412 |
+
"mean_token_accuracy": 0.9399621617794037,
|
| 413 |
+
"num_tokens": 4197604.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.20229352474212647,
|
| 418 |
+
"epoch": 4.947791164658635,
|
| 419 |
+
"grad_norm": 2.2329719066619873,
|
| 420 |
+
"learning_rate": 7.41397713634694e-05,
|
| 421 |
+
"loss": 0.7733911895751953,
|
| 422 |
+
"mean_token_accuracy": 0.9396535342931748,
|
| 423 |
+
"num_tokens": 4318894.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"epoch": 5.0,
|
| 428 |
+
"eval_entropy": 0.270584502145648,
|
| 429 |
+
"eval_loss": 0.6255385875701904,
|
| 430 |
+
"eval_mean_token_accuracy": 0.8687835082411766,
|
| 431 |
+
"eval_num_tokens": 4361235.0,
|
| 432 |
+
"eval_runtime": 96.6331,
|
| 433 |
+
"eval_samples_per_second": 16.547,
|
| 434 |
+
"eval_steps_per_second": 2.07,
|
| 435 |
+
"step": 1870
|
| 436 |
+
},
|
| 437 |
+
{
|
| 438 |
+
"entropy": 0.16372355209155517,
|
| 439 |
+
"epoch": 5.080321285140562,
|
| 440 |
+
"grad_norm": 8.029130935668945,
|
| 441 |
+
"learning_rate": 7.128190414543193e-05,
|
| 442 |
+
"loss": 0.6145073699951172,
|
| 443 |
+
"mean_token_accuracy": 0.9516371590922578,
|
| 444 |
+
"num_tokens": 4434412.0,
|
| 445 |
+
"step": 1900
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"entropy": 0.14057113960385323,
|
| 449 |
+
"epoch": 5.214190093708166,
|
| 450 |
+
"grad_norm": 2.23626446723938,
|
| 451 |
+
"learning_rate": 6.840415347341672e-05,
|
| 452 |
+
"loss": 0.5295140075683594,
|
| 453 |
+
"mean_token_accuracy": 0.9593333688378334,
|
| 454 |
+
"num_tokens": 4548703.0,
|
| 455 |
+
"step": 1950
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"entropy": 0.14139273861423135,
|
| 459 |
+
"epoch": 5.34805890227577,
|
| 460 |
+
"grad_norm": 2.0157318115234375,
|
| 461 |
+
"learning_rate": 6.551278528230729e-05,
|
| 462 |
+
"loss": 0.5296827697753906,
|
| 463 |
+
"mean_token_accuracy": 0.9590813705325126,
|
| 464 |
+
"num_tokens": 4665542.0,
|
| 465 |
+
"step": 2000
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"entropy": 0.14537794288247824,
|
| 469 |
+
"epoch": 5.481927710843373,
|
| 470 |
+
"grad_norm": 1.5371013879776,
|
| 471 |
+
"learning_rate": 6.261409515739736e-05,
|
| 472 |
+
"loss": 0.5478645706176758,
|
| 473 |
+
"mean_token_accuracy": 0.9577724316716194,
|
| 474 |
+
"num_tokens": 4778075.0,
|
| 475 |
+
"step": 2050
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"entropy": 0.14534839443862438,
|
| 479 |
+
"epoch": 5.615796519410977,
|
| 480 |
+
"grad_norm": 2.0134589672088623,
|
| 481 |
+
"learning_rate": 5.971439462655727e-05,
|
| 482 |
+
"loss": 0.5426230239868164,
|
| 483 |
+
"mean_token_accuracy": 0.9581041479110718,
|
| 484 |
+
"num_tokens": 4897453.0,
|
| 485 |
+
"step": 2100
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"entropy": 0.14614912170916797,
|
| 489 |
+
"epoch": 5.749665327978581,
|
| 490 |
+
"grad_norm": 1.286437749862671,
|
| 491 |
+
"learning_rate": 5.6819997417687274e-05,
|
| 492 |
+
"loss": 0.5487421798706055,
|
| 493 |
+
"mean_token_accuracy": 0.9563529288768768,
|
| 494 |
+
"num_tokens": 5012767.0,
|
| 495 |
+
"step": 2150
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"entropy": 0.13987606402486563,
|
| 499 |
+
"epoch": 5.883534136546185,
|
| 500 |
+
"grad_norm": 1.7586702108383179,
|
| 501 |
+
"learning_rate": 5.393720571138079e-05,
|
| 502 |
+
"loss": 0.5254617309570313,
|
| 503 |
+
"mean_token_accuracy": 0.9590577334165573,
|
| 504 |
+
"num_tokens": 5129878.0,
|
| 505 |
+
"step": 2200
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"epoch": 6.0,
|
| 509 |
+
"eval_entropy": 0.2240281231701374,
|
| 510 |
+
"eval_loss": 0.7485206723213196,
|
| 511 |
+
"eval_mean_token_accuracy": 0.8668996468186378,
|
| 512 |
+
"eval_num_tokens": 5233482.0,
|
| 513 |
+
"eval_runtime": 96.4089,
|
| 514 |
+
"eval_samples_per_second": 16.586,
|
| 515 |
+
"eval_steps_per_second": 2.074,
|
| 516 |
+
"step": 2244
|
| 517 |
+
}
|
| 518 |
+
],
|
| 519 |
+
"logging_steps": 50,
|
| 520 |
+
"max_steps": 3740,
|
| 521 |
+
"num_input_tokens_seen": 0,
|
| 522 |
+
"num_train_epochs": 10,
|
| 523 |
+
"save_steps": 500,
|
| 524 |
+
"stateful_callbacks": {
|
| 525 |
+
"TrainerControl": {
|
| 526 |
+
"args": {
|
| 527 |
+
"should_epoch_stop": false,
|
| 528 |
+
"should_evaluate": false,
|
| 529 |
+
"should_log": false,
|
| 530 |
+
"should_save": true,
|
| 531 |
+
"should_training_stop": false
|
| 532 |
+
},
|
| 533 |
+
"attributes": {}
|
| 534 |
+
}
|
| 535 |
+
},
|
| 536 |
+
"total_flos": 1.7914914724245857e+18,
|
| 537 |
+
"train_batch_size": 4,
|
| 538 |
+
"trial_name": null,
|
| 539 |
+
"trial_params": null
|
| 540 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json
ADDED
|
@@ -0,0 +1,631 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 7.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2618,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"entropy": 1.3355020767450332,
|
| 14 |
+
"epoch": 0.13386880856760375,
|
| 15 |
+
"grad_norm": 3.2956597805023193,
|
| 16 |
+
"learning_rate": 1.628530639938585e-05,
|
| 17 |
+
"loss": 5.349910278320312,
|
| 18 |
+
"mean_token_accuracy": 0.7383818039298058,
|
| 19 |
+
"num_tokens": 116199.0,
|
| 20 |
+
"step": 50
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"entropy": 0.5958842460811138,
|
| 24 |
+
"epoch": 0.2677376171352075,
|
| 25 |
+
"grad_norm": 2.5947492122650146,
|
| 26 |
+
"learning_rate": 3.290296599059591e-05,
|
| 27 |
+
"loss": 2.312855072021484,
|
| 28 |
+
"mean_token_accuracy": 0.8520967712998391,
|
| 29 |
+
"num_tokens": 232864.0,
|
| 30 |
+
"step": 100
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"entropy": 0.5190362003445625,
|
| 34 |
+
"epoch": 0.40160642570281124,
|
| 35 |
+
"grad_norm": 1.5038394927978516,
|
| 36 |
+
"learning_rate": 4.9520625581805955e-05,
|
| 37 |
+
"loss": 2.0574468994140624,
|
| 38 |
+
"mean_token_accuracy": 0.8657039344310761,
|
| 39 |
+
"num_tokens": 352382.0,
|
| 40 |
+
"step": 150
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"entropy": 0.4922871346771717,
|
| 44 |
+
"epoch": 0.535475234270415,
|
| 45 |
+
"grad_norm": 1.645923137664795,
|
| 46 |
+
"learning_rate": 6.613828517301602e-05,
|
| 47 |
+
"loss": 1.916438446044922,
|
| 48 |
+
"mean_token_accuracy": 0.8717759534716606,
|
| 49 |
+
"num_tokens": 474532.0,
|
| 50 |
+
"step": 200
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"entropy": 0.491110111027956,
|
| 54 |
+
"epoch": 0.6693440428380187,
|
| 55 |
+
"grad_norm": 1.866817593574524,
|
| 56 |
+
"learning_rate": 8.275594476422607e-05,
|
| 57 |
+
"loss": 1.9421713256835937,
|
| 58 |
+
"mean_token_accuracy": 0.8710730043053627,
|
| 59 |
+
"num_tokens": 589198.0,
|
| 60 |
+
"step": 250
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"entropy": 0.47134352535009383,
|
| 64 |
+
"epoch": 0.8032128514056225,
|
| 65 |
+
"grad_norm": 117.62409210205078,
|
| 66 |
+
"learning_rate": 9.937360435543611e-05,
|
| 67 |
+
"loss": 1.9768324279785157,
|
| 68 |
+
"mean_token_accuracy": 0.8741078078746796,
|
| 69 |
+
"num_tokens": 707057.0,
|
| 70 |
+
"step": 300
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"entropy": 0.4820582258701325,
|
| 74 |
+
"epoch": 0.9370816599732262,
|
| 75 |
+
"grad_norm": 2.3274827003479004,
|
| 76 |
+
"learning_rate": 0.00011599126394664616,
|
| 77 |
+
"loss": 2.2025875854492187,
|
| 78 |
+
"mean_token_accuracy": 0.8697148504853248,
|
| 79 |
+
"num_tokens": 822888.0,
|
| 80 |
+
"step": 350
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 1.0,
|
| 84 |
+
"eval_entropy": 0.5010400542616844,
|
| 85 |
+
"eval_loss": 0.5114277601242065,
|
| 86 |
+
"eval_mean_token_accuracy": 0.8587275749444961,
|
| 87 |
+
"eval_num_tokens": 872247.0,
|
| 88 |
+
"eval_runtime": 96.5515,
|
| 89 |
+
"eval_samples_per_second": 16.561,
|
| 90 |
+
"eval_steps_per_second": 2.071,
|
| 91 |
+
"step": 374
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"entropy": 0.4708875769918615,
|
| 95 |
+
"epoch": 1.069611780455154,
|
| 96 |
+
"grad_norm": 3.3712940216064453,
|
| 97 |
+
"learning_rate": 0.00012428317596508976,
|
| 98 |
+
"loss": 1.83294189453125,
|
| 99 |
+
"mean_token_accuracy": 0.8772370366737096,
|
| 100 |
+
"num_tokens": 929365.0,
|
| 101 |
+
"step": 400
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"entropy": 0.44804590195417404,
|
| 105 |
+
"epoch": 1.2034805890227578,
|
| 106 |
+
"grad_norm": 1.4833389520645142,
|
| 107 |
+
"learning_rate": 0.00012414788900475706,
|
| 108 |
+
"loss": 1.7768891906738282,
|
| 109 |
+
"mean_token_accuracy": 0.8791097947955131,
|
| 110 |
+
"num_tokens": 1046629.0,
|
| 111 |
+
"step": 450
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"entropy": 0.4510513086616993,
|
| 115 |
+
"epoch": 1.3373493975903614,
|
| 116 |
+
"grad_norm": 2.814790964126587,
|
| 117 |
+
"learning_rate": 0.00012387760965418496,
|
| 118 |
+
"loss": 1.7745071411132813,
|
| 119 |
+
"mean_token_accuracy": 0.8813075706362724,
|
| 120 |
+
"num_tokens": 1165744.0,
|
| 121 |
+
"step": 500
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"entropy": 0.4479117552936077,
|
| 125 |
+
"epoch": 1.4712182061579653,
|
| 126 |
+
"grad_norm": 1.855610728263855,
|
| 127 |
+
"learning_rate": 0.00012347292641217135,
|
| 128 |
+
"loss": 1.7583291625976563,
|
| 129 |
+
"mean_token_accuracy": 0.8815277495980263,
|
| 130 |
+
"num_tokens": 1284843.0,
|
| 131 |
+
"step": 550
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"entropy": 0.4380264139175415,
|
| 135 |
+
"epoch": 1.605087014725569,
|
| 136 |
+
"grad_norm": 1.383190631866455,
|
| 137 |
+
"learning_rate": 0.00012293472042483757,
|
| 138 |
+
"loss": 1.7229583740234375,
|
| 139 |
+
"mean_token_accuracy": 0.8832098203897476,
|
| 140 |
+
"num_tokens": 1406485.0,
|
| 141 |
+
"step": 600
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"entropy": 0.4342571949958801,
|
| 145 |
+
"epoch": 1.7389558232931726,
|
| 146 |
+
"grad_norm": 1.4977834224700928,
|
| 147 |
+
"learning_rate": 0.00012226416356704526,
|
| 148 |
+
"loss": 1.7174737548828125,
|
| 149 |
+
"mean_token_accuracy": 0.8834967383742333,
|
| 150 |
+
"num_tokens": 1525460.0,
|
| 151 |
+
"step": 650
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"entropy": 0.42700962007045745,
|
| 155 |
+
"epoch": 1.8728246318607764,
|
| 156 |
+
"grad_norm": 1.6156537532806396,
|
| 157 |
+
"learning_rate": 0.00012146271589078838,
|
| 158 |
+
"loss": 1.682061767578125,
|
| 159 |
+
"mean_token_accuracy": 0.8858474844694137,
|
| 160 |
+
"num_tokens": 1638984.0,
|
| 161 |
+
"step": 700
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"epoch": 2.0,
|
| 165 |
+
"eval_entropy": 0.4838937771320343,
|
| 166 |
+
"eval_loss": 0.4826815128326416,
|
| 167 |
+
"eval_mean_token_accuracy": 0.8682844692468643,
|
| 168 |
+
"eval_num_tokens": 1744494.0,
|
| 169 |
+
"eval_runtime": 96.5071,
|
| 170 |
+
"eval_samples_per_second": 16.569,
|
| 171 |
+
"eval_steps_per_second": 2.072,
|
| 172 |
+
"step": 748
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"entropy": 0.4378527848407476,
|
| 176 |
+
"epoch": 2.005354752342704,
|
| 177 |
+
"grad_norm": 1.400229573249817,
|
| 178 |
+
"learning_rate": 0.0001205321224461161,
|
| 179 |
+
"loss": 1.7096096801757812,
|
| 180 |
+
"mean_token_accuracy": 0.8838462468349573,
|
| 181 |
+
"num_tokens": 1749755.0,
|
| 182 |
+
"step": 750
|
| 183 |
+
},
|
| 184 |
+
{
|
| 185 |
+
"entropy": 0.3559799794852734,
|
| 186 |
+
"epoch": 2.139223560910308,
|
| 187 |
+
"grad_norm": 1.7168083190917969,
|
| 188 |
+
"learning_rate": 0.0001194744094815093,
|
| 189 |
+
"loss": 1.3893603515625,
|
| 190 |
+
"mean_token_accuracy": 0.9004731178283691,
|
| 191 |
+
"num_tokens": 1868231.0,
|
| 192 |
+
"step": 800
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"entropy": 0.3671448823064566,
|
| 196 |
+
"epoch": 2.2730923694779115,
|
| 197 |
+
"grad_norm": 1.9720135927200317,
|
| 198 |
+
"learning_rate": 0.00011829188003198282,
|
| 199 |
+
"loss": 1.429988555908203,
|
| 200 |
+
"mean_token_accuracy": 0.8970818132162094,
|
| 201 |
+
"num_tokens": 1979116.0,
|
| 202 |
+
"step": 850
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"entropy": 0.3597494306415319,
|
| 206 |
+
"epoch": 2.4069611780455156,
|
| 207 |
+
"grad_norm": 1.4947372674942017,
|
| 208 |
+
"learning_rate": 0.00011698710890452068,
|
| 209 |
+
"loss": 1.418173828125,
|
| 210 |
+
"mean_token_accuracy": 0.8994651186466217,
|
| 211 |
+
"num_tokens": 2094539.0,
|
| 212 |
+
"step": 900
|
| 213 |
+
},
|
| 214 |
+
{
|
| 215 |
+
"entropy": 0.36254502907395364,
|
| 216 |
+
"epoch": 2.540829986613119,
|
| 217 |
+
"grad_norm": 1.6768454313278198,
|
| 218 |
+
"learning_rate": 0.00011556293707176242,
|
| 219 |
+
"loss": 1.4158590698242188,
|
| 220 |
+
"mean_token_accuracy": 0.8995477721095085,
|
| 221 |
+
"num_tokens": 2209415.0,
|
| 222 |
+
"step": 950
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"entropy": 0.36290778368711474,
|
| 226 |
+
"epoch": 2.674698795180723,
|
| 227 |
+
"grad_norm": 1.6033697128295898,
|
| 228 |
+
"learning_rate": 0.00011402246548614765,
|
| 229 |
+
"loss": 1.4300469970703125,
|
| 230 |
+
"mean_token_accuracy": 0.8986452376842499,
|
| 231 |
+
"num_tokens": 2324269.0,
|
| 232 |
+
"step": 1000
|
| 233 |
+
},
|
| 234 |
+
{
|
| 235 |
+
"entropy": 0.3635872249305248,
|
| 236 |
+
"epoch": 2.8085676037483265,
|
| 237 |
+
"grad_norm": 1.546893835067749,
|
| 238 |
+
"learning_rate": 0.00011236904832798785,
|
| 239 |
+
"loss": 1.42587646484375,
|
| 240 |
+
"mean_token_accuracy": 0.9003903394937516,
|
| 241 |
+
"num_tokens": 2447336.0,
|
| 242 |
+
"step": 1050
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"entropy": 0.36871150620281695,
|
| 246 |
+
"epoch": 2.9424364123159306,
|
| 247 |
+
"grad_norm": 1.2951405048370361,
|
| 248 |
+
"learning_rate": 0.0001106062857021667,
|
| 249 |
+
"loss": 1.448046875,
|
| 250 |
+
"mean_token_accuracy": 0.8967258337140084,
|
| 251 |
+
"num_tokens": 2565837.0,
|
| 252 |
+
"step": 1100
|
| 253 |
+
},
|
| 254 |
+
{
|
| 255 |
+
"epoch": 3.0,
|
| 256 |
+
"eval_entropy": 0.4225208269059658,
|
| 257 |
+
"eval_loss": 0.489418089389801,
|
| 258 |
+
"eval_mean_token_accuracy": 0.8697815361618996,
|
| 259 |
+
"eval_num_tokens": 2616741.0,
|
| 260 |
+
"eval_runtime": 96.4058,
|
| 261 |
+
"eval_samples_per_second": 16.586,
|
| 262 |
+
"eval_steps_per_second": 2.075,
|
| 263 |
+
"step": 1122
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"entropy": 0.3120347365285411,
|
| 267 |
+
"epoch": 3.074966532797858,
|
| 268 |
+
"grad_norm": 1.639520287513733,
|
| 269 |
+
"learning_rate": 0.00010873801579937106,
|
| 270 |
+
"loss": 1.1941973876953125,
|
| 271 |
+
"mean_token_accuracy": 0.9117801315856703,
|
| 272 |
+
"num_tokens": 2685975.0,
|
| 273 |
+
"step": 1150
|
| 274 |
+
},
|
| 275 |
+
{
|
| 276 |
+
"entropy": 0.28257040068507194,
|
| 277 |
+
"epoch": 3.208835341365462,
|
| 278 |
+
"grad_norm": 1.7459681034088135,
|
| 279 |
+
"learning_rate": 0.00010676830653892058,
|
| 280 |
+
"loss": 1.0850601196289062,
|
| 281 |
+
"mean_token_accuracy": 0.9177472350001336,
|
| 282 |
+
"num_tokens": 2798277.0,
|
| 283 |
+
"step": 1200
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"entropy": 0.27802520349621773,
|
| 287 |
+
"epoch": 3.3427041499330654,
|
| 288 |
+
"grad_norm": 1.5176103115081787,
|
| 289 |
+
"learning_rate": 0.00010470144671139238,
|
| 290 |
+
"loss": 1.0840838623046876,
|
| 291 |
+
"mean_token_accuracy": 0.9179763168096542,
|
| 292 |
+
"num_tokens": 2918973.0,
|
| 293 |
+
"step": 1250
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"entropy": 0.280417420566082,
|
| 297 |
+
"epoch": 3.4765729585006695,
|
| 298 |
+
"grad_norm": 1.3774974346160889,
|
| 299 |
+
"learning_rate": 0.00010254193664032686,
|
| 300 |
+
"loss": 1.0911756896972655,
|
| 301 |
+
"mean_token_accuracy": 0.9162956389784813,
|
| 302 |
+
"num_tokens": 3039073.0,
|
| 303 |
+
"step": 1300
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"entropy": 0.2834589210152626,
|
| 307 |
+
"epoch": 3.610441767068273,
|
| 308 |
+
"grad_norm": 1.5929396152496338,
|
| 309 |
+
"learning_rate": 0.00010029447838334742,
|
| 310 |
+
"loss": 1.0985262298583984,
|
| 311 |
+
"mean_token_accuracy": 0.9174074530601501,
|
| 312 |
+
"num_tokens": 3153710.0,
|
| 313 |
+
"step": 1350
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"entropy": 0.282296127229929,
|
| 317 |
+
"epoch": 3.7443105756358768,
|
| 318 |
+
"grad_norm": 1.50350022315979,
|
| 319 |
+
"learning_rate": 9.796396549403e-05,
|
| 320 |
+
"loss": 1.101386260986328,
|
| 321 |
+
"mean_token_accuracy": 0.9168545073270797,
|
| 322 |
+
"num_tokens": 3263594.0,
|
| 323 |
+
"step": 1400
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"entropy": 0.279728781580925,
|
| 327 |
+
"epoch": 3.878179384203481,
|
| 328 |
+
"grad_norm": 1.4728187322616577,
|
| 329 |
+
"learning_rate": 9.555547236681456e-05,
|
| 330 |
+
"loss": 1.0859880065917968,
|
| 331 |
+
"mean_token_accuracy": 0.9178367125988006,
|
| 332 |
+
"num_tokens": 3386033.0,
|
| 333 |
+
"step": 1450
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"epoch": 4.0,
|
| 337 |
+
"eval_entropy": 0.34304031178355215,
|
| 338 |
+
"eval_loss": 0.5295785665512085,
|
| 339 |
+
"eval_mean_token_accuracy": 0.8698753178119659,
|
| 340 |
+
"eval_num_tokens": 3488988.0,
|
| 341 |
+
"eval_runtime": 96.3616,
|
| 342 |
+
"eval_samples_per_second": 16.594,
|
| 343 |
+
"eval_steps_per_second": 2.076,
|
| 344 |
+
"step": 1496
|
| 345 |
+
},
|
| 346 |
+
{
|
| 347 |
+
"entropy": 0.27893446536377225,
|
| 348 |
+
"epoch": 4.010709504685408,
|
| 349 |
+
"grad_norm": 1.545491337776184,
|
| 350 |
+
"learning_rate": 9.30742431881587e-05,
|
| 351 |
+
"loss": 1.0577442169189453,
|
| 352 |
+
"mean_token_accuracy": 0.9191552999645772,
|
| 353 |
+
"num_tokens": 3498406.0,
|
| 354 |
+
"step": 1500
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"entropy": 0.19769302535802125,
|
| 358 |
+
"epoch": 4.144578313253012,
|
| 359 |
+
"grad_norm": 2.10296893119812,
|
| 360 |
+
"learning_rate": 9.052568051799083e-05,
|
| 361 |
+
"loss": 0.7461458587646485,
|
| 362 |
+
"mean_token_accuracy": 0.9415343621373177,
|
| 363 |
+
"num_tokens": 3614301.0,
|
| 364 |
+
"step": 1550
|
| 365 |
+
},
|
| 366 |
+
{
|
| 367 |
+
"entropy": 0.1981763695180416,
|
| 368 |
+
"epoch": 4.278447121820616,
|
| 369 |
+
"grad_norm": 2.067410945892334,
|
| 370 |
+
"learning_rate": 8.791533352632524e-05,
|
| 371 |
+
"loss": 0.7580889892578125,
|
| 372 |
+
"mean_token_accuracy": 0.9396374526619912,
|
| 373 |
+
"num_tokens": 3735705.0,
|
| 374 |
+
"step": 1600
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"entropy": 0.19850988369435071,
|
| 378 |
+
"epoch": 4.412315930388219,
|
| 379 |
+
"grad_norm": 1.9034850597381592,
|
| 380 |
+
"learning_rate": 8.524888591065258e-05,
|
| 381 |
+
"loss": 0.7526986694335938,
|
| 382 |
+
"mean_token_accuracy": 0.9402479353547096,
|
| 383 |
+
"num_tokens": 3854287.0,
|
| 384 |
+
"step": 1650
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"entropy": 0.19905407220125199,
|
| 388 |
+
"epoch": 4.546184738955823,
|
| 389 |
+
"grad_norm": 2.1477949619293213,
|
| 390 |
+
"learning_rate": 8.253214352041379e-05,
|
| 391 |
+
"loss": 0.7603612518310547,
|
| 392 |
+
"mean_token_accuracy": 0.9396576225757599,
|
| 393 |
+
"num_tokens": 3967362.0,
|
| 394 |
+
"step": 1700
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"entropy": 0.20251497332006693,
|
| 398 |
+
"epoch": 4.680053547523427,
|
| 399 |
+
"grad_norm": 1.5489246845245361,
|
| 400 |
+
"learning_rate": 7.97710217155036e-05,
|
| 401 |
+
"loss": 0.7711930084228515,
|
| 402 |
+
"mean_token_accuracy": 0.9400961664319039,
|
| 403 |
+
"num_tokens": 4081441.0,
|
| 404 |
+
"step": 1750
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"entropy": 0.1991352306306362,
|
| 408 |
+
"epoch": 4.813922356091031,
|
| 409 |
+
"grad_norm": 1.969994068145752,
|
| 410 |
+
"learning_rate": 7.697153248632946e-05,
|
| 411 |
+
"loss": 0.7681967163085938,
|
| 412 |
+
"mean_token_accuracy": 0.9399621617794037,
|
| 413 |
+
"num_tokens": 4197604.0,
|
| 414 |
+
"step": 1800
|
| 415 |
+
},
|
| 416 |
+
{
|
| 417 |
+
"entropy": 0.20229352474212647,
|
| 418 |
+
"epoch": 4.947791164658635,
|
| 419 |
+
"grad_norm": 2.2329719066619873,
|
| 420 |
+
"learning_rate": 7.41397713634694e-05,
|
| 421 |
+
"loss": 0.7733911895751953,
|
| 422 |
+
"mean_token_accuracy": 0.9396535342931748,
|
| 423 |
+
"num_tokens": 4318894.0,
|
| 424 |
+
"step": 1850
|
| 425 |
+
},
|
| 426 |
+
{
|
| 427 |
+
"epoch": 5.0,
|
| 428 |
+
"eval_entropy": 0.270584502145648,
|
| 429 |
+
"eval_loss": 0.6255385875701904,
|
| 430 |
+
"eval_mean_token_accuracy": 0.8687835082411766,
|
| 431 |
+
"eval_num_tokens": 4361235.0,
|
| 432 |
+
"eval_runtime": 96.6331,
|
| 433 |
+
"eval_samples_per_second": 16.547,
|
| 434 |
+
"eval_steps_per_second": 2.07,
|
| 435 |
+
"step": 1870
|
| 436 |
+
},
|
| 437 |
+
{
|
| 438 |
+
"entropy": 0.16372355209155517,
|
| 439 |
+
"epoch": 5.080321285140562,
|
| 440 |
+
"grad_norm": 8.029130935668945,
|
| 441 |
+
"learning_rate": 7.128190414543193e-05,
|
| 442 |
+
"loss": 0.6145073699951172,
|
| 443 |
+
"mean_token_accuracy": 0.9516371590922578,
|
| 444 |
+
"num_tokens": 4434412.0,
|
| 445 |
+
"step": 1900
|
| 446 |
+
},
|
| 447 |
+
{
|
| 448 |
+
"entropy": 0.14057113960385323,
|
| 449 |
+
"epoch": 5.214190093708166,
|
| 450 |
+
"grad_norm": 2.23626446723938,
|
| 451 |
+
"learning_rate": 6.840415347341672e-05,
|
| 452 |
+
"loss": 0.5295140075683594,
|
| 453 |
+
"mean_token_accuracy": 0.9593333688378334,
|
| 454 |
+
"num_tokens": 4548703.0,
|
| 455 |
+
"step": 1950
|
| 456 |
+
},
|
| 457 |
+
{
|
| 458 |
+
"entropy": 0.14139273861423135,
|
| 459 |
+
"epoch": 5.34805890227577,
|
| 460 |
+
"grad_norm": 2.0157318115234375,
|
| 461 |
+
"learning_rate": 6.551278528230729e-05,
|
| 462 |
+
"loss": 0.5296827697753906,
|
| 463 |
+
"mean_token_accuracy": 0.9590813705325126,
|
| 464 |
+
"num_tokens": 4665542.0,
|
| 465 |
+
"step": 2000
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"entropy": 0.14537794288247824,
|
| 469 |
+
"epoch": 5.481927710843373,
|
| 470 |
+
"grad_norm": 1.5371013879776,
|
| 471 |
+
"learning_rate": 6.261409515739736e-05,
|
| 472 |
+
"loss": 0.5478645706176758,
|
| 473 |
+
"mean_token_accuracy": 0.9577724316716194,
|
| 474 |
+
"num_tokens": 4778075.0,
|
| 475 |
+
"step": 2050
|
| 476 |
+
},
|
| 477 |
+
{
|
| 478 |
+
"entropy": 0.14534839443862438,
|
| 479 |
+
"epoch": 5.615796519410977,
|
| 480 |
+
"grad_norm": 2.0134589672088623,
|
| 481 |
+
"learning_rate": 5.971439462655727e-05,
|
| 482 |
+
"loss": 0.5426230239868164,
|
| 483 |
+
"mean_token_accuracy": 0.9581041479110718,
|
| 484 |
+
"num_tokens": 4897453.0,
|
| 485 |
+
"step": 2100
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"entropy": 0.14614912170916797,
|
| 489 |
+
"epoch": 5.749665327978581,
|
| 490 |
+
"grad_norm": 1.286437749862671,
|
| 491 |
+
"learning_rate": 5.6819997417687274e-05,
|
| 492 |
+
"loss": 0.5487421798706055,
|
| 493 |
+
"mean_token_accuracy": 0.9563529288768768,
|
| 494 |
+
"num_tokens": 5012767.0,
|
| 495 |
+
"step": 2150
|
| 496 |
+
},
|
| 497 |
+
{
|
| 498 |
+
"entropy": 0.13987606402486563,
|
| 499 |
+
"epoch": 5.883534136546185,
|
| 500 |
+
"grad_norm": 1.7586702108383179,
|
| 501 |
+
"learning_rate": 5.393720571138079e-05,
|
| 502 |
+
"loss": 0.5254617309570313,
|
| 503 |
+
"mean_token_accuracy": 0.9590577334165573,
|
| 504 |
+
"num_tokens": 5129878.0,
|
| 505 |
+
"step": 2200
|
| 506 |
+
},
|
| 507 |
+
{
|
| 508 |
+
"epoch": 6.0,
|
| 509 |
+
"eval_entropy": 0.2240281231701374,
|
| 510 |
+
"eval_loss": 0.7485206723213196,
|
| 511 |
+
"eval_mean_token_accuracy": 0.8668996468186378,
|
| 512 |
+
"eval_num_tokens": 5233482.0,
|
| 513 |
+
"eval_runtime": 96.4089,
|
| 514 |
+
"eval_samples_per_second": 16.586,
|
| 515 |
+
"eval_steps_per_second": 2.074,
|
| 516 |
+
"step": 2244
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"entropy": 0.1413771447283451,
|
| 520 |
+
"epoch": 6.016064257028113,
|
| 521 |
+
"grad_norm": 1.2926467657089233,
|
| 522 |
+
"learning_rate": 5.1072296418730254e-05,
|
| 523 |
+
"loss": 0.5202234649658203,
|
| 524 |
+
"mean_token_accuracy": 0.9594009392189257,
|
| 525 |
+
"num_tokens": 5246734.0,
|
| 526 |
+
"step": 2250
|
| 527 |
+
},
|
| 528 |
+
{
|
| 529 |
+
"entropy": 0.1042403375543654,
|
| 530 |
+
"epoch": 6.149933065595716,
|
| 531 |
+
"grad_norm": 1.9540276527404785,
|
| 532 |
+
"learning_rate": 4.8231507514154216e-05,
|
| 533 |
+
"loss": 0.39597846984863283,
|
| 534 |
+
"mean_token_accuracy": 0.9706364983320236,
|
| 535 |
+
"num_tokens": 5366334.0,
|
| 536 |
+
"step": 2300
|
| 537 |
+
},
|
| 538 |
+
{
|
| 539 |
+
"entropy": 0.10351455600932241,
|
| 540 |
+
"epoch": 6.28380187416332,
|
| 541 |
+
"grad_norm": 2.139054775238037,
|
| 542 |
+
"learning_rate": 4.542102445300397e-05,
|
| 543 |
+
"loss": 0.38731266021728517,
|
| 544 |
+
"mean_token_accuracy": 0.9703371664881706,
|
| 545 |
+
"num_tokens": 5487013.0,
|
| 546 |
+
"step": 2350
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"entropy": 0.11232182893902064,
|
| 550 |
+
"epoch": 6.417670682730924,
|
| 551 |
+
"grad_norm": 1.6526401042938232,
|
| 552 |
+
"learning_rate": 4.264696670352381e-05,
|
| 553 |
+
"loss": 0.42091716766357423,
|
| 554 |
+
"mean_token_accuracy": 0.9684987756609916,
|
| 555 |
+
"num_tokens": 5599415.0,
|
| 556 |
+
"step": 2400
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"entropy": 0.10796859875321388,
|
| 560 |
+
"epoch": 6.551539491298527,
|
| 561 |
+
"grad_norm": 1.297956109046936,
|
| 562 |
+
"learning_rate": 3.9915374422489785e-05,
|
| 563 |
+
"loss": 0.40640792846679685,
|
| 564 |
+
"mean_token_accuracy": 0.9703203043341637,
|
| 565 |
+
"num_tokens": 5718099.0,
|
| 566 |
+
"step": 2450
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"entropy": 0.10999857917428017,
|
| 570 |
+
"epoch": 6.685408299866131,
|
| 571 |
+
"grad_norm": 1.5105161666870117,
|
| 572 |
+
"learning_rate": 3.723219530353909e-05,
|
| 573 |
+
"loss": 0.4118352508544922,
|
| 574 |
+
"mean_token_accuracy": 0.9697986772656441,
|
| 575 |
+
"num_tokens": 5833902.0,
|
| 576 |
+
"step": 2500
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"entropy": 0.11099046738818288,
|
| 580 |
+
"epoch": 6.8192771084337345,
|
| 581 |
+
"grad_norm": 1.8809560537338257,
|
| 582 |
+
"learning_rate": 3.460327162682602e-05,
|
| 583 |
+
"loss": 0.41624794006347654,
|
| 584 |
+
"mean_token_accuracy": 0.9690032437443733,
|
| 585 |
+
"num_tokens": 5948132.0,
|
| 586 |
+
"step": 2550
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"entropy": 0.11062245365232229,
|
| 590 |
+
"epoch": 6.953145917001339,
|
| 591 |
+
"grad_norm": 1.0219827890396118,
|
| 592 |
+
"learning_rate": 3.2034327538202464e-05,
|
| 593 |
+
"loss": 0.41484325408935546,
|
| 594 |
+
"mean_token_accuracy": 0.9690453514456749,
|
| 595 |
+
"num_tokens": 6066224.0,
|
| 596 |
+
"step": 2600
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"epoch": 7.0,
|
| 600 |
+
"eval_entropy": 0.18908375523984433,
|
| 601 |
+
"eval_loss": 0.8491571545600891,
|
| 602 |
+
"eval_mean_token_accuracy": 0.8642131051421166,
|
| 603 |
+
"eval_num_tokens": 6105729.0,
|
| 604 |
+
"eval_runtime": 96.4633,
|
| 605 |
+
"eval_samples_per_second": 16.576,
|
| 606 |
+
"eval_steps_per_second": 2.073,
|
| 607 |
+
"step": 2618
|
| 608 |
+
}
|
| 609 |
+
],
|
| 610 |
+
"logging_steps": 50,
|
| 611 |
+
"max_steps": 3740,
|
| 612 |
+
"num_input_tokens_seen": 0,
|
| 613 |
+
"num_train_epochs": 10,
|
| 614 |
+
"save_steps": 500,
|
| 615 |
+
"stateful_callbacks": {
|
| 616 |
+
"TrainerControl": {
|
| 617 |
+
"args": {
|
| 618 |
+
"should_epoch_stop": false,
|
| 619 |
+
"should_evaluate": false,
|
| 620 |
+
"should_log": false,
|
| 621 |
+
"should_save": true,
|
| 622 |
+
"should_training_stop": false
|
| 623 |
+
},
|
| 624 |
+
"attributes": {}
|
| 625 |
+
}
|
| 626 |
+
},
|
| 627 |
+
"total_flos": 2.0923154774653926e+18,
|
| 628 |
+
"train_batch_size": 4,
|
| 629 |
+
"trial_name": null,
|
| 630 |
+
"trial_params": null
|
| 631 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: google/gemma-4-31B
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:google/gemma-4-31B
|
| 7 |
+
- lora
|
| 8 |
+
- sft
|
| 9 |
+
- transformers
|
| 10 |
+
- trl
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Model Card for Model ID
|
| 14 |
+
|
| 15 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
## Model Details
|
| 20 |
+
|
| 21 |
+
### Model Description
|
| 22 |
+
|
| 23 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
- **Developed by:** [More Information Needed]
|
| 28 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 29 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 30 |
+
- **Model type:** [More Information Needed]
|
| 31 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 32 |
+
- **License:** [More Information Needed]
|
| 33 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 34 |
+
|
| 35 |
+
### Model Sources [optional]
|
| 36 |
+
|
| 37 |
+
<!-- Provide the basic links for the model. -->
|
| 38 |
+
|
| 39 |
+
- **Repository:** [More Information Needed]
|
| 40 |
+
- **Paper [optional]:** [More Information Needed]
|
| 41 |
+
- **Demo [optional]:** [More Information Needed]
|
| 42 |
+
|
| 43 |
+
## Uses
|
| 44 |
+
|
| 45 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 46 |
+
|
| 47 |
+
### Direct Use
|
| 48 |
+
|
| 49 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 50 |
+
|
| 51 |
+
[More Information Needed]
|
| 52 |
+
|
| 53 |
+
### Downstream Use [optional]
|
| 54 |
+
|
| 55 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 56 |
+
|
| 57 |
+
[More Information Needed]
|
| 58 |
+
|
| 59 |
+
### Out-of-Scope Use
|
| 60 |
+
|
| 61 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 62 |
+
|
| 63 |
+
[More Information Needed]
|
| 64 |
+
|
| 65 |
+
## Bias, Risks, and Limitations
|
| 66 |
+
|
| 67 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 68 |
+
|
| 69 |
+
[More Information Needed]
|
| 70 |
+
|
| 71 |
+
### Recommendations
|
| 72 |
+
|
| 73 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 74 |
+
|
| 75 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 76 |
+
|
| 77 |
+
## How to Get Started with the Model
|
| 78 |
+
|
| 79 |
+
Use the code below to get started with the model.
|
| 80 |
+
|
| 81 |
+
[More Information Needed]
|
| 82 |
+
|
| 83 |
+
## Training Details
|
| 84 |
+
|
| 85 |
+
### Training Data
|
| 86 |
+
|
| 87 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 88 |
+
|
| 89 |
+
[More Information Needed]
|
| 90 |
+
|
| 91 |
+
### Training Procedure
|
| 92 |
+
|
| 93 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 94 |
+
|
| 95 |
+
#### Preprocessing [optional]
|
| 96 |
+
|
| 97 |
+
[More Information Needed]
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
#### Training Hyperparameters
|
| 101 |
+
|
| 102 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 103 |
+
|
| 104 |
+
#### Speeds, Sizes, Times [optional]
|
| 105 |
+
|
| 106 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 107 |
+
|
| 108 |
+
[More Information Needed]
|
| 109 |
+
|
| 110 |
+
## Evaluation
|
| 111 |
+
|
| 112 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 113 |
+
|
| 114 |
+
### Testing Data, Factors & Metrics
|
| 115 |
+
|
| 116 |
+
#### Testing Data
|
| 117 |
+
|
| 118 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 119 |
+
|
| 120 |
+
[More Information Needed]
|
| 121 |
+
|
| 122 |
+
#### Factors
|
| 123 |
+
|
| 124 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 125 |
+
|
| 126 |
+
[More Information Needed]
|
| 127 |
+
|
| 128 |
+
#### Metrics
|
| 129 |
+
|
| 130 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 131 |
+
|
| 132 |
+
[More Information Needed]
|
| 133 |
+
|
| 134 |
+
### Results
|
| 135 |
+
|
| 136 |
+
[More Information Needed]
|
| 137 |
+
|
| 138 |
+
#### Summary
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
## Model Examination [optional]
|
| 143 |
+
|
| 144 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 145 |
+
|
| 146 |
+
[More Information Needed]
|
| 147 |
+
|
| 148 |
+
## Environmental Impact
|
| 149 |
+
|
| 150 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 151 |
+
|
| 152 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 153 |
+
|
| 154 |
+
- **Hardware Type:** [More Information Needed]
|
| 155 |
+
- **Hours used:** [More Information Needed]
|
| 156 |
+
- **Cloud Provider:** [More Information Needed]
|
| 157 |
+
- **Compute Region:** [More Information Needed]
|
| 158 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 159 |
+
|
| 160 |
+
## Technical Specifications [optional]
|
| 161 |
+
|
| 162 |
+
### Model Architecture and Objective
|
| 163 |
+
|
| 164 |
+
[More Information Needed]
|
| 165 |
+
|
| 166 |
+
### Compute Infrastructure
|
| 167 |
+
|
| 168 |
+
[More Information Needed]
|
| 169 |
+
|
| 170 |
+
#### Hardware
|
| 171 |
+
|
| 172 |
+
[More Information Needed]
|
| 173 |
+
|
| 174 |
+
#### Software
|
| 175 |
+
|
| 176 |
+
[More Information Needed]
|
| 177 |
+
|
| 178 |
+
## Citation [optional]
|
| 179 |
+
|
| 180 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 181 |
+
|
| 182 |
+
**BibTeX:**
|
| 183 |
+
|
| 184 |
+
[More Information Needed]
|
| 185 |
+
|
| 186 |
+
**APA:**
|
| 187 |
+
|
| 188 |
+
[More Information Needed]
|
| 189 |
+
|
| 190 |
+
## Glossary [optional]
|
| 191 |
+
|
| 192 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 193 |
+
|
| 194 |
+
[More Information Needed]
|
| 195 |
+
|
| 196 |
+
## More Information [optional]
|
| 197 |
+
|
| 198 |
+
[More Information Needed]
|
| 199 |
+
|
| 200 |
+
## Model Card Authors [optional]
|
| 201 |
+
|
| 202 |
+
[More Information Needed]
|
| 203 |
+
|
| 204 |
+
## Model Card Contact
|
| 205 |
+
|
| 206 |
+
[More Information Needed]
|
| 207 |
+
### Framework versions
|
| 208 |
+
|
| 209 |
+
- PEFT 0.19.1
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "google/gemma-4-31B",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 64,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.015034304668777832,
|
| 22 |
+
"lora_ga_config": null,
|
| 23 |
+
"megatron_config": null,
|
| 24 |
+
"megatron_core": "megatron.core",
|
| 25 |
+
"modules_to_save": null,
|
| 26 |
+
"peft_type": "LORA",
|
| 27 |
+
"peft_version": "0.19.1",
|
| 28 |
+
"qalora_group_size": 16,
|
| 29 |
+
"r": 64,
|
| 30 |
+
"rank_pattern": {},
|
| 31 |
+
"revision": null,
|
| 32 |
+
"target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
|
| 33 |
+
"target_parameters": null,
|
| 34 |
+
"task_type": "CAUSAL_LM",
|
| 35 |
+
"trainable_token_indices": null,
|
| 36 |
+
"use_bdlora": null,
|
| 37 |
+
"use_dora": false,
|
| 38 |
+
"use_qalora": false,
|
| 39 |
+
"use_rslora": false
|
| 40 |
+
}
|
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"audio_token": "<|audio|>",
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"boa_token": "<|audio>",
|
| 5 |
+
"boi_token": "<|image>",
|
| 6 |
+
"bos_token": "<bos>",
|
| 7 |
+
"eoa_token": "<audio|>",
|
| 8 |
+
"eoc_token": "<channel|>",
|
| 9 |
+
"eoi_token": "<image|>",
|
| 10 |
+
"eos_token": "<eos>",
|
| 11 |
+
"eot_token": "<turn|>",
|
| 12 |
+
"escape_token": "<|\"|>",
|
| 13 |
+
"etc_token": "<tool_call|>",
|
| 14 |
+
"etd_token": "<tool|>",
|
| 15 |
+
"etr_token": "<tool_response|>",
|
| 16 |
+
"extra_special_tokens": [
|
| 17 |
+
"<|video|>"
|
| 18 |
+
],
|
| 19 |
+
"image_token": "<|image|>",
|
| 20 |
+
"is_local": false,
|
| 21 |
+
"mask_token": "<mask>",
|
| 22 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 23 |
+
"model_specific_special_tokens": {
|
| 24 |
+
"audio_token": "<|audio|>",
|
| 25 |
+
"boa_token": "<|audio>",
|
| 26 |
+
"boi_token": "<|image>",
|
| 27 |
+
"eoa_token": "<audio|>",
|
| 28 |
+
"eoc_token": "<channel|>",
|
| 29 |
+
"eoi_token": "<image|>",
|
| 30 |
+
"eot_token": "<turn|>",
|
| 31 |
+
"escape_token": "<|\"|>",
|
| 32 |
+
"etc_token": "<tool_call|>",
|
| 33 |
+
"etd_token": "<tool|>",
|
| 34 |
+
"etr_token": "<tool_response|>",
|
| 35 |
+
"image_token": "<|image|>",
|
| 36 |
+
"soc_token": "<|channel>",
|
| 37 |
+
"sot_token": "<|turn>",
|
| 38 |
+
"stc_token": "<|tool_call>",
|
| 39 |
+
"std_token": "<|tool>",
|
| 40 |
+
"str_token": "<|tool_response>",
|
| 41 |
+
"think_token": "<|think|>"
|
| 42 |
+
},
|
| 43 |
+
"pad_token": "<pad>",
|
| 44 |
+
"padding_side": "left",
|
| 45 |
+
"processor_class": "Gemma4Processor",
|
| 46 |
+
"soc_token": "<|channel>",
|
| 47 |
+
"sot_token": "<|turn>",
|
| 48 |
+
"stc_token": "<|tool_call>",
|
| 49 |
+
"std_token": "<|tool>",
|
| 50 |
+
"str_token": "<|tool_response>",
|
| 51 |
+
"think_token": "<|think|>",
|
| 52 |
+
"tokenizer_class": "GemmaTokenizer",
|
| 53 |
+
"unk_token": "<unk>"
|
| 54 |
+
}
|