K2triinK commited on
Commit
d54e212
·
verified ·
1 Parent(s): 912fc28

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md +58 -0
  2. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md +58 -0
  3. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md +209 -0
  4. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json +40 -0
  5. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json +54 -0
  6. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json +297 -0
  7. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md +209 -0
  8. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json +40 -0
  9. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json +54 -0
  10. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json +378 -0
  11. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md +209 -0
  12. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json +40 -0
  13. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json +54 -0
  14. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json +469 -0
  15. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md +209 -0
  16. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json +40 -0
  17. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json +54 -0
  18. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json +560 -0
  19. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md +209 -0
  20. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json +40 -0
  21. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json +54 -0
  22. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json +641 -0
  23. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md +209 -0
  24. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json +40 -0
  25. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json +54 -0
  26. DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json +732 -0
  27. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md +58 -0
  28. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md +209 -0
  29. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json +40 -0
  30. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json +54 -0
  31. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json +287 -0
  32. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md +209 -0
  33. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json +40 -0
  34. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json +54 -0
  35. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json +368 -0
  36. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md +209 -0
  37. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json +40 -0
  38. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json +54 -0
  39. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json +459 -0
  40. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md +209 -0
  41. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json +40 -0
  42. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json +54 -0
  43. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json +540 -0
  44. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md +209 -0
  45. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json +40 -0
  46. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json +54 -0
  47. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json +631 -0
  48. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md +209 -0
  49. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json +40 -0
  50. DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json +54 -0
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: transformers
4
+ model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1
13
+
14
+ This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/sfblzvnx)
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.29.0
39
+ - Transformers: 5.5.4
40
+ - Pytorch: 2.10.0
41
+ - Datasets: 4.6.1
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
57
+ }
58
+ ```
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: transformers
4
+ model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test2
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - sft
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test2
13
+
14
+ This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/ncgnoczk)
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.29.0
39
+ - Transformers: 5.5.4
40
+ - Pytorch: 2.10.0
41
+ - Datasets: 4.6.1
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
57
+ }
58
+ ```
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1155/trainer_state.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1155,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ }
275
+ ],
276
+ "logging_steps": 50,
277
+ "max_steps": 3850,
278
+ "num_input_tokens_seen": 0,
279
+ "num_train_epochs": 10,
280
+ "save_steps": 500,
281
+ "stateful_callbacks": {
282
+ "TrainerControl": {
283
+ "args": {
284
+ "should_epoch_stop": false,
285
+ "should_evaluate": false,
286
+ "should_log": false,
287
+ "should_save": true,
288
+ "should_training_stop": false
289
+ },
290
+ "attributes": {}
291
+ }
292
+ },
293
+ "total_flos": 9.957948339009064e+17,
294
+ "train_batch_size": 4,
295
+ "trial_name": null,
296
+ "trial_params": null
297
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1540/trainer_state.json ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1540,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ },
275
+ {
276
+ "entropy": 0.37655152073457615,
277
+ "epoch": 3.116959064327485,
278
+ "grad_norm": 1.504384160041809,
279
+ "learning_rate": 0.0003313287743759729,
280
+ "loss": 1.3653451538085937,
281
+ "mean_token_accuracy": 0.8971295344769655,
282
+ "num_tokens": 2939773.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.37069276951253416,
287
+ "epoch": 3.246913580246914,
288
+ "grad_norm": 1.9665946960449219,
289
+ "learning_rate": 0.0003253772935629151,
290
+ "loss": 1.3458108520507812,
291
+ "mean_token_accuracy": 0.8982205548882485,
292
+ "num_tokens": 3063617.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.37295883789658546,
297
+ "epoch": 3.3768680961663415,
298
+ "grad_norm": 1.7501362562179565,
299
+ "learning_rate": 0.00031914855214759165,
300
+ "loss": 1.357562255859375,
301
+ "mean_token_accuracy": 0.8977113124728203,
302
+ "num_tokens": 3189800.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.3805788069963455,
307
+ "epoch": 3.50682261208577,
308
+ "grad_norm": 1.7277154922485352,
309
+ "learning_rate": 0.00031265534863374894,
310
+ "loss": 1.3735618591308594,
311
+ "mean_token_accuracy": 0.8962143072485924,
312
+ "num_tokens": 3311908.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.3840580120682716,
317
+ "epoch": 3.636777128005198,
318
+ "grad_norm": 2.2338802814483643,
319
+ "learning_rate": 0.0003059110249285165,
320
+ "loss": 1.3903216552734374,
321
+ "mean_token_accuracy": 0.8958476388454437,
322
+ "num_tokens": 3432934.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.37621145449578763,
327
+ "epoch": 3.7667316439246266,
328
+ "grad_norm": 1.9029661417007446,
329
+ "learning_rate": 0.00029892943892812944,
330
+ "loss": 1.3776657104492187,
331
+ "mean_token_accuracy": 0.8964926180243492,
332
+ "num_tokens": 3561408.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "entropy": 0.3784803995490074,
337
+ "epoch": 3.8966861598440543,
338
+ "grad_norm": 2.089708089828491,
339
+ "learning_rate": 0.00029172493604342163,
340
+ "loss": 1.3816807556152344,
341
+ "mean_token_accuracy": 0.8962833172082901,
342
+ "num_tokens": 3684624.0,
343
+ "step": 1500
344
+ },
345
+ {
346
+ "epoch": 4.0,
347
+ "eval_entropy": 0.4351254403591156,
348
+ "eval_loss": 0.5814722180366516,
349
+ "eval_mean_token_accuracy": 0.8530604747625498,
350
+ "eval_num_tokens": 3779128.0,
351
+ "eval_runtime": 90.2232,
352
+ "eval_samples_per_second": 18.366,
353
+ "eval_steps_per_second": 2.305,
354
+ "step": 1540
355
+ }
356
+ ],
357
+ "logging_steps": 50,
358
+ "max_steps": 3850,
359
+ "num_input_tokens_seen": 0,
360
+ "num_train_epochs": 10,
361
+ "save_steps": 500,
362
+ "stateful_callbacks": {
363
+ "TrainerControl": {
364
+ "args": {
365
+ "should_epoch_stop": false,
366
+ "should_evaluate": false,
367
+ "should_log": false,
368
+ "should_save": true,
369
+ "should_training_stop": false
370
+ },
371
+ "attributes": {}
372
+ }
373
+ },
374
+ "total_flos": 1.3259599564032195e+18,
375
+ "train_batch_size": 4,
376
+ "trial_name": null,
377
+ "trial_params": null
378
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-1925/trainer_state.json ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1925,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ },
275
+ {
276
+ "entropy": 0.37655152073457615,
277
+ "epoch": 3.116959064327485,
278
+ "grad_norm": 1.504384160041809,
279
+ "learning_rate": 0.0003313287743759729,
280
+ "loss": 1.3653451538085937,
281
+ "mean_token_accuracy": 0.8971295344769655,
282
+ "num_tokens": 2939773.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.37069276951253416,
287
+ "epoch": 3.246913580246914,
288
+ "grad_norm": 1.9665946960449219,
289
+ "learning_rate": 0.0003253772935629151,
290
+ "loss": 1.3458108520507812,
291
+ "mean_token_accuracy": 0.8982205548882485,
292
+ "num_tokens": 3063617.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.37295883789658546,
297
+ "epoch": 3.3768680961663415,
298
+ "grad_norm": 1.7501362562179565,
299
+ "learning_rate": 0.00031914855214759165,
300
+ "loss": 1.357562255859375,
301
+ "mean_token_accuracy": 0.8977113124728203,
302
+ "num_tokens": 3189800.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.3805788069963455,
307
+ "epoch": 3.50682261208577,
308
+ "grad_norm": 1.7277154922485352,
309
+ "learning_rate": 0.00031265534863374894,
310
+ "loss": 1.3735618591308594,
311
+ "mean_token_accuracy": 0.8962143072485924,
312
+ "num_tokens": 3311908.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.3840580120682716,
317
+ "epoch": 3.636777128005198,
318
+ "grad_norm": 2.2338802814483643,
319
+ "learning_rate": 0.0003059110249285165,
320
+ "loss": 1.3903216552734374,
321
+ "mean_token_accuracy": 0.8958476388454437,
322
+ "num_tokens": 3432934.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.37621145449578763,
327
+ "epoch": 3.7667316439246266,
328
+ "grad_norm": 1.9029661417007446,
329
+ "learning_rate": 0.00029892943892812944,
330
+ "loss": 1.3776657104492187,
331
+ "mean_token_accuracy": 0.8964926180243492,
332
+ "num_tokens": 3561408.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "entropy": 0.3784803995490074,
337
+ "epoch": 3.8966861598440543,
338
+ "grad_norm": 2.089708089828491,
339
+ "learning_rate": 0.00029172493604342163,
340
+ "loss": 1.3816807556152344,
341
+ "mean_token_accuracy": 0.8962833172082901,
342
+ "num_tokens": 3684624.0,
343
+ "step": 1500
344
+ },
345
+ {
346
+ "epoch": 4.0,
347
+ "eval_entropy": 0.4351254403591156,
348
+ "eval_loss": 0.5814722180366516,
349
+ "eval_mean_token_accuracy": 0.8530604747625498,
350
+ "eval_num_tokens": 3779128.0,
351
+ "eval_runtime": 90.2232,
352
+ "eval_samples_per_second": 18.366,
353
+ "eval_steps_per_second": 2.305,
354
+ "step": 1540
355
+ },
356
+ {
357
+ "entropy": 0.36326556409423677,
358
+ "epoch": 4.025990903183885,
359
+ "grad_norm": 2.1354947090148926,
360
+ "learning_rate": 0.0002843123197235993,
361
+ "loss": 1.3295362854003907,
362
+ "mean_token_accuracy": 0.8993093811686913,
363
+ "num_tokens": 3804993.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.2879397062957287,
368
+ "epoch": 4.155945419103314,
369
+ "grad_norm": 2.201097011566162,
370
+ "learning_rate": 0.0002767068210388601,
371
+ "loss": 1.0272974395751953,
372
+ "mean_token_accuracy": 0.9182627710700035,
373
+ "num_tokens": 3928162.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.2848948486149311,
378
+ "epoch": 4.2858999350227425,
379
+ "grad_norm": 2.01479172706604,
380
+ "learning_rate": 0.000268924067384358,
381
+ "loss": 1.0278727722167968,
382
+ "mean_token_accuracy": 0.9194766515493393,
383
+ "num_tokens": 4049012.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.2940504560619593,
388
+ "epoch": 4.41585445094217,
389
+ "grad_norm": 2.0893027782440186,
390
+ "learning_rate": 0.00026098005036982003,
391
+ "loss": 1.0586751556396485,
392
+ "mean_token_accuracy": 0.9167885810136795,
393
+ "num_tokens": 4167845.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.293505182415247,
398
+ "epoch": 4.545808966861598,
399
+ "grad_norm": 1.6346389055252075,
400
+ "learning_rate": 0.0002528910929607928,
401
+ "loss": 1.0669570922851563,
402
+ "mean_token_accuracy": 0.9160876458883286,
403
+ "num_tokens": 4287505.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.2898535231500864,
408
+ "epoch": 4.675763482781027,
409
+ "grad_norm": 1.6645033359527588,
410
+ "learning_rate": 0.0002446738159390364,
411
+ "loss": 1.0582612609863282,
412
+ "mean_token_accuracy": 0.9177632886171341,
413
+ "num_tokens": 4412221.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.2842763290554285,
418
+ "epoch": 4.805717998700455,
419
+ "grad_norm": 2.4594268798828125,
420
+ "learning_rate": 0.0002363451037509798,
421
+ "loss": 1.0467537689208983,
422
+ "mean_token_accuracy": 0.9177608361840248,
423
+ "num_tokens": 4537178.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "entropy": 0.284430123642087,
428
+ "epoch": 4.935672514619883,
429
+ "grad_norm": 2.1724514961242676,
430
+ "learning_rate": 0.00022792206981441223,
431
+ "loss": 1.0753899383544923,
432
+ "mean_token_accuracy": 0.915192686021328,
433
+ "num_tokens": 4664196.0,
434
+ "step": 1900
435
+ },
436
+ {
437
+ "epoch": 5.0,
438
+ "eval_entropy": 0.3632780872285366,
439
+ "eval_loss": 0.6438126564025879,
440
+ "eval_mean_token_accuracy": 0.8511462942338907,
441
+ "eval_num_tokens": 4723910.0,
442
+ "eval_runtime": 90.1846,
443
+ "eval_samples_per_second": 18.373,
444
+ "eval_steps_per_second": 2.306,
445
+ "step": 1925
446
+ }
447
+ ],
448
+ "logging_steps": 50,
449
+ "max_steps": 3850,
450
+ "num_input_tokens_seen": 0,
451
+ "num_train_epochs": 10,
452
+ "save_steps": 500,
453
+ "stateful_callbacks": {
454
+ "TrainerControl": {
455
+ "args": {
456
+ "should_epoch_stop": false,
457
+ "should_evaluate": false,
458
+ "should_log": false,
459
+ "should_save": true,
460
+ "should_training_stop": false
461
+ },
462
+ "attributes": {}
463
+ }
464
+ },
465
+ "total_flos": 1.6564080889424607e+18,
466
+ "train_batch_size": 4,
467
+ "trial_name": null,
468
+ "trial_params": null
469
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2310/trainer_state.json ADDED
@@ -0,0 +1,560 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2310,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ },
275
+ {
276
+ "entropy": 0.37655152073457615,
277
+ "epoch": 3.116959064327485,
278
+ "grad_norm": 1.504384160041809,
279
+ "learning_rate": 0.0003313287743759729,
280
+ "loss": 1.3653451538085937,
281
+ "mean_token_accuracy": 0.8971295344769655,
282
+ "num_tokens": 2939773.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.37069276951253416,
287
+ "epoch": 3.246913580246914,
288
+ "grad_norm": 1.9665946960449219,
289
+ "learning_rate": 0.0003253772935629151,
290
+ "loss": 1.3458108520507812,
291
+ "mean_token_accuracy": 0.8982205548882485,
292
+ "num_tokens": 3063617.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.37295883789658546,
297
+ "epoch": 3.3768680961663415,
298
+ "grad_norm": 1.7501362562179565,
299
+ "learning_rate": 0.00031914855214759165,
300
+ "loss": 1.357562255859375,
301
+ "mean_token_accuracy": 0.8977113124728203,
302
+ "num_tokens": 3189800.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.3805788069963455,
307
+ "epoch": 3.50682261208577,
308
+ "grad_norm": 1.7277154922485352,
309
+ "learning_rate": 0.00031265534863374894,
310
+ "loss": 1.3735618591308594,
311
+ "mean_token_accuracy": 0.8962143072485924,
312
+ "num_tokens": 3311908.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.3840580120682716,
317
+ "epoch": 3.636777128005198,
318
+ "grad_norm": 2.2338802814483643,
319
+ "learning_rate": 0.0003059110249285165,
320
+ "loss": 1.3903216552734374,
321
+ "mean_token_accuracy": 0.8958476388454437,
322
+ "num_tokens": 3432934.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.37621145449578763,
327
+ "epoch": 3.7667316439246266,
328
+ "grad_norm": 1.9029661417007446,
329
+ "learning_rate": 0.00029892943892812944,
330
+ "loss": 1.3776657104492187,
331
+ "mean_token_accuracy": 0.8964926180243492,
332
+ "num_tokens": 3561408.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "entropy": 0.3784803995490074,
337
+ "epoch": 3.8966861598440543,
338
+ "grad_norm": 2.089708089828491,
339
+ "learning_rate": 0.00029172493604342163,
340
+ "loss": 1.3816807556152344,
341
+ "mean_token_accuracy": 0.8962833172082901,
342
+ "num_tokens": 3684624.0,
343
+ "step": 1500
344
+ },
345
+ {
346
+ "epoch": 4.0,
347
+ "eval_entropy": 0.4351254403591156,
348
+ "eval_loss": 0.5814722180366516,
349
+ "eval_mean_token_accuracy": 0.8530604747625498,
350
+ "eval_num_tokens": 3779128.0,
351
+ "eval_runtime": 90.2232,
352
+ "eval_samples_per_second": 18.366,
353
+ "eval_steps_per_second": 2.305,
354
+ "step": 1540
355
+ },
356
+ {
357
+ "entropy": 0.36326556409423677,
358
+ "epoch": 4.025990903183885,
359
+ "grad_norm": 2.1354947090148926,
360
+ "learning_rate": 0.0002843123197235993,
361
+ "loss": 1.3295362854003907,
362
+ "mean_token_accuracy": 0.8993093811686913,
363
+ "num_tokens": 3804993.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.2879397062957287,
368
+ "epoch": 4.155945419103314,
369
+ "grad_norm": 2.201097011566162,
370
+ "learning_rate": 0.0002767068210388601,
371
+ "loss": 1.0272974395751953,
372
+ "mean_token_accuracy": 0.9182627710700035,
373
+ "num_tokens": 3928162.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.2848948486149311,
378
+ "epoch": 4.2858999350227425,
379
+ "grad_norm": 2.01479172706604,
380
+ "learning_rate": 0.000268924067384358,
381
+ "loss": 1.0278727722167968,
382
+ "mean_token_accuracy": 0.9194766515493393,
383
+ "num_tokens": 4049012.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.2940504560619593,
388
+ "epoch": 4.41585445094217,
389
+ "grad_norm": 2.0893027782440186,
390
+ "learning_rate": 0.00026098005036982003,
391
+ "loss": 1.0586751556396485,
392
+ "mean_token_accuracy": 0.9167885810136795,
393
+ "num_tokens": 4167845.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.293505182415247,
398
+ "epoch": 4.545808966861598,
399
+ "grad_norm": 1.6346389055252075,
400
+ "learning_rate": 0.0002528910929607928,
401
+ "loss": 1.0669570922851563,
402
+ "mean_token_accuracy": 0.9160876458883286,
403
+ "num_tokens": 4287505.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.2898535231500864,
408
+ "epoch": 4.675763482781027,
409
+ "grad_norm": 1.6645033359527588,
410
+ "learning_rate": 0.0002446738159390364,
411
+ "loss": 1.0582612609863282,
412
+ "mean_token_accuracy": 0.9177632886171341,
413
+ "num_tokens": 4412221.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.2842763290554285,
418
+ "epoch": 4.805717998700455,
419
+ "grad_norm": 2.4594268798828125,
420
+ "learning_rate": 0.0002363451037509798,
421
+ "loss": 1.0467537689208983,
422
+ "mean_token_accuracy": 0.9177608361840248,
423
+ "num_tokens": 4537178.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "entropy": 0.284430123642087,
428
+ "epoch": 4.935672514619883,
429
+ "grad_norm": 2.1724514961242676,
430
+ "learning_rate": 0.00022792206981441223,
431
+ "loss": 1.0753899383544923,
432
+ "mean_token_accuracy": 0.915192686021328,
433
+ "num_tokens": 4664196.0,
434
+ "step": 1900
435
+ },
436
+ {
437
+ "epoch": 5.0,
438
+ "eval_entropy": 0.3632780872285366,
439
+ "eval_loss": 0.6438126564025879,
440
+ "eval_mean_token_accuracy": 0.8511462942338907,
441
+ "eval_num_tokens": 4723910.0,
442
+ "eval_runtime": 90.1846,
443
+ "eval_samples_per_second": 18.373,
444
+ "eval_steps_per_second": 2.306,
445
+ "step": 1925
446
+ },
447
+ {
448
+ "entropy": 0.23515464736139355,
449
+ "epoch": 5.064977257959714,
450
+ "grad_norm": 1.651587724685669,
451
+ "learning_rate": 0.00021942202135469513,
452
+ "loss": 0.8597064971923828,
453
+ "mean_token_accuracy": 0.9324622603517082,
454
+ "num_tokens": 4789568.0,
455
+ "step": 1950
456
+ },
457
+ {
458
+ "entropy": 0.1958953895419836,
459
+ "epoch": 5.1949317738791425,
460
+ "grad_norm": 1.923292636871338,
461
+ "learning_rate": 0.0002108624238427481,
462
+ "loss": 0.7188112640380859,
463
+ "mean_token_accuracy": 0.9416415295004845,
464
+ "num_tokens": 4913407.0,
465
+ "step": 2000
466
+ },
467
+ {
468
+ "entropy": 0.21068542070686816,
469
+ "epoch": 5.32488628979857,
470
+ "grad_norm": 2.299356460571289,
471
+ "learning_rate": 0.0002022608651078804,
472
+ "loss": 0.7712985229492187,
473
+ "mean_token_accuracy": 0.9386440163850784,
474
+ "num_tokens": 5032951.0,
475
+ "step": 2050
476
+ },
477
+ {
478
+ "entropy": 0.21234643168747425,
479
+ "epoch": 5.454840805717999,
480
+ "grad_norm": 2.2119295597076416,
481
+ "learning_rate": 0.00019363501919920608,
482
+ "loss": 0.7650181579589844,
483
+ "mean_token_accuracy": 0.938471505343914,
484
+ "num_tokens": 5156908.0,
485
+ "step": 2100
486
+ },
487
+ {
488
+ "entropy": 0.21658269092440605,
489
+ "epoch": 5.584795321637427,
490
+ "grad_norm": 1.5394288301467896,
491
+ "learning_rate": 0.00018500261006989887,
492
+ "loss": 0.7784209442138672,
493
+ "mean_token_accuracy": 0.9371598136425018,
494
+ "num_tokens": 5276087.0,
495
+ "step": 2150
496
+ },
497
+ {
498
+ "entropy": 0.2045296123996377,
499
+ "epoch": 5.714749837556855,
500
+ "grad_norm": 1.913680076599121,
501
+ "learning_rate": 0.00017638137515890763,
502
+ "loss": 0.7638166046142578,
503
+ "mean_token_accuracy": 0.9378301629424095,
504
+ "num_tokens": 5398787.0,
505
+ "step": 2200
506
+ },
507
+ {
508
+ "entropy": 0.20917976945638656,
509
+ "epoch": 5.844704353476283,
510
+ "grad_norm": 2.0847299098968506,
511
+ "learning_rate": 0.00016778902894496063,
512
+ "loss": 0.7631703186035156,
513
+ "mean_token_accuracy": 0.9387557968497277,
514
+ "num_tokens": 5522332.0,
515
+ "step": 2250
516
+ },
517
+ {
518
+ "entropy": 0.22262076318264007,
519
+ "epoch": 5.974658869395712,
520
+ "grad_norm": 2.1597352027893066,
521
+ "learning_rate": 0.0001592432265477485,
522
+ "loss": 0.798133773803711,
523
+ "mean_token_accuracy": 0.936034984588623,
524
+ "num_tokens": 5642361.0,
525
+ "step": 2300
526
+ },
527
+ {
528
+ "epoch": 6.0,
529
+ "eval_entropy": 0.31502799331568754,
530
+ "eval_loss": 0.7417300343513489,
531
+ "eval_mean_token_accuracy": 0.8477253922476218,
532
+ "eval_num_tokens": 5668692.0,
533
+ "eval_runtime": 90.4252,
534
+ "eval_samples_per_second": 18.325,
535
+ "eval_steps_per_second": 2.3,
536
+ "step": 2310
537
+ }
538
+ ],
539
+ "logging_steps": 50,
540
+ "max_steps": 3850,
541
+ "num_input_tokens_seen": 0,
542
+ "num_train_epochs": 10,
543
+ "save_steps": 500,
544
+ "stateful_callbacks": {
545
+ "TrainerControl": {
546
+ "args": {
547
+ "should_epoch_stop": false,
548
+ "should_evaluate": false,
549
+ "should_log": false,
550
+ "should_save": true,
551
+ "should_training_stop": false
552
+ },
553
+ "attributes": {}
554
+ }
555
+ },
556
+ "total_flos": 1.9871331143277489e+18,
557
+ "train_batch_size": 4,
558
+ "trial_name": null,
559
+ "trial_params": null
560
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-2695/trainer_state.json ADDED
@@ -0,0 +1,641 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 7.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2695,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ },
275
+ {
276
+ "entropy": 0.37655152073457615,
277
+ "epoch": 3.116959064327485,
278
+ "grad_norm": 1.504384160041809,
279
+ "learning_rate": 0.0003313287743759729,
280
+ "loss": 1.3653451538085937,
281
+ "mean_token_accuracy": 0.8971295344769655,
282
+ "num_tokens": 2939773.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.37069276951253416,
287
+ "epoch": 3.246913580246914,
288
+ "grad_norm": 1.9665946960449219,
289
+ "learning_rate": 0.0003253772935629151,
290
+ "loss": 1.3458108520507812,
291
+ "mean_token_accuracy": 0.8982205548882485,
292
+ "num_tokens": 3063617.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.37295883789658546,
297
+ "epoch": 3.3768680961663415,
298
+ "grad_norm": 1.7501362562179565,
299
+ "learning_rate": 0.00031914855214759165,
300
+ "loss": 1.357562255859375,
301
+ "mean_token_accuracy": 0.8977113124728203,
302
+ "num_tokens": 3189800.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.3805788069963455,
307
+ "epoch": 3.50682261208577,
308
+ "grad_norm": 1.7277154922485352,
309
+ "learning_rate": 0.00031265534863374894,
310
+ "loss": 1.3735618591308594,
311
+ "mean_token_accuracy": 0.8962143072485924,
312
+ "num_tokens": 3311908.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.3840580120682716,
317
+ "epoch": 3.636777128005198,
318
+ "grad_norm": 2.2338802814483643,
319
+ "learning_rate": 0.0003059110249285165,
320
+ "loss": 1.3903216552734374,
321
+ "mean_token_accuracy": 0.8958476388454437,
322
+ "num_tokens": 3432934.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.37621145449578763,
327
+ "epoch": 3.7667316439246266,
328
+ "grad_norm": 1.9029661417007446,
329
+ "learning_rate": 0.00029892943892812944,
330
+ "loss": 1.3776657104492187,
331
+ "mean_token_accuracy": 0.8964926180243492,
332
+ "num_tokens": 3561408.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "entropy": 0.3784803995490074,
337
+ "epoch": 3.8966861598440543,
338
+ "grad_norm": 2.089708089828491,
339
+ "learning_rate": 0.00029172493604342163,
340
+ "loss": 1.3816807556152344,
341
+ "mean_token_accuracy": 0.8962833172082901,
342
+ "num_tokens": 3684624.0,
343
+ "step": 1500
344
+ },
345
+ {
346
+ "epoch": 4.0,
347
+ "eval_entropy": 0.4351254403591156,
348
+ "eval_loss": 0.5814722180366516,
349
+ "eval_mean_token_accuracy": 0.8530604747625498,
350
+ "eval_num_tokens": 3779128.0,
351
+ "eval_runtime": 90.2232,
352
+ "eval_samples_per_second": 18.366,
353
+ "eval_steps_per_second": 2.305,
354
+ "step": 1540
355
+ },
356
+ {
357
+ "entropy": 0.36326556409423677,
358
+ "epoch": 4.025990903183885,
359
+ "grad_norm": 2.1354947090148926,
360
+ "learning_rate": 0.0002843123197235993,
361
+ "loss": 1.3295362854003907,
362
+ "mean_token_accuracy": 0.8993093811686913,
363
+ "num_tokens": 3804993.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.2879397062957287,
368
+ "epoch": 4.155945419103314,
369
+ "grad_norm": 2.201097011566162,
370
+ "learning_rate": 0.0002767068210388601,
371
+ "loss": 1.0272974395751953,
372
+ "mean_token_accuracy": 0.9182627710700035,
373
+ "num_tokens": 3928162.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.2848948486149311,
378
+ "epoch": 4.2858999350227425,
379
+ "grad_norm": 2.01479172706604,
380
+ "learning_rate": 0.000268924067384358,
381
+ "loss": 1.0278727722167968,
382
+ "mean_token_accuracy": 0.9194766515493393,
383
+ "num_tokens": 4049012.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.2940504560619593,
388
+ "epoch": 4.41585445094217,
389
+ "grad_norm": 2.0893027782440186,
390
+ "learning_rate": 0.00026098005036982003,
391
+ "loss": 1.0586751556396485,
392
+ "mean_token_accuracy": 0.9167885810136795,
393
+ "num_tokens": 4167845.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.293505182415247,
398
+ "epoch": 4.545808966861598,
399
+ "grad_norm": 1.6346389055252075,
400
+ "learning_rate": 0.0002528910929607928,
401
+ "loss": 1.0669570922851563,
402
+ "mean_token_accuracy": 0.9160876458883286,
403
+ "num_tokens": 4287505.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.2898535231500864,
408
+ "epoch": 4.675763482781027,
409
+ "grad_norm": 1.6645033359527588,
410
+ "learning_rate": 0.0002446738159390364,
411
+ "loss": 1.0582612609863282,
412
+ "mean_token_accuracy": 0.9177632886171341,
413
+ "num_tokens": 4412221.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.2842763290554285,
418
+ "epoch": 4.805717998700455,
419
+ "grad_norm": 2.4594268798828125,
420
+ "learning_rate": 0.0002363451037509798,
421
+ "loss": 1.0467537689208983,
422
+ "mean_token_accuracy": 0.9177608361840248,
423
+ "num_tokens": 4537178.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "entropy": 0.284430123642087,
428
+ "epoch": 4.935672514619883,
429
+ "grad_norm": 2.1724514961242676,
430
+ "learning_rate": 0.00022792206981441223,
431
+ "loss": 1.0753899383544923,
432
+ "mean_token_accuracy": 0.915192686021328,
433
+ "num_tokens": 4664196.0,
434
+ "step": 1900
435
+ },
436
+ {
437
+ "epoch": 5.0,
438
+ "eval_entropy": 0.3632780872285366,
439
+ "eval_loss": 0.6438126564025879,
440
+ "eval_mean_token_accuracy": 0.8511462942338907,
441
+ "eval_num_tokens": 4723910.0,
442
+ "eval_runtime": 90.1846,
443
+ "eval_samples_per_second": 18.373,
444
+ "eval_steps_per_second": 2.306,
445
+ "step": 1925
446
+ },
447
+ {
448
+ "entropy": 0.23515464736139355,
449
+ "epoch": 5.064977257959714,
450
+ "grad_norm": 1.651587724685669,
451
+ "learning_rate": 0.00021942202135469513,
452
+ "loss": 0.8597064971923828,
453
+ "mean_token_accuracy": 0.9324622603517082,
454
+ "num_tokens": 4789568.0,
455
+ "step": 1950
456
+ },
457
+ {
458
+ "entropy": 0.1958953895419836,
459
+ "epoch": 5.1949317738791425,
460
+ "grad_norm": 1.923292636871338,
461
+ "learning_rate": 0.0002108624238427481,
462
+ "loss": 0.7188112640380859,
463
+ "mean_token_accuracy": 0.9416415295004845,
464
+ "num_tokens": 4913407.0,
465
+ "step": 2000
466
+ },
467
+ {
468
+ "entropy": 0.21068542070686816,
469
+ "epoch": 5.32488628979857,
470
+ "grad_norm": 2.299356460571289,
471
+ "learning_rate": 0.0002022608651078804,
472
+ "loss": 0.7712985229492187,
473
+ "mean_token_accuracy": 0.9386440163850784,
474
+ "num_tokens": 5032951.0,
475
+ "step": 2050
476
+ },
477
+ {
478
+ "entropy": 0.21234643168747425,
479
+ "epoch": 5.454840805717999,
480
+ "grad_norm": 2.2119295597076416,
481
+ "learning_rate": 0.00019363501919920608,
482
+ "loss": 0.7650181579589844,
483
+ "mean_token_accuracy": 0.938471505343914,
484
+ "num_tokens": 5156908.0,
485
+ "step": 2100
486
+ },
487
+ {
488
+ "entropy": 0.21658269092440605,
489
+ "epoch": 5.584795321637427,
490
+ "grad_norm": 1.5394288301467896,
491
+ "learning_rate": 0.00018500261006989887,
492
+ "loss": 0.7784209442138672,
493
+ "mean_token_accuracy": 0.9371598136425018,
494
+ "num_tokens": 5276087.0,
495
+ "step": 2150
496
+ },
497
+ {
498
+ "entropy": 0.2045296123996377,
499
+ "epoch": 5.714749837556855,
500
+ "grad_norm": 1.913680076599121,
501
+ "learning_rate": 0.00017638137515890763,
502
+ "loss": 0.7638166046142578,
503
+ "mean_token_accuracy": 0.9378301629424095,
504
+ "num_tokens": 5398787.0,
505
+ "step": 2200
506
+ },
507
+ {
508
+ "entropy": 0.20917976945638656,
509
+ "epoch": 5.844704353476283,
510
+ "grad_norm": 2.0847299098968506,
511
+ "learning_rate": 0.00016778902894496063,
512
+ "loss": 0.7631703186035156,
513
+ "mean_token_accuracy": 0.9387557968497277,
514
+ "num_tokens": 5522332.0,
515
+ "step": 2250
516
+ },
517
+ {
518
+ "entropy": 0.22262076318264007,
519
+ "epoch": 5.974658869395712,
520
+ "grad_norm": 2.1597352027893066,
521
+ "learning_rate": 0.0001592432265477485,
522
+ "loss": 0.798133773803711,
523
+ "mean_token_accuracy": 0.936034984588623,
524
+ "num_tokens": 5642361.0,
525
+ "step": 2300
526
+ },
527
+ {
528
+ "epoch": 6.0,
529
+ "eval_entropy": 0.31502799331568754,
530
+ "eval_loss": 0.7417300343513489,
531
+ "eval_mean_token_accuracy": 0.8477253922476218,
532
+ "eval_num_tokens": 5668692.0,
533
+ "eval_runtime": 90.4252,
534
+ "eval_samples_per_second": 18.325,
535
+ "eval_steps_per_second": 2.3,
536
+ "step": 2310
537
+ },
538
+ {
539
+ "entropy": 0.16796037876725795,
540
+ "epoch": 6.1039636127355426,
541
+ "grad_norm": 2.2228569984436035,
542
+ "learning_rate": 0.00015076152745107442,
543
+ "loss": 0.5835284805297851,
544
+ "mean_token_accuracy": 0.9529892874123463,
545
+ "num_tokens": 5766129.0,
546
+ "step": 2350
547
+ },
548
+ {
549
+ "entropy": 0.14919219192117453,
550
+ "epoch": 6.23391812865497,
551
+ "grad_norm": 1.408840298652649,
552
+ "learning_rate": 0.00014236135942251215,
553
+ "loss": 0.5310631561279296,
554
+ "mean_token_accuracy": 0.9586454060673714,
555
+ "num_tokens": 5888746.0,
556
+ "step": 2400
557
+ },
558
+ {
559
+ "entropy": 0.1499051059409976,
560
+ "epoch": 6.363872644574399,
561
+ "grad_norm": 1.8611102104187012,
562
+ "learning_rate": 0.00013405998270370849,
563
+ "loss": 0.5127810668945313,
564
+ "mean_token_accuracy": 0.9591325157880783,
565
+ "num_tokens": 6014455.0,
566
+ "step": 2450
567
+ },
568
+ {
569
+ "entropy": 0.15334193099290133,
570
+ "epoch": 6.493827160493828,
571
+ "grad_norm": 1.6051015853881836,
572
+ "learning_rate": 0.00012587445454490892,
573
+ "loss": 0.5349758529663086,
574
+ "mean_token_accuracy": 0.9574431091547012,
575
+ "num_tokens": 6141229.0,
576
+ "step": 2500
577
+ },
578
+ {
579
+ "entropy": 0.15982334002852439,
580
+ "epoch": 6.623781676413255,
581
+ "grad_norm": 3.7065205574035645,
582
+ "learning_rate": 0.00011782159415658008,
583
+ "loss": 0.5602469253540039,
584
+ "mean_token_accuracy": 0.9555372184515,
585
+ "num_tokens": 6257983.0,
586
+ "step": 2550
587
+ },
588
+ {
589
+ "entropy": 0.16072992872446776,
590
+ "epoch": 6.753736192332683,
591
+ "grad_norm": 2.282320976257324,
592
+ "learning_rate": 0.00010991794815014401,
593
+ "loss": 0.5657939910888672,
594
+ "mean_token_accuracy": 0.9550630164146423,
595
+ "num_tokens": 6376198.0,
596
+ "step": 2600
597
+ },
598
+ {
599
+ "entropy": 0.1512781011685729,
600
+ "epoch": 6.883690708252112,
601
+ "grad_norm": 1.3716893196105957,
602
+ "learning_rate": 0.00010217975653883603,
603
+ "loss": 0.5340792465209961,
604
+ "mean_token_accuracy": 0.9578188157081604,
605
+ "num_tokens": 6502526.0,
606
+ "step": 2650
607
+ },
608
+ {
609
+ "epoch": 7.0,
610
+ "eval_entropy": 0.2444461930829745,
611
+ "eval_loss": 0.8798949718475342,
612
+ "eval_mean_token_accuracy": 0.8457763839799625,
613
+ "eval_num_tokens": 6613474.0,
614
+ "eval_runtime": 90.2868,
615
+ "eval_samples_per_second": 18.353,
616
+ "eval_steps_per_second": 2.304,
617
+ "step": 2695
618
+ }
619
+ ],
620
+ "logging_steps": 50,
621
+ "max_steps": 3850,
622
+ "num_input_tokens_seen": 0,
623
+ "num_train_epochs": 10,
624
+ "save_steps": 500,
625
+ "stateful_callbacks": {
626
+ "TrainerControl": {
627
+ "args": {
628
+ "should_epoch_stop": false,
629
+ "should_evaluate": false,
630
+ "should_log": false,
631
+ "should_save": true,
632
+ "should_training_stop": false
633
+ },
634
+ "attributes": {}
635
+ }
636
+ },
637
+ "total_flos": 2.31810912445653e+18,
638
+ "train_batch_size": 4,
639
+ "trial_name": null,
640
+ "trial_params": null
641
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.00985279561940916,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 16,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Estonian/gemma-4-31B_original_features_structural_train_original_features_structural_test2/checkpoint-3080/trainer_state.json ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 8.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3080,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.353258643448353,
14
+ "epoch": 0.1299545159194282,
15
+ "grad_norm": 3.010725975036621,
16
+ "learning_rate": 4.8475852375026876e-05,
17
+ "loss": 5.475971069335937,
18
+ "mean_token_accuracy": 0.7263440760970116,
19
+ "num_tokens": 128842.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.649170914888382,
24
+ "epoch": 0.2599090318388564,
25
+ "grad_norm": 1.9099390506744385,
26
+ "learning_rate": 9.794100785974817e-05,
27
+ "loss": 2.55168701171875,
28
+ "mean_token_accuracy": 0.8364580717682838,
29
+ "num_tokens": 255497.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5930788792669773,
34
+ "epoch": 0.3898635477582846,
35
+ "grad_norm": 2.1239051818847656,
36
+ "learning_rate": 0.0001474061633444695,
37
+ "loss": 2.3440716552734373,
38
+ "mean_token_accuracy": 0.8452290838956833,
39
+ "num_tokens": 372014.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.5564522063732147,
44
+ "epoch": 0.5198180636777128,
45
+ "grad_norm": 411.71807861328125,
46
+ "learning_rate": 0.00019687131882919077,
47
+ "loss": 2.2838446044921876,
48
+ "mean_token_accuracy": 0.8498487600684166,
49
+ "num_tokens": 500623.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.5539529167115689,
54
+ "epoch": 0.649772579597141,
55
+ "grad_norm": 2.1969902515411377,
56
+ "learning_rate": 0.0002463364743139121,
57
+ "loss": 2.675394287109375,
58
+ "mean_token_accuracy": 0.8430694487690925,
59
+ "num_tokens": 616223.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.5719467167556286,
64
+ "epoch": 0.7797270955165692,
65
+ "grad_norm": 1.98796546459198,
66
+ "learning_rate": 0.00029580162979863343,
67
+ "loss": 2.2434300231933593,
68
+ "mean_token_accuracy": 0.851241897046566,
69
+ "num_tokens": 737263.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.5502805083990097,
74
+ "epoch": 0.9096816114359974,
75
+ "grad_norm": 2.0211398601531982,
76
+ "learning_rate": 0.0003452667852833547,
77
+ "loss": 2.1729367065429686,
78
+ "mean_token_accuracy": 0.8554597494006156,
79
+ "num_tokens": 861477.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5580813550891784,
85
+ "eval_loss": 0.5830356478691101,
86
+ "eval_mean_token_accuracy": 0.8432669037809739,
87
+ "eval_num_tokens": 944782.0,
88
+ "eval_runtime": 90.3664,
89
+ "eval_samples_per_second": 18.336,
90
+ "eval_steps_per_second": 2.302,
91
+ "step": 385
92
+ },
93
+ {
94
+ "entropy": 0.5498402091725987,
95
+ "epoch": 1.0389863547758285,
96
+ "grad_norm": 3.8034188747406006,
97
+ "learning_rate": 0.000380866355527619,
98
+ "loss": 2.113946990966797,
99
+ "mean_token_accuracy": 0.8578129452676629,
100
+ "num_tokens": 982803.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.5182110907137394,
105
+ "epoch": 1.1689408706952567,
106
+ "grad_norm": 2.7830824851989746,
107
+ "learning_rate": 0.0003805611725593471,
108
+ "loss": 1.9833453369140626,
109
+ "mean_token_accuracy": 0.8656822636723518,
110
+ "num_tokens": 1105926.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.5260789206624031,
115
+ "epoch": 1.2988953866146848,
116
+ "grad_norm": 1.7993361949920654,
117
+ "learning_rate": 0.0003798653399371568,
118
+ "loss": 2.006897430419922,
119
+ "mean_token_accuracy": 0.8631055191159248,
120
+ "num_tokens": 1229857.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.5327546864748001,
125
+ "epoch": 1.428849902534113,
126
+ "grad_norm": 1.7606678009033203,
127
+ "learning_rate": 0.0003787802874228295,
128
+ "loss": 2.020283050537109,
129
+ "mean_token_accuracy": 0.8638329988718033,
130
+ "num_tokens": 1352330.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.5285360223054886,
135
+ "epoch": 1.5588044184535412,
136
+ "grad_norm": 4.76006555557251,
137
+ "learning_rate": 0.00037730824452755275,
138
+ "loss": 1.9987391662597656,
139
+ "mean_token_accuracy": 0.8644696187973022,
140
+ "num_tokens": 1474790.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.5134804363548756,
145
+ "epoch": 1.6887589343729694,
146
+ "grad_norm": 1.8447264432907104,
147
+ "learning_rate": 0.000375452235930833,
148
+ "loss": 1.9669386291503905,
149
+ "mean_token_accuracy": 0.8659948265552521,
150
+ "num_tokens": 1600381.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.5371069309115409,
155
+ "epoch": 1.8187134502923976,
156
+ "grad_norm": 1.6537392139434814,
157
+ "learning_rate": 0.00037321607526553675,
158
+ "loss": 2.0411550903320315,
159
+ "mean_token_accuracy": 0.8624854254722595,
160
+ "num_tokens": 1716827.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "entropy": 0.5270501750707627,
165
+ "epoch": 1.9486679662118258,
166
+ "grad_norm": 2.6990911960601807,
167
+ "learning_rate": 0.00037060435728183,
168
+ "loss": 2.015792236328125,
169
+ "mean_token_accuracy": 0.8631013777852058,
170
+ "num_tokens": 1842798.0,
171
+ "step": 750
172
+ },
173
+ {
174
+ "epoch": 2.0,
175
+ "eval_entropy": 0.5477195472384875,
176
+ "eval_loss": 0.5585702657699585,
177
+ "eval_mean_token_accuracy": 0.8486175815073344,
178
+ "eval_num_tokens": 1889564.0,
179
+ "eval_runtime": 90.2194,
180
+ "eval_samples_per_second": 18.366,
181
+ "eval_steps_per_second": 2.305,
182
+ "step": 770
183
+ },
184
+ {
185
+ "entropy": 0.4782189565088282,
186
+ "epoch": 2.077972709551657,
187
+ "grad_norm": 2.041952610015869,
188
+ "learning_rate": 0.0003676224484061175,
189
+ "loss": 1.7843829345703126,
190
+ "mean_token_accuracy": 0.8739750406250881,
191
+ "num_tokens": 1959778.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.4443667846918106,
196
+ "epoch": 2.207927225471085,
197
+ "grad_norm": 16.27313804626465,
198
+ "learning_rate": 0.00036427647571437996,
199
+ "loss": 1.6559255981445313,
200
+ "mean_token_accuracy": 0.8808386281132699,
201
+ "num_tokens": 2087384.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.44861202985048293,
206
+ "epoch": 2.3378817413905133,
207
+ "grad_norm": 1.648870587348938,
208
+ "learning_rate": 0.0003605733143425679,
209
+ "loss": 1.677943878173828,
210
+ "mean_token_accuracy": 0.879555520415306,
211
+ "num_tokens": 2211962.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.4568726105988026,
216
+ "epoch": 2.4678362573099415,
217
+ "grad_norm": 1.7573126554489136,
218
+ "learning_rate": 0.00035652057335991866,
219
+ "loss": 1.6760734558105468,
220
+ "mean_token_accuracy": 0.8791913360357284,
221
+ "num_tokens": 2334838.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.44863338857889173,
226
+ "epoch": 2.5977907732293697,
227
+ "grad_norm": 1.8639047145843506,
228
+ "learning_rate": 0.00035212658013422465,
229
+ "loss": 1.6799411010742187,
230
+ "mean_token_accuracy": 0.8790675121545791,
231
+ "num_tokens": 2461732.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.4585830120742321,
236
+ "epoch": 2.727745289148798,
237
+ "grad_norm": 1.9825985431671143,
238
+ "learning_rate": 0.0003474003632211781,
239
+ "loss": 1.7172026062011718,
240
+ "mean_token_accuracy": 0.8782495930790901,
241
+ "num_tokens": 2580026.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.45422692246735097,
246
+ "epoch": 2.857699805068226,
247
+ "grad_norm": 1.7149962186813354,
248
+ "learning_rate": 0.00034235163381294995,
249
+ "loss": 1.679084014892578,
250
+ "mean_token_accuracy": 0.8795321774482727,
251
+ "num_tokens": 2705600.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "entropy": 0.47297614574432373,
256
+ "epoch": 2.9876543209876543,
257
+ "grad_norm": 1.7435617446899414,
258
+ "learning_rate": 0.0003369907657841221,
259
+ "loss": 1.7386201477050782,
260
+ "mean_token_accuracy": 0.8779115182161331,
261
+ "num_tokens": 2822808.0,
262
+ "step": 1150
263
+ },
264
+ {
265
+ "epoch": 3.0,
266
+ "eval_entropy": 0.5031588454372607,
267
+ "eval_loss": 0.5551120638847351,
268
+ "eval_mean_token_accuracy": 0.8531603300227568,
269
+ "eval_num_tokens": 2834346.0,
270
+ "eval_runtime": 90.2397,
271
+ "eval_samples_per_second": 18.362,
272
+ "eval_steps_per_second": 2.305,
273
+ "step": 1155
274
+ },
275
+ {
276
+ "entropy": 0.37655152073457615,
277
+ "epoch": 3.116959064327485,
278
+ "grad_norm": 1.504384160041809,
279
+ "learning_rate": 0.0003313287743759729,
280
+ "loss": 1.3653451538085937,
281
+ "mean_token_accuracy": 0.8971295344769655,
282
+ "num_tokens": 2939773.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.37069276951253416,
287
+ "epoch": 3.246913580246914,
288
+ "grad_norm": 1.9665946960449219,
289
+ "learning_rate": 0.0003253772935629151,
290
+ "loss": 1.3458108520507812,
291
+ "mean_token_accuracy": 0.8982205548882485,
292
+ "num_tokens": 3063617.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.37295883789658546,
297
+ "epoch": 3.3768680961663415,
298
+ "grad_norm": 1.7501362562179565,
299
+ "learning_rate": 0.00031914855214759165,
300
+ "loss": 1.357562255859375,
301
+ "mean_token_accuracy": 0.8977113124728203,
302
+ "num_tokens": 3189800.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.3805788069963455,
307
+ "epoch": 3.50682261208577,
308
+ "grad_norm": 1.7277154922485352,
309
+ "learning_rate": 0.00031265534863374894,
310
+ "loss": 1.3735618591308594,
311
+ "mean_token_accuracy": 0.8962143072485924,
312
+ "num_tokens": 3311908.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.3840580120682716,
317
+ "epoch": 3.636777128005198,
318
+ "grad_norm": 2.2338802814483643,
319
+ "learning_rate": 0.0003059110249285165,
320
+ "loss": 1.3903216552734374,
321
+ "mean_token_accuracy": 0.8958476388454437,
322
+ "num_tokens": 3432934.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.37621145449578763,
327
+ "epoch": 3.7667316439246266,
328
+ "grad_norm": 1.9029661417007446,
329
+ "learning_rate": 0.00029892943892812944,
330
+ "loss": 1.3776657104492187,
331
+ "mean_token_accuracy": 0.8964926180243492,
332
+ "num_tokens": 3561408.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "entropy": 0.3784803995490074,
337
+ "epoch": 3.8966861598440543,
338
+ "grad_norm": 2.089708089828491,
339
+ "learning_rate": 0.00029172493604342163,
340
+ "loss": 1.3816807556152344,
341
+ "mean_token_accuracy": 0.8962833172082901,
342
+ "num_tokens": 3684624.0,
343
+ "step": 1500
344
+ },
345
+ {
346
+ "epoch": 4.0,
347
+ "eval_entropy": 0.4351254403591156,
348
+ "eval_loss": 0.5814722180366516,
349
+ "eval_mean_token_accuracy": 0.8530604747625498,
350
+ "eval_num_tokens": 3779128.0,
351
+ "eval_runtime": 90.2232,
352
+ "eval_samples_per_second": 18.366,
353
+ "eval_steps_per_second": 2.305,
354
+ "step": 1540
355
+ },
356
+ {
357
+ "entropy": 0.36326556409423677,
358
+ "epoch": 4.025990903183885,
359
+ "grad_norm": 2.1354947090148926,
360
+ "learning_rate": 0.0002843123197235993,
361
+ "loss": 1.3295362854003907,
362
+ "mean_token_accuracy": 0.8993093811686913,
363
+ "num_tokens": 3804993.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.2879397062957287,
368
+ "epoch": 4.155945419103314,
369
+ "grad_norm": 2.201097011566162,
370
+ "learning_rate": 0.0002767068210388601,
371
+ "loss": 1.0272974395751953,
372
+ "mean_token_accuracy": 0.9182627710700035,
373
+ "num_tokens": 3928162.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.2848948486149311,
378
+ "epoch": 4.2858999350227425,
379
+ "grad_norm": 2.01479172706604,
380
+ "learning_rate": 0.000268924067384358,
381
+ "loss": 1.0278727722167968,
382
+ "mean_token_accuracy": 0.9194766515493393,
383
+ "num_tokens": 4049012.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.2940504560619593,
388
+ "epoch": 4.41585445094217,
389
+ "grad_norm": 2.0893027782440186,
390
+ "learning_rate": 0.00026098005036982003,
391
+ "loss": 1.0586751556396485,
392
+ "mean_token_accuracy": 0.9167885810136795,
393
+ "num_tokens": 4167845.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.293505182415247,
398
+ "epoch": 4.545808966861598,
399
+ "grad_norm": 1.6346389055252075,
400
+ "learning_rate": 0.0002528910929607928,
401
+ "loss": 1.0669570922851563,
402
+ "mean_token_accuracy": 0.9160876458883286,
403
+ "num_tokens": 4287505.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.2898535231500864,
408
+ "epoch": 4.675763482781027,
409
+ "grad_norm": 1.6645033359527588,
410
+ "learning_rate": 0.0002446738159390364,
411
+ "loss": 1.0582612609863282,
412
+ "mean_token_accuracy": 0.9177632886171341,
413
+ "num_tokens": 4412221.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.2842763290554285,
418
+ "epoch": 4.805717998700455,
419
+ "grad_norm": 2.4594268798828125,
420
+ "learning_rate": 0.0002363451037509798,
421
+ "loss": 1.0467537689208983,
422
+ "mean_token_accuracy": 0.9177608361840248,
423
+ "num_tokens": 4537178.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "entropy": 0.284430123642087,
428
+ "epoch": 4.935672514619883,
429
+ "grad_norm": 2.1724514961242676,
430
+ "learning_rate": 0.00022792206981441223,
431
+ "loss": 1.0753899383544923,
432
+ "mean_token_accuracy": 0.915192686021328,
433
+ "num_tokens": 4664196.0,
434
+ "step": 1900
435
+ },
436
+ {
437
+ "epoch": 5.0,
438
+ "eval_entropy": 0.3632780872285366,
439
+ "eval_loss": 0.6438126564025879,
440
+ "eval_mean_token_accuracy": 0.8511462942338907,
441
+ "eval_num_tokens": 4723910.0,
442
+ "eval_runtime": 90.1846,
443
+ "eval_samples_per_second": 18.373,
444
+ "eval_steps_per_second": 2.306,
445
+ "step": 1925
446
+ },
447
+ {
448
+ "entropy": 0.23515464736139355,
449
+ "epoch": 5.064977257959714,
450
+ "grad_norm": 1.651587724685669,
451
+ "learning_rate": 0.00021942202135469513,
452
+ "loss": 0.8597064971923828,
453
+ "mean_token_accuracy": 0.9324622603517082,
454
+ "num_tokens": 4789568.0,
455
+ "step": 1950
456
+ },
457
+ {
458
+ "entropy": 0.1958953895419836,
459
+ "epoch": 5.1949317738791425,
460
+ "grad_norm": 1.923292636871338,
461
+ "learning_rate": 0.0002108624238427481,
462
+ "loss": 0.7188112640380859,
463
+ "mean_token_accuracy": 0.9416415295004845,
464
+ "num_tokens": 4913407.0,
465
+ "step": 2000
466
+ },
467
+ {
468
+ "entropy": 0.21068542070686816,
469
+ "epoch": 5.32488628979857,
470
+ "grad_norm": 2.299356460571289,
471
+ "learning_rate": 0.0002022608651078804,
472
+ "loss": 0.7712985229492187,
473
+ "mean_token_accuracy": 0.9386440163850784,
474
+ "num_tokens": 5032951.0,
475
+ "step": 2050
476
+ },
477
+ {
478
+ "entropy": 0.21234643168747425,
479
+ "epoch": 5.454840805717999,
480
+ "grad_norm": 2.2119295597076416,
481
+ "learning_rate": 0.00019363501919920608,
482
+ "loss": 0.7650181579589844,
483
+ "mean_token_accuracy": 0.938471505343914,
484
+ "num_tokens": 5156908.0,
485
+ "step": 2100
486
+ },
487
+ {
488
+ "entropy": 0.21658269092440605,
489
+ "epoch": 5.584795321637427,
490
+ "grad_norm": 1.5394288301467896,
491
+ "learning_rate": 0.00018500261006989887,
492
+ "loss": 0.7784209442138672,
493
+ "mean_token_accuracy": 0.9371598136425018,
494
+ "num_tokens": 5276087.0,
495
+ "step": 2150
496
+ },
497
+ {
498
+ "entropy": 0.2045296123996377,
499
+ "epoch": 5.714749837556855,
500
+ "grad_norm": 1.913680076599121,
501
+ "learning_rate": 0.00017638137515890763,
502
+ "loss": 0.7638166046142578,
503
+ "mean_token_accuracy": 0.9378301629424095,
504
+ "num_tokens": 5398787.0,
505
+ "step": 2200
506
+ },
507
+ {
508
+ "entropy": 0.20917976945638656,
509
+ "epoch": 5.844704353476283,
510
+ "grad_norm": 2.0847299098968506,
511
+ "learning_rate": 0.00016778902894496063,
512
+ "loss": 0.7631703186035156,
513
+ "mean_token_accuracy": 0.9387557968497277,
514
+ "num_tokens": 5522332.0,
515
+ "step": 2250
516
+ },
517
+ {
518
+ "entropy": 0.22262076318264007,
519
+ "epoch": 5.974658869395712,
520
+ "grad_norm": 2.1597352027893066,
521
+ "learning_rate": 0.0001592432265477485,
522
+ "loss": 0.798133773803711,
523
+ "mean_token_accuracy": 0.936034984588623,
524
+ "num_tokens": 5642361.0,
525
+ "step": 2300
526
+ },
527
+ {
528
+ "epoch": 6.0,
529
+ "eval_entropy": 0.31502799331568754,
530
+ "eval_loss": 0.7417300343513489,
531
+ "eval_mean_token_accuracy": 0.8477253922476218,
532
+ "eval_num_tokens": 5668692.0,
533
+ "eval_runtime": 90.4252,
534
+ "eval_samples_per_second": 18.325,
535
+ "eval_steps_per_second": 2.3,
536
+ "step": 2310
537
+ },
538
+ {
539
+ "entropy": 0.16796037876725795,
540
+ "epoch": 6.1039636127355426,
541
+ "grad_norm": 2.2228569984436035,
542
+ "learning_rate": 0.00015076152745107442,
543
+ "loss": 0.5835284805297851,
544
+ "mean_token_accuracy": 0.9529892874123463,
545
+ "num_tokens": 5766129.0,
546
+ "step": 2350
547
+ },
548
+ {
549
+ "entropy": 0.14919219192117453,
550
+ "epoch": 6.23391812865497,
551
+ "grad_norm": 1.408840298652649,
552
+ "learning_rate": 0.00014236135942251215,
553
+ "loss": 0.5310631561279296,
554
+ "mean_token_accuracy": 0.9586454060673714,
555
+ "num_tokens": 5888746.0,
556
+ "step": 2400
557
+ },
558
+ {
559
+ "entropy": 0.1499051059409976,
560
+ "epoch": 6.363872644574399,
561
+ "grad_norm": 1.8611102104187012,
562
+ "learning_rate": 0.00013405998270370849,
563
+ "loss": 0.5127810668945313,
564
+ "mean_token_accuracy": 0.9591325157880783,
565
+ "num_tokens": 6014455.0,
566
+ "step": 2450
567
+ },
568
+ {
569
+ "entropy": 0.15334193099290133,
570
+ "epoch": 6.493827160493828,
571
+ "grad_norm": 1.6051015853881836,
572
+ "learning_rate": 0.00012587445454490892,
573
+ "loss": 0.5349758529663086,
574
+ "mean_token_accuracy": 0.9574431091547012,
575
+ "num_tokens": 6141229.0,
576
+ "step": 2500
577
+ },
578
+ {
579
+ "entropy": 0.15982334002852439,
580
+ "epoch": 6.623781676413255,
581
+ "grad_norm": 3.7065205574035645,
582
+ "learning_rate": 0.00011782159415658008,
583
+ "loss": 0.5602469253540039,
584
+ "mean_token_accuracy": 0.9555372184515,
585
+ "num_tokens": 6257983.0,
586
+ "step": 2550
587
+ },
588
+ {
589
+ "entropy": 0.16072992872446776,
590
+ "epoch": 6.753736192332683,
591
+ "grad_norm": 2.282320976257324,
592
+ "learning_rate": 0.00010991794815014401,
593
+ "loss": 0.5657939910888672,
594
+ "mean_token_accuracy": 0.9550630164146423,
595
+ "num_tokens": 6376198.0,
596
+ "step": 2600
597
+ },
598
+ {
599
+ "entropy": 0.1512781011685729,
600
+ "epoch": 6.883690708252112,
601
+ "grad_norm": 1.3716893196105957,
602
+ "learning_rate": 0.00010217975653883603,
603
+ "loss": 0.5340792465209961,
604
+ "mean_token_accuracy": 0.9578188157081604,
605
+ "num_tokens": 6502526.0,
606
+ "step": 2650
607
+ },
608
+ {
609
+ "epoch": 7.0,
610
+ "eval_entropy": 0.2444461930829745,
611
+ "eval_loss": 0.8798949718475342,
612
+ "eval_mean_token_accuracy": 0.8457763839799625,
613
+ "eval_num_tokens": 6613474.0,
614
+ "eval_runtime": 90.2868,
615
+ "eval_samples_per_second": 18.353,
616
+ "eval_steps_per_second": 2.304,
617
+ "step": 2695
618
+ },
619
+ {
620
+ "entropy": 0.1444593005668578,
621
+ "epoch": 7.012995451591943,
622
+ "grad_norm": 1.0965569019317627,
623
+ "learning_rate": 9.462291936854386e-05,
624
+ "loss": 0.511833839416504,
625
+ "mean_token_accuracy": 0.9595773016388093,
626
+ "num_tokens": 6626464.0,
627
+ "step": 2700
628
+ },
629
+ {
630
+ "entropy": 0.10985541097819805,
631
+ "epoch": 7.142949967511371,
632
+ "grad_norm": 1.8079149723052979,
633
+ "learning_rate": 8.726296404719584e-05,
634
+ "loss": 0.3876673126220703,
635
+ "mean_token_accuracy": 0.9704919803142548,
636
+ "num_tokens": 6746276.0,
637
+ "step": 2750
638
+ },
639
+ {
640
+ "entropy": 0.11304264679551125,
641
+ "epoch": 7.272904483430799,
642
+ "grad_norm": 1.5228444337844849,
643
+ "learning_rate": 8.01150134398253e-05,
644
+ "loss": 0.39335052490234373,
645
+ "mean_token_accuracy": 0.9695766788721084,
646
+ "num_tokens": 6868131.0,
647
+ "step": 2800
648
+ },
649
+ {
650
+ "entropy": 0.11066193280741572,
651
+ "epoch": 7.402858999350228,
652
+ "grad_norm": 2.265174388885498,
653
+ "learning_rate": 7.319375479487112e-05,
654
+ "loss": 0.38289966583251955,
655
+ "mean_token_accuracy": 0.9707033503055572,
656
+ "num_tokens": 6993803.0,
657
+ "step": 2850
658
+ },
659
+ {
660
+ "entropy": 0.12022399662062526,
661
+ "epoch": 7.532813515269655,
662
+ "grad_norm": 1.0657345056533813,
663
+ "learning_rate": 6.65134095655596e-05,
664
+ "loss": 0.4089087677001953,
665
+ "mean_token_accuracy": 0.9689779531955719,
666
+ "num_tokens": 7113063.0,
667
+ "step": 2900
668
+ },
669
+ {
670
+ "entropy": 0.11429863104596734,
671
+ "epoch": 7.662768031189084,
672
+ "grad_norm": 1.3440358638763428,
673
+ "learning_rate": 6.008770418837973e-05,
674
+ "loss": 0.3935198593139648,
675
+ "mean_token_accuracy": 0.9698223957419395,
676
+ "num_tokens": 7237174.0,
677
+ "step": 2950
678
+ },
679
+ {
680
+ "entropy": 0.11748226622119545,
681
+ "epoch": 7.792722547108512,
682
+ "grad_norm": 1.4607034921646118,
683
+ "learning_rate": 5.3929841878693804e-05,
684
+ "loss": 0.40399799346923826,
685
+ "mean_token_accuracy": 0.9695871344208717,
686
+ "num_tokens": 7357301.0,
687
+ "step": 3000
688
+ },
689
+ {
690
+ "entropy": 0.11790506653487683,
691
+ "epoch": 7.92267706302794,
692
+ "grad_norm": 1.4574708938598633,
693
+ "learning_rate": 4.805247550143646e-05,
694
+ "loss": 0.4049314880371094,
695
+ "mean_token_accuracy": 0.9693469110131264,
696
+ "num_tokens": 7482431.0,
697
+ "step": 3050
698
+ },
699
+ {
700
+ "epoch": 8.0,
701
+ "eval_entropy": 0.2104659411483086,
702
+ "eval_loss": 0.9939886927604675,
703
+ "eval_mean_token_accuracy": 0.8444042455118436,
704
+ "eval_num_tokens": 7558256.0,
705
+ "eval_runtime": 90.3118,
706
+ "eval_samples_per_second": 18.348,
707
+ "eval_steps_per_second": 2.303,
708
+ "step": 3080
709
+ }
710
+ ],
711
+ "logging_steps": 50,
712
+ "max_steps": 3850,
713
+ "num_input_tokens_seen": 0,
714
+ "num_train_epochs": 10,
715
+ "save_steps": 500,
716
+ "stateful_callbacks": {
717
+ "TrainerControl": {
718
+ "args": {
719
+ "should_epoch_stop": false,
720
+ "should_evaluate": false,
721
+ "should_log": false,
722
+ "should_save": true,
723
+ "should_training_stop": false
724
+ },
725
+ "attributes": {}
726
+ }
727
+ },
728
+ "total_flos": 2.648642717750723e+18,
729
+ "train_batch_size": 4,
730
+ "trial_name": null,
731
+ "trial_params": null
732
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: transformers
4
+ model_name: gemma-4-31B_original_features_structural_train_original_features_structural_test1
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for gemma-4-31B_original_features_structural_train_original_features_structural_test1
13
+
14
+ This model is a fine-tuned version of [google/gemma-4-31B](https://huggingface.co/google/gemma-4-31B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/katriin-kukk/Cross_lingual_morphological_generalization/runs/rfqns0wc)
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.29.0
39
+ - Transformers: 5.5.4
40
+ - Pytorch: 2.10.0
41
+ - Datasets: 4.6.1
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
+
47
+
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
57
+ }
58
+ ```
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1122/trainer_state.json ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1122,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.3355020767450332,
14
+ "epoch": 0.13386880856760375,
15
+ "grad_norm": 3.2956597805023193,
16
+ "learning_rate": 1.628530639938585e-05,
17
+ "loss": 5.349910278320312,
18
+ "mean_token_accuracy": 0.7383818039298058,
19
+ "num_tokens": 116199.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.5958842460811138,
24
+ "epoch": 0.2677376171352075,
25
+ "grad_norm": 2.5947492122650146,
26
+ "learning_rate": 3.290296599059591e-05,
27
+ "loss": 2.312855072021484,
28
+ "mean_token_accuracy": 0.8520967712998391,
29
+ "num_tokens": 232864.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5190362003445625,
34
+ "epoch": 0.40160642570281124,
35
+ "grad_norm": 1.5038394927978516,
36
+ "learning_rate": 4.9520625581805955e-05,
37
+ "loss": 2.0574468994140624,
38
+ "mean_token_accuracy": 0.8657039344310761,
39
+ "num_tokens": 352382.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.4922871346771717,
44
+ "epoch": 0.535475234270415,
45
+ "grad_norm": 1.645923137664795,
46
+ "learning_rate": 6.613828517301602e-05,
47
+ "loss": 1.916438446044922,
48
+ "mean_token_accuracy": 0.8717759534716606,
49
+ "num_tokens": 474532.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.491110111027956,
54
+ "epoch": 0.6693440428380187,
55
+ "grad_norm": 1.866817593574524,
56
+ "learning_rate": 8.275594476422607e-05,
57
+ "loss": 1.9421713256835937,
58
+ "mean_token_accuracy": 0.8710730043053627,
59
+ "num_tokens": 589198.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.47134352535009383,
64
+ "epoch": 0.8032128514056225,
65
+ "grad_norm": 117.62409210205078,
66
+ "learning_rate": 9.937360435543611e-05,
67
+ "loss": 1.9768324279785157,
68
+ "mean_token_accuracy": 0.8741078078746796,
69
+ "num_tokens": 707057.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.4820582258701325,
74
+ "epoch": 0.9370816599732262,
75
+ "grad_norm": 2.3274827003479004,
76
+ "learning_rate": 0.00011599126394664616,
77
+ "loss": 2.2025875854492187,
78
+ "mean_token_accuracy": 0.8697148504853248,
79
+ "num_tokens": 822888.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5010400542616844,
85
+ "eval_loss": 0.5114277601242065,
86
+ "eval_mean_token_accuracy": 0.8587275749444961,
87
+ "eval_num_tokens": 872247.0,
88
+ "eval_runtime": 96.5515,
89
+ "eval_samples_per_second": 16.561,
90
+ "eval_steps_per_second": 2.071,
91
+ "step": 374
92
+ },
93
+ {
94
+ "entropy": 0.4708875769918615,
95
+ "epoch": 1.069611780455154,
96
+ "grad_norm": 3.3712940216064453,
97
+ "learning_rate": 0.00012428317596508976,
98
+ "loss": 1.83294189453125,
99
+ "mean_token_accuracy": 0.8772370366737096,
100
+ "num_tokens": 929365.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.44804590195417404,
105
+ "epoch": 1.2034805890227578,
106
+ "grad_norm": 1.4833389520645142,
107
+ "learning_rate": 0.00012414788900475706,
108
+ "loss": 1.7768891906738282,
109
+ "mean_token_accuracy": 0.8791097947955131,
110
+ "num_tokens": 1046629.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.4510513086616993,
115
+ "epoch": 1.3373493975903614,
116
+ "grad_norm": 2.814790964126587,
117
+ "learning_rate": 0.00012387760965418496,
118
+ "loss": 1.7745071411132813,
119
+ "mean_token_accuracy": 0.8813075706362724,
120
+ "num_tokens": 1165744.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.4479117552936077,
125
+ "epoch": 1.4712182061579653,
126
+ "grad_norm": 1.855610728263855,
127
+ "learning_rate": 0.00012347292641217135,
128
+ "loss": 1.7583291625976563,
129
+ "mean_token_accuracy": 0.8815277495980263,
130
+ "num_tokens": 1284843.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.4380264139175415,
135
+ "epoch": 1.605087014725569,
136
+ "grad_norm": 1.383190631866455,
137
+ "learning_rate": 0.00012293472042483757,
138
+ "loss": 1.7229583740234375,
139
+ "mean_token_accuracy": 0.8832098203897476,
140
+ "num_tokens": 1406485.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.4342571949958801,
145
+ "epoch": 1.7389558232931726,
146
+ "grad_norm": 1.4977834224700928,
147
+ "learning_rate": 0.00012226416356704526,
148
+ "loss": 1.7174737548828125,
149
+ "mean_token_accuracy": 0.8834967383742333,
150
+ "num_tokens": 1525460.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.42700962007045745,
155
+ "epoch": 1.8728246318607764,
156
+ "grad_norm": 1.6156537532806396,
157
+ "learning_rate": 0.00012146271589078838,
158
+ "loss": 1.682061767578125,
159
+ "mean_token_accuracy": 0.8858474844694137,
160
+ "num_tokens": 1638984.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "epoch": 2.0,
165
+ "eval_entropy": 0.4838937771320343,
166
+ "eval_loss": 0.4826815128326416,
167
+ "eval_mean_token_accuracy": 0.8682844692468643,
168
+ "eval_num_tokens": 1744494.0,
169
+ "eval_runtime": 96.5071,
170
+ "eval_samples_per_second": 16.569,
171
+ "eval_steps_per_second": 2.072,
172
+ "step": 748
173
+ },
174
+ {
175
+ "entropy": 0.4378527848407476,
176
+ "epoch": 2.005354752342704,
177
+ "grad_norm": 1.400229573249817,
178
+ "learning_rate": 0.0001205321224461161,
179
+ "loss": 1.7096096801757812,
180
+ "mean_token_accuracy": 0.8838462468349573,
181
+ "num_tokens": 1749755.0,
182
+ "step": 750
183
+ },
184
+ {
185
+ "entropy": 0.3559799794852734,
186
+ "epoch": 2.139223560910308,
187
+ "grad_norm": 1.7168083190917969,
188
+ "learning_rate": 0.0001194744094815093,
189
+ "loss": 1.3893603515625,
190
+ "mean_token_accuracy": 0.9004731178283691,
191
+ "num_tokens": 1868231.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.3671448823064566,
196
+ "epoch": 2.2730923694779115,
197
+ "grad_norm": 1.9720135927200317,
198
+ "learning_rate": 0.00011829188003198282,
199
+ "loss": 1.429988555908203,
200
+ "mean_token_accuracy": 0.8970818132162094,
201
+ "num_tokens": 1979116.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.3597494306415319,
206
+ "epoch": 2.4069611780455156,
207
+ "grad_norm": 1.4947372674942017,
208
+ "learning_rate": 0.00011698710890452068,
209
+ "loss": 1.418173828125,
210
+ "mean_token_accuracy": 0.8994651186466217,
211
+ "num_tokens": 2094539.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.36254502907395364,
216
+ "epoch": 2.540829986613119,
217
+ "grad_norm": 1.6768454313278198,
218
+ "learning_rate": 0.00011556293707176242,
219
+ "loss": 1.4158590698242188,
220
+ "mean_token_accuracy": 0.8995477721095085,
221
+ "num_tokens": 2209415.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.36290778368711474,
226
+ "epoch": 2.674698795180723,
227
+ "grad_norm": 1.6033697128295898,
228
+ "learning_rate": 0.00011402246548614765,
229
+ "loss": 1.4300469970703125,
230
+ "mean_token_accuracy": 0.8986452376842499,
231
+ "num_tokens": 2324269.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.3635872249305248,
236
+ "epoch": 2.8085676037483265,
237
+ "grad_norm": 1.546893835067749,
238
+ "learning_rate": 0.00011236904832798785,
239
+ "loss": 1.42587646484375,
240
+ "mean_token_accuracy": 0.9003903394937516,
241
+ "num_tokens": 2447336.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.36871150620281695,
246
+ "epoch": 2.9424364123159306,
247
+ "grad_norm": 1.2951405048370361,
248
+ "learning_rate": 0.0001106062857021667,
249
+ "loss": 1.448046875,
250
+ "mean_token_accuracy": 0.8967258337140084,
251
+ "num_tokens": 2565837.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 3.0,
256
+ "eval_entropy": 0.4225208269059658,
257
+ "eval_loss": 0.489418089389801,
258
+ "eval_mean_token_accuracy": 0.8697815361618996,
259
+ "eval_num_tokens": 2616741.0,
260
+ "eval_runtime": 96.4058,
261
+ "eval_samples_per_second": 16.586,
262
+ "eval_steps_per_second": 2.075,
263
+ "step": 1122
264
+ }
265
+ ],
266
+ "logging_steps": 50,
267
+ "max_steps": 3740,
268
+ "num_input_tokens_seen": 0,
269
+ "num_train_epochs": 10,
270
+ "save_steps": 500,
271
+ "stateful_callbacks": {
272
+ "TrainerControl": {
273
+ "args": {
274
+ "should_epoch_stop": false,
275
+ "should_evaluate": false,
276
+ "should_log": false,
277
+ "should_save": true,
278
+ "should_training_stop": false
279
+ },
280
+ "attributes": {}
281
+ }
282
+ },
283
+ "total_flos": 8.979346498185751e+17,
284
+ "train_batch_size": 4,
285
+ "trial_name": null,
286
+ "trial_params": null
287
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1496/trainer_state.json ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1496,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.3355020767450332,
14
+ "epoch": 0.13386880856760375,
15
+ "grad_norm": 3.2956597805023193,
16
+ "learning_rate": 1.628530639938585e-05,
17
+ "loss": 5.349910278320312,
18
+ "mean_token_accuracy": 0.7383818039298058,
19
+ "num_tokens": 116199.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.5958842460811138,
24
+ "epoch": 0.2677376171352075,
25
+ "grad_norm": 2.5947492122650146,
26
+ "learning_rate": 3.290296599059591e-05,
27
+ "loss": 2.312855072021484,
28
+ "mean_token_accuracy": 0.8520967712998391,
29
+ "num_tokens": 232864.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5190362003445625,
34
+ "epoch": 0.40160642570281124,
35
+ "grad_norm": 1.5038394927978516,
36
+ "learning_rate": 4.9520625581805955e-05,
37
+ "loss": 2.0574468994140624,
38
+ "mean_token_accuracy": 0.8657039344310761,
39
+ "num_tokens": 352382.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.4922871346771717,
44
+ "epoch": 0.535475234270415,
45
+ "grad_norm": 1.645923137664795,
46
+ "learning_rate": 6.613828517301602e-05,
47
+ "loss": 1.916438446044922,
48
+ "mean_token_accuracy": 0.8717759534716606,
49
+ "num_tokens": 474532.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.491110111027956,
54
+ "epoch": 0.6693440428380187,
55
+ "grad_norm": 1.866817593574524,
56
+ "learning_rate": 8.275594476422607e-05,
57
+ "loss": 1.9421713256835937,
58
+ "mean_token_accuracy": 0.8710730043053627,
59
+ "num_tokens": 589198.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.47134352535009383,
64
+ "epoch": 0.8032128514056225,
65
+ "grad_norm": 117.62409210205078,
66
+ "learning_rate": 9.937360435543611e-05,
67
+ "loss": 1.9768324279785157,
68
+ "mean_token_accuracy": 0.8741078078746796,
69
+ "num_tokens": 707057.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.4820582258701325,
74
+ "epoch": 0.9370816599732262,
75
+ "grad_norm": 2.3274827003479004,
76
+ "learning_rate": 0.00011599126394664616,
77
+ "loss": 2.2025875854492187,
78
+ "mean_token_accuracy": 0.8697148504853248,
79
+ "num_tokens": 822888.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5010400542616844,
85
+ "eval_loss": 0.5114277601242065,
86
+ "eval_mean_token_accuracy": 0.8587275749444961,
87
+ "eval_num_tokens": 872247.0,
88
+ "eval_runtime": 96.5515,
89
+ "eval_samples_per_second": 16.561,
90
+ "eval_steps_per_second": 2.071,
91
+ "step": 374
92
+ },
93
+ {
94
+ "entropy": 0.4708875769918615,
95
+ "epoch": 1.069611780455154,
96
+ "grad_norm": 3.3712940216064453,
97
+ "learning_rate": 0.00012428317596508976,
98
+ "loss": 1.83294189453125,
99
+ "mean_token_accuracy": 0.8772370366737096,
100
+ "num_tokens": 929365.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.44804590195417404,
105
+ "epoch": 1.2034805890227578,
106
+ "grad_norm": 1.4833389520645142,
107
+ "learning_rate": 0.00012414788900475706,
108
+ "loss": 1.7768891906738282,
109
+ "mean_token_accuracy": 0.8791097947955131,
110
+ "num_tokens": 1046629.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.4510513086616993,
115
+ "epoch": 1.3373493975903614,
116
+ "grad_norm": 2.814790964126587,
117
+ "learning_rate": 0.00012387760965418496,
118
+ "loss": 1.7745071411132813,
119
+ "mean_token_accuracy": 0.8813075706362724,
120
+ "num_tokens": 1165744.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.4479117552936077,
125
+ "epoch": 1.4712182061579653,
126
+ "grad_norm": 1.855610728263855,
127
+ "learning_rate": 0.00012347292641217135,
128
+ "loss": 1.7583291625976563,
129
+ "mean_token_accuracy": 0.8815277495980263,
130
+ "num_tokens": 1284843.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.4380264139175415,
135
+ "epoch": 1.605087014725569,
136
+ "grad_norm": 1.383190631866455,
137
+ "learning_rate": 0.00012293472042483757,
138
+ "loss": 1.7229583740234375,
139
+ "mean_token_accuracy": 0.8832098203897476,
140
+ "num_tokens": 1406485.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.4342571949958801,
145
+ "epoch": 1.7389558232931726,
146
+ "grad_norm": 1.4977834224700928,
147
+ "learning_rate": 0.00012226416356704526,
148
+ "loss": 1.7174737548828125,
149
+ "mean_token_accuracy": 0.8834967383742333,
150
+ "num_tokens": 1525460.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.42700962007045745,
155
+ "epoch": 1.8728246318607764,
156
+ "grad_norm": 1.6156537532806396,
157
+ "learning_rate": 0.00012146271589078838,
158
+ "loss": 1.682061767578125,
159
+ "mean_token_accuracy": 0.8858474844694137,
160
+ "num_tokens": 1638984.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "epoch": 2.0,
165
+ "eval_entropy": 0.4838937771320343,
166
+ "eval_loss": 0.4826815128326416,
167
+ "eval_mean_token_accuracy": 0.8682844692468643,
168
+ "eval_num_tokens": 1744494.0,
169
+ "eval_runtime": 96.5071,
170
+ "eval_samples_per_second": 16.569,
171
+ "eval_steps_per_second": 2.072,
172
+ "step": 748
173
+ },
174
+ {
175
+ "entropy": 0.4378527848407476,
176
+ "epoch": 2.005354752342704,
177
+ "grad_norm": 1.400229573249817,
178
+ "learning_rate": 0.0001205321224461161,
179
+ "loss": 1.7096096801757812,
180
+ "mean_token_accuracy": 0.8838462468349573,
181
+ "num_tokens": 1749755.0,
182
+ "step": 750
183
+ },
184
+ {
185
+ "entropy": 0.3559799794852734,
186
+ "epoch": 2.139223560910308,
187
+ "grad_norm": 1.7168083190917969,
188
+ "learning_rate": 0.0001194744094815093,
189
+ "loss": 1.3893603515625,
190
+ "mean_token_accuracy": 0.9004731178283691,
191
+ "num_tokens": 1868231.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.3671448823064566,
196
+ "epoch": 2.2730923694779115,
197
+ "grad_norm": 1.9720135927200317,
198
+ "learning_rate": 0.00011829188003198282,
199
+ "loss": 1.429988555908203,
200
+ "mean_token_accuracy": 0.8970818132162094,
201
+ "num_tokens": 1979116.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.3597494306415319,
206
+ "epoch": 2.4069611780455156,
207
+ "grad_norm": 1.4947372674942017,
208
+ "learning_rate": 0.00011698710890452068,
209
+ "loss": 1.418173828125,
210
+ "mean_token_accuracy": 0.8994651186466217,
211
+ "num_tokens": 2094539.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.36254502907395364,
216
+ "epoch": 2.540829986613119,
217
+ "grad_norm": 1.6768454313278198,
218
+ "learning_rate": 0.00011556293707176242,
219
+ "loss": 1.4158590698242188,
220
+ "mean_token_accuracy": 0.8995477721095085,
221
+ "num_tokens": 2209415.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.36290778368711474,
226
+ "epoch": 2.674698795180723,
227
+ "grad_norm": 1.6033697128295898,
228
+ "learning_rate": 0.00011402246548614765,
229
+ "loss": 1.4300469970703125,
230
+ "mean_token_accuracy": 0.8986452376842499,
231
+ "num_tokens": 2324269.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.3635872249305248,
236
+ "epoch": 2.8085676037483265,
237
+ "grad_norm": 1.546893835067749,
238
+ "learning_rate": 0.00011236904832798785,
239
+ "loss": 1.42587646484375,
240
+ "mean_token_accuracy": 0.9003903394937516,
241
+ "num_tokens": 2447336.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.36871150620281695,
246
+ "epoch": 2.9424364123159306,
247
+ "grad_norm": 1.2951405048370361,
248
+ "learning_rate": 0.0001106062857021667,
249
+ "loss": 1.448046875,
250
+ "mean_token_accuracy": 0.8967258337140084,
251
+ "num_tokens": 2565837.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 3.0,
256
+ "eval_entropy": 0.4225208269059658,
257
+ "eval_loss": 0.489418089389801,
258
+ "eval_mean_token_accuracy": 0.8697815361618996,
259
+ "eval_num_tokens": 2616741.0,
260
+ "eval_runtime": 96.4058,
261
+ "eval_samples_per_second": 16.586,
262
+ "eval_steps_per_second": 2.075,
263
+ "step": 1122
264
+ },
265
+ {
266
+ "entropy": 0.3120347365285411,
267
+ "epoch": 3.074966532797858,
268
+ "grad_norm": 1.639520287513733,
269
+ "learning_rate": 0.00010873801579937106,
270
+ "loss": 1.1941973876953125,
271
+ "mean_token_accuracy": 0.9117801315856703,
272
+ "num_tokens": 2685975.0,
273
+ "step": 1150
274
+ },
275
+ {
276
+ "entropy": 0.28257040068507194,
277
+ "epoch": 3.208835341365462,
278
+ "grad_norm": 1.7459681034088135,
279
+ "learning_rate": 0.00010676830653892058,
280
+ "loss": 1.0850601196289062,
281
+ "mean_token_accuracy": 0.9177472350001336,
282
+ "num_tokens": 2798277.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.27802520349621773,
287
+ "epoch": 3.3427041499330654,
288
+ "grad_norm": 1.5176103115081787,
289
+ "learning_rate": 0.00010470144671139238,
290
+ "loss": 1.0840838623046876,
291
+ "mean_token_accuracy": 0.9179763168096542,
292
+ "num_tokens": 2918973.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.280417420566082,
297
+ "epoch": 3.4765729585006695,
298
+ "grad_norm": 1.3774974346160889,
299
+ "learning_rate": 0.00010254193664032686,
300
+ "loss": 1.0911756896972655,
301
+ "mean_token_accuracy": 0.9162956389784813,
302
+ "num_tokens": 3039073.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.2834589210152626,
307
+ "epoch": 3.610441767068273,
308
+ "grad_norm": 1.5929396152496338,
309
+ "learning_rate": 0.00010029447838334742,
310
+ "loss": 1.0985262298583984,
311
+ "mean_token_accuracy": 0.9174074530601501,
312
+ "num_tokens": 3153710.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.282296127229929,
317
+ "epoch": 3.7443105756358768,
318
+ "grad_norm": 1.50350022315979,
319
+ "learning_rate": 9.796396549403e-05,
320
+ "loss": 1.101386260986328,
321
+ "mean_token_accuracy": 0.9168545073270797,
322
+ "num_tokens": 3263594.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.279728781580925,
327
+ "epoch": 3.878179384203481,
328
+ "grad_norm": 1.4728187322616577,
329
+ "learning_rate": 9.555547236681456e-05,
330
+ "loss": 1.0859880065917968,
331
+ "mean_token_accuracy": 0.9178367125988006,
332
+ "num_tokens": 3386033.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "epoch": 4.0,
337
+ "eval_entropy": 0.34304031178355215,
338
+ "eval_loss": 0.5295785665512085,
339
+ "eval_mean_token_accuracy": 0.8698753178119659,
340
+ "eval_num_tokens": 3488988.0,
341
+ "eval_runtime": 96.3616,
342
+ "eval_samples_per_second": 16.594,
343
+ "eval_steps_per_second": 2.076,
344
+ "step": 1496
345
+ }
346
+ ],
347
+ "logging_steps": 50,
348
+ "max_steps": 3740,
349
+ "num_input_tokens_seen": 0,
350
+ "num_train_epochs": 10,
351
+ "save_steps": 500,
352
+ "stateful_callbacks": {
353
+ "TrainerControl": {
354
+ "args": {
355
+ "should_epoch_stop": false,
356
+ "should_evaluate": false,
357
+ "should_log": false,
358
+ "should_save": true,
359
+ "should_training_stop": false
360
+ },
361
+ "attributes": {}
362
+ }
363
+ },
364
+ "total_flos": 1.1971161045794035e+18,
365
+ "train_batch_size": 4,
366
+ "trial_name": null,
367
+ "trial_params": null
368
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-1870/trainer_state.json ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1870,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.3355020767450332,
14
+ "epoch": 0.13386880856760375,
15
+ "grad_norm": 3.2956597805023193,
16
+ "learning_rate": 1.628530639938585e-05,
17
+ "loss": 5.349910278320312,
18
+ "mean_token_accuracy": 0.7383818039298058,
19
+ "num_tokens": 116199.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.5958842460811138,
24
+ "epoch": 0.2677376171352075,
25
+ "grad_norm": 2.5947492122650146,
26
+ "learning_rate": 3.290296599059591e-05,
27
+ "loss": 2.312855072021484,
28
+ "mean_token_accuracy": 0.8520967712998391,
29
+ "num_tokens": 232864.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5190362003445625,
34
+ "epoch": 0.40160642570281124,
35
+ "grad_norm": 1.5038394927978516,
36
+ "learning_rate": 4.9520625581805955e-05,
37
+ "loss": 2.0574468994140624,
38
+ "mean_token_accuracy": 0.8657039344310761,
39
+ "num_tokens": 352382.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.4922871346771717,
44
+ "epoch": 0.535475234270415,
45
+ "grad_norm": 1.645923137664795,
46
+ "learning_rate": 6.613828517301602e-05,
47
+ "loss": 1.916438446044922,
48
+ "mean_token_accuracy": 0.8717759534716606,
49
+ "num_tokens": 474532.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.491110111027956,
54
+ "epoch": 0.6693440428380187,
55
+ "grad_norm": 1.866817593574524,
56
+ "learning_rate": 8.275594476422607e-05,
57
+ "loss": 1.9421713256835937,
58
+ "mean_token_accuracy": 0.8710730043053627,
59
+ "num_tokens": 589198.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.47134352535009383,
64
+ "epoch": 0.8032128514056225,
65
+ "grad_norm": 117.62409210205078,
66
+ "learning_rate": 9.937360435543611e-05,
67
+ "loss": 1.9768324279785157,
68
+ "mean_token_accuracy": 0.8741078078746796,
69
+ "num_tokens": 707057.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.4820582258701325,
74
+ "epoch": 0.9370816599732262,
75
+ "grad_norm": 2.3274827003479004,
76
+ "learning_rate": 0.00011599126394664616,
77
+ "loss": 2.2025875854492187,
78
+ "mean_token_accuracy": 0.8697148504853248,
79
+ "num_tokens": 822888.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5010400542616844,
85
+ "eval_loss": 0.5114277601242065,
86
+ "eval_mean_token_accuracy": 0.8587275749444961,
87
+ "eval_num_tokens": 872247.0,
88
+ "eval_runtime": 96.5515,
89
+ "eval_samples_per_second": 16.561,
90
+ "eval_steps_per_second": 2.071,
91
+ "step": 374
92
+ },
93
+ {
94
+ "entropy": 0.4708875769918615,
95
+ "epoch": 1.069611780455154,
96
+ "grad_norm": 3.3712940216064453,
97
+ "learning_rate": 0.00012428317596508976,
98
+ "loss": 1.83294189453125,
99
+ "mean_token_accuracy": 0.8772370366737096,
100
+ "num_tokens": 929365.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.44804590195417404,
105
+ "epoch": 1.2034805890227578,
106
+ "grad_norm": 1.4833389520645142,
107
+ "learning_rate": 0.00012414788900475706,
108
+ "loss": 1.7768891906738282,
109
+ "mean_token_accuracy": 0.8791097947955131,
110
+ "num_tokens": 1046629.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.4510513086616993,
115
+ "epoch": 1.3373493975903614,
116
+ "grad_norm": 2.814790964126587,
117
+ "learning_rate": 0.00012387760965418496,
118
+ "loss": 1.7745071411132813,
119
+ "mean_token_accuracy": 0.8813075706362724,
120
+ "num_tokens": 1165744.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.4479117552936077,
125
+ "epoch": 1.4712182061579653,
126
+ "grad_norm": 1.855610728263855,
127
+ "learning_rate": 0.00012347292641217135,
128
+ "loss": 1.7583291625976563,
129
+ "mean_token_accuracy": 0.8815277495980263,
130
+ "num_tokens": 1284843.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.4380264139175415,
135
+ "epoch": 1.605087014725569,
136
+ "grad_norm": 1.383190631866455,
137
+ "learning_rate": 0.00012293472042483757,
138
+ "loss": 1.7229583740234375,
139
+ "mean_token_accuracy": 0.8832098203897476,
140
+ "num_tokens": 1406485.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.4342571949958801,
145
+ "epoch": 1.7389558232931726,
146
+ "grad_norm": 1.4977834224700928,
147
+ "learning_rate": 0.00012226416356704526,
148
+ "loss": 1.7174737548828125,
149
+ "mean_token_accuracy": 0.8834967383742333,
150
+ "num_tokens": 1525460.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.42700962007045745,
155
+ "epoch": 1.8728246318607764,
156
+ "grad_norm": 1.6156537532806396,
157
+ "learning_rate": 0.00012146271589078838,
158
+ "loss": 1.682061767578125,
159
+ "mean_token_accuracy": 0.8858474844694137,
160
+ "num_tokens": 1638984.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "epoch": 2.0,
165
+ "eval_entropy": 0.4838937771320343,
166
+ "eval_loss": 0.4826815128326416,
167
+ "eval_mean_token_accuracy": 0.8682844692468643,
168
+ "eval_num_tokens": 1744494.0,
169
+ "eval_runtime": 96.5071,
170
+ "eval_samples_per_second": 16.569,
171
+ "eval_steps_per_second": 2.072,
172
+ "step": 748
173
+ },
174
+ {
175
+ "entropy": 0.4378527848407476,
176
+ "epoch": 2.005354752342704,
177
+ "grad_norm": 1.400229573249817,
178
+ "learning_rate": 0.0001205321224461161,
179
+ "loss": 1.7096096801757812,
180
+ "mean_token_accuracy": 0.8838462468349573,
181
+ "num_tokens": 1749755.0,
182
+ "step": 750
183
+ },
184
+ {
185
+ "entropy": 0.3559799794852734,
186
+ "epoch": 2.139223560910308,
187
+ "grad_norm": 1.7168083190917969,
188
+ "learning_rate": 0.0001194744094815093,
189
+ "loss": 1.3893603515625,
190
+ "mean_token_accuracy": 0.9004731178283691,
191
+ "num_tokens": 1868231.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.3671448823064566,
196
+ "epoch": 2.2730923694779115,
197
+ "grad_norm": 1.9720135927200317,
198
+ "learning_rate": 0.00011829188003198282,
199
+ "loss": 1.429988555908203,
200
+ "mean_token_accuracy": 0.8970818132162094,
201
+ "num_tokens": 1979116.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.3597494306415319,
206
+ "epoch": 2.4069611780455156,
207
+ "grad_norm": 1.4947372674942017,
208
+ "learning_rate": 0.00011698710890452068,
209
+ "loss": 1.418173828125,
210
+ "mean_token_accuracy": 0.8994651186466217,
211
+ "num_tokens": 2094539.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.36254502907395364,
216
+ "epoch": 2.540829986613119,
217
+ "grad_norm": 1.6768454313278198,
218
+ "learning_rate": 0.00011556293707176242,
219
+ "loss": 1.4158590698242188,
220
+ "mean_token_accuracy": 0.8995477721095085,
221
+ "num_tokens": 2209415.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.36290778368711474,
226
+ "epoch": 2.674698795180723,
227
+ "grad_norm": 1.6033697128295898,
228
+ "learning_rate": 0.00011402246548614765,
229
+ "loss": 1.4300469970703125,
230
+ "mean_token_accuracy": 0.8986452376842499,
231
+ "num_tokens": 2324269.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.3635872249305248,
236
+ "epoch": 2.8085676037483265,
237
+ "grad_norm": 1.546893835067749,
238
+ "learning_rate": 0.00011236904832798785,
239
+ "loss": 1.42587646484375,
240
+ "mean_token_accuracy": 0.9003903394937516,
241
+ "num_tokens": 2447336.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.36871150620281695,
246
+ "epoch": 2.9424364123159306,
247
+ "grad_norm": 1.2951405048370361,
248
+ "learning_rate": 0.0001106062857021667,
249
+ "loss": 1.448046875,
250
+ "mean_token_accuracy": 0.8967258337140084,
251
+ "num_tokens": 2565837.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 3.0,
256
+ "eval_entropy": 0.4225208269059658,
257
+ "eval_loss": 0.489418089389801,
258
+ "eval_mean_token_accuracy": 0.8697815361618996,
259
+ "eval_num_tokens": 2616741.0,
260
+ "eval_runtime": 96.4058,
261
+ "eval_samples_per_second": 16.586,
262
+ "eval_steps_per_second": 2.075,
263
+ "step": 1122
264
+ },
265
+ {
266
+ "entropy": 0.3120347365285411,
267
+ "epoch": 3.074966532797858,
268
+ "grad_norm": 1.639520287513733,
269
+ "learning_rate": 0.00010873801579937106,
270
+ "loss": 1.1941973876953125,
271
+ "mean_token_accuracy": 0.9117801315856703,
272
+ "num_tokens": 2685975.0,
273
+ "step": 1150
274
+ },
275
+ {
276
+ "entropy": 0.28257040068507194,
277
+ "epoch": 3.208835341365462,
278
+ "grad_norm": 1.7459681034088135,
279
+ "learning_rate": 0.00010676830653892058,
280
+ "loss": 1.0850601196289062,
281
+ "mean_token_accuracy": 0.9177472350001336,
282
+ "num_tokens": 2798277.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.27802520349621773,
287
+ "epoch": 3.3427041499330654,
288
+ "grad_norm": 1.5176103115081787,
289
+ "learning_rate": 0.00010470144671139238,
290
+ "loss": 1.0840838623046876,
291
+ "mean_token_accuracy": 0.9179763168096542,
292
+ "num_tokens": 2918973.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.280417420566082,
297
+ "epoch": 3.4765729585006695,
298
+ "grad_norm": 1.3774974346160889,
299
+ "learning_rate": 0.00010254193664032686,
300
+ "loss": 1.0911756896972655,
301
+ "mean_token_accuracy": 0.9162956389784813,
302
+ "num_tokens": 3039073.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.2834589210152626,
307
+ "epoch": 3.610441767068273,
308
+ "grad_norm": 1.5929396152496338,
309
+ "learning_rate": 0.00010029447838334742,
310
+ "loss": 1.0985262298583984,
311
+ "mean_token_accuracy": 0.9174074530601501,
312
+ "num_tokens": 3153710.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.282296127229929,
317
+ "epoch": 3.7443105756358768,
318
+ "grad_norm": 1.50350022315979,
319
+ "learning_rate": 9.796396549403e-05,
320
+ "loss": 1.101386260986328,
321
+ "mean_token_accuracy": 0.9168545073270797,
322
+ "num_tokens": 3263594.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.279728781580925,
327
+ "epoch": 3.878179384203481,
328
+ "grad_norm": 1.4728187322616577,
329
+ "learning_rate": 9.555547236681456e-05,
330
+ "loss": 1.0859880065917968,
331
+ "mean_token_accuracy": 0.9178367125988006,
332
+ "num_tokens": 3386033.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "epoch": 4.0,
337
+ "eval_entropy": 0.34304031178355215,
338
+ "eval_loss": 0.5295785665512085,
339
+ "eval_mean_token_accuracy": 0.8698753178119659,
340
+ "eval_num_tokens": 3488988.0,
341
+ "eval_runtime": 96.3616,
342
+ "eval_samples_per_second": 16.594,
343
+ "eval_steps_per_second": 2.076,
344
+ "step": 1496
345
+ },
346
+ {
347
+ "entropy": 0.27893446536377225,
348
+ "epoch": 4.010709504685408,
349
+ "grad_norm": 1.545491337776184,
350
+ "learning_rate": 9.30742431881587e-05,
351
+ "loss": 1.0577442169189453,
352
+ "mean_token_accuracy": 0.9191552999645772,
353
+ "num_tokens": 3498406.0,
354
+ "step": 1500
355
+ },
356
+ {
357
+ "entropy": 0.19769302535802125,
358
+ "epoch": 4.144578313253012,
359
+ "grad_norm": 2.10296893119812,
360
+ "learning_rate": 9.052568051799083e-05,
361
+ "loss": 0.7461458587646485,
362
+ "mean_token_accuracy": 0.9415343621373177,
363
+ "num_tokens": 3614301.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.1981763695180416,
368
+ "epoch": 4.278447121820616,
369
+ "grad_norm": 2.067410945892334,
370
+ "learning_rate": 8.791533352632524e-05,
371
+ "loss": 0.7580889892578125,
372
+ "mean_token_accuracy": 0.9396374526619912,
373
+ "num_tokens": 3735705.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.19850988369435071,
378
+ "epoch": 4.412315930388219,
379
+ "grad_norm": 1.9034850597381592,
380
+ "learning_rate": 8.524888591065258e-05,
381
+ "loss": 0.7526986694335938,
382
+ "mean_token_accuracy": 0.9402479353547096,
383
+ "num_tokens": 3854287.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.19905407220125199,
388
+ "epoch": 4.546184738955823,
389
+ "grad_norm": 2.1477949619293213,
390
+ "learning_rate": 8.253214352041379e-05,
391
+ "loss": 0.7603612518310547,
392
+ "mean_token_accuracy": 0.9396576225757599,
393
+ "num_tokens": 3967362.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.20251497332006693,
398
+ "epoch": 4.680053547523427,
399
+ "grad_norm": 1.5489246845245361,
400
+ "learning_rate": 7.97710217155036e-05,
401
+ "loss": 0.7711930084228515,
402
+ "mean_token_accuracy": 0.9400961664319039,
403
+ "num_tokens": 4081441.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.1991352306306362,
408
+ "epoch": 4.813922356091031,
409
+ "grad_norm": 1.969994068145752,
410
+ "learning_rate": 7.697153248632946e-05,
411
+ "loss": 0.7681967163085938,
412
+ "mean_token_accuracy": 0.9399621617794037,
413
+ "num_tokens": 4197604.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.20229352474212647,
418
+ "epoch": 4.947791164658635,
419
+ "grad_norm": 2.2329719066619873,
420
+ "learning_rate": 7.41397713634694e-05,
421
+ "loss": 0.7733911895751953,
422
+ "mean_token_accuracy": 0.9396535342931748,
423
+ "num_tokens": 4318894.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "epoch": 5.0,
428
+ "eval_entropy": 0.270584502145648,
429
+ "eval_loss": 0.6255385875701904,
430
+ "eval_mean_token_accuracy": 0.8687835082411766,
431
+ "eval_num_tokens": 4361235.0,
432
+ "eval_runtime": 96.6331,
433
+ "eval_samples_per_second": 16.547,
434
+ "eval_steps_per_second": 2.07,
435
+ "step": 1870
436
+ }
437
+ ],
438
+ "logging_steps": 50,
439
+ "max_steps": 3740,
440
+ "num_input_tokens_seen": 0,
441
+ "num_train_epochs": 10,
442
+ "save_steps": 500,
443
+ "stateful_callbacks": {
444
+ "TrainerControl": {
445
+ "args": {
446
+ "should_epoch_stop": false,
447
+ "should_evaluate": false,
448
+ "should_log": false,
449
+ "should_save": true,
450
+ "should_training_stop": false
451
+ },
452
+ "attributes": {}
453
+ }
454
+ },
455
+ "total_flos": 1.4947622783933181e+18,
456
+ "train_batch_size": 4,
457
+ "trial_name": null,
458
+ "trial_params": null
459
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2244/trainer_state.json ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2244,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.3355020767450332,
14
+ "epoch": 0.13386880856760375,
15
+ "grad_norm": 3.2956597805023193,
16
+ "learning_rate": 1.628530639938585e-05,
17
+ "loss": 5.349910278320312,
18
+ "mean_token_accuracy": 0.7383818039298058,
19
+ "num_tokens": 116199.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.5958842460811138,
24
+ "epoch": 0.2677376171352075,
25
+ "grad_norm": 2.5947492122650146,
26
+ "learning_rate": 3.290296599059591e-05,
27
+ "loss": 2.312855072021484,
28
+ "mean_token_accuracy": 0.8520967712998391,
29
+ "num_tokens": 232864.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5190362003445625,
34
+ "epoch": 0.40160642570281124,
35
+ "grad_norm": 1.5038394927978516,
36
+ "learning_rate": 4.9520625581805955e-05,
37
+ "loss": 2.0574468994140624,
38
+ "mean_token_accuracy": 0.8657039344310761,
39
+ "num_tokens": 352382.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.4922871346771717,
44
+ "epoch": 0.535475234270415,
45
+ "grad_norm": 1.645923137664795,
46
+ "learning_rate": 6.613828517301602e-05,
47
+ "loss": 1.916438446044922,
48
+ "mean_token_accuracy": 0.8717759534716606,
49
+ "num_tokens": 474532.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.491110111027956,
54
+ "epoch": 0.6693440428380187,
55
+ "grad_norm": 1.866817593574524,
56
+ "learning_rate": 8.275594476422607e-05,
57
+ "loss": 1.9421713256835937,
58
+ "mean_token_accuracy": 0.8710730043053627,
59
+ "num_tokens": 589198.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.47134352535009383,
64
+ "epoch": 0.8032128514056225,
65
+ "grad_norm": 117.62409210205078,
66
+ "learning_rate": 9.937360435543611e-05,
67
+ "loss": 1.9768324279785157,
68
+ "mean_token_accuracy": 0.8741078078746796,
69
+ "num_tokens": 707057.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.4820582258701325,
74
+ "epoch": 0.9370816599732262,
75
+ "grad_norm": 2.3274827003479004,
76
+ "learning_rate": 0.00011599126394664616,
77
+ "loss": 2.2025875854492187,
78
+ "mean_token_accuracy": 0.8697148504853248,
79
+ "num_tokens": 822888.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5010400542616844,
85
+ "eval_loss": 0.5114277601242065,
86
+ "eval_mean_token_accuracy": 0.8587275749444961,
87
+ "eval_num_tokens": 872247.0,
88
+ "eval_runtime": 96.5515,
89
+ "eval_samples_per_second": 16.561,
90
+ "eval_steps_per_second": 2.071,
91
+ "step": 374
92
+ },
93
+ {
94
+ "entropy": 0.4708875769918615,
95
+ "epoch": 1.069611780455154,
96
+ "grad_norm": 3.3712940216064453,
97
+ "learning_rate": 0.00012428317596508976,
98
+ "loss": 1.83294189453125,
99
+ "mean_token_accuracy": 0.8772370366737096,
100
+ "num_tokens": 929365.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.44804590195417404,
105
+ "epoch": 1.2034805890227578,
106
+ "grad_norm": 1.4833389520645142,
107
+ "learning_rate": 0.00012414788900475706,
108
+ "loss": 1.7768891906738282,
109
+ "mean_token_accuracy": 0.8791097947955131,
110
+ "num_tokens": 1046629.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.4510513086616993,
115
+ "epoch": 1.3373493975903614,
116
+ "grad_norm": 2.814790964126587,
117
+ "learning_rate": 0.00012387760965418496,
118
+ "loss": 1.7745071411132813,
119
+ "mean_token_accuracy": 0.8813075706362724,
120
+ "num_tokens": 1165744.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.4479117552936077,
125
+ "epoch": 1.4712182061579653,
126
+ "grad_norm": 1.855610728263855,
127
+ "learning_rate": 0.00012347292641217135,
128
+ "loss": 1.7583291625976563,
129
+ "mean_token_accuracy": 0.8815277495980263,
130
+ "num_tokens": 1284843.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.4380264139175415,
135
+ "epoch": 1.605087014725569,
136
+ "grad_norm": 1.383190631866455,
137
+ "learning_rate": 0.00012293472042483757,
138
+ "loss": 1.7229583740234375,
139
+ "mean_token_accuracy": 0.8832098203897476,
140
+ "num_tokens": 1406485.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.4342571949958801,
145
+ "epoch": 1.7389558232931726,
146
+ "grad_norm": 1.4977834224700928,
147
+ "learning_rate": 0.00012226416356704526,
148
+ "loss": 1.7174737548828125,
149
+ "mean_token_accuracy": 0.8834967383742333,
150
+ "num_tokens": 1525460.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.42700962007045745,
155
+ "epoch": 1.8728246318607764,
156
+ "grad_norm": 1.6156537532806396,
157
+ "learning_rate": 0.00012146271589078838,
158
+ "loss": 1.682061767578125,
159
+ "mean_token_accuracy": 0.8858474844694137,
160
+ "num_tokens": 1638984.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "epoch": 2.0,
165
+ "eval_entropy": 0.4838937771320343,
166
+ "eval_loss": 0.4826815128326416,
167
+ "eval_mean_token_accuracy": 0.8682844692468643,
168
+ "eval_num_tokens": 1744494.0,
169
+ "eval_runtime": 96.5071,
170
+ "eval_samples_per_second": 16.569,
171
+ "eval_steps_per_second": 2.072,
172
+ "step": 748
173
+ },
174
+ {
175
+ "entropy": 0.4378527848407476,
176
+ "epoch": 2.005354752342704,
177
+ "grad_norm": 1.400229573249817,
178
+ "learning_rate": 0.0001205321224461161,
179
+ "loss": 1.7096096801757812,
180
+ "mean_token_accuracy": 0.8838462468349573,
181
+ "num_tokens": 1749755.0,
182
+ "step": 750
183
+ },
184
+ {
185
+ "entropy": 0.3559799794852734,
186
+ "epoch": 2.139223560910308,
187
+ "grad_norm": 1.7168083190917969,
188
+ "learning_rate": 0.0001194744094815093,
189
+ "loss": 1.3893603515625,
190
+ "mean_token_accuracy": 0.9004731178283691,
191
+ "num_tokens": 1868231.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.3671448823064566,
196
+ "epoch": 2.2730923694779115,
197
+ "grad_norm": 1.9720135927200317,
198
+ "learning_rate": 0.00011829188003198282,
199
+ "loss": 1.429988555908203,
200
+ "mean_token_accuracy": 0.8970818132162094,
201
+ "num_tokens": 1979116.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.3597494306415319,
206
+ "epoch": 2.4069611780455156,
207
+ "grad_norm": 1.4947372674942017,
208
+ "learning_rate": 0.00011698710890452068,
209
+ "loss": 1.418173828125,
210
+ "mean_token_accuracy": 0.8994651186466217,
211
+ "num_tokens": 2094539.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.36254502907395364,
216
+ "epoch": 2.540829986613119,
217
+ "grad_norm": 1.6768454313278198,
218
+ "learning_rate": 0.00011556293707176242,
219
+ "loss": 1.4158590698242188,
220
+ "mean_token_accuracy": 0.8995477721095085,
221
+ "num_tokens": 2209415.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.36290778368711474,
226
+ "epoch": 2.674698795180723,
227
+ "grad_norm": 1.6033697128295898,
228
+ "learning_rate": 0.00011402246548614765,
229
+ "loss": 1.4300469970703125,
230
+ "mean_token_accuracy": 0.8986452376842499,
231
+ "num_tokens": 2324269.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.3635872249305248,
236
+ "epoch": 2.8085676037483265,
237
+ "grad_norm": 1.546893835067749,
238
+ "learning_rate": 0.00011236904832798785,
239
+ "loss": 1.42587646484375,
240
+ "mean_token_accuracy": 0.9003903394937516,
241
+ "num_tokens": 2447336.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.36871150620281695,
246
+ "epoch": 2.9424364123159306,
247
+ "grad_norm": 1.2951405048370361,
248
+ "learning_rate": 0.0001106062857021667,
249
+ "loss": 1.448046875,
250
+ "mean_token_accuracy": 0.8967258337140084,
251
+ "num_tokens": 2565837.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 3.0,
256
+ "eval_entropy": 0.4225208269059658,
257
+ "eval_loss": 0.489418089389801,
258
+ "eval_mean_token_accuracy": 0.8697815361618996,
259
+ "eval_num_tokens": 2616741.0,
260
+ "eval_runtime": 96.4058,
261
+ "eval_samples_per_second": 16.586,
262
+ "eval_steps_per_second": 2.075,
263
+ "step": 1122
264
+ },
265
+ {
266
+ "entropy": 0.3120347365285411,
267
+ "epoch": 3.074966532797858,
268
+ "grad_norm": 1.639520287513733,
269
+ "learning_rate": 0.00010873801579937106,
270
+ "loss": 1.1941973876953125,
271
+ "mean_token_accuracy": 0.9117801315856703,
272
+ "num_tokens": 2685975.0,
273
+ "step": 1150
274
+ },
275
+ {
276
+ "entropy": 0.28257040068507194,
277
+ "epoch": 3.208835341365462,
278
+ "grad_norm": 1.7459681034088135,
279
+ "learning_rate": 0.00010676830653892058,
280
+ "loss": 1.0850601196289062,
281
+ "mean_token_accuracy": 0.9177472350001336,
282
+ "num_tokens": 2798277.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.27802520349621773,
287
+ "epoch": 3.3427041499330654,
288
+ "grad_norm": 1.5176103115081787,
289
+ "learning_rate": 0.00010470144671139238,
290
+ "loss": 1.0840838623046876,
291
+ "mean_token_accuracy": 0.9179763168096542,
292
+ "num_tokens": 2918973.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.280417420566082,
297
+ "epoch": 3.4765729585006695,
298
+ "grad_norm": 1.3774974346160889,
299
+ "learning_rate": 0.00010254193664032686,
300
+ "loss": 1.0911756896972655,
301
+ "mean_token_accuracy": 0.9162956389784813,
302
+ "num_tokens": 3039073.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.2834589210152626,
307
+ "epoch": 3.610441767068273,
308
+ "grad_norm": 1.5929396152496338,
309
+ "learning_rate": 0.00010029447838334742,
310
+ "loss": 1.0985262298583984,
311
+ "mean_token_accuracy": 0.9174074530601501,
312
+ "num_tokens": 3153710.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.282296127229929,
317
+ "epoch": 3.7443105756358768,
318
+ "grad_norm": 1.50350022315979,
319
+ "learning_rate": 9.796396549403e-05,
320
+ "loss": 1.101386260986328,
321
+ "mean_token_accuracy": 0.9168545073270797,
322
+ "num_tokens": 3263594.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.279728781580925,
327
+ "epoch": 3.878179384203481,
328
+ "grad_norm": 1.4728187322616577,
329
+ "learning_rate": 9.555547236681456e-05,
330
+ "loss": 1.0859880065917968,
331
+ "mean_token_accuracy": 0.9178367125988006,
332
+ "num_tokens": 3386033.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "epoch": 4.0,
337
+ "eval_entropy": 0.34304031178355215,
338
+ "eval_loss": 0.5295785665512085,
339
+ "eval_mean_token_accuracy": 0.8698753178119659,
340
+ "eval_num_tokens": 3488988.0,
341
+ "eval_runtime": 96.3616,
342
+ "eval_samples_per_second": 16.594,
343
+ "eval_steps_per_second": 2.076,
344
+ "step": 1496
345
+ },
346
+ {
347
+ "entropy": 0.27893446536377225,
348
+ "epoch": 4.010709504685408,
349
+ "grad_norm": 1.545491337776184,
350
+ "learning_rate": 9.30742431881587e-05,
351
+ "loss": 1.0577442169189453,
352
+ "mean_token_accuracy": 0.9191552999645772,
353
+ "num_tokens": 3498406.0,
354
+ "step": 1500
355
+ },
356
+ {
357
+ "entropy": 0.19769302535802125,
358
+ "epoch": 4.144578313253012,
359
+ "grad_norm": 2.10296893119812,
360
+ "learning_rate": 9.052568051799083e-05,
361
+ "loss": 0.7461458587646485,
362
+ "mean_token_accuracy": 0.9415343621373177,
363
+ "num_tokens": 3614301.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.1981763695180416,
368
+ "epoch": 4.278447121820616,
369
+ "grad_norm": 2.067410945892334,
370
+ "learning_rate": 8.791533352632524e-05,
371
+ "loss": 0.7580889892578125,
372
+ "mean_token_accuracy": 0.9396374526619912,
373
+ "num_tokens": 3735705.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.19850988369435071,
378
+ "epoch": 4.412315930388219,
379
+ "grad_norm": 1.9034850597381592,
380
+ "learning_rate": 8.524888591065258e-05,
381
+ "loss": 0.7526986694335938,
382
+ "mean_token_accuracy": 0.9402479353547096,
383
+ "num_tokens": 3854287.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.19905407220125199,
388
+ "epoch": 4.546184738955823,
389
+ "grad_norm": 2.1477949619293213,
390
+ "learning_rate": 8.253214352041379e-05,
391
+ "loss": 0.7603612518310547,
392
+ "mean_token_accuracy": 0.9396576225757599,
393
+ "num_tokens": 3967362.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.20251497332006693,
398
+ "epoch": 4.680053547523427,
399
+ "grad_norm": 1.5489246845245361,
400
+ "learning_rate": 7.97710217155036e-05,
401
+ "loss": 0.7711930084228515,
402
+ "mean_token_accuracy": 0.9400961664319039,
403
+ "num_tokens": 4081441.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.1991352306306362,
408
+ "epoch": 4.813922356091031,
409
+ "grad_norm": 1.969994068145752,
410
+ "learning_rate": 7.697153248632946e-05,
411
+ "loss": 0.7681967163085938,
412
+ "mean_token_accuracy": 0.9399621617794037,
413
+ "num_tokens": 4197604.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.20229352474212647,
418
+ "epoch": 4.947791164658635,
419
+ "grad_norm": 2.2329719066619873,
420
+ "learning_rate": 7.41397713634694e-05,
421
+ "loss": 0.7733911895751953,
422
+ "mean_token_accuracy": 0.9396535342931748,
423
+ "num_tokens": 4318894.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "epoch": 5.0,
428
+ "eval_entropy": 0.270584502145648,
429
+ "eval_loss": 0.6255385875701904,
430
+ "eval_mean_token_accuracy": 0.8687835082411766,
431
+ "eval_num_tokens": 4361235.0,
432
+ "eval_runtime": 96.6331,
433
+ "eval_samples_per_second": 16.547,
434
+ "eval_steps_per_second": 2.07,
435
+ "step": 1870
436
+ },
437
+ {
438
+ "entropy": 0.16372355209155517,
439
+ "epoch": 5.080321285140562,
440
+ "grad_norm": 8.029130935668945,
441
+ "learning_rate": 7.128190414543193e-05,
442
+ "loss": 0.6145073699951172,
443
+ "mean_token_accuracy": 0.9516371590922578,
444
+ "num_tokens": 4434412.0,
445
+ "step": 1900
446
+ },
447
+ {
448
+ "entropy": 0.14057113960385323,
449
+ "epoch": 5.214190093708166,
450
+ "grad_norm": 2.23626446723938,
451
+ "learning_rate": 6.840415347341672e-05,
452
+ "loss": 0.5295140075683594,
453
+ "mean_token_accuracy": 0.9593333688378334,
454
+ "num_tokens": 4548703.0,
455
+ "step": 1950
456
+ },
457
+ {
458
+ "entropy": 0.14139273861423135,
459
+ "epoch": 5.34805890227577,
460
+ "grad_norm": 2.0157318115234375,
461
+ "learning_rate": 6.551278528230729e-05,
462
+ "loss": 0.5296827697753906,
463
+ "mean_token_accuracy": 0.9590813705325126,
464
+ "num_tokens": 4665542.0,
465
+ "step": 2000
466
+ },
467
+ {
468
+ "entropy": 0.14537794288247824,
469
+ "epoch": 5.481927710843373,
470
+ "grad_norm": 1.5371013879776,
471
+ "learning_rate": 6.261409515739736e-05,
472
+ "loss": 0.5478645706176758,
473
+ "mean_token_accuracy": 0.9577724316716194,
474
+ "num_tokens": 4778075.0,
475
+ "step": 2050
476
+ },
477
+ {
478
+ "entropy": 0.14534839443862438,
479
+ "epoch": 5.615796519410977,
480
+ "grad_norm": 2.0134589672088623,
481
+ "learning_rate": 5.971439462655727e-05,
482
+ "loss": 0.5426230239868164,
483
+ "mean_token_accuracy": 0.9581041479110718,
484
+ "num_tokens": 4897453.0,
485
+ "step": 2100
486
+ },
487
+ {
488
+ "entropy": 0.14614912170916797,
489
+ "epoch": 5.749665327978581,
490
+ "grad_norm": 1.286437749862671,
491
+ "learning_rate": 5.6819997417687274e-05,
492
+ "loss": 0.5487421798706055,
493
+ "mean_token_accuracy": 0.9563529288768768,
494
+ "num_tokens": 5012767.0,
495
+ "step": 2150
496
+ },
497
+ {
498
+ "entropy": 0.13987606402486563,
499
+ "epoch": 5.883534136546185,
500
+ "grad_norm": 1.7586702108383179,
501
+ "learning_rate": 5.393720571138079e-05,
502
+ "loss": 0.5254617309570313,
503
+ "mean_token_accuracy": 0.9590577334165573,
504
+ "num_tokens": 5129878.0,
505
+ "step": 2200
506
+ },
507
+ {
508
+ "epoch": 6.0,
509
+ "eval_entropy": 0.2240281231701374,
510
+ "eval_loss": 0.7485206723213196,
511
+ "eval_mean_token_accuracy": 0.8668996468186378,
512
+ "eval_num_tokens": 5233482.0,
513
+ "eval_runtime": 96.4089,
514
+ "eval_samples_per_second": 16.586,
515
+ "eval_steps_per_second": 2.074,
516
+ "step": 2244
517
+ }
518
+ ],
519
+ "logging_steps": 50,
520
+ "max_steps": 3740,
521
+ "num_input_tokens_seen": 0,
522
+ "num_train_epochs": 10,
523
+ "save_steps": 500,
524
+ "stateful_callbacks": {
525
+ "TrainerControl": {
526
+ "args": {
527
+ "should_epoch_stop": false,
528
+ "should_evaluate": false,
529
+ "should_log": false,
530
+ "should_save": true,
531
+ "should_training_stop": false
532
+ },
533
+ "attributes": {}
534
+ }
535
+ },
536
+ "total_flos": 1.7914914724245857e+18,
537
+ "train_batch_size": 4,
538
+ "trial_name": null,
539
+ "trial_params": null
540
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2618/trainer_state.json ADDED
@@ -0,0 +1,631 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 7.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2618,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 1.3355020767450332,
14
+ "epoch": 0.13386880856760375,
15
+ "grad_norm": 3.2956597805023193,
16
+ "learning_rate": 1.628530639938585e-05,
17
+ "loss": 5.349910278320312,
18
+ "mean_token_accuracy": 0.7383818039298058,
19
+ "num_tokens": 116199.0,
20
+ "step": 50
21
+ },
22
+ {
23
+ "entropy": 0.5958842460811138,
24
+ "epoch": 0.2677376171352075,
25
+ "grad_norm": 2.5947492122650146,
26
+ "learning_rate": 3.290296599059591e-05,
27
+ "loss": 2.312855072021484,
28
+ "mean_token_accuracy": 0.8520967712998391,
29
+ "num_tokens": 232864.0,
30
+ "step": 100
31
+ },
32
+ {
33
+ "entropy": 0.5190362003445625,
34
+ "epoch": 0.40160642570281124,
35
+ "grad_norm": 1.5038394927978516,
36
+ "learning_rate": 4.9520625581805955e-05,
37
+ "loss": 2.0574468994140624,
38
+ "mean_token_accuracy": 0.8657039344310761,
39
+ "num_tokens": 352382.0,
40
+ "step": 150
41
+ },
42
+ {
43
+ "entropy": 0.4922871346771717,
44
+ "epoch": 0.535475234270415,
45
+ "grad_norm": 1.645923137664795,
46
+ "learning_rate": 6.613828517301602e-05,
47
+ "loss": 1.916438446044922,
48
+ "mean_token_accuracy": 0.8717759534716606,
49
+ "num_tokens": 474532.0,
50
+ "step": 200
51
+ },
52
+ {
53
+ "entropy": 0.491110111027956,
54
+ "epoch": 0.6693440428380187,
55
+ "grad_norm": 1.866817593574524,
56
+ "learning_rate": 8.275594476422607e-05,
57
+ "loss": 1.9421713256835937,
58
+ "mean_token_accuracy": 0.8710730043053627,
59
+ "num_tokens": 589198.0,
60
+ "step": 250
61
+ },
62
+ {
63
+ "entropy": 0.47134352535009383,
64
+ "epoch": 0.8032128514056225,
65
+ "grad_norm": 117.62409210205078,
66
+ "learning_rate": 9.937360435543611e-05,
67
+ "loss": 1.9768324279785157,
68
+ "mean_token_accuracy": 0.8741078078746796,
69
+ "num_tokens": 707057.0,
70
+ "step": 300
71
+ },
72
+ {
73
+ "entropy": 0.4820582258701325,
74
+ "epoch": 0.9370816599732262,
75
+ "grad_norm": 2.3274827003479004,
76
+ "learning_rate": 0.00011599126394664616,
77
+ "loss": 2.2025875854492187,
78
+ "mean_token_accuracy": 0.8697148504853248,
79
+ "num_tokens": 822888.0,
80
+ "step": 350
81
+ },
82
+ {
83
+ "epoch": 1.0,
84
+ "eval_entropy": 0.5010400542616844,
85
+ "eval_loss": 0.5114277601242065,
86
+ "eval_mean_token_accuracy": 0.8587275749444961,
87
+ "eval_num_tokens": 872247.0,
88
+ "eval_runtime": 96.5515,
89
+ "eval_samples_per_second": 16.561,
90
+ "eval_steps_per_second": 2.071,
91
+ "step": 374
92
+ },
93
+ {
94
+ "entropy": 0.4708875769918615,
95
+ "epoch": 1.069611780455154,
96
+ "grad_norm": 3.3712940216064453,
97
+ "learning_rate": 0.00012428317596508976,
98
+ "loss": 1.83294189453125,
99
+ "mean_token_accuracy": 0.8772370366737096,
100
+ "num_tokens": 929365.0,
101
+ "step": 400
102
+ },
103
+ {
104
+ "entropy": 0.44804590195417404,
105
+ "epoch": 1.2034805890227578,
106
+ "grad_norm": 1.4833389520645142,
107
+ "learning_rate": 0.00012414788900475706,
108
+ "loss": 1.7768891906738282,
109
+ "mean_token_accuracy": 0.8791097947955131,
110
+ "num_tokens": 1046629.0,
111
+ "step": 450
112
+ },
113
+ {
114
+ "entropy": 0.4510513086616993,
115
+ "epoch": 1.3373493975903614,
116
+ "grad_norm": 2.814790964126587,
117
+ "learning_rate": 0.00012387760965418496,
118
+ "loss": 1.7745071411132813,
119
+ "mean_token_accuracy": 0.8813075706362724,
120
+ "num_tokens": 1165744.0,
121
+ "step": 500
122
+ },
123
+ {
124
+ "entropy": 0.4479117552936077,
125
+ "epoch": 1.4712182061579653,
126
+ "grad_norm": 1.855610728263855,
127
+ "learning_rate": 0.00012347292641217135,
128
+ "loss": 1.7583291625976563,
129
+ "mean_token_accuracy": 0.8815277495980263,
130
+ "num_tokens": 1284843.0,
131
+ "step": 550
132
+ },
133
+ {
134
+ "entropy": 0.4380264139175415,
135
+ "epoch": 1.605087014725569,
136
+ "grad_norm": 1.383190631866455,
137
+ "learning_rate": 0.00012293472042483757,
138
+ "loss": 1.7229583740234375,
139
+ "mean_token_accuracy": 0.8832098203897476,
140
+ "num_tokens": 1406485.0,
141
+ "step": 600
142
+ },
143
+ {
144
+ "entropy": 0.4342571949958801,
145
+ "epoch": 1.7389558232931726,
146
+ "grad_norm": 1.4977834224700928,
147
+ "learning_rate": 0.00012226416356704526,
148
+ "loss": 1.7174737548828125,
149
+ "mean_token_accuracy": 0.8834967383742333,
150
+ "num_tokens": 1525460.0,
151
+ "step": 650
152
+ },
153
+ {
154
+ "entropy": 0.42700962007045745,
155
+ "epoch": 1.8728246318607764,
156
+ "grad_norm": 1.6156537532806396,
157
+ "learning_rate": 0.00012146271589078838,
158
+ "loss": 1.682061767578125,
159
+ "mean_token_accuracy": 0.8858474844694137,
160
+ "num_tokens": 1638984.0,
161
+ "step": 700
162
+ },
163
+ {
164
+ "epoch": 2.0,
165
+ "eval_entropy": 0.4838937771320343,
166
+ "eval_loss": 0.4826815128326416,
167
+ "eval_mean_token_accuracy": 0.8682844692468643,
168
+ "eval_num_tokens": 1744494.0,
169
+ "eval_runtime": 96.5071,
170
+ "eval_samples_per_second": 16.569,
171
+ "eval_steps_per_second": 2.072,
172
+ "step": 748
173
+ },
174
+ {
175
+ "entropy": 0.4378527848407476,
176
+ "epoch": 2.005354752342704,
177
+ "grad_norm": 1.400229573249817,
178
+ "learning_rate": 0.0001205321224461161,
179
+ "loss": 1.7096096801757812,
180
+ "mean_token_accuracy": 0.8838462468349573,
181
+ "num_tokens": 1749755.0,
182
+ "step": 750
183
+ },
184
+ {
185
+ "entropy": 0.3559799794852734,
186
+ "epoch": 2.139223560910308,
187
+ "grad_norm": 1.7168083190917969,
188
+ "learning_rate": 0.0001194744094815093,
189
+ "loss": 1.3893603515625,
190
+ "mean_token_accuracy": 0.9004731178283691,
191
+ "num_tokens": 1868231.0,
192
+ "step": 800
193
+ },
194
+ {
195
+ "entropy": 0.3671448823064566,
196
+ "epoch": 2.2730923694779115,
197
+ "grad_norm": 1.9720135927200317,
198
+ "learning_rate": 0.00011829188003198282,
199
+ "loss": 1.429988555908203,
200
+ "mean_token_accuracy": 0.8970818132162094,
201
+ "num_tokens": 1979116.0,
202
+ "step": 850
203
+ },
204
+ {
205
+ "entropy": 0.3597494306415319,
206
+ "epoch": 2.4069611780455156,
207
+ "grad_norm": 1.4947372674942017,
208
+ "learning_rate": 0.00011698710890452068,
209
+ "loss": 1.418173828125,
210
+ "mean_token_accuracy": 0.8994651186466217,
211
+ "num_tokens": 2094539.0,
212
+ "step": 900
213
+ },
214
+ {
215
+ "entropy": 0.36254502907395364,
216
+ "epoch": 2.540829986613119,
217
+ "grad_norm": 1.6768454313278198,
218
+ "learning_rate": 0.00011556293707176242,
219
+ "loss": 1.4158590698242188,
220
+ "mean_token_accuracy": 0.8995477721095085,
221
+ "num_tokens": 2209415.0,
222
+ "step": 950
223
+ },
224
+ {
225
+ "entropy": 0.36290778368711474,
226
+ "epoch": 2.674698795180723,
227
+ "grad_norm": 1.6033697128295898,
228
+ "learning_rate": 0.00011402246548614765,
229
+ "loss": 1.4300469970703125,
230
+ "mean_token_accuracy": 0.8986452376842499,
231
+ "num_tokens": 2324269.0,
232
+ "step": 1000
233
+ },
234
+ {
235
+ "entropy": 0.3635872249305248,
236
+ "epoch": 2.8085676037483265,
237
+ "grad_norm": 1.546893835067749,
238
+ "learning_rate": 0.00011236904832798785,
239
+ "loss": 1.42587646484375,
240
+ "mean_token_accuracy": 0.9003903394937516,
241
+ "num_tokens": 2447336.0,
242
+ "step": 1050
243
+ },
244
+ {
245
+ "entropy": 0.36871150620281695,
246
+ "epoch": 2.9424364123159306,
247
+ "grad_norm": 1.2951405048370361,
248
+ "learning_rate": 0.0001106062857021667,
249
+ "loss": 1.448046875,
250
+ "mean_token_accuracy": 0.8967258337140084,
251
+ "num_tokens": 2565837.0,
252
+ "step": 1100
253
+ },
254
+ {
255
+ "epoch": 3.0,
256
+ "eval_entropy": 0.4225208269059658,
257
+ "eval_loss": 0.489418089389801,
258
+ "eval_mean_token_accuracy": 0.8697815361618996,
259
+ "eval_num_tokens": 2616741.0,
260
+ "eval_runtime": 96.4058,
261
+ "eval_samples_per_second": 16.586,
262
+ "eval_steps_per_second": 2.075,
263
+ "step": 1122
264
+ },
265
+ {
266
+ "entropy": 0.3120347365285411,
267
+ "epoch": 3.074966532797858,
268
+ "grad_norm": 1.639520287513733,
269
+ "learning_rate": 0.00010873801579937106,
270
+ "loss": 1.1941973876953125,
271
+ "mean_token_accuracy": 0.9117801315856703,
272
+ "num_tokens": 2685975.0,
273
+ "step": 1150
274
+ },
275
+ {
276
+ "entropy": 0.28257040068507194,
277
+ "epoch": 3.208835341365462,
278
+ "grad_norm": 1.7459681034088135,
279
+ "learning_rate": 0.00010676830653892058,
280
+ "loss": 1.0850601196289062,
281
+ "mean_token_accuracy": 0.9177472350001336,
282
+ "num_tokens": 2798277.0,
283
+ "step": 1200
284
+ },
285
+ {
286
+ "entropy": 0.27802520349621773,
287
+ "epoch": 3.3427041499330654,
288
+ "grad_norm": 1.5176103115081787,
289
+ "learning_rate": 0.00010470144671139238,
290
+ "loss": 1.0840838623046876,
291
+ "mean_token_accuracy": 0.9179763168096542,
292
+ "num_tokens": 2918973.0,
293
+ "step": 1250
294
+ },
295
+ {
296
+ "entropy": 0.280417420566082,
297
+ "epoch": 3.4765729585006695,
298
+ "grad_norm": 1.3774974346160889,
299
+ "learning_rate": 0.00010254193664032686,
300
+ "loss": 1.0911756896972655,
301
+ "mean_token_accuracy": 0.9162956389784813,
302
+ "num_tokens": 3039073.0,
303
+ "step": 1300
304
+ },
305
+ {
306
+ "entropy": 0.2834589210152626,
307
+ "epoch": 3.610441767068273,
308
+ "grad_norm": 1.5929396152496338,
309
+ "learning_rate": 0.00010029447838334742,
310
+ "loss": 1.0985262298583984,
311
+ "mean_token_accuracy": 0.9174074530601501,
312
+ "num_tokens": 3153710.0,
313
+ "step": 1350
314
+ },
315
+ {
316
+ "entropy": 0.282296127229929,
317
+ "epoch": 3.7443105756358768,
318
+ "grad_norm": 1.50350022315979,
319
+ "learning_rate": 9.796396549403e-05,
320
+ "loss": 1.101386260986328,
321
+ "mean_token_accuracy": 0.9168545073270797,
322
+ "num_tokens": 3263594.0,
323
+ "step": 1400
324
+ },
325
+ {
326
+ "entropy": 0.279728781580925,
327
+ "epoch": 3.878179384203481,
328
+ "grad_norm": 1.4728187322616577,
329
+ "learning_rate": 9.555547236681456e-05,
330
+ "loss": 1.0859880065917968,
331
+ "mean_token_accuracy": 0.9178367125988006,
332
+ "num_tokens": 3386033.0,
333
+ "step": 1450
334
+ },
335
+ {
336
+ "epoch": 4.0,
337
+ "eval_entropy": 0.34304031178355215,
338
+ "eval_loss": 0.5295785665512085,
339
+ "eval_mean_token_accuracy": 0.8698753178119659,
340
+ "eval_num_tokens": 3488988.0,
341
+ "eval_runtime": 96.3616,
342
+ "eval_samples_per_second": 16.594,
343
+ "eval_steps_per_second": 2.076,
344
+ "step": 1496
345
+ },
346
+ {
347
+ "entropy": 0.27893446536377225,
348
+ "epoch": 4.010709504685408,
349
+ "grad_norm": 1.545491337776184,
350
+ "learning_rate": 9.30742431881587e-05,
351
+ "loss": 1.0577442169189453,
352
+ "mean_token_accuracy": 0.9191552999645772,
353
+ "num_tokens": 3498406.0,
354
+ "step": 1500
355
+ },
356
+ {
357
+ "entropy": 0.19769302535802125,
358
+ "epoch": 4.144578313253012,
359
+ "grad_norm": 2.10296893119812,
360
+ "learning_rate": 9.052568051799083e-05,
361
+ "loss": 0.7461458587646485,
362
+ "mean_token_accuracy": 0.9415343621373177,
363
+ "num_tokens": 3614301.0,
364
+ "step": 1550
365
+ },
366
+ {
367
+ "entropy": 0.1981763695180416,
368
+ "epoch": 4.278447121820616,
369
+ "grad_norm": 2.067410945892334,
370
+ "learning_rate": 8.791533352632524e-05,
371
+ "loss": 0.7580889892578125,
372
+ "mean_token_accuracy": 0.9396374526619912,
373
+ "num_tokens": 3735705.0,
374
+ "step": 1600
375
+ },
376
+ {
377
+ "entropy": 0.19850988369435071,
378
+ "epoch": 4.412315930388219,
379
+ "grad_norm": 1.9034850597381592,
380
+ "learning_rate": 8.524888591065258e-05,
381
+ "loss": 0.7526986694335938,
382
+ "mean_token_accuracy": 0.9402479353547096,
383
+ "num_tokens": 3854287.0,
384
+ "step": 1650
385
+ },
386
+ {
387
+ "entropy": 0.19905407220125199,
388
+ "epoch": 4.546184738955823,
389
+ "grad_norm": 2.1477949619293213,
390
+ "learning_rate": 8.253214352041379e-05,
391
+ "loss": 0.7603612518310547,
392
+ "mean_token_accuracy": 0.9396576225757599,
393
+ "num_tokens": 3967362.0,
394
+ "step": 1700
395
+ },
396
+ {
397
+ "entropy": 0.20251497332006693,
398
+ "epoch": 4.680053547523427,
399
+ "grad_norm": 1.5489246845245361,
400
+ "learning_rate": 7.97710217155036e-05,
401
+ "loss": 0.7711930084228515,
402
+ "mean_token_accuracy": 0.9400961664319039,
403
+ "num_tokens": 4081441.0,
404
+ "step": 1750
405
+ },
406
+ {
407
+ "entropy": 0.1991352306306362,
408
+ "epoch": 4.813922356091031,
409
+ "grad_norm": 1.969994068145752,
410
+ "learning_rate": 7.697153248632946e-05,
411
+ "loss": 0.7681967163085938,
412
+ "mean_token_accuracy": 0.9399621617794037,
413
+ "num_tokens": 4197604.0,
414
+ "step": 1800
415
+ },
416
+ {
417
+ "entropy": 0.20229352474212647,
418
+ "epoch": 4.947791164658635,
419
+ "grad_norm": 2.2329719066619873,
420
+ "learning_rate": 7.41397713634694e-05,
421
+ "loss": 0.7733911895751953,
422
+ "mean_token_accuracy": 0.9396535342931748,
423
+ "num_tokens": 4318894.0,
424
+ "step": 1850
425
+ },
426
+ {
427
+ "epoch": 5.0,
428
+ "eval_entropy": 0.270584502145648,
429
+ "eval_loss": 0.6255385875701904,
430
+ "eval_mean_token_accuracy": 0.8687835082411766,
431
+ "eval_num_tokens": 4361235.0,
432
+ "eval_runtime": 96.6331,
433
+ "eval_samples_per_second": 16.547,
434
+ "eval_steps_per_second": 2.07,
435
+ "step": 1870
436
+ },
437
+ {
438
+ "entropy": 0.16372355209155517,
439
+ "epoch": 5.080321285140562,
440
+ "grad_norm": 8.029130935668945,
441
+ "learning_rate": 7.128190414543193e-05,
442
+ "loss": 0.6145073699951172,
443
+ "mean_token_accuracy": 0.9516371590922578,
444
+ "num_tokens": 4434412.0,
445
+ "step": 1900
446
+ },
447
+ {
448
+ "entropy": 0.14057113960385323,
449
+ "epoch": 5.214190093708166,
450
+ "grad_norm": 2.23626446723938,
451
+ "learning_rate": 6.840415347341672e-05,
452
+ "loss": 0.5295140075683594,
453
+ "mean_token_accuracy": 0.9593333688378334,
454
+ "num_tokens": 4548703.0,
455
+ "step": 1950
456
+ },
457
+ {
458
+ "entropy": 0.14139273861423135,
459
+ "epoch": 5.34805890227577,
460
+ "grad_norm": 2.0157318115234375,
461
+ "learning_rate": 6.551278528230729e-05,
462
+ "loss": 0.5296827697753906,
463
+ "mean_token_accuracy": 0.9590813705325126,
464
+ "num_tokens": 4665542.0,
465
+ "step": 2000
466
+ },
467
+ {
468
+ "entropy": 0.14537794288247824,
469
+ "epoch": 5.481927710843373,
470
+ "grad_norm": 1.5371013879776,
471
+ "learning_rate": 6.261409515739736e-05,
472
+ "loss": 0.5478645706176758,
473
+ "mean_token_accuracy": 0.9577724316716194,
474
+ "num_tokens": 4778075.0,
475
+ "step": 2050
476
+ },
477
+ {
478
+ "entropy": 0.14534839443862438,
479
+ "epoch": 5.615796519410977,
480
+ "grad_norm": 2.0134589672088623,
481
+ "learning_rate": 5.971439462655727e-05,
482
+ "loss": 0.5426230239868164,
483
+ "mean_token_accuracy": 0.9581041479110718,
484
+ "num_tokens": 4897453.0,
485
+ "step": 2100
486
+ },
487
+ {
488
+ "entropy": 0.14614912170916797,
489
+ "epoch": 5.749665327978581,
490
+ "grad_norm": 1.286437749862671,
491
+ "learning_rate": 5.6819997417687274e-05,
492
+ "loss": 0.5487421798706055,
493
+ "mean_token_accuracy": 0.9563529288768768,
494
+ "num_tokens": 5012767.0,
495
+ "step": 2150
496
+ },
497
+ {
498
+ "entropy": 0.13987606402486563,
499
+ "epoch": 5.883534136546185,
500
+ "grad_norm": 1.7586702108383179,
501
+ "learning_rate": 5.393720571138079e-05,
502
+ "loss": 0.5254617309570313,
503
+ "mean_token_accuracy": 0.9590577334165573,
504
+ "num_tokens": 5129878.0,
505
+ "step": 2200
506
+ },
507
+ {
508
+ "epoch": 6.0,
509
+ "eval_entropy": 0.2240281231701374,
510
+ "eval_loss": 0.7485206723213196,
511
+ "eval_mean_token_accuracy": 0.8668996468186378,
512
+ "eval_num_tokens": 5233482.0,
513
+ "eval_runtime": 96.4089,
514
+ "eval_samples_per_second": 16.586,
515
+ "eval_steps_per_second": 2.074,
516
+ "step": 2244
517
+ },
518
+ {
519
+ "entropy": 0.1413771447283451,
520
+ "epoch": 6.016064257028113,
521
+ "grad_norm": 1.2926467657089233,
522
+ "learning_rate": 5.1072296418730254e-05,
523
+ "loss": 0.5202234649658203,
524
+ "mean_token_accuracy": 0.9594009392189257,
525
+ "num_tokens": 5246734.0,
526
+ "step": 2250
527
+ },
528
+ {
529
+ "entropy": 0.1042403375543654,
530
+ "epoch": 6.149933065595716,
531
+ "grad_norm": 1.9540276527404785,
532
+ "learning_rate": 4.8231507514154216e-05,
533
+ "loss": 0.39597846984863283,
534
+ "mean_token_accuracy": 0.9706364983320236,
535
+ "num_tokens": 5366334.0,
536
+ "step": 2300
537
+ },
538
+ {
539
+ "entropy": 0.10351455600932241,
540
+ "epoch": 6.28380187416332,
541
+ "grad_norm": 2.139054775238037,
542
+ "learning_rate": 4.542102445300397e-05,
543
+ "loss": 0.38731266021728517,
544
+ "mean_token_accuracy": 0.9703371664881706,
545
+ "num_tokens": 5487013.0,
546
+ "step": 2350
547
+ },
548
+ {
549
+ "entropy": 0.11232182893902064,
550
+ "epoch": 6.417670682730924,
551
+ "grad_norm": 1.6526401042938232,
552
+ "learning_rate": 4.264696670352381e-05,
553
+ "loss": 0.42091716766357423,
554
+ "mean_token_accuracy": 0.9684987756609916,
555
+ "num_tokens": 5599415.0,
556
+ "step": 2400
557
+ },
558
+ {
559
+ "entropy": 0.10796859875321388,
560
+ "epoch": 6.551539491298527,
561
+ "grad_norm": 1.297956109046936,
562
+ "learning_rate": 3.9915374422489785e-05,
563
+ "loss": 0.40640792846679685,
564
+ "mean_token_accuracy": 0.9703203043341637,
565
+ "num_tokens": 5718099.0,
566
+ "step": 2450
567
+ },
568
+ {
569
+ "entropy": 0.10999857917428017,
570
+ "epoch": 6.685408299866131,
571
+ "grad_norm": 1.5105161666870117,
572
+ "learning_rate": 3.723219530353909e-05,
573
+ "loss": 0.4118352508544922,
574
+ "mean_token_accuracy": 0.9697986772656441,
575
+ "num_tokens": 5833902.0,
576
+ "step": 2500
577
+ },
578
+ {
579
+ "entropy": 0.11099046738818288,
580
+ "epoch": 6.8192771084337345,
581
+ "grad_norm": 1.8809560537338257,
582
+ "learning_rate": 3.460327162682602e-05,
583
+ "loss": 0.41624794006347654,
584
+ "mean_token_accuracy": 0.9690032437443733,
585
+ "num_tokens": 5948132.0,
586
+ "step": 2550
587
+ },
588
+ {
589
+ "entropy": 0.11062245365232229,
590
+ "epoch": 6.953145917001339,
591
+ "grad_norm": 1.0219827890396118,
592
+ "learning_rate": 3.2034327538202464e-05,
593
+ "loss": 0.41484325408935546,
594
+ "mean_token_accuracy": 0.9690453514456749,
595
+ "num_tokens": 6066224.0,
596
+ "step": 2600
597
+ },
598
+ {
599
+ "epoch": 7.0,
600
+ "eval_entropy": 0.18908375523984433,
601
+ "eval_loss": 0.8491571545600891,
602
+ "eval_mean_token_accuracy": 0.8642131051421166,
603
+ "eval_num_tokens": 6105729.0,
604
+ "eval_runtime": 96.4633,
605
+ "eval_samples_per_second": 16.576,
606
+ "eval_steps_per_second": 2.073,
607
+ "step": 2618
608
+ }
609
+ ],
610
+ "logging_steps": 50,
611
+ "max_steps": 3740,
612
+ "num_input_tokens_seen": 0,
613
+ "num_train_epochs": 10,
614
+ "save_steps": 500,
615
+ "stateful_callbacks": {
616
+ "TrainerControl": {
617
+ "args": {
618
+ "should_epoch_stop": false,
619
+ "should_evaluate": false,
620
+ "should_log": false,
621
+ "should_save": true,
622
+ "should_training_stop": false
623
+ },
624
+ "attributes": {}
625
+ }
626
+ },
627
+ "total_flos": 2.0923154774653926e+18,
628
+ "train_batch_size": 4,
629
+ "trial_name": null,
630
+ "trial_params": null
631
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-4-31B
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-4-31B
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.19.1
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-4-31B",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.015034304668777832,
22
+ "lora_ga_config": null,
23
+ "megatron_config": null,
24
+ "megatron_core": "megatron.core",
25
+ "modules_to_save": null,
26
+ "peft_type": "LORA",
27
+ "peft_version": "0.19.1",
28
+ "qalora_group_size": 16,
29
+ "r": 64,
30
+ "rank_pattern": {},
31
+ "revision": null,
32
+ "target_modules": ".*language_model.*\\.(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)$",
33
+ "target_parameters": null,
34
+ "task_type": "CAUSAL_LM",
35
+ "trainable_token_indices": null,
36
+ "use_bdlora": null,
37
+ "use_dora": false,
38
+ "use_qalora": false,
39
+ "use_rslora": false
40
+ }
DBCA_original_Swedish/gemma-4-31B_original_features_structural_train_original_features_structural_test1/checkpoint-2992/tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<eos>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": false,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "left",
45
+ "processor_class": "Gemma4Processor",
46
+ "soc_token": "<|channel>",
47
+ "sot_token": "<|turn>",
48
+ "stc_token": "<|tool_call>",
49
+ "std_token": "<|tool>",
50
+ "str_token": "<|tool_response>",
51
+ "think_token": "<|think|>",
52
+ "tokenizer_class": "GemmaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }