modrill commited on
Commit
4adcd3e
·
verified ·
1 Parent(s): 1290a65

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. README.md +54 -9
  3. adapter_config.json +46 -0
  4. adapter_model.safetensors +3 -0
  5. all_results.json +8 -0
  6. chat_template.jinja +47 -0
  7. checkpoint-1400/README.md +208 -0
  8. checkpoint-1400/adapter_config.json +46 -0
  9. checkpoint-1400/chat_template.jinja +47 -0
  10. checkpoint-1400/rng_state.pth +3 -0
  11. checkpoint-1400/scheduler.pt +3 -0
  12. checkpoint-1400/tokenizer_config.json +26 -0
  13. checkpoint-1400/trainer_state.json +1014 -0
  14. checkpoint-1400/training_args.bin +3 -0
  15. checkpoint-1500/README.md +208 -0
  16. checkpoint-1500/adapter_config.json +46 -0
  17. checkpoint-1500/chat_template.jinja +47 -0
  18. checkpoint-1500/optimizer.pt +3 -0
  19. checkpoint-1500/scheduler.pt +3 -0
  20. checkpoint-1500/tokenizer_config.json +26 -0
  21. checkpoint-1500/trainer_state.json +1084 -0
  22. checkpoint-1600/README.md +208 -0
  23. checkpoint-1600/adapter_config.json +46 -0
  24. checkpoint-1600/chat_template.jinja +47 -0
  25. checkpoint-1600/optimizer.pt +3 -0
  26. checkpoint-1600/rng_state.pth +3 -0
  27. checkpoint-1600/scheduler.pt +3 -0
  28. checkpoint-1600/tokenizer_config.json +26 -0
  29. checkpoint-1600/trainer_state.json +1154 -0
  30. checkpoint-1600/training_args.bin +3 -0
  31. checkpoint-1700/README.md +208 -0
  32. checkpoint-1700/adapter_config.json +46 -0
  33. checkpoint-1700/chat_template.jinja +47 -0
  34. checkpoint-1700/optimizer.pt +3 -0
  35. checkpoint-1700/tokenizer_config.json +26 -0
  36. checkpoint-1700/trainer_state.json +1224 -0
  37. checkpoint-1782/README.md +208 -0
  38. checkpoint-1782/adapter_config.json +46 -0
  39. checkpoint-1782/adapter_model.safetensors +3 -0
  40. checkpoint-1782/chat_template.jinja +47 -0
  41. checkpoint-1782/optimizer.pt +3 -0
  42. checkpoint-1782/tokenizer.json +3 -0
  43. checkpoint-1782/tokenizer_config.json +26 -0
  44. checkpoint-1782/trainer_state.json +1280 -0
  45. tokenizer.json +3 -0
  46. tokenizer_config.json +26 -0
  47. train_results.json +8 -0
  48. trainer_log.jsonl +179 -0
  49. trainer_state.json +1289 -0
  50. training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1782/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,16 +1,61 @@
1
  ---
2
- license: cc-by-nc-4.0
 
 
3
  tags:
4
- - qwen3
5
- - mhm
6
- - text-generation
7
- library_name: transformers
 
 
 
 
8
  ---
9
 
 
 
 
10
  # opencodeinst_5k_sft
11
 
12
- Auto-uploaded by watcher.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- - Source path: `LlamaFactory/models/opencodeinst_5k_sft`
15
- - Uploaded at: `2026-05-20T06:02:49.311145`
16
- - Visibility: `public`
 
 
 
1
  ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: google/gemma-3-1b-it
5
  tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ pipeline_tag: text-generation
11
+ model-index:
12
+ - name: opencodeinst_5k_sft
13
+ results: []
14
  ---
15
 
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
  # opencodeinst_5k_sft
20
 
21
+ This model is a fine-tuned version of [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it) on the opencodeinst_5k dataset.
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 1e-06
41
+ - train_batch_size: 1
42
+ - eval_batch_size: 4
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 8
45
+ - total_train_batch_size: 8
46
+ - optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
47
+ - lr_scheduler_type: cosine
48
+ - lr_scheduler_warmup_steps: 0.1
49
+ - num_epochs: 3.0
50
+
51
+ ### Training results
52
+
53
+
54
+
55
+ ### Framework versions
56
 
57
+ - PEFT 0.18.1
58
+ - Transformers 5.2.0
59
+ - Pytorch 2.11.0+cu130
60
+ - Datasets 4.0.0
61
+ - Tokenizers 0.22.2
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fccca914644fd456afb68abc1f7307cc09f76cd552cb49bc0a910ee900a73e
3
+ size 26139264
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 3.258875338498253e+16,
4
+ "train_loss": 0.45725877370630985,
5
+ "train_runtime": 3399.7742,
6
+ "train_samples_per_second": 4.191,
7
+ "train_steps_per_second": 0.524
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1400/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-1b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-1400/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1400/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5563bee0e6340c81024f327b435c7f4b69444288abbfb077a5d9ac163cb64bd4
3
+ size 14645
checkpoint-1400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50427b778ba4958d014cebb411d4b1ae082125569b84cc46d15eb28aa365f6a6
3
+ size 1465
checkpoint-1400/tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
checkpoint-1400/trainer_state.json ADDED
@@ -0,0 +1,1014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.3570526315789473,
6
+ "eval_steps": 100,
7
+ "global_step": 1400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ }
992
+ ],
993
+ "logging_steps": 10,
994
+ "max_steps": 1782,
995
+ "num_input_tokens_seen": 0,
996
+ "num_train_epochs": 3,
997
+ "save_steps": 100,
998
+ "stateful_callbacks": {
999
+ "TrainerControl": {
1000
+ "args": {
1001
+ "should_epoch_stop": false,
1002
+ "should_evaluate": false,
1003
+ "should_log": false,
1004
+ "should_save": true,
1005
+ "should_training_stop": false
1006
+ },
1007
+ "attributes": {}
1008
+ }
1009
+ },
1010
+ "total_flos": 2.559039062380339e+16,
1011
+ "train_batch_size": 1,
1012
+ "trial_name": null,
1013
+ "trial_params": null
1014
+ }
checkpoint-1400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e59b6aeffb8563cc09210642a0d080410d061efe9a32399cd1e4a1e0abccb0a
3
+ size 5585
checkpoint-1500/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-1b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-1500/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1500/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd92b20e79099890a866712c75560516d2183b31fabb44bdeefd400ffa79d907
3
+ size 52494119
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22054b632e5063c85ccdc7cea2230764df35e2e45f2f169a54122de5c531ca3b
3
+ size 1465
checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.5254736842105263,
6
+ "eval_steps": 100,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 2.3738947368421055,
994
+ "grad_norm": 0.10681115835905075,
995
+ "learning_rate": 1.2775080538619347e-07,
996
+ "loss": 0.3913698196411133,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 2.3907368421052633,
1001
+ "grad_norm": 0.10323983430862427,
1002
+ "learning_rate": 1.2128059108915595e-07,
1003
+ "loss": 0.39077584743499755,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 2.407578947368421,
1008
+ "grad_norm": 0.09566064178943634,
1009
+ "learning_rate": 1.1495583437374263e-07,
1010
+ "loss": 0.39895172119140626,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 2.424421052631579,
1015
+ "grad_norm": 0.13018426299095154,
1016
+ "learning_rate": 1.0877896443633117e-07,
1017
+ "loss": 0.38982129096984863,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 2.441263157894737,
1022
+ "grad_norm": 0.10760781168937683,
1023
+ "learning_rate": 1.0275235367332347e-07,
1024
+ "loss": 0.3756714344024658,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 2.458105263157895,
1029
+ "grad_norm": 0.11606904864311218,
1030
+ "learning_rate": 9.687831676996238e-08,
1031
+ "loss": 0.37858171463012696,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 2.4749473684210526,
1036
+ "grad_norm": 0.12957172095775604,
1037
+ "learning_rate": 9.115910981131336e-08,
1038
+ "loss": 0.40050196647644043,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 2.4917894736842103,
1043
+ "grad_norm": 0.11186131089925766,
1044
+ "learning_rate": 8.559692941575231e-08,
1045
+ "loss": 0.3684133291244507,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 2.5086315789473685,
1050
+ "grad_norm": 0.13279542326927185,
1051
+ "learning_rate": 8.019391189129466e-08,
1052
+ "loss": 0.3452518224716187,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 2.5254736842105263,
1057
+ "grad_norm": 0.09041756391525269,
1058
+ "learning_rate": 7.495213241508786e-08,
1059
+ "loss": 0.36301617622375487,
1060
+ "step": 1500
1061
+ }
1062
+ ],
1063
+ "logging_steps": 10,
1064
+ "max_steps": 1782,
1065
+ "num_input_tokens_seen": 0,
1066
+ "num_train_epochs": 3,
1067
+ "save_steps": 100,
1068
+ "stateful_callbacks": {
1069
+ "TrainerControl": {
1070
+ "args": {
1071
+ "should_epoch_stop": false,
1072
+ "should_evaluate": false,
1073
+ "should_log": false,
1074
+ "should_save": true,
1075
+ "should_training_stop": false
1076
+ },
1077
+ "attributes": {}
1078
+ }
1079
+ },
1080
+ "total_flos": 2.742094440984576e+16,
1081
+ "train_batch_size": 1,
1082
+ "trial_name": null,
1083
+ "trial_params": null
1084
+ }
checkpoint-1600/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-1b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-1600/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1600/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00e4e0c0ee17f488ebea76d4be58523084a7d82aa17ad91dc55431395752764d
3
+ size 52494119
checkpoint-1600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:926b6d8daef5503279afe9a92b1410c52f82ece90d2b637e440d4677105ea8ef
3
+ size 14645
checkpoint-1600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09e8b402b77d901f345c2fba01c7ef96ce60173c9b1b1245be32661c8e0cee5a
3
+ size 1465
checkpoint-1600/tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
checkpoint-1600/trainer_state.json ADDED
@@ -0,0 +1,1154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.6938947368421053,
6
+ "eval_steps": 100,
7
+ "global_step": 1600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 2.3738947368421055,
994
+ "grad_norm": 0.10681115835905075,
995
+ "learning_rate": 1.2775080538619347e-07,
996
+ "loss": 0.3913698196411133,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 2.3907368421052633,
1001
+ "grad_norm": 0.10323983430862427,
1002
+ "learning_rate": 1.2128059108915595e-07,
1003
+ "loss": 0.39077584743499755,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 2.407578947368421,
1008
+ "grad_norm": 0.09566064178943634,
1009
+ "learning_rate": 1.1495583437374263e-07,
1010
+ "loss": 0.39895172119140626,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 2.424421052631579,
1015
+ "grad_norm": 0.13018426299095154,
1016
+ "learning_rate": 1.0877896443633117e-07,
1017
+ "loss": 0.38982129096984863,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 2.441263157894737,
1022
+ "grad_norm": 0.10760781168937683,
1023
+ "learning_rate": 1.0275235367332347e-07,
1024
+ "loss": 0.3756714344024658,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 2.458105263157895,
1029
+ "grad_norm": 0.11606904864311218,
1030
+ "learning_rate": 9.687831676996238e-08,
1031
+ "loss": 0.37858171463012696,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 2.4749473684210526,
1036
+ "grad_norm": 0.12957172095775604,
1037
+ "learning_rate": 9.115910981131336e-08,
1038
+ "loss": 0.40050196647644043,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 2.4917894736842103,
1043
+ "grad_norm": 0.11186131089925766,
1044
+ "learning_rate": 8.559692941575231e-08,
1045
+ "loss": 0.3684133291244507,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 2.5086315789473685,
1050
+ "grad_norm": 0.13279542326927185,
1051
+ "learning_rate": 8.019391189129466e-08,
1052
+ "loss": 0.3452518224716187,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 2.5254736842105263,
1057
+ "grad_norm": 0.09041756391525269,
1058
+ "learning_rate": 7.495213241508786e-08,
1059
+ "loss": 0.36301617622375487,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 2.542315789473684,
1064
+ "grad_norm": 0.10033190995454788,
1065
+ "learning_rate": 6.987360423638205e-08,
1066
+ "loss": 0.3706004858016968,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 2.559157894736842,
1071
+ "grad_norm": 0.10681814700365067,
1072
+ "learning_rate": 6.49602779032865e-08,
1073
+ "loss": 0.36011199951171874,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 2.576,
1078
+ "grad_norm": 0.1008416935801506,
1079
+ "learning_rate": 6.02140405136089e-08,
1080
+ "loss": 0.37473766803741454,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 2.592842105263158,
1085
+ "grad_norm": 0.11559010297060013,
1086
+ "learning_rate": 5.5636714990062393e-08,
1087
+ "loss": 0.39232525825500486,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 2.609684210526316,
1092
+ "grad_norm": 0.10601615905761719,
1093
+ "learning_rate": 5.1230059380123034e-08,
1094
+ "loss": 0.34370343685150145,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 2.626526315789474,
1099
+ "grad_norm": 0.11516924202442169,
1100
+ "learning_rate": 4.699576618080331e-08,
1101
+ "loss": 0.39509878158569334,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 2.6433684210526316,
1106
+ "grad_norm": 0.11444627493619919,
1107
+ "learning_rate": 4.293546168860163e-08,
1108
+ "loss": 0.3881126165390015,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 2.6602105263157894,
1113
+ "grad_norm": 0.09985339641571045,
1114
+ "learning_rate": 3.9050705374879086e-08,
1115
+ "loss": 0.34624040126800537,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 2.677052631578947,
1120
+ "grad_norm": 0.10439962148666382,
1121
+ "learning_rate": 3.534298928690166e-08,
1122
+ "loss": 0.35141232013702395,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 2.6938947368421053,
1127
+ "grad_norm": 0.12180087715387344,
1128
+ "learning_rate": 3.181373747477822e-08,
1129
+ "loss": 0.39980330467224123,
1130
+ "step": 1600
1131
+ }
1132
+ ],
1133
+ "logging_steps": 10,
1134
+ "max_steps": 1782,
1135
+ "num_input_tokens_seen": 0,
1136
+ "num_train_epochs": 3,
1137
+ "save_steps": 100,
1138
+ "stateful_callbacks": {
1139
+ "TrainerControl": {
1140
+ "args": {
1141
+ "should_epoch_stop": false,
1142
+ "should_evaluate": false,
1143
+ "should_log": false,
1144
+ "should_save": true,
1145
+ "should_training_stop": false
1146
+ },
1147
+ "attributes": {}
1148
+ }
1149
+ },
1150
+ "total_flos": 2.925778724822016e+16,
1151
+ "train_batch_size": 1,
1152
+ "trial_name": null,
1153
+ "trial_params": null
1154
+ }
checkpoint-1600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e59b6aeffb8563cc09210642a0d080410d061efe9a32399cd1e4a1e0abccb0a
3
+ size 5585
checkpoint-1700/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-1b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-1700/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1700/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1700/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c0bcdfe6b227477259d00ef78e6870fb90e1f42b41a69b2cc4d605d31c60db6
3
+ size 52494119
checkpoint-1700/tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
checkpoint-1700/trainer_state.json ADDED
@@ -0,0 +1,1224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.8623157894736844,
6
+ "eval_steps": 100,
7
+ "global_step": 1700,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 2.3738947368421055,
994
+ "grad_norm": 0.10681115835905075,
995
+ "learning_rate": 1.2775080538619347e-07,
996
+ "loss": 0.3913698196411133,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 2.3907368421052633,
1001
+ "grad_norm": 0.10323983430862427,
1002
+ "learning_rate": 1.2128059108915595e-07,
1003
+ "loss": 0.39077584743499755,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 2.407578947368421,
1008
+ "grad_norm": 0.09566064178943634,
1009
+ "learning_rate": 1.1495583437374263e-07,
1010
+ "loss": 0.39895172119140626,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 2.424421052631579,
1015
+ "grad_norm": 0.13018426299095154,
1016
+ "learning_rate": 1.0877896443633117e-07,
1017
+ "loss": 0.38982129096984863,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 2.441263157894737,
1022
+ "grad_norm": 0.10760781168937683,
1023
+ "learning_rate": 1.0275235367332347e-07,
1024
+ "loss": 0.3756714344024658,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 2.458105263157895,
1029
+ "grad_norm": 0.11606904864311218,
1030
+ "learning_rate": 9.687831676996238e-08,
1031
+ "loss": 0.37858171463012696,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 2.4749473684210526,
1036
+ "grad_norm": 0.12957172095775604,
1037
+ "learning_rate": 9.115910981131336e-08,
1038
+ "loss": 0.40050196647644043,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 2.4917894736842103,
1043
+ "grad_norm": 0.11186131089925766,
1044
+ "learning_rate": 8.559692941575231e-08,
1045
+ "loss": 0.3684133291244507,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 2.5086315789473685,
1050
+ "grad_norm": 0.13279542326927185,
1051
+ "learning_rate": 8.019391189129466e-08,
1052
+ "loss": 0.3452518224716187,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 2.5254736842105263,
1057
+ "grad_norm": 0.09041756391525269,
1058
+ "learning_rate": 7.495213241508786e-08,
1059
+ "loss": 0.36301617622375487,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 2.542315789473684,
1064
+ "grad_norm": 0.10033190995454788,
1065
+ "learning_rate": 6.987360423638205e-08,
1066
+ "loss": 0.3706004858016968,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 2.559157894736842,
1071
+ "grad_norm": 0.10681814700365067,
1072
+ "learning_rate": 6.49602779032865e-08,
1073
+ "loss": 0.36011199951171874,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 2.576,
1078
+ "grad_norm": 0.1008416935801506,
1079
+ "learning_rate": 6.02140405136089e-08,
1080
+ "loss": 0.37473766803741454,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 2.592842105263158,
1085
+ "grad_norm": 0.11559010297060013,
1086
+ "learning_rate": 5.5636714990062393e-08,
1087
+ "loss": 0.39232525825500486,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 2.609684210526316,
1092
+ "grad_norm": 0.10601615905761719,
1093
+ "learning_rate": 5.1230059380123034e-08,
1094
+ "loss": 0.34370343685150145,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 2.626526315789474,
1099
+ "grad_norm": 0.11516924202442169,
1100
+ "learning_rate": 4.699576618080331e-08,
1101
+ "loss": 0.39509878158569334,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 2.6433684210526316,
1106
+ "grad_norm": 0.11444627493619919,
1107
+ "learning_rate": 4.293546168860163e-08,
1108
+ "loss": 0.3881126165390015,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 2.6602105263157894,
1113
+ "grad_norm": 0.09985339641571045,
1114
+ "learning_rate": 3.9050705374879086e-08,
1115
+ "loss": 0.34624040126800537,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 2.677052631578947,
1120
+ "grad_norm": 0.10439962148666382,
1121
+ "learning_rate": 3.534298928690166e-08,
1122
+ "loss": 0.35141232013702395,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 2.6938947368421053,
1127
+ "grad_norm": 0.12180087715387344,
1128
+ "learning_rate": 3.181373747477822e-08,
1129
+ "loss": 0.39980330467224123,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 2.710736842105263,
1134
+ "grad_norm": 0.12150874733924866,
1135
+ "learning_rate": 2.8464305444515112e-08,
1136
+ "loss": 0.3560852766036987,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 2.7275789473684213,
1141
+ "grad_norm": 0.11374734342098236,
1142
+ "learning_rate": 2.5295979637397213e-08,
1143
+ "loss": 0.39339067935943606,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 2.744421052631579,
1148
+ "grad_norm": 0.11705286055803299,
1149
+ "learning_rate": 2.2309976935894203e-08,
1150
+ "loss": 0.38021705150604246,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 2.761263157894737,
1155
+ "grad_norm": 0.09300459921360016,
1156
+ "learning_rate": 1.9507444196284195e-08,
1157
+ "loss": 0.3467890739440918,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 2.7781052631578946,
1162
+ "grad_norm": 0.08709974586963654,
1163
+ "learning_rate": 1.688945780817147e-08,
1164
+ "loss": 0.38266596794128416,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 2.7949473684210524,
1169
+ "grad_norm": 0.12327069044113159,
1170
+ "learning_rate": 1.445702328106979e-08,
1171
+ "loss": 0.34164865016937257,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 2.8117894736842106,
1176
+ "grad_norm": 0.12361253052949905,
1177
+ "learning_rate": 1.2211074858209103e-08,
1178
+ "loss": 0.3876492977142334,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 2.8286315789473684,
1183
+ "grad_norm": 0.120187908411026,
1184
+ "learning_rate": 1.0152475157713392e-08,
1185
+ "loss": 0.3703944206237793,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 2.845473684210526,
1190
+ "grad_norm": 0.10741006582975388,
1191
+ "learning_rate": 8.282014841288653e-09,
1192
+ "loss": 0.3735771656036377,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 2.8623157894736844,
1197
+ "grad_norm": 0.10451704263687134,
1198
+ "learning_rate": 6.600412310547754e-09,
1199
+ "loss": 0.3807518005371094,
1200
+ "step": 1700
1201
+ }
1202
+ ],
1203
+ "logging_steps": 10,
1204
+ "max_steps": 1782,
1205
+ "num_input_tokens_seen": 0,
1206
+ "num_train_epochs": 3,
1207
+ "save_steps": 100,
1208
+ "stateful_callbacks": {
1209
+ "TrainerControl": {
1210
+ "args": {
1211
+ "should_epoch_stop": false,
1212
+ "should_evaluate": false,
1213
+ "should_log": false,
1214
+ "should_save": true,
1215
+ "should_training_stop": false
1216
+ },
1217
+ "attributes": {}
1218
+ }
1219
+ },
1220
+ "total_flos": 3.1088780591683584e+16,
1221
+ "train_batch_size": 1,
1222
+ "trial_name": null,
1223
+ "trial_params": null
1224
+ }
checkpoint-1782/README.md ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-1b-it
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:google/gemma-3-1b-it
7
+ - llama-factory
8
+ - lora
9
+ - transformers
10
+ ---
11
+
12
+ # Model Card for Model ID
13
+
14
+ <!-- Provide a quick summary of what the model is/does. -->
15
+
16
+
17
+
18
+ ## Model Details
19
+
20
+ ### Model Description
21
+
22
+ <!-- Provide a longer summary of what this model is. -->
23
+
24
+
25
+
26
+ - **Developed by:** [More Information Needed]
27
+ - **Funded by [optional]:** [More Information Needed]
28
+ - **Shared by [optional]:** [More Information Needed]
29
+ - **Model type:** [More Information Needed]
30
+ - **Language(s) (NLP):** [More Information Needed]
31
+ - **License:** [More Information Needed]
32
+ - **Finetuned from model [optional]:** [More Information Needed]
33
+
34
+ ### Model Sources [optional]
35
+
36
+ <!-- Provide the basic links for the model. -->
37
+
38
+ - **Repository:** [More Information Needed]
39
+ - **Paper [optional]:** [More Information Needed]
40
+ - **Demo [optional]:** [More Information Needed]
41
+
42
+ ## Uses
43
+
44
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
45
+
46
+ ### Direct Use
47
+
48
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Downstream Use [optional]
53
+
54
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
55
+
56
+ [More Information Needed]
57
+
58
+ ### Out-of-Scope Use
59
+
60
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ## Bias, Risks, and Limitations
65
+
66
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
67
+
68
+ [More Information Needed]
69
+
70
+ ### Recommendations
71
+
72
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
73
+
74
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
75
+
76
+ ## How to Get Started with the Model
77
+
78
+ Use the code below to get started with the model.
79
+
80
+ [More Information Needed]
81
+
82
+ ## Training Details
83
+
84
+ ### Training Data
85
+
86
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
87
+
88
+ [More Information Needed]
89
+
90
+ ### Training Procedure
91
+
92
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
93
+
94
+ #### Preprocessing [optional]
95
+
96
+ [More Information Needed]
97
+
98
+
99
+ #### Training Hyperparameters
100
+
101
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
102
+
103
+ #### Speeds, Sizes, Times [optional]
104
+
105
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
106
+
107
+ [More Information Needed]
108
+
109
+ ## Evaluation
110
+
111
+ <!-- This section describes the evaluation protocols and provides the results. -->
112
+
113
+ ### Testing Data, Factors & Metrics
114
+
115
+ #### Testing Data
116
+
117
+ <!-- This should link to a Dataset Card if possible. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Factors
122
+
123
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
124
+
125
+ [More Information Needed]
126
+
127
+ #### Metrics
128
+
129
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
130
+
131
+ [More Information Needed]
132
+
133
+ ### Results
134
+
135
+ [More Information Needed]
136
+
137
+ #### Summary
138
+
139
+
140
+
141
+ ## Model Examination [optional]
142
+
143
+ <!-- Relevant interpretability work for the model goes here -->
144
+
145
+ [More Information Needed]
146
+
147
+ ## Environmental Impact
148
+
149
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
150
+
151
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
152
+
153
+ - **Hardware Type:** [More Information Needed]
154
+ - **Hours used:** [More Information Needed]
155
+ - **Cloud Provider:** [More Information Needed]
156
+ - **Compute Region:** [More Information Needed]
157
+ - **Carbon Emitted:** [More Information Needed]
158
+
159
+ ## Technical Specifications [optional]
160
+
161
+ ### Model Architecture and Objective
162
+
163
+ [More Information Needed]
164
+
165
+ ### Compute Infrastructure
166
+
167
+ [More Information Needed]
168
+
169
+ #### Hardware
170
+
171
+ [More Information Needed]
172
+
173
+ #### Software
174
+
175
+ [More Information Needed]
176
+
177
+ ## Citation [optional]
178
+
179
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
180
+
181
+ **BibTeX:**
182
+
183
+ [More Information Needed]
184
+
185
+ **APA:**
186
+
187
+ [More Information Needed]
188
+
189
+ ## Glossary [optional]
190
+
191
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
192
+
193
+ [More Information Needed]
194
+
195
+ ## More Information [optional]
196
+
197
+ [More Information Needed]
198
+
199
+ ## Model Card Authors [optional]
200
+
201
+ [More Information Needed]
202
+
203
+ ## Model Card Contact
204
+
205
+ [More Information Needed]
206
+ ### Framework versions
207
+
208
+ - PEFT 0.18.1
checkpoint-1782/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "google/gemma-3-1b-it",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 16,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 8,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "up_proj",
33
+ "v_proj",
34
+ "down_proj",
35
+ "o_proj",
36
+ "k_proj",
37
+ "gate_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-1782/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2fccca914644fd456afb68abc1f7307cc09f76cd552cb49bc0a910ee900a73e
3
+ size 26139264
checkpoint-1782/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
checkpoint-1782/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6755df68ed122f1f13f9150cae71db9987413a5e7f10ce12895cbd2aad495e84
3
+ size 52494119
checkpoint-1782/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74aefb1dc1340a25f29ab8370384b9ed24b2d921d7749ece7bbcfcfdf00d497
3
+ size 33384443
checkpoint-1782/tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
checkpoint-1782/trainer_state.json ADDED
@@ -0,0 +1,1280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 1782,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 2.3738947368421055,
994
+ "grad_norm": 0.10681115835905075,
995
+ "learning_rate": 1.2775080538619347e-07,
996
+ "loss": 0.3913698196411133,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 2.3907368421052633,
1001
+ "grad_norm": 0.10323983430862427,
1002
+ "learning_rate": 1.2128059108915595e-07,
1003
+ "loss": 0.39077584743499755,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 2.407578947368421,
1008
+ "grad_norm": 0.09566064178943634,
1009
+ "learning_rate": 1.1495583437374263e-07,
1010
+ "loss": 0.39895172119140626,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 2.424421052631579,
1015
+ "grad_norm": 0.13018426299095154,
1016
+ "learning_rate": 1.0877896443633117e-07,
1017
+ "loss": 0.38982129096984863,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 2.441263157894737,
1022
+ "grad_norm": 0.10760781168937683,
1023
+ "learning_rate": 1.0275235367332347e-07,
1024
+ "loss": 0.3756714344024658,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 2.458105263157895,
1029
+ "grad_norm": 0.11606904864311218,
1030
+ "learning_rate": 9.687831676996238e-08,
1031
+ "loss": 0.37858171463012696,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 2.4749473684210526,
1036
+ "grad_norm": 0.12957172095775604,
1037
+ "learning_rate": 9.115910981131336e-08,
1038
+ "loss": 0.40050196647644043,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 2.4917894736842103,
1043
+ "grad_norm": 0.11186131089925766,
1044
+ "learning_rate": 8.559692941575231e-08,
1045
+ "loss": 0.3684133291244507,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 2.5086315789473685,
1050
+ "grad_norm": 0.13279542326927185,
1051
+ "learning_rate": 8.019391189129466e-08,
1052
+ "loss": 0.3452518224716187,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 2.5254736842105263,
1057
+ "grad_norm": 0.09041756391525269,
1058
+ "learning_rate": 7.495213241508786e-08,
1059
+ "loss": 0.36301617622375487,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 2.542315789473684,
1064
+ "grad_norm": 0.10033190995454788,
1065
+ "learning_rate": 6.987360423638205e-08,
1066
+ "loss": 0.3706004858016968,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 2.559157894736842,
1071
+ "grad_norm": 0.10681814700365067,
1072
+ "learning_rate": 6.49602779032865e-08,
1073
+ "loss": 0.36011199951171874,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 2.576,
1078
+ "grad_norm": 0.1008416935801506,
1079
+ "learning_rate": 6.02140405136089e-08,
1080
+ "loss": 0.37473766803741454,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 2.592842105263158,
1085
+ "grad_norm": 0.11559010297060013,
1086
+ "learning_rate": 5.5636714990062393e-08,
1087
+ "loss": 0.39232525825500486,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 2.609684210526316,
1092
+ "grad_norm": 0.10601615905761719,
1093
+ "learning_rate": 5.1230059380123034e-08,
1094
+ "loss": 0.34370343685150145,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 2.626526315789474,
1099
+ "grad_norm": 0.11516924202442169,
1100
+ "learning_rate": 4.699576618080331e-08,
1101
+ "loss": 0.39509878158569334,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 2.6433684210526316,
1106
+ "grad_norm": 0.11444627493619919,
1107
+ "learning_rate": 4.293546168860163e-08,
1108
+ "loss": 0.3881126165390015,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 2.6602105263157894,
1113
+ "grad_norm": 0.09985339641571045,
1114
+ "learning_rate": 3.9050705374879086e-08,
1115
+ "loss": 0.34624040126800537,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 2.677052631578947,
1120
+ "grad_norm": 0.10439962148666382,
1121
+ "learning_rate": 3.534298928690166e-08,
1122
+ "loss": 0.35141232013702395,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 2.6938947368421053,
1127
+ "grad_norm": 0.12180087715387344,
1128
+ "learning_rate": 3.181373747477822e-08,
1129
+ "loss": 0.39980330467224123,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 2.710736842105263,
1134
+ "grad_norm": 0.12150874733924866,
1135
+ "learning_rate": 2.8464305444515112e-08,
1136
+ "loss": 0.3560852766036987,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 2.7275789473684213,
1141
+ "grad_norm": 0.11374734342098236,
1142
+ "learning_rate": 2.5295979637397213e-08,
1143
+ "loss": 0.39339067935943606,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 2.744421052631579,
1148
+ "grad_norm": 0.11705286055803299,
1149
+ "learning_rate": 2.2309976935894203e-08,
1150
+ "loss": 0.38021705150604246,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 2.761263157894737,
1155
+ "grad_norm": 0.09300459921360016,
1156
+ "learning_rate": 1.9507444196284195e-08,
1157
+ "loss": 0.3467890739440918,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 2.7781052631578946,
1162
+ "grad_norm": 0.08709974586963654,
1163
+ "learning_rate": 1.688945780817147e-08,
1164
+ "loss": 0.38266596794128416,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 2.7949473684210524,
1169
+ "grad_norm": 0.12327069044113159,
1170
+ "learning_rate": 1.445702328106979e-08,
1171
+ "loss": 0.34164865016937257,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 2.8117894736842106,
1176
+ "grad_norm": 0.12361253052949905,
1177
+ "learning_rate": 1.2211074858209103e-08,
1178
+ "loss": 0.3876492977142334,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 2.8286315789473684,
1183
+ "grad_norm": 0.120187908411026,
1184
+ "learning_rate": 1.0152475157713392e-08,
1185
+ "loss": 0.3703944206237793,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 2.845473684210526,
1190
+ "grad_norm": 0.10741006582975388,
1191
+ "learning_rate": 8.282014841288653e-09,
1192
+ "loss": 0.3735771656036377,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 2.8623157894736844,
1197
+ "grad_norm": 0.10451704263687134,
1198
+ "learning_rate": 6.600412310547754e-09,
1199
+ "loss": 0.3807518005371094,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 2.879157894736842,
1204
+ "grad_norm": 0.10978730767965317,
1205
+ "learning_rate": 5.1083134310882515e-09,
1206
+ "loss": 0.3799649000167847,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 2.896,
1211
+ "grad_norm": 0.09521365165710449,
1212
+ "learning_rate": 3.806291284430274e-09,
1213
+ "loss": 0.3580306053161621,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 2.9128421052631577,
1218
+ "grad_norm": 0.11443010717630386,
1219
+ "learning_rate": 2.6948459479087526e-09,
1220
+ "loss": 0.4088387966156006,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 2.929684210526316,
1225
+ "grad_norm": 0.10477516055107117,
1226
+ "learning_rate": 1.7744043026048372e-09,
1227
+ "loss": 0.3971900463104248,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 2.9465263157894737,
1232
+ "grad_norm": 0.11358557641506195,
1233
+ "learning_rate": 1.0453198693907706e-09,
1234
+ "loss": 0.35434761047363283,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 2.9633684210526314,
1239
+ "grad_norm": 0.10436985641717911,
1240
+ "learning_rate": 5.07872673150278e-10,
1241
+ "loss": 0.40241618156433107,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 2.9802105263157896,
1246
+ "grad_norm": 0.10179495811462402,
1247
+ "learning_rate": 1.6226913522743302e-10,
1248
+ "loss": 0.34090123176574705,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 2.9970526315789474,
1253
+ "grad_norm": 0.1047094464302063,
1254
+ "learning_rate": 8.641994144853448e-12,
1255
+ "loss": 0.33390347957611083,
1256
+ "step": 1780
1257
+ }
1258
+ ],
1259
+ "logging_steps": 10,
1260
+ "max_steps": 1782,
1261
+ "num_input_tokens_seen": 0,
1262
+ "num_train_epochs": 3,
1263
+ "save_steps": 100,
1264
+ "stateful_callbacks": {
1265
+ "TrainerControl": {
1266
+ "args": {
1267
+ "should_epoch_stop": false,
1268
+ "should_evaluate": false,
1269
+ "should_log": false,
1270
+ "should_save": true,
1271
+ "should_training_stop": true
1272
+ },
1273
+ "attributes": {}
1274
+ }
1275
+ },
1276
+ "total_flos": 3.258875338498253e+16,
1277
+ "train_batch_size": 1,
1278
+ "trial_name": null,
1279
+ "trial_params": null
1280
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74aefb1dc1340a25f29ab8370384b9ed24b2d921d7749ece7bbcfcfdf00d497
3
+ size 33384443
tokenizer_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<end_of_turn>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": false,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "padding_side": "right",
19
+ "processor_class": "Gemma3Processor",
20
+ "sp_model_kwargs": null,
21
+ "spaces_between_special_tokens": false,
22
+ "split_special_tokens": false,
23
+ "tokenizer_class": "GemmaTokenizer",
24
+ "unk_token": "<unk>",
25
+ "use_default_system_prompt": false
26
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 3.258875338498253e+16,
4
+ "train_loss": 0.45725877370630985,
5
+ "train_runtime": 3399.7742,
6
+ "train_samples_per_second": 4.191,
7
+ "train_steps_per_second": 0.524
8
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 10, "total_steps": 1782, "loss": 0.7252199172973632, "lr": 5.027932960893855e-08, "epoch": 0.016842105263157894, "percentage": 0.56, "elapsed_time": "0:00:25", "remaining_time": "1:14:15"}
2
+ {"current_steps": 20, "total_steps": 1782, "loss": 0.6507451057434082, "lr": 1.0614525139664805e-07, "epoch": 0.03368421052631579, "percentage": 1.12, "elapsed_time": "0:00:46", "remaining_time": "1:08:40"}
3
+ {"current_steps": 30, "total_steps": 1782, "loss": 0.7381344795227051, "lr": 1.6201117318435754e-07, "epoch": 0.05052631578947368, "percentage": 1.68, "elapsed_time": "0:01:07", "remaining_time": "1:05:28"}
4
+ {"current_steps": 40, "total_steps": 1782, "loss": 0.7012194156646728, "lr": 2.17877094972067e-07, "epoch": 0.06736842105263158, "percentage": 2.24, "elapsed_time": "0:01:26", "remaining_time": "1:03:02"}
5
+ {"current_steps": 50, "total_steps": 1782, "loss": 0.6083873748779297, "lr": 2.7374301675977653e-07, "epoch": 0.08421052631578947, "percentage": 2.81, "elapsed_time": "0:01:45", "remaining_time": "1:01:10"}
6
+ {"current_steps": 60, "total_steps": 1782, "loss": 0.6980491638183594, "lr": 3.29608938547486e-07, "epoch": 0.10105263157894737, "percentage": 3.37, "elapsed_time": "0:02:04", "remaining_time": "0:59:38"}
7
+ {"current_steps": 70, "total_steps": 1782, "loss": 0.708641767501831, "lr": 3.8547486033519547e-07, "epoch": 0.11789473684210526, "percentage": 3.93, "elapsed_time": "0:02:24", "remaining_time": "0:58:54"}
8
+ {"current_steps": 80, "total_steps": 1782, "loss": 0.6742453098297119, "lr": 4.41340782122905e-07, "epoch": 0.13473684210526315, "percentage": 4.49, "elapsed_time": "0:02:43", "remaining_time": "0:58:03"}
9
+ {"current_steps": 90, "total_steps": 1782, "loss": 0.6590609550476074, "lr": 4.972067039106145e-07, "epoch": 0.15157894736842106, "percentage": 5.05, "elapsed_time": "0:03:02", "remaining_time": "0:57:10"}
10
+ {"current_steps": 100, "total_steps": 1782, "loss": 0.704926872253418, "lr": 5.53072625698324e-07, "epoch": 0.16842105263157894, "percentage": 5.61, "elapsed_time": "0:03:20", "remaining_time": "0:56:16"}
11
+ {"current_steps": 110, "total_steps": 1782, "loss": 0.7445036888122558, "lr": 6.089385474860335e-07, "epoch": 0.18526315789473685, "percentage": 6.17, "elapsed_time": "0:03:40", "remaining_time": "0:55:52"}
12
+ {"current_steps": 120, "total_steps": 1782, "loss": 0.7476531028747558, "lr": 6.64804469273743e-07, "epoch": 0.20210526315789473, "percentage": 6.73, "elapsed_time": "0:03:59", "remaining_time": "0:55:13"}
13
+ {"current_steps": 130, "total_steps": 1782, "loss": 0.7291872501373291, "lr": 7.206703910614524e-07, "epoch": 0.21894736842105264, "percentage": 7.3, "elapsed_time": "0:04:17", "remaining_time": "0:54:37"}
14
+ {"current_steps": 140, "total_steps": 1782, "loss": 0.721175241470337, "lr": 7.76536312849162e-07, "epoch": 0.23578947368421052, "percentage": 7.86, "elapsed_time": "0:04:37", "remaining_time": "0:54:09"}
15
+ {"current_steps": 150, "total_steps": 1782, "loss": 0.7556095600128174, "lr": 8.324022346368714e-07, "epoch": 0.25263157894736843, "percentage": 8.42, "elapsed_time": "0:04:55", "remaining_time": "0:53:35"}
16
+ {"current_steps": 160, "total_steps": 1782, "loss": 0.7328392505645752, "lr": 8.88268156424581e-07, "epoch": 0.2694736842105263, "percentage": 8.98, "elapsed_time": "0:05:14", "remaining_time": "0:53:05"}
17
+ {"current_steps": 170, "total_steps": 1782, "loss": 0.6990129470825195, "lr": 9.441340782122904e-07, "epoch": 0.2863157894736842, "percentage": 9.54, "elapsed_time": "0:05:32", "remaining_time": "0:52:35"}
18
+ {"current_steps": 180, "total_steps": 1782, "loss": 0.6694639205932618, "lr": 1e-06, "epoch": 0.3031578947368421, "percentage": 10.1, "elapsed_time": "0:05:51", "remaining_time": "0:52:06"}
19
+ {"current_steps": 190, "total_steps": 1782, "loss": 0.7123252868652343, "lr": 9.999039806396227e-07, "epoch": 0.32, "percentage": 10.66, "elapsed_time": "0:06:09", "remaining_time": "0:51:37"}
20
+ {"current_steps": 200, "total_steps": 1782, "loss": 0.6858412742614746, "lr": 9.996159594373611e-07, "epoch": 0.3368421052631579, "percentage": 11.22, "elapsed_time": "0:06:28", "remaining_time": "0:51:10"}
21
+ {"current_steps": 210, "total_steps": 1782, "loss": 0.6541069507598877, "lr": 9.991360470156615e-07, "epoch": 0.35368421052631577, "percentage": 11.78, "elapsed_time": "0:06:47", "remaining_time": "0:50:48"}
22
+ {"current_steps": 220, "total_steps": 1782, "loss": 0.6506116390228271, "lr": 9.984644276980594e-07, "epoch": 0.3705263157894737, "percentage": 12.35, "elapsed_time": "0:07:06", "remaining_time": "0:50:25"}
23
+ {"current_steps": 230, "total_steps": 1782, "loss": 0.6540626049041748, "lr": 9.976013594383835e-07, "epoch": 0.3873684210526316, "percentage": 12.91, "elapsed_time": "0:07:24", "remaining_time": "0:50:02"}
24
+ {"current_steps": 240, "total_steps": 1782, "loss": 0.6737770557403564, "lr": 9.965471737216833e-07, "epoch": 0.40421052631578946, "percentage": 13.47, "elapsed_time": "0:07:43", "remaining_time": "0:49:36"}
25
+ {"current_steps": 250, "total_steps": 1782, "loss": 0.6755708217620849, "lr": 9.953022754369114e-07, "epoch": 0.42105263157894735, "percentage": 14.03, "elapsed_time": "0:08:01", "remaining_time": "0:49:12"}
26
+ {"current_steps": 260, "total_steps": 1782, "loss": 0.6578442573547363, "lr": 9.938671427214158e-07, "epoch": 0.4378947368421053, "percentage": 14.59, "elapsed_time": "0:08:20", "remaining_time": "0:48:48"}
27
+ {"current_steps": 270, "total_steps": 1782, "loss": 0.639409875869751, "lr": 9.922423267772986e-07, "epoch": 0.45473684210526316, "percentage": 15.15, "elapsed_time": "0:08:38", "remaining_time": "0:48:24"}
28
+ {"current_steps": 280, "total_steps": 1782, "loss": 0.5995691776275635, "lr": 9.904284516597102e-07, "epoch": 0.47157894736842104, "percentage": 15.71, "elapsed_time": "0:08:56", "remaining_time": "0:47:59"}
29
+ {"current_steps": 290, "total_steps": 1782, "loss": 0.5898309707641601, "lr": 9.884262140371648e-07, "epoch": 0.4884210526315789, "percentage": 16.27, "elapsed_time": "0:09:15", "remaining_time": "0:47:36"}
30
+ {"current_steps": 300, "total_steps": 1782, "loss": 0.6371779441833496, "lr": 9.862363829239662e-07, "epoch": 0.5052631578947369, "percentage": 16.84, "elapsed_time": "0:09:33", "remaining_time": "0:47:14"}
31
+ {"current_steps": 310, "total_steps": 1782, "loss": 0.5795581817626954, "lr": 9.838597993848456e-07, "epoch": 0.5221052631578947, "percentage": 17.4, "elapsed_time": "0:09:53", "remaining_time": "0:46:55"}
32
+ {"current_steps": 320, "total_steps": 1782, "loss": 0.5668415546417236, "lr": 9.81297376211928e-07, "epoch": 0.5389473684210526, "percentage": 17.96, "elapsed_time": "0:10:11", "remaining_time": "0:46:33"}
33
+ {"current_steps": 330, "total_steps": 1782, "loss": 0.5933257102966308, "lr": 9.785500975741498e-07, "epoch": 0.5557894736842105, "percentage": 18.52, "elapsed_time": "0:10:34", "remaining_time": "0:46:30"}
34
+ {"current_steps": 340, "total_steps": 1782, "loss": 0.5574678897857666, "lr": 9.756190186392615e-07, "epoch": 0.5726315789473684, "percentage": 19.08, "elapsed_time": "0:10:57", "remaining_time": "0:46:29"}
35
+ {"current_steps": 350, "total_steps": 1782, "loss": 0.5532425880432129, "lr": 9.725052651685612e-07, "epoch": 0.5894736842105263, "percentage": 19.64, "elapsed_time": "0:11:20", "remaining_time": "0:46:23"}
36
+ {"current_steps": 360, "total_steps": 1782, "loss": 0.5613389492034913, "lr": 9.692100330845153e-07, "epoch": 0.6063157894736843, "percentage": 20.2, "elapsed_time": "0:11:40", "remaining_time": "0:46:08"}
37
+ {"current_steps": 370, "total_steps": 1782, "loss": 0.5131485939025879, "lr": 9.657345880114318e-07, "epoch": 0.6231578947368421, "percentage": 20.76, "elapsed_time": "0:12:00", "remaining_time": "0:45:47"}
38
+ {"current_steps": 380, "total_steps": 1782, "loss": 0.6279027462005615, "lr": 9.620802647893623e-07, "epoch": 0.64, "percentage": 21.32, "elapsed_time": "0:12:20", "remaining_time": "0:45:31"}
39
+ {"current_steps": 390, "total_steps": 1782, "loss": 0.5403085231781006, "lr": 9.58248466961421e-07, "epoch": 0.6568421052631579, "percentage": 21.89, "elapsed_time": "0:12:41", "remaining_time": "0:45:18"}
40
+ {"current_steps": 400, "total_steps": 1782, "loss": 0.5678809642791748, "lr": 9.542406662347137e-07, "epoch": 0.6736842105263158, "percentage": 22.45, "elapsed_time": "0:13:03", "remaining_time": "0:45:05"}
41
+ {"current_steps": 410, "total_steps": 1782, "loss": 0.5479135036468505, "lr": 9.500584019150895e-07, "epoch": 0.6905263157894737, "percentage": 23.01, "elapsed_time": "0:13:24", "remaining_time": "0:44:53"}
42
+ {"current_steps": 420, "total_steps": 1782, "loss": 0.5604462623596191, "lr": 9.45703280315928e-07, "epoch": 0.7073684210526315, "percentage": 23.57, "elapsed_time": "0:13:45", "remaining_time": "0:44:38"}
43
+ {"current_steps": 430, "total_steps": 1782, "loss": 0.4704423427581787, "lr": 9.411769741411903e-07, "epoch": 0.7242105263157895, "percentage": 24.13, "elapsed_time": "0:14:06", "remaining_time": "0:44:23"}
44
+ {"current_steps": 440, "total_steps": 1782, "loss": 0.560968017578125, "lr": 9.364812218429721e-07, "epoch": 0.7410526315789474, "percentage": 24.69, "elapsed_time": "0:14:27", "remaining_time": "0:44:04"}
45
+ {"current_steps": 450, "total_steps": 1782, "loss": 0.5088452816009521, "lr": 9.316178269538014e-07, "epoch": 0.7578947368421053, "percentage": 25.25, "elapsed_time": "0:14:48", "remaining_time": "0:43:49"}
46
+ {"current_steps": 460, "total_steps": 1782, "loss": 0.5030550956726074, "lr": 9.265886573939446e-07, "epoch": 0.7747368421052632, "percentage": 25.81, "elapsed_time": "0:15:10", "remaining_time": "0:43:35"}
47
+ {"current_steps": 470, "total_steps": 1782, "loss": 0.46353440284729003, "lr": 9.213956447539792e-07, "epoch": 0.791578947368421, "percentage": 26.37, "elapsed_time": "0:15:31", "remaining_time": "0:43:20"}
48
+ {"current_steps": 480, "total_steps": 1782, "loss": 0.49871411323547366, "lr": 9.160407835529136e-07, "epoch": 0.8084210526315789, "percentage": 26.94, "elapsed_time": "0:15:52", "remaining_time": "0:43:03"}
49
+ {"current_steps": 490, "total_steps": 1782, "loss": 0.4416178226470947, "lr": 9.105261304721375e-07, "epoch": 0.8252631578947368, "percentage": 27.5, "elapsed_time": "0:16:13", "remaining_time": "0:42:47"}
50
+ {"current_steps": 500, "total_steps": 1782, "loss": 0.39783194065093996, "lr": 9.048538035654969e-07, "epoch": 0.8421052631578947, "percentage": 28.06, "elapsed_time": "0:16:34", "remaining_time": "0:42:30"}
51
+ {"current_steps": 510, "total_steps": 1782, "loss": 0.4318229198455811, "lr": 8.990259814457977e-07, "epoch": 0.8589473684210527, "percentage": 28.62, "elapsed_time": "0:16:58", "remaining_time": "0:42:19"}
52
+ {"current_steps": 520, "total_steps": 1782, "loss": 0.42445807456970214, "lr": 8.930449024480491e-07, "epoch": 0.8757894736842106, "percentage": 29.18, "elapsed_time": "0:17:21", "remaining_time": "0:42:07"}
53
+ {"current_steps": 530, "total_steps": 1782, "loss": 0.46429901123046874, "lr": 8.8691286376977e-07, "epoch": 0.8926315789473684, "percentage": 29.74, "elapsed_time": "0:17:43", "remaining_time": "0:41:51"}
54
+ {"current_steps": 540, "total_steps": 1782, "loss": 0.3975703239440918, "lr": 8.806322205886873e-07, "epoch": 0.9094736842105263, "percentage": 30.3, "elapsed_time": "0:18:04", "remaining_time": "0:41:34"}
55
+ {"current_steps": 550, "total_steps": 1782, "loss": 0.4458911418914795, "lr": 8.74205385158165e-07, "epoch": 0.9263157894736842, "percentage": 30.86, "elapsed_time": "0:18:25", "remaining_time": "0:41:16"}
56
+ {"current_steps": 560, "total_steps": 1782, "loss": 0.45571184158325195, "lr": 8.676348258807121e-07, "epoch": 0.9431578947368421, "percentage": 31.43, "elapsed_time": "0:18:46", "remaining_time": "0:40:58"}
57
+ {"current_steps": 570, "total_steps": 1782, "loss": 0.4039600372314453, "lr": 8.609230663599254e-07, "epoch": 0.96, "percentage": 31.99, "elapsed_time": "0:19:07", "remaining_time": "0:40:38"}
58
+ {"current_steps": 580, "total_steps": 1782, "loss": 0.4382494926452637, "lr": 8.540726844312294e-07, "epoch": 0.9768421052631578, "percentage": 32.55, "elapsed_time": "0:19:26", "remaining_time": "0:40:16"}
59
+ {"current_steps": 590, "total_steps": 1782, "loss": 0.4306180477142334, "lr": 8.470863111717889e-07, "epoch": 0.9936842105263158, "percentage": 33.11, "elapsed_time": "0:19:45", "remaining_time": "0:39:54"}
60
+ {"current_steps": 600, "total_steps": 1782, "loss": 0.39806089401245115, "lr": 8.399666298899706e-07, "epoch": 1.0101052631578948, "percentage": 33.67, "elapsed_time": "0:20:04", "remaining_time": "0:39:32"}
61
+ {"current_steps": 610, "total_steps": 1782, "loss": 0.4271697044372559, "lr": 8.327163750947457e-07, "epoch": 1.0269473684210526, "percentage": 34.23, "elapsed_time": "0:20:23", "remaining_time": "0:39:11"}
62
+ {"current_steps": 620, "total_steps": 1782, "loss": 0.3939049243927002, "lr": 8.253383314454263e-07, "epoch": 1.0437894736842106, "percentage": 34.79, "elapsed_time": "0:20:42", "remaining_time": "0:38:48"}
63
+ {"current_steps": 630, "total_steps": 1782, "loss": 0.43197131156921387, "lr": 8.178353326821404e-07, "epoch": 1.0606315789473684, "percentage": 35.35, "elapsed_time": "0:21:01", "remaining_time": "0:38:25"}
64
+ {"current_steps": 640, "total_steps": 1782, "loss": 0.437807559967041, "lr": 8.102102605374566e-07, "epoch": 1.0774736842105264, "percentage": 35.91, "elapsed_time": "0:21:19", "remaining_time": "0:38:03"}
65
+ {"current_steps": 650, "total_steps": 1782, "loss": 0.38409013748168946, "lr": 8.024660436295759e-07, "epoch": 1.0943157894736841, "percentage": 36.48, "elapsed_time": "0:21:37", "remaining_time": "0:37:40"}
66
+ {"current_steps": 660, "total_steps": 1782, "loss": 0.4204962730407715, "lr": 7.946056563375145e-07, "epoch": 1.1111578947368421, "percentage": 37.04, "elapsed_time": "0:21:56", "remaining_time": "0:37:17"}
67
+ {"current_steps": 670, "total_steps": 1782, "loss": 0.42113161087036133, "lr": 7.866321176587128e-07, "epoch": 1.1280000000000001, "percentage": 37.6, "elapsed_time": "0:22:14", "remaining_time": "0:36:55"}
68
+ {"current_steps": 680, "total_steps": 1782, "loss": 0.4151731491088867, "lr": 7.785484900495065e-07, "epoch": 1.1448421052631579, "percentage": 38.16, "elapsed_time": "0:22:33", "remaining_time": "0:36:32"}
69
+ {"current_steps": 690, "total_steps": 1782, "loss": 0.38312902450561526, "lr": 7.703578782489058e-07, "epoch": 1.1616842105263159, "percentage": 38.72, "elapsed_time": "0:22:51", "remaining_time": "0:36:10"}
70
+ {"current_steps": 700, "total_steps": 1782, "loss": 0.42612557411193847, "lr": 7.620634280861351e-07, "epoch": 1.1785263157894736, "percentage": 39.28, "elapsed_time": "0:23:09", "remaining_time": "0:35:48"}
71
+ {"current_steps": 710, "total_steps": 1782, "loss": 0.4306772708892822, "lr": 7.536683252723923e-07, "epoch": 1.1953684210526316, "percentage": 39.84, "elapsed_time": "0:23:28", "remaining_time": "0:35:27"}
72
+ {"current_steps": 720, "total_steps": 1782, "loss": 0.38483757972717286, "lr": 7.451757941772868e-07, "epoch": 1.2122105263157894, "percentage": 40.4, "elapsed_time": "0:23:47", "remaining_time": "0:35:05"}
73
+ {"current_steps": 730, "total_steps": 1782, "loss": 0.4030342102050781, "lr": 7.365890965904337e-07, "epoch": 1.2290526315789474, "percentage": 40.97, "elapsed_time": "0:24:05", "remaining_time": "0:34:43"}
74
+ {"current_steps": 740, "total_steps": 1782, "loss": 0.4091166973114014, "lr": 7.279115304686733e-07, "epoch": 1.2458947368421052, "percentage": 41.53, "elapsed_time": "0:24:24", "remaining_time": "0:34:22"}
75
+ {"current_steps": 750, "total_steps": 1782, "loss": 0.41426806449890136, "lr": 7.191464286694e-07, "epoch": 1.2627368421052632, "percentage": 42.09, "elapsed_time": "0:24:43", "remaining_time": "0:34:00"}
76
+ {"current_steps": 760, "total_steps": 1782, "loss": 0.38181486129760744, "lr": 7.102971576704875e-07, "epoch": 1.279578947368421, "percentage": 42.65, "elapsed_time": "0:25:01", "remaining_time": "0:33:39"}
77
+ {"current_steps": 770, "total_steps": 1782, "loss": 0.39824953079223635, "lr": 7.013671162773003e-07, "epoch": 1.296421052631579, "percentage": 43.21, "elapsed_time": "0:25:19", "remaining_time": "0:33:17"}
78
+ {"current_steps": 780, "total_steps": 1782, "loss": 0.40348024368286134, "lr": 6.923597343172891e-07, "epoch": 1.313263157894737, "percentage": 43.77, "elapsed_time": "0:25:39", "remaining_time": "0:32:57"}
79
+ {"current_steps": 790, "total_steps": 1782, "loss": 0.38022048473358155, "lr": 6.83278471322672e-07, "epoch": 1.3301052631578947, "percentage": 44.33, "elapsed_time": "0:25:57", "remaining_time": "0:32:35"}
80
+ {"current_steps": 800, "total_steps": 1782, "loss": 0.42791285514831545, "lr": 6.741268152017057e-07, "epoch": 1.3469473684210527, "percentage": 44.89, "elapsed_time": "0:26:16", "remaining_time": "0:32:14"}
81
+ {"current_steps": 810, "total_steps": 1782, "loss": 0.4263493061065674, "lr": 6.649082808990585e-07, "epoch": 1.3637894736842107, "percentage": 45.45, "elapsed_time": "0:26:35", "remaining_time": "0:31:54"}
82
+ {"current_steps": 820, "total_steps": 1782, "loss": 0.37379777431488037, "lr": 6.556264090457998e-07, "epoch": 1.3806315789473684, "percentage": 46.02, "elapsed_time": "0:26:53", "remaining_time": "0:31:33"}
83
+ {"current_steps": 830, "total_steps": 1782, "loss": 0.38636391162872313, "lr": 6.462847645995237e-07, "epoch": 1.3974736842105262, "percentage": 46.58, "elapsed_time": "0:27:12", "remaining_time": "0:31:11"}
84
+ {"current_steps": 840, "total_steps": 1782, "loss": 0.408221435546875, "lr": 6.368869354751284e-07, "epoch": 1.4143157894736842, "percentage": 47.14, "elapsed_time": "0:27:30", "remaining_time": "0:30:51"}
85
+ {"current_steps": 850, "total_steps": 1782, "loss": 0.3951406717300415, "lr": 6.274365311667797e-07, "epoch": 1.4311578947368422, "percentage": 47.7, "elapsed_time": "0:27:49", "remaining_time": "0:30:30"}
86
+ {"current_steps": 860, "total_steps": 1782, "loss": 0.3732129096984863, "lr": 6.179371813615859e-07, "epoch": 1.448, "percentage": 48.26, "elapsed_time": "0:28:07", "remaining_time": "0:30:09"}
87
+ {"current_steps": 870, "total_steps": 1782, "loss": 0.38601529598236084, "lr": 6.083925345455158e-07, "epoch": 1.464842105263158, "percentage": 48.82, "elapsed_time": "0:28:26", "remaining_time": "0:29:49"}
88
+ {"current_steps": 880, "total_steps": 1782, "loss": 0.3859985828399658, "lr": 5.988062566020986e-07, "epoch": 1.4816842105263157, "percentage": 49.38, "elapsed_time": "0:28:45", "remaining_time": "0:29:28"}
89
+ {"current_steps": 890, "total_steps": 1782, "loss": 0.3983951807022095, "lr": 5.891820294044408e-07, "epoch": 1.4985263157894737, "percentage": 49.94, "elapsed_time": "0:29:04", "remaining_time": "0:29:08"}
90
+ {"current_steps": 900, "total_steps": 1782, "loss": 0.41107850074768065, "lr": 5.795235494011007e-07, "epoch": 1.5153684210526315, "percentage": 50.51, "elapsed_time": "0:29:22", "remaining_time": "0:28:47"}
91
+ {"current_steps": 910, "total_steps": 1782, "loss": 0.3708331823348999, "lr": 5.698345261963668e-07, "epoch": 1.5322105263157895, "percentage": 51.07, "elapsed_time": "0:29:42", "remaining_time": "0:28:27"}
92
+ {"current_steps": 920, "total_steps": 1782, "loss": 0.387884521484375, "lr": 5.601186811254825e-07, "epoch": 1.5490526315789475, "percentage": 51.63, "elapsed_time": "0:30:00", "remaining_time": "0:28:06"}
93
+ {"current_steps": 930, "total_steps": 1782, "loss": 0.43808717727661134, "lr": 5.503797458253646e-07, "epoch": 1.5658947368421052, "percentage": 52.19, "elapsed_time": "0:30:18", "remaining_time": "0:27:46"}
94
+ {"current_steps": 940, "total_steps": 1782, "loss": 0.41345391273498533, "lr": 5.406214608013662e-07, "epoch": 1.582736842105263, "percentage": 52.75, "elapsed_time": "0:30:37", "remaining_time": "0:27:25"}
95
+ {"current_steps": 950, "total_steps": 1782, "loss": 0.40022664070129393, "lr": 5.308475739906328e-07, "epoch": 1.5995789473684212, "percentage": 53.31, "elapsed_time": "0:30:55", "remaining_time": "0:27:05"}
96
+ {"current_steps": 960, "total_steps": 1782, "loss": 0.3909924983978271, "lr": 5.210618393226045e-07, "epoch": 1.616421052631579, "percentage": 53.87, "elapsed_time": "0:31:14", "remaining_time": "0:26:44"}
97
+ {"current_steps": 970, "total_steps": 1782, "loss": 0.37143146991729736, "lr": 5.112680152772156e-07, "epoch": 1.6332631578947368, "percentage": 54.43, "elapsed_time": "0:31:32", "remaining_time": "0:26:24"}
98
+ {"current_steps": 980, "total_steps": 1782, "loss": 0.38103113174438474, "lr": 5.01469863441348e-07, "epoch": 1.6501052631578947, "percentage": 54.99, "elapsed_time": "0:31:51", "remaining_time": "0:26:04"}
99
+ {"current_steps": 990, "total_steps": 1782, "loss": 0.4071629524230957, "lr": 4.916711470640907e-07, "epoch": 1.6669473684210527, "percentage": 55.56, "elapsed_time": "0:32:10", "remaining_time": "0:25:44"}
100
+ {"current_steps": 1000, "total_steps": 1782, "loss": 0.417419958114624, "lr": 4.818756296113595e-07, "epoch": 1.6837894736842105, "percentage": 56.12, "elapsed_time": "0:32:28", "remaining_time": "0:25:23"}
101
+ {"current_steps": 1010, "total_steps": 1782, "loss": 0.3998772859573364, "lr": 4.7208707332043623e-07, "epoch": 1.7006315789473683, "percentage": 56.68, "elapsed_time": "0:32:48", "remaining_time": "0:25:04"}
102
+ {"current_steps": 1020, "total_steps": 1782, "loss": 0.38056583404541017, "lr": 4.6230923775497714e-07, "epoch": 1.7174736842105263, "percentage": 57.24, "elapsed_time": "0:33:06", "remaining_time": "0:24:44"}
103
+ {"current_steps": 1030, "total_steps": 1782, "loss": 0.39371190071105955, "lr": 4.5254587836104964e-07, "epoch": 1.7343157894736843, "percentage": 57.8, "elapsed_time": "0:33:25", "remaining_time": "0:24:23"}
104
+ {"current_steps": 1040, "total_steps": 1782, "loss": 0.4280440330505371, "lr": 4.4280074502475017e-07, "epoch": 1.751157894736842, "percentage": 58.36, "elapsed_time": "0:33:43", "remaining_time": "0:24:03"}
105
+ {"current_steps": 1050, "total_steps": 1782, "loss": 0.3791615962982178, "lr": 4.3307758063195796e-07, "epoch": 1.768, "percentage": 58.92, "elapsed_time": "0:34:02", "remaining_time": "0:23:43"}
106
+ {"current_steps": 1060, "total_steps": 1782, "loss": 0.347782301902771, "lr": 4.233801196307762e-07, "epoch": 1.784842105263158, "percentage": 59.48, "elapsed_time": "0:34:20", "remaining_time": "0:23:23"}
107
+ {"current_steps": 1070, "total_steps": 1782, "loss": 0.38370628356933595, "lr": 4.1371208659721536e-07, "epoch": 1.8016842105263158, "percentage": 60.04, "elapsed_time": "0:34:39", "remaining_time": "0:23:03"}
108
+ {"current_steps": 1080, "total_steps": 1782, "loss": 0.40404376983642576, "lr": 4.0407719480466736e-07, "epoch": 1.8185263157894735, "percentage": 60.61, "elapsed_time": "0:34:57", "remaining_time": "0:22:43"}
109
+ {"current_steps": 1090, "total_steps": 1782, "loss": 0.4167450428009033, "lr": 3.944791447977213e-07, "epoch": 1.8353684210526315, "percentage": 61.17, "elapsed_time": "0:35:16", "remaining_time": "0:22:23"}
110
+ {"current_steps": 1100, "total_steps": 1782, "loss": 0.4046513080596924, "lr": 3.849216229708671e-07, "epoch": 1.8522105263157895, "percentage": 61.73, "elapsed_time": "0:35:34", "remaining_time": "0:22:03"}
111
+ {"current_steps": 1110, "total_steps": 1782, "loss": 0.39672977924346925, "lr": 3.7540830015263526e-07, "epoch": 1.8690526315789473, "percentage": 62.29, "elapsed_time": "0:35:53", "remaining_time": "0:21:43"}
112
+ {"current_steps": 1120, "total_steps": 1782, "loss": 0.39356396198272703, "lr": 3.6594283019571416e-07, "epoch": 1.8858947368421053, "percentage": 62.85, "elapsed_time": "0:36:11", "remaining_time": "0:21:23"}
113
+ {"current_steps": 1130, "total_steps": 1782, "loss": 0.42082643508911133, "lr": 3.565288485735874e-07, "epoch": 1.9027368421052633, "percentage": 63.41, "elapsed_time": "0:36:30", "remaining_time": "0:21:03"}
114
+ {"current_steps": 1140, "total_steps": 1782, "loss": 0.34105117321014405, "lr": 3.4716997098423085e-07, "epoch": 1.919578947368421, "percentage": 63.97, "elapsed_time": "0:36:48", "remaining_time": "0:20:43"}
115
+ {"current_steps": 1150, "total_steps": 1782, "loss": 0.3924069404602051, "lr": 3.378697919614045e-07, "epoch": 1.9364210526315788, "percentage": 64.53, "elapsed_time": "0:37:06", "remaining_time": "0:20:23"}
116
+ {"current_steps": 1160, "total_steps": 1782, "loss": 0.3922377586364746, "lr": 3.286318834940729e-07, "epoch": 1.9532631578947368, "percentage": 65.1, "elapsed_time": "0:37:25", "remaining_time": "0:20:03"}
117
+ {"current_steps": 1170, "total_steps": 1782, "loss": 0.3745201587677002, "lr": 3.1945979365448517e-07, "epoch": 1.9701052631578948, "percentage": 65.66, "elapsed_time": "0:37:43", "remaining_time": "0:19:44"}
118
+ {"current_steps": 1180, "total_steps": 1782, "loss": 0.40110602378845217, "lr": 3.103570452354402e-07, "epoch": 1.9869473684210526, "percentage": 66.22, "elapsed_time": "0:38:02", "remaining_time": "0:19:24"}
119
+ {"current_steps": 1190, "total_steps": 1782, "loss": 0.3981154918670654, "lr": 3.013271343972613e-07, "epoch": 2.0033684210526315, "percentage": 66.78, "elapsed_time": "0:38:20", "remaining_time": "0:19:04"}
120
+ {"current_steps": 1200, "total_steps": 1782, "loss": 0.3726134061813354, "lr": 2.9237352932500046e-07, "epoch": 2.0202105263157897, "percentage": 67.34, "elapsed_time": "0:38:38", "remaining_time": "0:18:44"}
121
+ {"current_steps": 1210, "total_steps": 1782, "loss": 0.42558698654174804, "lr": 2.8349966889638615e-07, "epoch": 2.0370526315789474, "percentage": 67.9, "elapsed_time": "0:38:58", "remaining_time": "0:18:25"}
122
+ {"current_steps": 1220, "total_steps": 1782, "loss": 0.3682931184768677, "lr": 2.747089613610278e-07, "epoch": 2.053894736842105, "percentage": 68.46, "elapsed_time": "0:39:16", "remaining_time": "0:18:05"}
123
+ {"current_steps": 1230, "total_steps": 1782, "loss": 0.3756644487380981, "lr": 2.66004783031385e-07, "epoch": 2.070736842105263, "percentage": 69.02, "elapsed_time": "0:39:34", "remaining_time": "0:17:45"}
124
+ {"current_steps": 1240, "total_steps": 1782, "loss": 0.3804330825805664, "lr": 2.573904769860009e-07, "epoch": 2.087578947368421, "percentage": 69.58, "elapsed_time": "0:39:53", "remaining_time": "0:17:26"}
125
+ {"current_steps": 1250, "total_steps": 1782, "loss": 0.3978404521942139, "lr": 2.488693517855016e-07, "epoch": 2.104421052631579, "percentage": 70.15, "elapsed_time": "0:40:11", "remaining_time": "0:17:06"}
126
+ {"current_steps": 1260, "total_steps": 1782, "loss": 0.3935218334197998, "lr": 2.404446802018533e-07, "epoch": 2.1212631578947367, "percentage": 70.71, "elapsed_time": "0:40:30", "remaining_time": "0:16:46"}
127
+ {"current_steps": 1270, "total_steps": 1782, "loss": 0.42966952323913576, "lr": 2.3211969796136305e-07, "epoch": 2.138105263157895, "percentage": 71.27, "elapsed_time": "0:40:48", "remaining_time": "0:16:27"}
128
+ {"current_steps": 1280, "total_steps": 1782, "loss": 0.3679579019546509, "lr": 2.2389760250191038e-07, "epoch": 2.1549473684210527, "percentage": 71.83, "elapsed_time": "0:41:06", "remaining_time": "0:16:07"}
129
+ {"current_steps": 1290, "total_steps": 1782, "loss": 0.41324810981750487, "lr": 2.1578155174488343e-07, "epoch": 2.1717894736842105, "percentage": 72.39, "elapsed_time": "0:41:25", "remaining_time": "0:15:47"}
130
+ {"current_steps": 1300, "total_steps": 1782, "loss": 0.40120248794555663, "lr": 2.0777466288229205e-07, "epoch": 2.1886315789473683, "percentage": 72.95, "elapsed_time": "0:41:44", "remaining_time": "0:15:28"}
131
+ {"current_steps": 1310, "total_steps": 1782, "loss": 0.3501007080078125, "lr": 1.9988001117952485e-07, "epoch": 2.2054736842105265, "percentage": 73.51, "elapsed_time": "0:42:03", "remaining_time": "0:15:09"}
132
+ {"current_steps": 1320, "total_steps": 1782, "loss": 0.3839429378509521, "lr": 1.9210062879420973e-07, "epoch": 2.2223157894736842, "percentage": 74.07, "elapsed_time": "0:42:22", "remaining_time": "0:14:49"}
133
+ {"current_steps": 1330, "total_steps": 1782, "loss": 0.4338528156280518, "lr": 1.8443950361162957e-07, "epoch": 2.239157894736842, "percentage": 74.64, "elapsed_time": "0:42:40", "remaining_time": "0:14:30"}
134
+ {"current_steps": 1340, "total_steps": 1782, "loss": 0.39229888916015626, "lr": 1.7689957809714346e-07, "epoch": 2.2560000000000002, "percentage": 75.2, "elapsed_time": "0:42:59", "remaining_time": "0:14:10"}
135
+ {"current_steps": 1350, "total_steps": 1782, "loss": 0.38006880283355715, "lr": 1.694837481660525e-07, "epoch": 2.272842105263158, "percentage": 75.76, "elapsed_time": "0:43:17", "remaining_time": "0:13:51"}
136
+ {"current_steps": 1360, "total_steps": 1782, "loss": 0.3808159589767456, "lr": 1.6219486207134313e-07, "epoch": 2.2896842105263158, "percentage": 76.32, "elapsed_time": "0:43:36", "remaining_time": "0:13:31"}
137
+ {"current_steps": 1370, "total_steps": 1782, "loss": 0.401824426651001, "lr": 1.5503571930973785e-07, "epoch": 2.3065263157894735, "percentage": 76.88, "elapsed_time": "0:43:54", "remaining_time": "0:13:12"}
138
+ {"current_steps": 1380, "total_steps": 1782, "loss": 0.40149493217468263, "lr": 1.480090695464723e-07, "epoch": 2.3233684210526317, "percentage": 77.44, "elapsed_time": "0:44:13", "remaining_time": "0:12:52"}
139
+ {"current_steps": 1390, "total_steps": 1782, "loss": 0.38567726612091063, "lr": 1.4111761155920975e-07, "epoch": 2.3402105263157895, "percentage": 78.0, "elapsed_time": "0:44:31", "remaining_time": "0:12:33"}
140
+ {"current_steps": 1400, "total_steps": 1782, "loss": 0.3759742736816406, "lr": 1.3436399220150212e-07, "epoch": 2.3570526315789473, "percentage": 78.56, "elapsed_time": "0:44:50", "remaining_time": "0:12:14"}
141
+ {"current_steps": 1410, "total_steps": 1782, "loss": 0.3913698196411133, "lr": 1.2775080538619347e-07, "epoch": 2.3738947368421055, "percentage": 79.12, "elapsed_time": "0:45:09", "remaining_time": "0:11:54"}
142
+ {"current_steps": 1420, "total_steps": 1782, "loss": 0.39077584743499755, "lr": 1.2128059108915595e-07, "epoch": 2.3907368421052633, "percentage": 79.69, "elapsed_time": "0:45:28", "remaining_time": "0:11:35"}
143
+ {"current_steps": 1430, "total_steps": 1782, "loss": 0.39895172119140626, "lr": 1.1495583437374263e-07, "epoch": 2.407578947368421, "percentage": 80.25, "elapsed_time": "0:45:47", "remaining_time": "0:11:16"}
144
+ {"current_steps": 1440, "total_steps": 1782, "loss": 0.38982129096984863, "lr": 1.0877896443633117e-07, "epoch": 2.424421052631579, "percentage": 80.81, "elapsed_time": "0:46:05", "remaining_time": "0:10:56"}
145
+ {"current_steps": 1450, "total_steps": 1782, "loss": 0.3756714344024658, "lr": 1.0275235367332347e-07, "epoch": 2.441263157894737, "percentage": 81.37, "elapsed_time": "0:46:24", "remaining_time": "0:10:37"}
146
+ {"current_steps": 1460, "total_steps": 1782, "loss": 0.37858171463012696, "lr": 9.687831676996238e-08, "epoch": 2.458105263157895, "percentage": 81.93, "elapsed_time": "0:46:42", "remaining_time": "0:10:18"}
147
+ {"current_steps": 1470, "total_steps": 1782, "loss": 0.40050196647644043, "lr": 9.115910981131336e-08, "epoch": 2.4749473684210526, "percentage": 82.49, "elapsed_time": "0:47:01", "remaining_time": "0:09:58"}
148
+ {"current_steps": 1480, "total_steps": 1782, "loss": 0.3684133291244507, "lr": 8.559692941575231e-08, "epoch": 2.4917894736842103, "percentage": 83.05, "elapsed_time": "0:47:19", "remaining_time": "0:09:39"}
149
+ {"current_steps": 1490, "total_steps": 1782, "loss": 0.3452518224716187, "lr": 8.019391189129466e-08, "epoch": 2.5086315789473685, "percentage": 83.61, "elapsed_time": "0:47:38", "remaining_time": "0:09:20"}
150
+ {"current_steps": 1500, "total_steps": 1782, "loss": 0.36301617622375487, "lr": 7.495213241508786e-08, "epoch": 2.5254736842105263, "percentage": 84.18, "elapsed_time": "0:47:56", "remaining_time": "0:09:00"}
151
+ {"current_steps": 1510, "total_steps": 1782, "loss": 0.3706004858016968, "lr": 6.987360423638205e-08, "epoch": 2.542315789473684, "percentage": 84.74, "elapsed_time": "0:48:16", "remaining_time": "0:08:41"}
152
+ {"current_steps": 1520, "total_steps": 1782, "loss": 0.36011199951171874, "lr": 6.49602779032865e-08, "epoch": 2.559157894736842, "percentage": 85.3, "elapsed_time": "0:48:34", "remaining_time": "0:08:22"}
153
+ {"current_steps": 1530, "total_steps": 1782, "loss": 0.37473766803741454, "lr": 6.02140405136089e-08, "epoch": 2.576, "percentage": 85.86, "elapsed_time": "0:48:53", "remaining_time": "0:08:03"}
154
+ {"current_steps": 1540, "total_steps": 1782, "loss": 0.39232525825500486, "lr": 5.5636714990062393e-08, "epoch": 2.592842105263158, "percentage": 86.42, "elapsed_time": "0:49:11", "remaining_time": "0:07:43"}
155
+ {"current_steps": 1550, "total_steps": 1782, "loss": 0.34370343685150145, "lr": 5.1230059380123034e-08, "epoch": 2.609684210526316, "percentage": 86.98, "elapsed_time": "0:49:30", "remaining_time": "0:07:24"}
156
+ {"current_steps": 1560, "total_steps": 1782, "loss": 0.39509878158569334, "lr": 4.699576618080331e-08, "epoch": 2.626526315789474, "percentage": 87.54, "elapsed_time": "0:49:48", "remaining_time": "0:07:05"}
157
+ {"current_steps": 1570, "total_steps": 1782, "loss": 0.3881126165390015, "lr": 4.293546168860163e-08, "epoch": 2.6433684210526316, "percentage": 88.1, "elapsed_time": "0:50:06", "remaining_time": "0:06:46"}
158
+ {"current_steps": 1580, "total_steps": 1782, "loss": 0.34624040126800537, "lr": 3.9050705374879086e-08, "epoch": 2.6602105263157894, "percentage": 88.66, "elapsed_time": "0:50:25", "remaining_time": "0:06:26"}
159
+ {"current_steps": 1590, "total_steps": 1782, "loss": 0.35141232013702395, "lr": 3.534298928690166e-08, "epoch": 2.677052631578947, "percentage": 89.23, "elapsed_time": "0:50:43", "remaining_time": "0:06:07"}
160
+ {"current_steps": 1600, "total_steps": 1782, "loss": 0.39980330467224123, "lr": 3.181373747477822e-08, "epoch": 2.6938947368421053, "percentage": 89.79, "elapsed_time": "0:51:02", "remaining_time": "0:05:48"}
161
+ {"current_steps": 1610, "total_steps": 1782, "loss": 0.3560852766036987, "lr": 2.8464305444515112e-08, "epoch": 2.710736842105263, "percentage": 90.35, "elapsed_time": "0:51:21", "remaining_time": "0:05:29"}
162
+ {"current_steps": 1620, "total_steps": 1782, "loss": 0.39339067935943606, "lr": 2.5295979637397213e-08, "epoch": 2.7275789473684213, "percentage": 90.91, "elapsed_time": "0:51:40", "remaining_time": "0:05:10"}
163
+ {"current_steps": 1630, "total_steps": 1782, "loss": 0.38021705150604246, "lr": 2.2309976935894203e-08, "epoch": 2.744421052631579, "percentage": 91.47, "elapsed_time": "0:51:58", "remaining_time": "0:04:50"}
164
+ {"current_steps": 1640, "total_steps": 1782, "loss": 0.3467890739440918, "lr": 1.9507444196284195e-08, "epoch": 2.761263157894737, "percentage": 92.03, "elapsed_time": "0:52:17", "remaining_time": "0:04:31"}
165
+ {"current_steps": 1650, "total_steps": 1782, "loss": 0.38266596794128416, "lr": 1.688945780817147e-08, "epoch": 2.7781052631578946, "percentage": 92.59, "elapsed_time": "0:52:35", "remaining_time": "0:04:12"}
166
+ {"current_steps": 1660, "total_steps": 1782, "loss": 0.34164865016937257, "lr": 1.445702328106979e-08, "epoch": 2.7949473684210524, "percentage": 93.15, "elapsed_time": "0:52:53", "remaining_time": "0:03:53"}
167
+ {"current_steps": 1670, "total_steps": 1782, "loss": 0.3876492977142334, "lr": 1.2211074858209103e-08, "epoch": 2.8117894736842106, "percentage": 93.71, "elapsed_time": "0:53:12", "remaining_time": "0:03:34"}
168
+ {"current_steps": 1680, "total_steps": 1782, "loss": 0.3703944206237793, "lr": 1.0152475157713392e-08, "epoch": 2.8286315789473684, "percentage": 94.28, "elapsed_time": "0:53:30", "remaining_time": "0:03:14"}
169
+ {"current_steps": 1690, "total_steps": 1782, "loss": 0.3735771656036377, "lr": 8.282014841288653e-09, "epoch": 2.845473684210526, "percentage": 94.84, "elapsed_time": "0:53:49", "remaining_time": "0:02:55"}
170
+ {"current_steps": 1700, "total_steps": 1782, "loss": 0.3807518005371094, "lr": 6.600412310547754e-09, "epoch": 2.8623157894736844, "percentage": 95.4, "elapsed_time": "0:54:07", "remaining_time": "0:02:36"}
171
+ {"current_steps": 1710, "total_steps": 1782, "loss": 0.3799649000167847, "lr": 5.1083134310882515e-09, "epoch": 2.879157894736842, "percentage": 95.96, "elapsed_time": "0:54:27", "remaining_time": "0:02:17"}
172
+ {"current_steps": 1720, "total_steps": 1782, "loss": 0.3580306053161621, "lr": 3.806291284430274e-09, "epoch": 2.896, "percentage": 96.52, "elapsed_time": "0:54:45", "remaining_time": "0:01:58"}
173
+ {"current_steps": 1730, "total_steps": 1782, "loss": 0.4088387966156006, "lr": 2.6948459479087526e-09, "epoch": 2.9128421052631577, "percentage": 97.08, "elapsed_time": "0:55:04", "remaining_time": "0:01:39"}
174
+ {"current_steps": 1740, "total_steps": 1782, "loss": 0.3971900463104248, "lr": 1.7744043026048372e-09, "epoch": 2.929684210526316, "percentage": 97.64, "elapsed_time": "0:55:22", "remaining_time": "0:01:20"}
175
+ {"current_steps": 1750, "total_steps": 1782, "loss": 0.35434761047363283, "lr": 1.0453198693907706e-09, "epoch": 2.9465263157894737, "percentage": 98.2, "elapsed_time": "0:55:40", "remaining_time": "0:01:01"}
176
+ {"current_steps": 1760, "total_steps": 1782, "loss": 0.40241618156433107, "lr": 5.07872673150278e-10, "epoch": 2.9633684210526314, "percentage": 98.77, "elapsed_time": "0:55:59", "remaining_time": "0:00:41"}
177
+ {"current_steps": 1770, "total_steps": 1782, "loss": 0.34090123176574705, "lr": 1.6226913522743302e-10, "epoch": 2.9802105263157896, "percentage": 99.33, "elapsed_time": "0:56:17", "remaining_time": "0:00:22"}
178
+ {"current_steps": 1780, "total_steps": 1782, "loss": 0.33390347957611083, "lr": 8.641994144853448e-12, "epoch": 2.9970526315789474, "percentage": 99.89, "elapsed_time": "0:56:35", "remaining_time": "0:00:03"}
179
+ {"current_steps": 1782, "total_steps": 1782, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "0:56:39", "remaining_time": "0:00:00"}
trainer_state.json ADDED
@@ -0,0 +1,1289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 100,
7
+ "global_step": 1782,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016842105263157894,
14
+ "grad_norm": 0.21757784485816956,
15
+ "learning_rate": 5.027932960893855e-08,
16
+ "loss": 0.7252199172973632,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.03368421052631579,
21
+ "grad_norm": 0.2456846386194229,
22
+ "learning_rate": 1.0614525139664805e-07,
23
+ "loss": 0.6507451057434082,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.05052631578947368,
28
+ "grad_norm": 0.20819272100925446,
29
+ "learning_rate": 1.6201117318435754e-07,
30
+ "loss": 0.7381344795227051,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.06736842105263158,
35
+ "grad_norm": 0.26373574137687683,
36
+ "learning_rate": 2.17877094972067e-07,
37
+ "loss": 0.7012194156646728,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.08421052631578947,
42
+ "grad_norm": 0.2081507444381714,
43
+ "learning_rate": 2.7374301675977653e-07,
44
+ "loss": 0.6083873748779297,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.10105263157894737,
49
+ "grad_norm": 0.2091236114501953,
50
+ "learning_rate": 3.29608938547486e-07,
51
+ "loss": 0.6980491638183594,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.11789473684210526,
56
+ "grad_norm": 0.20970331132411957,
57
+ "learning_rate": 3.8547486033519547e-07,
58
+ "loss": 0.708641767501831,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.13473684210526315,
63
+ "grad_norm": 0.18810197710990906,
64
+ "learning_rate": 4.41340782122905e-07,
65
+ "loss": 0.6742453098297119,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.15157894736842106,
70
+ "grad_norm": 0.20251069962978363,
71
+ "learning_rate": 4.972067039106145e-07,
72
+ "loss": 0.6590609550476074,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.16842105263157894,
77
+ "grad_norm": 0.2644217908382416,
78
+ "learning_rate": 5.53072625698324e-07,
79
+ "loss": 0.704926872253418,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.18526315789473685,
84
+ "grad_norm": 0.23766489326953888,
85
+ "learning_rate": 6.089385474860335e-07,
86
+ "loss": 0.7445036888122558,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.20210526315789473,
91
+ "grad_norm": 0.27427056431770325,
92
+ "learning_rate": 6.64804469273743e-07,
93
+ "loss": 0.7476531028747558,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.21894736842105264,
98
+ "grad_norm": 0.3208928406238556,
99
+ "learning_rate": 7.206703910614524e-07,
100
+ "loss": 0.7291872501373291,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.23578947368421052,
105
+ "grad_norm": 0.3123615086078644,
106
+ "learning_rate": 7.76536312849162e-07,
107
+ "loss": 0.721175241470337,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.25263157894736843,
112
+ "grad_norm": 0.26158222556114197,
113
+ "learning_rate": 8.324022346368714e-07,
114
+ "loss": 0.7556095600128174,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.2694736842105263,
119
+ "grad_norm": 0.2592650353908539,
120
+ "learning_rate": 8.88268156424581e-07,
121
+ "loss": 0.7328392505645752,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.2863157894736842,
126
+ "grad_norm": 0.24533776938915253,
127
+ "learning_rate": 9.441340782122904e-07,
128
+ "loss": 0.6990129470825195,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.3031578947368421,
133
+ "grad_norm": 0.23409004509449005,
134
+ "learning_rate": 1e-06,
135
+ "loss": 0.6694639205932618,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.32,
140
+ "grad_norm": 0.3267499506473541,
141
+ "learning_rate": 9.999039806396227e-07,
142
+ "loss": 0.7123252868652343,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.3368421052631579,
147
+ "grad_norm": 0.2115064263343811,
148
+ "learning_rate": 9.996159594373611e-07,
149
+ "loss": 0.6858412742614746,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.35368421052631577,
154
+ "grad_norm": 0.26226580142974854,
155
+ "learning_rate": 9.991360470156615e-07,
156
+ "loss": 0.6541069507598877,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.3705263157894737,
161
+ "grad_norm": 0.24552594125270844,
162
+ "learning_rate": 9.984644276980594e-07,
163
+ "loss": 0.6506116390228271,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.3873684210526316,
168
+ "grad_norm": 0.25084301829338074,
169
+ "learning_rate": 9.976013594383835e-07,
170
+ "loss": 0.6540626049041748,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.40421052631578946,
175
+ "grad_norm": 0.34244054555892944,
176
+ "learning_rate": 9.965471737216833e-07,
177
+ "loss": 0.6737770557403564,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.42105263157894735,
182
+ "grad_norm": 0.34752583503723145,
183
+ "learning_rate": 9.953022754369114e-07,
184
+ "loss": 0.6755708217620849,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.4378947368421053,
189
+ "grad_norm": 0.31017956137657166,
190
+ "learning_rate": 9.938671427214158e-07,
191
+ "loss": 0.6578442573547363,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.45473684210526316,
196
+ "grad_norm": 0.21509627997875214,
197
+ "learning_rate": 9.922423267772986e-07,
198
+ "loss": 0.639409875869751,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.47157894736842104,
203
+ "grad_norm": 0.3022947609424591,
204
+ "learning_rate": 9.904284516597102e-07,
205
+ "loss": 0.5995691776275635,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.4884210526315789,
210
+ "grad_norm": 0.3367304801940918,
211
+ "learning_rate": 9.884262140371648e-07,
212
+ "loss": 0.5898309707641601,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.5052631578947369,
217
+ "grad_norm": 0.294842928647995,
218
+ "learning_rate": 9.862363829239662e-07,
219
+ "loss": 0.6371779441833496,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.5221052631578947,
224
+ "grad_norm": 0.25171560049057007,
225
+ "learning_rate": 9.838597993848456e-07,
226
+ "loss": 0.5795581817626954,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.5389473684210526,
231
+ "grad_norm": 0.2818540036678314,
232
+ "learning_rate": 9.81297376211928e-07,
233
+ "loss": 0.5668415546417236,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.5557894736842105,
238
+ "grad_norm": 0.32951900362968445,
239
+ "learning_rate": 9.785500975741498e-07,
240
+ "loss": 0.5933257102966308,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.5726315789473684,
245
+ "grad_norm": 0.2763514518737793,
246
+ "learning_rate": 9.756190186392615e-07,
247
+ "loss": 0.5574678897857666,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.5894736842105263,
252
+ "grad_norm": 0.3070182204246521,
253
+ "learning_rate": 9.725052651685612e-07,
254
+ "loss": 0.5532425880432129,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.6063157894736843,
259
+ "grad_norm": 0.2079988420009613,
260
+ "learning_rate": 9.692100330845153e-07,
261
+ "loss": 0.5613389492034913,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.6231578947368421,
266
+ "grad_norm": 0.282924622297287,
267
+ "learning_rate": 9.657345880114318e-07,
268
+ "loss": 0.5131485939025879,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.64,
273
+ "grad_norm": 0.20901450514793396,
274
+ "learning_rate": 9.620802647893623e-07,
275
+ "loss": 0.6279027462005615,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.6568421052631579,
280
+ "grad_norm": 0.2637634575366974,
281
+ "learning_rate": 9.58248466961421e-07,
282
+ "loss": 0.5403085231781006,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.6736842105263158,
287
+ "grad_norm": 0.29078468680381775,
288
+ "learning_rate": 9.542406662347137e-07,
289
+ "loss": 0.5678809642791748,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.6905263157894737,
294
+ "grad_norm": 0.2865101397037506,
295
+ "learning_rate": 9.500584019150895e-07,
296
+ "loss": 0.5479135036468505,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.7073684210526315,
301
+ "grad_norm": 0.22857311367988586,
302
+ "learning_rate": 9.45703280315928e-07,
303
+ "loss": 0.5604462623596191,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.7242105263157895,
308
+ "grad_norm": 0.23971959948539734,
309
+ "learning_rate": 9.411769741411903e-07,
310
+ "loss": 0.4704423427581787,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.7410526315789474,
315
+ "grad_norm": 0.29793378710746765,
316
+ "learning_rate": 9.364812218429721e-07,
317
+ "loss": 0.560968017578125,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.7578947368421053,
322
+ "grad_norm": 0.2236040234565735,
323
+ "learning_rate": 9.316178269538014e-07,
324
+ "loss": 0.5088452816009521,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.7747368421052632,
329
+ "grad_norm": 0.22047854959964752,
330
+ "learning_rate": 9.265886573939446e-07,
331
+ "loss": 0.5030550956726074,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.791578947368421,
336
+ "grad_norm": 0.2273361086845398,
337
+ "learning_rate": 9.213956447539792e-07,
338
+ "loss": 0.46353440284729003,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.8084210526315789,
343
+ "grad_norm": 0.2170158326625824,
344
+ "learning_rate": 9.160407835529136e-07,
345
+ "loss": 0.49871411323547366,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.8252631578947368,
350
+ "grad_norm": 0.19333498179912567,
351
+ "learning_rate": 9.105261304721375e-07,
352
+ "loss": 0.4416178226470947,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.8421052631578947,
357
+ "grad_norm": 0.18490085005760193,
358
+ "learning_rate": 9.048538035654969e-07,
359
+ "loss": 0.39783194065093996,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.8589473684210527,
364
+ "grad_norm": 0.22122648358345032,
365
+ "learning_rate": 8.990259814457977e-07,
366
+ "loss": 0.4318229198455811,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.8757894736842106,
371
+ "grad_norm": 0.17448943853378296,
372
+ "learning_rate": 8.930449024480491e-07,
373
+ "loss": 0.42445807456970214,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.8926315789473684,
378
+ "grad_norm": 0.18165165185928345,
379
+ "learning_rate": 8.8691286376977e-07,
380
+ "loss": 0.46429901123046874,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.9094736842105263,
385
+ "grad_norm": 0.16785287857055664,
386
+ "learning_rate": 8.806322205886873e-07,
387
+ "loss": 0.3975703239440918,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.9263157894736842,
392
+ "grad_norm": 0.1613738089799881,
393
+ "learning_rate": 8.74205385158165e-07,
394
+ "loss": 0.4458911418914795,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.9431578947368421,
399
+ "grad_norm": 0.15376177430152893,
400
+ "learning_rate": 8.676348258807121e-07,
401
+ "loss": 0.45571184158325195,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.96,
406
+ "grad_norm": 0.14966322481632233,
407
+ "learning_rate": 8.609230663599254e-07,
408
+ "loss": 0.4039600372314453,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.9768421052631578,
413
+ "grad_norm": 0.16819055378437042,
414
+ "learning_rate": 8.540726844312294e-07,
415
+ "loss": 0.4382494926452637,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.9936842105263158,
420
+ "grad_norm": 0.16405776143074036,
421
+ "learning_rate": 8.470863111717889e-07,
422
+ "loss": 0.4306180477142334,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 1.0101052631578948,
427
+ "grad_norm": 0.18503950536251068,
428
+ "learning_rate": 8.399666298899706e-07,
429
+ "loss": 0.39806089401245115,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 1.0269473684210526,
434
+ "grad_norm": 0.14375492930412292,
435
+ "learning_rate": 8.327163750947457e-07,
436
+ "loss": 0.4271697044372559,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 1.0437894736842106,
441
+ "grad_norm": 0.1412728875875473,
442
+ "learning_rate": 8.253383314454263e-07,
443
+ "loss": 0.3939049243927002,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 1.0606315789473684,
448
+ "grad_norm": 0.20121850073337555,
449
+ "learning_rate": 8.178353326821404e-07,
450
+ "loss": 0.43197131156921387,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 1.0774736842105264,
455
+ "grad_norm": 0.17767728865146637,
456
+ "learning_rate": 8.102102605374566e-07,
457
+ "loss": 0.437807559967041,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 1.0943157894736841,
462
+ "grad_norm": 0.1498359888792038,
463
+ "learning_rate": 8.024660436295759e-07,
464
+ "loss": 0.38409013748168946,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 1.1111578947368421,
469
+ "grad_norm": 0.15958793461322784,
470
+ "learning_rate": 7.946056563375145e-07,
471
+ "loss": 0.4204962730407715,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 1.1280000000000001,
476
+ "grad_norm": 0.157291978597641,
477
+ "learning_rate": 7.866321176587128e-07,
478
+ "loss": 0.42113161087036133,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 1.1448421052631579,
483
+ "grad_norm": 0.14119838178157806,
484
+ "learning_rate": 7.785484900495065e-07,
485
+ "loss": 0.4151731491088867,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 1.1616842105263159,
490
+ "grad_norm": 0.1296525001525879,
491
+ "learning_rate": 7.703578782489058e-07,
492
+ "loss": 0.38312902450561526,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 1.1785263157894736,
497
+ "grad_norm": 0.13671696186065674,
498
+ "learning_rate": 7.620634280861351e-07,
499
+ "loss": 0.42612557411193847,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 1.1953684210526316,
504
+ "grad_norm": 0.15196114778518677,
505
+ "learning_rate": 7.536683252723923e-07,
506
+ "loss": 0.4306772708892822,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 1.2122105263157894,
511
+ "grad_norm": 0.1136903315782547,
512
+ "learning_rate": 7.451757941772868e-07,
513
+ "loss": 0.38483757972717286,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 1.2290526315789474,
518
+ "grad_norm": 0.12378744781017303,
519
+ "learning_rate": 7.365890965904337e-07,
520
+ "loss": 0.4030342102050781,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 1.2458947368421052,
525
+ "grad_norm": 0.1265542209148407,
526
+ "learning_rate": 7.279115304686733e-07,
527
+ "loss": 0.4091166973114014,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 1.2627368421052632,
532
+ "grad_norm": 0.11647409200668335,
533
+ "learning_rate": 7.191464286694e-07,
534
+ "loss": 0.41426806449890136,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 1.279578947368421,
539
+ "grad_norm": 0.11192695051431656,
540
+ "learning_rate": 7.102971576704875e-07,
541
+ "loss": 0.38181486129760744,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 1.296421052631579,
546
+ "grad_norm": 0.14947861433029175,
547
+ "learning_rate": 7.013671162773003e-07,
548
+ "loss": 0.39824953079223635,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 1.313263157894737,
553
+ "grad_norm": 0.11269424855709076,
554
+ "learning_rate": 6.923597343172891e-07,
555
+ "loss": 0.40348024368286134,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 1.3301052631578947,
560
+ "grad_norm": 0.3742346167564392,
561
+ "learning_rate": 6.83278471322672e-07,
562
+ "loss": 0.38022048473358155,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 1.3469473684210527,
567
+ "grad_norm": 0.1310902237892151,
568
+ "learning_rate": 6.741268152017057e-07,
569
+ "loss": 0.42791285514831545,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 1.3637894736842107,
574
+ "grad_norm": 0.1692703813314438,
575
+ "learning_rate": 6.649082808990585e-07,
576
+ "loss": 0.4263493061065674,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 1.3806315789473684,
581
+ "grad_norm": 0.1279117316007614,
582
+ "learning_rate": 6.556264090457998e-07,
583
+ "loss": 0.37379777431488037,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 1.3974736842105262,
588
+ "grad_norm": 0.12949039041996002,
589
+ "learning_rate": 6.462847645995237e-07,
590
+ "loss": 0.38636391162872313,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 1.4143157894736842,
595
+ "grad_norm": 0.10221126675605774,
596
+ "learning_rate": 6.368869354751284e-07,
597
+ "loss": 0.408221435546875,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 1.4311578947368422,
602
+ "grad_norm": 0.11505889147520065,
603
+ "learning_rate": 6.274365311667797e-07,
604
+ "loss": 0.3951406717300415,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 1.448,
609
+ "grad_norm": 0.11054962873458862,
610
+ "learning_rate": 6.179371813615859e-07,
611
+ "loss": 0.3732129096984863,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 1.464842105263158,
616
+ "grad_norm": 0.10150120407342911,
617
+ "learning_rate": 6.083925345455158e-07,
618
+ "loss": 0.38601529598236084,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 1.4816842105263157,
623
+ "grad_norm": 0.12239400297403336,
624
+ "learning_rate": 5.988062566020986e-07,
625
+ "loss": 0.3859985828399658,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 1.4985263157894737,
630
+ "grad_norm": 0.15801067650318146,
631
+ "learning_rate": 5.891820294044408e-07,
632
+ "loss": 0.3983951807022095,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 1.5153684210526315,
637
+ "grad_norm": 0.10104545950889587,
638
+ "learning_rate": 5.795235494011007e-07,
639
+ "loss": 0.41107850074768065,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 1.5322105263157895,
644
+ "grad_norm": 0.1378099024295807,
645
+ "learning_rate": 5.698345261963668e-07,
646
+ "loss": 0.3708331823348999,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 1.5490526315789475,
651
+ "grad_norm": 0.12936057150363922,
652
+ "learning_rate": 5.601186811254825e-07,
653
+ "loss": 0.387884521484375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 1.5658947368421052,
658
+ "grad_norm": 0.12379129230976105,
659
+ "learning_rate": 5.503797458253646e-07,
660
+ "loss": 0.43808717727661134,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 1.582736842105263,
665
+ "grad_norm": 0.12017743289470673,
666
+ "learning_rate": 5.406214608013662e-07,
667
+ "loss": 0.41345391273498533,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 1.5995789473684212,
672
+ "grad_norm": 0.1095535159111023,
673
+ "learning_rate": 5.308475739906328e-07,
674
+ "loss": 0.40022664070129393,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 1.616421052631579,
679
+ "grad_norm": 0.13831396400928497,
680
+ "learning_rate": 5.210618393226045e-07,
681
+ "loss": 0.3909924983978271,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 1.6332631578947368,
686
+ "grad_norm": 0.10449163615703583,
687
+ "learning_rate": 5.112680152772156e-07,
688
+ "loss": 0.37143146991729736,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 1.6501052631578947,
693
+ "grad_norm": 0.11249610036611557,
694
+ "learning_rate": 5.01469863441348e-07,
695
+ "loss": 0.38103113174438474,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 1.6669473684210527,
700
+ "grad_norm": 0.13718819618225098,
701
+ "learning_rate": 4.916711470640907e-07,
702
+ "loss": 0.4071629524230957,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 1.6837894736842105,
707
+ "grad_norm": 0.10473571717739105,
708
+ "learning_rate": 4.818756296113595e-07,
709
+ "loss": 0.417419958114624,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 1.7006315789473683,
714
+ "grad_norm": 0.10846224427223206,
715
+ "learning_rate": 4.7208707332043623e-07,
716
+ "loss": 0.3998772859573364,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 1.7174736842105263,
721
+ "grad_norm": 0.10248563438653946,
722
+ "learning_rate": 4.6230923775497714e-07,
723
+ "loss": 0.38056583404541017,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 1.7343157894736843,
728
+ "grad_norm": 0.12221980094909668,
729
+ "learning_rate": 4.5254587836104964e-07,
730
+ "loss": 0.39371190071105955,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 1.751157894736842,
735
+ "grad_norm": 0.10641586035490036,
736
+ "learning_rate": 4.4280074502475017e-07,
737
+ "loss": 0.4280440330505371,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 1.768,
742
+ "grad_norm": 0.12907131016254425,
743
+ "learning_rate": 4.3307758063195796e-07,
744
+ "loss": 0.3791615962982178,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 1.784842105263158,
749
+ "grad_norm": 0.12383506447076797,
750
+ "learning_rate": 4.233801196307762e-07,
751
+ "loss": 0.347782301902771,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 1.8016842105263158,
756
+ "grad_norm": 0.12547679245471954,
757
+ "learning_rate": 4.1371208659721536e-07,
758
+ "loss": 0.38370628356933595,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 1.8185263157894735,
763
+ "grad_norm": 0.10580642521381378,
764
+ "learning_rate": 4.0407719480466736e-07,
765
+ "loss": 0.40404376983642576,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 1.8353684210526315,
770
+ "grad_norm": 0.1055402085185051,
771
+ "learning_rate": 3.944791447977213e-07,
772
+ "loss": 0.4167450428009033,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 1.8522105263157895,
777
+ "grad_norm": 0.11053823679685593,
778
+ "learning_rate": 3.849216229708671e-07,
779
+ "loss": 0.4046513080596924,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 1.8690526315789473,
784
+ "grad_norm": 0.10185246914625168,
785
+ "learning_rate": 3.7540830015263526e-07,
786
+ "loss": 0.39672977924346925,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 1.8858947368421053,
791
+ "grad_norm": 0.08342823386192322,
792
+ "learning_rate": 3.6594283019571416e-07,
793
+ "loss": 0.39356396198272703,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 1.9027368421052633,
798
+ "grad_norm": 0.11821646988391876,
799
+ "learning_rate": 3.565288485735874e-07,
800
+ "loss": 0.42082643508911133,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 1.919578947368421,
805
+ "grad_norm": 0.1106327474117279,
806
+ "learning_rate": 3.4716997098423085e-07,
807
+ "loss": 0.34105117321014405,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 1.9364210526315788,
812
+ "grad_norm": 0.11533800512552261,
813
+ "learning_rate": 3.378697919614045e-07,
814
+ "loss": 0.3924069404602051,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 1.9532631578947368,
819
+ "grad_norm": 0.1431114822626114,
820
+ "learning_rate": 3.286318834940729e-07,
821
+ "loss": 0.3922377586364746,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 1.9701052631578948,
826
+ "grad_norm": 0.16050194203853607,
827
+ "learning_rate": 3.1945979365448517e-07,
828
+ "loss": 0.3745201587677002,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 1.9869473684210526,
833
+ "grad_norm": 0.11921833455562592,
834
+ "learning_rate": 3.103570452354402e-07,
835
+ "loss": 0.40110602378845217,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 2.0033684210526315,
840
+ "grad_norm": 0.0832003727555275,
841
+ "learning_rate": 3.013271343972613e-07,
842
+ "loss": 0.3981154918670654,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 2.0202105263157897,
847
+ "grad_norm": 0.09975888580083847,
848
+ "learning_rate": 2.9237352932500046e-07,
849
+ "loss": 0.3726134061813354,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 2.0370526315789474,
854
+ "grad_norm": 0.14600081741809845,
855
+ "learning_rate": 2.8349966889638615e-07,
856
+ "loss": 0.42558698654174804,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 2.053894736842105,
861
+ "grad_norm": 0.10875770449638367,
862
+ "learning_rate": 2.747089613610278e-07,
863
+ "loss": 0.3682931184768677,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 2.070736842105263,
868
+ "grad_norm": 0.10050549358129501,
869
+ "learning_rate": 2.66004783031385e-07,
870
+ "loss": 0.3756644487380981,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 2.087578947368421,
875
+ "grad_norm": 0.08914914727210999,
876
+ "learning_rate": 2.573904769860009e-07,
877
+ "loss": 0.3804330825805664,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 2.104421052631579,
882
+ "grad_norm": 0.08296852558851242,
883
+ "learning_rate": 2.488693517855016e-07,
884
+ "loss": 0.3978404521942139,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 2.1212631578947367,
889
+ "grad_norm": 0.13885149359703064,
890
+ "learning_rate": 2.404446802018533e-07,
891
+ "loss": 0.3935218334197998,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 2.138105263157895,
896
+ "grad_norm": 0.13195137679576874,
897
+ "learning_rate": 2.3211969796136305e-07,
898
+ "loss": 0.42966952323913576,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 2.1549473684210527,
903
+ "grad_norm": 0.13367892801761627,
904
+ "learning_rate": 2.2389760250191038e-07,
905
+ "loss": 0.3679579019546509,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 2.1717894736842105,
910
+ "grad_norm": 0.1288345605134964,
911
+ "learning_rate": 2.1578155174488343e-07,
912
+ "loss": 0.41324810981750487,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 2.1886315789473683,
917
+ "grad_norm": 0.09626021236181259,
918
+ "learning_rate": 2.0777466288229205e-07,
919
+ "loss": 0.40120248794555663,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 2.2054736842105265,
924
+ "grad_norm": 0.10264381766319275,
925
+ "learning_rate": 1.9988001117952485e-07,
926
+ "loss": 0.3501007080078125,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 2.2223157894736842,
931
+ "grad_norm": 0.09031466394662857,
932
+ "learning_rate": 1.9210062879420973e-07,
933
+ "loss": 0.3839429378509521,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 2.239157894736842,
938
+ "grad_norm": 0.12686079740524292,
939
+ "learning_rate": 1.8443950361162957e-07,
940
+ "loss": 0.4338528156280518,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 2.2560000000000002,
945
+ "grad_norm": 0.12199016660451889,
946
+ "learning_rate": 1.7689957809714346e-07,
947
+ "loss": 0.39229888916015626,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 2.272842105263158,
952
+ "grad_norm": 0.12029567360877991,
953
+ "learning_rate": 1.694837481660525e-07,
954
+ "loss": 0.38006880283355715,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 2.2896842105263158,
959
+ "grad_norm": 0.08686309307813644,
960
+ "learning_rate": 1.6219486207134313e-07,
961
+ "loss": 0.3808159589767456,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 2.3065263157894735,
966
+ "grad_norm": 0.10810462385416031,
967
+ "learning_rate": 1.5503571930973785e-07,
968
+ "loss": 0.401824426651001,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 2.3233684210526317,
973
+ "grad_norm": 0.10281873494386673,
974
+ "learning_rate": 1.480090695464723e-07,
975
+ "loss": 0.40149493217468263,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 2.3402105263157895,
980
+ "grad_norm": 0.09503985196352005,
981
+ "learning_rate": 1.4111761155920975e-07,
982
+ "loss": 0.38567726612091063,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 2.3570526315789473,
987
+ "grad_norm": 0.10420782119035721,
988
+ "learning_rate": 1.3436399220150212e-07,
989
+ "loss": 0.3759742736816406,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 2.3738947368421055,
994
+ "grad_norm": 0.10681115835905075,
995
+ "learning_rate": 1.2775080538619347e-07,
996
+ "loss": 0.3913698196411133,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 2.3907368421052633,
1001
+ "grad_norm": 0.10323983430862427,
1002
+ "learning_rate": 1.2128059108915595e-07,
1003
+ "loss": 0.39077584743499755,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 2.407578947368421,
1008
+ "grad_norm": 0.09566064178943634,
1009
+ "learning_rate": 1.1495583437374263e-07,
1010
+ "loss": 0.39895172119140626,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 2.424421052631579,
1015
+ "grad_norm": 0.13018426299095154,
1016
+ "learning_rate": 1.0877896443633117e-07,
1017
+ "loss": 0.38982129096984863,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 2.441263157894737,
1022
+ "grad_norm": 0.10760781168937683,
1023
+ "learning_rate": 1.0275235367332347e-07,
1024
+ "loss": 0.3756714344024658,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 2.458105263157895,
1029
+ "grad_norm": 0.11606904864311218,
1030
+ "learning_rate": 9.687831676996238e-08,
1031
+ "loss": 0.37858171463012696,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 2.4749473684210526,
1036
+ "grad_norm": 0.12957172095775604,
1037
+ "learning_rate": 9.115910981131336e-08,
1038
+ "loss": 0.40050196647644043,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 2.4917894736842103,
1043
+ "grad_norm": 0.11186131089925766,
1044
+ "learning_rate": 8.559692941575231e-08,
1045
+ "loss": 0.3684133291244507,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 2.5086315789473685,
1050
+ "grad_norm": 0.13279542326927185,
1051
+ "learning_rate": 8.019391189129466e-08,
1052
+ "loss": 0.3452518224716187,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 2.5254736842105263,
1057
+ "grad_norm": 0.09041756391525269,
1058
+ "learning_rate": 7.495213241508786e-08,
1059
+ "loss": 0.36301617622375487,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 2.542315789473684,
1064
+ "grad_norm": 0.10033190995454788,
1065
+ "learning_rate": 6.987360423638205e-08,
1066
+ "loss": 0.3706004858016968,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 2.559157894736842,
1071
+ "grad_norm": 0.10681814700365067,
1072
+ "learning_rate": 6.49602779032865e-08,
1073
+ "loss": 0.36011199951171874,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 2.576,
1078
+ "grad_norm": 0.1008416935801506,
1079
+ "learning_rate": 6.02140405136089e-08,
1080
+ "loss": 0.37473766803741454,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 2.592842105263158,
1085
+ "grad_norm": 0.11559010297060013,
1086
+ "learning_rate": 5.5636714990062393e-08,
1087
+ "loss": 0.39232525825500486,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 2.609684210526316,
1092
+ "grad_norm": 0.10601615905761719,
1093
+ "learning_rate": 5.1230059380123034e-08,
1094
+ "loss": 0.34370343685150145,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 2.626526315789474,
1099
+ "grad_norm": 0.11516924202442169,
1100
+ "learning_rate": 4.699576618080331e-08,
1101
+ "loss": 0.39509878158569334,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 2.6433684210526316,
1106
+ "grad_norm": 0.11444627493619919,
1107
+ "learning_rate": 4.293546168860163e-08,
1108
+ "loss": 0.3881126165390015,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 2.6602105263157894,
1113
+ "grad_norm": 0.09985339641571045,
1114
+ "learning_rate": 3.9050705374879086e-08,
1115
+ "loss": 0.34624040126800537,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 2.677052631578947,
1120
+ "grad_norm": 0.10439962148666382,
1121
+ "learning_rate": 3.534298928690166e-08,
1122
+ "loss": 0.35141232013702395,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 2.6938947368421053,
1127
+ "grad_norm": 0.12180087715387344,
1128
+ "learning_rate": 3.181373747477822e-08,
1129
+ "loss": 0.39980330467224123,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 2.710736842105263,
1134
+ "grad_norm": 0.12150874733924866,
1135
+ "learning_rate": 2.8464305444515112e-08,
1136
+ "loss": 0.3560852766036987,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 2.7275789473684213,
1141
+ "grad_norm": 0.11374734342098236,
1142
+ "learning_rate": 2.5295979637397213e-08,
1143
+ "loss": 0.39339067935943606,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 2.744421052631579,
1148
+ "grad_norm": 0.11705286055803299,
1149
+ "learning_rate": 2.2309976935894203e-08,
1150
+ "loss": 0.38021705150604246,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 2.761263157894737,
1155
+ "grad_norm": 0.09300459921360016,
1156
+ "learning_rate": 1.9507444196284195e-08,
1157
+ "loss": 0.3467890739440918,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 2.7781052631578946,
1162
+ "grad_norm": 0.08709974586963654,
1163
+ "learning_rate": 1.688945780817147e-08,
1164
+ "loss": 0.38266596794128416,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 2.7949473684210524,
1169
+ "grad_norm": 0.12327069044113159,
1170
+ "learning_rate": 1.445702328106979e-08,
1171
+ "loss": 0.34164865016937257,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 2.8117894736842106,
1176
+ "grad_norm": 0.12361253052949905,
1177
+ "learning_rate": 1.2211074858209103e-08,
1178
+ "loss": 0.3876492977142334,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 2.8286315789473684,
1183
+ "grad_norm": 0.120187908411026,
1184
+ "learning_rate": 1.0152475157713392e-08,
1185
+ "loss": 0.3703944206237793,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 2.845473684210526,
1190
+ "grad_norm": 0.10741006582975388,
1191
+ "learning_rate": 8.282014841288653e-09,
1192
+ "loss": 0.3735771656036377,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 2.8623157894736844,
1197
+ "grad_norm": 0.10451704263687134,
1198
+ "learning_rate": 6.600412310547754e-09,
1199
+ "loss": 0.3807518005371094,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 2.879157894736842,
1204
+ "grad_norm": 0.10978730767965317,
1205
+ "learning_rate": 5.1083134310882515e-09,
1206
+ "loss": 0.3799649000167847,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 2.896,
1211
+ "grad_norm": 0.09521365165710449,
1212
+ "learning_rate": 3.806291284430274e-09,
1213
+ "loss": 0.3580306053161621,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 2.9128421052631577,
1218
+ "grad_norm": 0.11443010717630386,
1219
+ "learning_rate": 2.6948459479087526e-09,
1220
+ "loss": 0.4088387966156006,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 2.929684210526316,
1225
+ "grad_norm": 0.10477516055107117,
1226
+ "learning_rate": 1.7744043026048372e-09,
1227
+ "loss": 0.3971900463104248,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 2.9465263157894737,
1232
+ "grad_norm": 0.11358557641506195,
1233
+ "learning_rate": 1.0453198693907706e-09,
1234
+ "loss": 0.35434761047363283,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 2.9633684210526314,
1239
+ "grad_norm": 0.10436985641717911,
1240
+ "learning_rate": 5.07872673150278e-10,
1241
+ "loss": 0.40241618156433107,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 2.9802105263157896,
1246
+ "grad_norm": 0.10179495811462402,
1247
+ "learning_rate": 1.6226913522743302e-10,
1248
+ "loss": 0.34090123176574705,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 2.9970526315789474,
1253
+ "grad_norm": 0.1047094464302063,
1254
+ "learning_rate": 8.641994144853448e-12,
1255
+ "loss": 0.33390347957611083,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 3.0,
1260
+ "step": 1782,
1261
+ "total_flos": 3.258875338498253e+16,
1262
+ "train_loss": 0.45725877370630985,
1263
+ "train_runtime": 3399.7742,
1264
+ "train_samples_per_second": 4.191,
1265
+ "train_steps_per_second": 0.524
1266
+ }
1267
+ ],
1268
+ "logging_steps": 10,
1269
+ "max_steps": 1782,
1270
+ "num_input_tokens_seen": 0,
1271
+ "num_train_epochs": 3,
1272
+ "save_steps": 100,
1273
+ "stateful_callbacks": {
1274
+ "TrainerControl": {
1275
+ "args": {
1276
+ "should_epoch_stop": false,
1277
+ "should_evaluate": false,
1278
+ "should_log": false,
1279
+ "should_save": true,
1280
+ "should_training_stop": true
1281
+ },
1282
+ "attributes": {}
1283
+ }
1284
+ },
1285
+ "total_flos": 3.258875338498253e+16,
1286
+ "train_batch_size": 1,
1287
+ "trial_name": null,
1288
+ "trial_params": null
1289
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e59b6aeffb8563cc09210642a0d080410d061efe9a32399cd1e4a1e0abccb0a
3
+ size 5585