uaritm commited on
Commit
865a603
·
verified ·
1 Parent(s): 1da3322

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. lora_checkpoints/README.md +209 -0
  2. lora_checkpoints/adapter_config.json +42 -0
  3. lora_checkpoints/adapter_model.safetensors +3 -0
  4. lora_checkpoints/added_tokens.json +3 -0
  5. lora_checkpoints/chat_template.jinja +47 -0
  6. lora_checkpoints/checkpoint-4400/README.md +209 -0
  7. lora_checkpoints/checkpoint-4400/adapter_config.json +42 -0
  8. lora_checkpoints/checkpoint-4400/adapter_model.safetensors +3 -0
  9. lora_checkpoints/checkpoint-4400/added_tokens.json +3 -0
  10. lora_checkpoints/checkpoint-4400/chat_template.jinja +47 -0
  11. lora_checkpoints/checkpoint-4400/optimizer.pt +3 -0
  12. lora_checkpoints/checkpoint-4400/rng_state.pth +3 -0
  13. lora_checkpoints/checkpoint-4400/scaler.pt +3 -0
  14. lora_checkpoints/checkpoint-4400/scheduler.pt +3 -0
  15. lora_checkpoints/checkpoint-4400/special_tokens_map.json +33 -0
  16. lora_checkpoints/checkpoint-4400/tokenizer.model +3 -0
  17. lora_checkpoints/checkpoint-4400/tokenizer_config.json +0 -0
  18. lora_checkpoints/checkpoint-4400/trainer_state.json +3114 -0
  19. lora_checkpoints/checkpoint-4400/training_args.bin +3 -0
  20. lora_checkpoints/checkpoint-4600/README.md +209 -0
  21. lora_checkpoints/checkpoint-4600/adapter_config.json +42 -0
  22. lora_checkpoints/checkpoint-4600/adapter_model.safetensors +3 -0
  23. lora_checkpoints/checkpoint-4600/added_tokens.json +3 -0
  24. lora_checkpoints/checkpoint-4600/chat_template.jinja +47 -0
  25. lora_checkpoints/checkpoint-4600/optimizer.pt +3 -0
  26. lora_checkpoints/checkpoint-4600/rng_state.pth +3 -0
  27. lora_checkpoints/checkpoint-4600/scaler.pt +3 -0
  28. lora_checkpoints/checkpoint-4600/scheduler.pt +3 -0
  29. lora_checkpoints/checkpoint-4600/special_tokens_map.json +33 -0
  30. lora_checkpoints/checkpoint-4600/tokenizer.model +3 -0
  31. lora_checkpoints/checkpoint-4600/tokenizer_config.json +0 -0
  32. lora_checkpoints/checkpoint-4600/trainer_state.json +3254 -0
  33. lora_checkpoints/checkpoint-4600/training_args.bin +3 -0
  34. lora_checkpoints/checkpoint-4800/README.md +209 -0
  35. lora_checkpoints/checkpoint-4800/adapter_config.json +42 -0
  36. lora_checkpoints/checkpoint-4800/adapter_model.safetensors +3 -0
  37. lora_checkpoints/checkpoint-4800/added_tokens.json +3 -0
  38. lora_checkpoints/checkpoint-4800/chat_template.jinja +47 -0
  39. lora_checkpoints/checkpoint-4800/optimizer.pt +3 -0
  40. lora_checkpoints/checkpoint-4800/rng_state.pth +3 -0
  41. lora_checkpoints/checkpoint-4800/scaler.pt +3 -0
  42. lora_checkpoints/checkpoint-4800/scheduler.pt +3 -0
  43. lora_checkpoints/checkpoint-4800/special_tokens_map.json +33 -0
  44. lora_checkpoints/checkpoint-4800/tokenizer.model +3 -0
  45. lora_checkpoints/checkpoint-4800/tokenizer_config.json +0 -0
  46. lora_checkpoints/checkpoint-4800/trainer_state.json +3394 -0
  47. lora_checkpoints/checkpoint-4800/training_args.bin +3 -0
  48. lora_checkpoints/special_tokens_map.json +33 -0
  49. lora_checkpoints/tokenizer.model +3 -0
  50. lora_checkpoints/tokenizer_config.json +0 -0
lora_checkpoints/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: uaritm/gemma3_1b_med_qa_ru
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:uaritm/gemma3_1b_med_qa_ru
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
lora_checkpoints/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
lora_checkpoints/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b4ff0908196780a01d5777bf7ee02d871b7bb99dcb4c70e6b808d975af1aad
3
+ size 52231312
lora_checkpoints/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
lora_checkpoints/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
lora_checkpoints/checkpoint-4400/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: uaritm/gemma3_1b_med_qa_ru
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:uaritm/gemma3_1b_med_qa_ru
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
lora_checkpoints/checkpoint-4400/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
lora_checkpoints/checkpoint-4400/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9050b82063245ef7ac87dfa8d36e9f9bcbf4db241b1ac2d995a8b3122f0c1bab
3
+ size 52231312
lora_checkpoints/checkpoint-4400/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
lora_checkpoints/checkpoint-4400/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
lora_checkpoints/checkpoint-4400/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b106776532c3a8d2ca91a43cb4885171a035baa1d90d48ae73b409db5e0e0f5
3
+ size 104671958
lora_checkpoints/checkpoint-4400/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d810a67992b58b715a7d9b25a5acdf9e0b832de9c82ec2c6816584522da09bb
3
+ size 14244
lora_checkpoints/checkpoint-4400/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f52741266f98091dd23c9f0b9999f8607a62de5e710d64dca525fe9ba02fe51
3
+ size 988
lora_checkpoints/checkpoint-4400/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8fb657393d96492d81b7388ad08bb8c9c79899a2d54086f176059c5daff895c7
3
+ size 1064
lora_checkpoints/checkpoint-4400/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
lora_checkpoints/checkpoint-4400/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
lora_checkpoints/checkpoint-4400/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
lora_checkpoints/checkpoint-4400/trainer_state.json ADDED
@@ -0,0 +1,3114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9024484040507628,
6
+ "eval_steps": 500,
7
+ "global_step": 4400,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00205101910011537,
14
+ "grad_norm": 1.9277215003967285,
15
+ "learning_rate": 3.6885245901639347e-06,
16
+ "loss": 1.4306,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.00410203820023074,
21
+ "grad_norm": 0.3513035476207733,
22
+ "learning_rate": 7.78688524590164e-06,
23
+ "loss": 1.3524,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.006153057300346109,
28
+ "grad_norm": 0.3364648222923279,
29
+ "learning_rate": 1.1885245901639344e-05,
30
+ "loss": 1.3188,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.00820407640046148,
35
+ "grad_norm": 0.3382512927055359,
36
+ "learning_rate": 1.598360655737705e-05,
37
+ "loss": 1.3418,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.01025509550057685,
42
+ "grad_norm": 0.360334575176239,
43
+ "learning_rate": 2.0081967213114755e-05,
44
+ "loss": 1.3381,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.012306114600692218,
49
+ "grad_norm": 0.3408481180667877,
50
+ "learning_rate": 2.418032786885246e-05,
51
+ "loss": 1.3365,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.014357133700807588,
56
+ "grad_norm": 0.36211535334587097,
57
+ "learning_rate": 2.8278688524590162e-05,
58
+ "loss": 1.3314,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.01640815280092296,
63
+ "grad_norm": 0.38704580068588257,
64
+ "learning_rate": 3.237704918032787e-05,
65
+ "loss": 1.3108,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.018459171901038327,
70
+ "grad_norm": 0.44303640723228455,
71
+ "learning_rate": 3.6475409836065576e-05,
72
+ "loss": 1.3073,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.0205101910011537,
77
+ "grad_norm": 0.4073602557182312,
78
+ "learning_rate": 4.057377049180328e-05,
79
+ "loss": 1.2993,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.022561210101269068,
84
+ "grad_norm": 0.4478100538253784,
85
+ "learning_rate": 4.467213114754098e-05,
86
+ "loss": 1.3413,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.024612229201384436,
91
+ "grad_norm": 0.39146170020103455,
92
+ "learning_rate": 4.8770491803278687e-05,
93
+ "loss": 1.3168,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.026663248301499808,
98
+ "grad_norm": 0.3786431849002838,
99
+ "learning_rate": 5.28688524590164e-05,
100
+ "loss": 1.2774,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.028714267401615177,
105
+ "grad_norm": 0.4014948904514313,
106
+ "learning_rate": 5.69672131147541e-05,
107
+ "loss": 1.346,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.03076528650173055,
112
+ "grad_norm": 0.3987842798233032,
113
+ "learning_rate": 6.10655737704918e-05,
114
+ "loss": 1.2816,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.03281630560184592,
119
+ "grad_norm": 0.3897082507610321,
120
+ "learning_rate": 6.516393442622951e-05,
121
+ "loss": 1.3485,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.034867324701961286,
126
+ "grad_norm": 0.373279333114624,
127
+ "learning_rate": 6.926229508196722e-05,
128
+ "loss": 1.3185,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.036918343802076654,
133
+ "grad_norm": 0.3812575340270996,
134
+ "learning_rate": 7.336065573770491e-05,
135
+ "loss": 1.3394,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.03896936290219203,
140
+ "grad_norm": 0.35926997661590576,
141
+ "learning_rate": 7.745901639344263e-05,
142
+ "loss": 1.2821,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.0410203820023074,
147
+ "grad_norm": 0.3649434745311737,
148
+ "learning_rate": 8.155737704918032e-05,
149
+ "loss": 1.33,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.04307140110242277,
154
+ "grad_norm": 0.345662921667099,
155
+ "learning_rate": 8.565573770491803e-05,
156
+ "loss": 1.3107,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.045122420202538135,
161
+ "grad_norm": 0.37169769406318665,
162
+ "learning_rate": 8.975409836065574e-05,
163
+ "loss": 1.309,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.047173439302653504,
168
+ "grad_norm": 0.37920281291007996,
169
+ "learning_rate": 9.385245901639344e-05,
170
+ "loss": 1.3352,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.04922445840276887,
175
+ "grad_norm": 0.35772770643234253,
176
+ "learning_rate": 9.795081967213115e-05,
177
+ "loss": 1.2402,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.05127547750288425,
182
+ "grad_norm": 0.38790181279182434,
183
+ "learning_rate": 9.989205526770294e-05,
184
+ "loss": 1.326,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.053326496602999617,
189
+ "grad_norm": 0.3545536696910858,
190
+ "learning_rate": 9.967616580310882e-05,
191
+ "loss": 1.3173,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.055377515703114985,
196
+ "grad_norm": 0.3845142722129822,
197
+ "learning_rate": 9.946027633851469e-05,
198
+ "loss": 1.2949,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.057428534803230354,
203
+ "grad_norm": 0.38621339201927185,
204
+ "learning_rate": 9.924438687392055e-05,
205
+ "loss": 1.2773,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.05947955390334572,
210
+ "grad_norm": 0.38091301918029785,
211
+ "learning_rate": 9.902849740932643e-05,
212
+ "loss": 1.3282,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.0615305730034611,
217
+ "grad_norm": 0.37546730041503906,
218
+ "learning_rate": 9.88126079447323e-05,
219
+ "loss": 1.2862,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.06358159210357646,
224
+ "grad_norm": 0.3515011966228485,
225
+ "learning_rate": 9.859671848013817e-05,
226
+ "loss": 1.2937,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.06563261120369183,
231
+ "grad_norm": 0.3863738775253296,
232
+ "learning_rate": 9.838082901554406e-05,
233
+ "loss": 1.3056,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.06768363030380721,
238
+ "grad_norm": 0.36615240573883057,
239
+ "learning_rate": 9.816493955094992e-05,
240
+ "loss": 1.3062,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.06973464940392257,
245
+ "grad_norm": 0.37741243839263916,
246
+ "learning_rate": 9.794905008635579e-05,
247
+ "loss": 1.3094,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.07178566850403795,
252
+ "grad_norm": 0.38626739382743835,
253
+ "learning_rate": 9.773316062176167e-05,
254
+ "loss": 1.2947,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.07383668760415331,
259
+ "grad_norm": 0.38667401671409607,
260
+ "learning_rate": 9.751727115716753e-05,
261
+ "loss": 1.2976,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.07588770670426868,
266
+ "grad_norm": 0.36084800958633423,
267
+ "learning_rate": 9.730138169257342e-05,
268
+ "loss": 1.27,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.07793872580438406,
273
+ "grad_norm": 0.3754425346851349,
274
+ "learning_rate": 9.708549222797928e-05,
275
+ "loss": 1.3243,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.07998974490449942,
280
+ "grad_norm": 0.39857473969459534,
281
+ "learning_rate": 9.686960276338515e-05,
282
+ "loss": 1.3077,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.0820407640046148,
287
+ "grad_norm": 0.3919648230075836,
288
+ "learning_rate": 9.665371329879103e-05,
289
+ "loss": 1.2985,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.08409178310473016,
294
+ "grad_norm": 0.3675483465194702,
295
+ "learning_rate": 9.643782383419689e-05,
296
+ "loss": 1.2946,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.08614280220484553,
301
+ "grad_norm": 0.3898465633392334,
302
+ "learning_rate": 9.622193436960277e-05,
303
+ "loss": 1.333,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.08819382130496091,
308
+ "grad_norm": 0.3681259751319885,
309
+ "learning_rate": 9.600604490500864e-05,
310
+ "loss": 1.2968,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.09024484040507627,
315
+ "grad_norm": 0.36453816294670105,
316
+ "learning_rate": 9.57901554404145e-05,
317
+ "loss": 1.272,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.09229585950519165,
322
+ "grad_norm": 0.34828147292137146,
323
+ "learning_rate": 9.557426597582039e-05,
324
+ "loss": 1.3245,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.09434687860530701,
329
+ "grad_norm": 0.3570501208305359,
330
+ "learning_rate": 9.535837651122625e-05,
331
+ "loss": 1.313,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.09639789770542238,
336
+ "grad_norm": 0.36692506074905396,
337
+ "learning_rate": 9.514248704663213e-05,
338
+ "loss": 1.2915,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.09844891680553775,
343
+ "grad_norm": 0.39161381125450134,
344
+ "learning_rate": 9.4926597582038e-05,
345
+ "loss": 1.3101,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.10049993590565312,
350
+ "grad_norm": 0.3808858394622803,
351
+ "learning_rate": 9.471070811744387e-05,
352
+ "loss": 1.3099,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.1025509550057685,
357
+ "grad_norm": 0.3541582524776459,
358
+ "learning_rate": 9.449481865284975e-05,
359
+ "loss": 1.2772,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.10460197410588386,
364
+ "grad_norm": 0.379190593957901,
365
+ "learning_rate": 9.427892918825562e-05,
366
+ "loss": 1.2914,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.10665299320599923,
371
+ "grad_norm": 0.37727421522140503,
372
+ "learning_rate": 9.406303972366149e-05,
373
+ "loss": 1.2888,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1087040123061146,
378
+ "grad_norm": 0.3787306845188141,
379
+ "learning_rate": 9.384715025906737e-05,
380
+ "loss": 1.3049,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.11075503140622997,
385
+ "grad_norm": 0.3831459581851959,
386
+ "learning_rate": 9.363126079447323e-05,
387
+ "loss": 1.2631,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.11280605050634535,
392
+ "grad_norm": 0.37274929881095886,
393
+ "learning_rate": 9.34153713298791e-05,
394
+ "loss": 1.3313,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.11485706960646071,
399
+ "grad_norm": 0.3683277368545532,
400
+ "learning_rate": 9.319948186528498e-05,
401
+ "loss": 1.2528,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.11690808870657608,
406
+ "grad_norm": 0.39554840326309204,
407
+ "learning_rate": 9.298359240069085e-05,
408
+ "loss": 1.2737,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.11895910780669144,
413
+ "grad_norm": 0.39166760444641113,
414
+ "learning_rate": 9.276770293609673e-05,
415
+ "loss": 1.271,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.12101012690680682,
420
+ "grad_norm": 0.384085476398468,
421
+ "learning_rate": 9.255181347150259e-05,
422
+ "loss": 1.2921,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.1230611460069222,
427
+ "grad_norm": 0.3704201281070709,
428
+ "learning_rate": 9.233592400690847e-05,
429
+ "loss": 1.2776,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.12511216510703757,
434
+ "grad_norm": 0.3844301998615265,
435
+ "learning_rate": 9.212003454231434e-05,
436
+ "loss": 1.3067,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.12716318420715292,
441
+ "grad_norm": 0.3971571922302246,
442
+ "learning_rate": 9.190414507772022e-05,
443
+ "loss": 1.2792,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.1292142033072683,
448
+ "grad_norm": 0.40666353702545166,
449
+ "learning_rate": 9.168825561312608e-05,
450
+ "loss": 1.2964,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.13126522240738367,
455
+ "grad_norm": 0.38252532482147217,
456
+ "learning_rate": 9.147236614853195e-05,
457
+ "loss": 1.2815,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.13331624150749904,
462
+ "grad_norm": 0.37795621156692505,
463
+ "learning_rate": 9.125647668393783e-05,
464
+ "loss": 1.283,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.13536726060761442,
469
+ "grad_norm": 0.4035683572292328,
470
+ "learning_rate": 9.10405872193437e-05,
471
+ "loss": 1.288,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.13741827970772977,
476
+ "grad_norm": 0.410669207572937,
477
+ "learning_rate": 9.082469775474958e-05,
478
+ "loss": 1.2659,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.13946929880784514,
483
+ "grad_norm": 0.3809865713119507,
484
+ "learning_rate": 9.060880829015544e-05,
485
+ "loss": 1.3133,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.14152031790796052,
490
+ "grad_norm": 0.3748447597026825,
491
+ "learning_rate": 9.039291882556131e-05,
492
+ "loss": 1.2643,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.1435713370080759,
497
+ "grad_norm": 0.39292991161346436,
498
+ "learning_rate": 9.017702936096719e-05,
499
+ "loss": 1.2855,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.14562235610819127,
504
+ "grad_norm": 0.4399755001068115,
505
+ "learning_rate": 8.996113989637307e-05,
506
+ "loss": 1.286,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.14767337520830662,
511
+ "grad_norm": 0.42447429895401,
512
+ "learning_rate": 8.974525043177894e-05,
513
+ "loss": 1.2736,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.149724394308422,
518
+ "grad_norm": 0.37248438596725464,
519
+ "learning_rate": 8.95293609671848e-05,
520
+ "loss": 1.2652,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.15177541340853737,
525
+ "grad_norm": 0.39122238755226135,
526
+ "learning_rate": 8.931347150259068e-05,
527
+ "loss": 1.2814,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.15382643250865274,
532
+ "grad_norm": 0.3697800040245056,
533
+ "learning_rate": 8.909758203799655e-05,
534
+ "loss": 1.2462,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.15587745160876812,
539
+ "grad_norm": 0.3901929259300232,
540
+ "learning_rate": 8.888169257340241e-05,
541
+ "loss": 1.2742,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.15792847070888347,
546
+ "grad_norm": 0.3833727538585663,
547
+ "learning_rate": 8.86658031088083e-05,
548
+ "loss": 1.3015,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.15997948980899884,
553
+ "grad_norm": 0.4028802216053009,
554
+ "learning_rate": 8.844991364421416e-05,
555
+ "loss": 1.2631,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.16203050890911422,
560
+ "grad_norm": 0.39087918400764465,
561
+ "learning_rate": 8.823402417962004e-05,
562
+ "loss": 1.2993,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.1640815280092296,
567
+ "grad_norm": 0.39453235268592834,
568
+ "learning_rate": 8.801813471502591e-05,
569
+ "loss": 1.2544,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.16613254710934497,
574
+ "grad_norm": 0.42142602801322937,
575
+ "learning_rate": 8.780224525043178e-05,
576
+ "loss": 1.2676,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.16818356620946032,
581
+ "grad_norm": 0.36646899580955505,
582
+ "learning_rate": 8.758635578583767e-05,
583
+ "loss": 1.2765,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.1702345853095757,
588
+ "grad_norm": 0.4253019094467163,
589
+ "learning_rate": 8.737046632124353e-05,
590
+ "loss": 1.3003,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.17228560440969107,
595
+ "grad_norm": 0.41490674018859863,
596
+ "learning_rate": 8.715457685664939e-05,
597
+ "loss": 1.2731,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.17433662350980644,
602
+ "grad_norm": 0.405460387468338,
603
+ "learning_rate": 8.693868739205528e-05,
604
+ "loss": 1.2122,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.17638764260992182,
609
+ "grad_norm": 0.4028235375881195,
610
+ "learning_rate": 8.672279792746114e-05,
611
+ "loss": 1.3238,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.17843866171003717,
616
+ "grad_norm": 0.38994792103767395,
617
+ "learning_rate": 8.650690846286701e-05,
618
+ "loss": 1.2875,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.18048968081015254,
623
+ "grad_norm": 0.4099538326263428,
624
+ "learning_rate": 8.629101899827289e-05,
625
+ "loss": 1.2807,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.18254069991026792,
630
+ "grad_norm": 0.40470021963119507,
631
+ "learning_rate": 8.607512953367875e-05,
632
+ "loss": 1.2802,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.1845917190103833,
637
+ "grad_norm": 0.4066854417324066,
638
+ "learning_rate": 8.585924006908464e-05,
639
+ "loss": 1.2464,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.18664273811049864,
644
+ "grad_norm": 0.38739994168281555,
645
+ "learning_rate": 8.56433506044905e-05,
646
+ "loss": 1.2831,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.18869375721061402,
651
+ "grad_norm": 0.4257420301437378,
652
+ "learning_rate": 8.542746113989638e-05,
653
+ "loss": 1.2679,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.1907447763107294,
658
+ "grad_norm": 0.41571488976478577,
659
+ "learning_rate": 8.521157167530225e-05,
660
+ "loss": 1.2501,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.19279579541084477,
665
+ "grad_norm": 0.4178495407104492,
666
+ "learning_rate": 8.499568221070811e-05,
667
+ "loss": 1.2657,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.19484681451096014,
672
+ "grad_norm": 0.4083455801010132,
673
+ "learning_rate": 8.477979274611399e-05,
674
+ "loss": 1.2781,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.1968978336110755,
679
+ "grad_norm": 0.4067554175853729,
680
+ "learning_rate": 8.456390328151986e-05,
681
+ "loss": 1.2582,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.19894885271119087,
686
+ "grad_norm": 0.4067447781562805,
687
+ "learning_rate": 8.434801381692574e-05,
688
+ "loss": 1.2948,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.20099987181130624,
693
+ "grad_norm": 0.44283562898635864,
694
+ "learning_rate": 8.413212435233161e-05,
695
+ "loss": 1.3011,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.20305089091142162,
700
+ "grad_norm": 0.41568294167518616,
701
+ "learning_rate": 8.391623488773748e-05,
702
+ "loss": 1.2804,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.205101910011537,
707
+ "grad_norm": 0.4183642864227295,
708
+ "learning_rate": 8.370034542314335e-05,
709
+ "loss": 1.2228,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.20715292911165234,
714
+ "grad_norm": 0.4311917722225189,
715
+ "learning_rate": 8.348445595854923e-05,
716
+ "loss": 1.2714,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.20920394821176772,
721
+ "grad_norm": 0.41575828194618225,
722
+ "learning_rate": 8.32685664939551e-05,
723
+ "loss": 1.2783,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.2112549673118831,
728
+ "grad_norm": 0.3958878815174103,
729
+ "learning_rate": 8.305267702936098e-05,
730
+ "loss": 1.2558,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.21330598641199847,
735
+ "grad_norm": 0.43759557604789734,
736
+ "learning_rate": 8.283678756476684e-05,
737
+ "loss": 1.2557,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.21535700551211384,
742
+ "grad_norm": 0.41460636258125305,
743
+ "learning_rate": 8.262089810017271e-05,
744
+ "loss": 1.2851,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.2174080246122292,
749
+ "grad_norm": 0.4114689826965332,
750
+ "learning_rate": 8.240500863557859e-05,
751
+ "loss": 1.3076,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.21945904371234456,
756
+ "grad_norm": 0.42222094535827637,
757
+ "learning_rate": 8.218911917098446e-05,
758
+ "loss": 1.2263,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.22151006281245994,
763
+ "grad_norm": 0.4098639488220215,
764
+ "learning_rate": 8.197322970639033e-05,
765
+ "loss": 1.2779,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.22356108191257532,
770
+ "grad_norm": 0.4205043315887451,
771
+ "learning_rate": 8.175734024179621e-05,
772
+ "loss": 1.2177,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.2256121010126907,
777
+ "grad_norm": 0.4501648247241974,
778
+ "learning_rate": 8.154145077720208e-05,
779
+ "loss": 1.3227,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.22766312011280604,
784
+ "grad_norm": 0.41510599851608276,
785
+ "learning_rate": 8.132556131260795e-05,
786
+ "loss": 1.3177,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.22971413921292141,
791
+ "grad_norm": 0.41567444801330566,
792
+ "learning_rate": 8.110967184801383e-05,
793
+ "loss": 1.2506,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.2317651583130368,
798
+ "grad_norm": 0.4262779653072357,
799
+ "learning_rate": 8.089378238341969e-05,
800
+ "loss": 1.2506,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.23381617741315217,
805
+ "grad_norm": 0.4220465421676636,
806
+ "learning_rate": 8.067789291882558e-05,
807
+ "loss": 1.2514,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.23586719651326754,
812
+ "grad_norm": 0.4169275462627411,
813
+ "learning_rate": 8.046200345423144e-05,
814
+ "loss": 1.2693,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.2379182156133829,
819
+ "grad_norm": 0.43145328760147095,
820
+ "learning_rate": 8.02461139896373e-05,
821
+ "loss": 1.2394,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.23996923471349826,
826
+ "grad_norm": 0.42889878153800964,
827
+ "learning_rate": 8.003022452504319e-05,
828
+ "loss": 1.248,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.24202025381361364,
833
+ "grad_norm": 0.41731464862823486,
834
+ "learning_rate": 7.981433506044905e-05,
835
+ "loss": 1.2498,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.24407127291372901,
840
+ "grad_norm": 0.4326362609863281,
841
+ "learning_rate": 7.959844559585493e-05,
842
+ "loss": 1.265,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.2461222920138444,
847
+ "grad_norm": 0.4242352843284607,
848
+ "learning_rate": 7.93825561312608e-05,
849
+ "loss": 1.2672,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.24817331111395974,
854
+ "grad_norm": 0.4441153407096863,
855
+ "learning_rate": 7.916666666666666e-05,
856
+ "loss": 1.2944,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.25022433021407514,
861
+ "grad_norm": 0.40912818908691406,
862
+ "learning_rate": 7.895077720207255e-05,
863
+ "loss": 1.2702,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.2522753493141905,
868
+ "grad_norm": 0.44539037346839905,
869
+ "learning_rate": 7.873488773747841e-05,
870
+ "loss": 1.2228,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.25432636841430584,
875
+ "grad_norm": 0.4299303889274597,
876
+ "learning_rate": 7.851899827288429e-05,
877
+ "loss": 1.2328,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.25637738751442124,
882
+ "grad_norm": 0.4408973455429077,
883
+ "learning_rate": 7.830310880829016e-05,
884
+ "loss": 1.2358,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.2584284066145366,
889
+ "grad_norm": 0.4100968837738037,
890
+ "learning_rate": 7.808721934369602e-05,
891
+ "loss": 1.2458,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.260479425714652,
896
+ "grad_norm": 0.4401489198207855,
897
+ "learning_rate": 7.787132987910191e-05,
898
+ "loss": 1.2593,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.26253044481476734,
903
+ "grad_norm": 0.4514229893684387,
904
+ "learning_rate": 7.765544041450777e-05,
905
+ "loss": 1.2632,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.2645814639148827,
910
+ "grad_norm": 0.38684791326522827,
911
+ "learning_rate": 7.743955094991365e-05,
912
+ "loss": 1.2424,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.2666324830149981,
917
+ "grad_norm": 0.46148189902305603,
918
+ "learning_rate": 7.722366148531953e-05,
919
+ "loss": 1.2445,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.26868350211511344,
924
+ "grad_norm": 0.4319213628768921,
925
+ "learning_rate": 7.700777202072539e-05,
926
+ "loss": 1.2253,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.27073452121522884,
931
+ "grad_norm": 0.4195545017719269,
932
+ "learning_rate": 7.679188255613126e-05,
933
+ "loss": 1.2578,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.2727855403153442,
938
+ "grad_norm": 0.43690159916877747,
939
+ "learning_rate": 7.657599309153714e-05,
940
+ "loss": 1.2573,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.27483655941545954,
945
+ "grad_norm": 0.44571492075920105,
946
+ "learning_rate": 7.636010362694301e-05,
947
+ "loss": 1.2607,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.27688757851557494,
952
+ "grad_norm": 0.43295958638191223,
953
+ "learning_rate": 7.614421416234889e-05,
954
+ "loss": 1.2278,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.2789385976156903,
959
+ "grad_norm": 0.44495707750320435,
960
+ "learning_rate": 7.592832469775475e-05,
961
+ "loss": 1.2798,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.2809896167158057,
966
+ "grad_norm": 0.4412330985069275,
967
+ "learning_rate": 7.571243523316062e-05,
968
+ "loss": 1.2501,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.28304063581592104,
973
+ "grad_norm": 0.44599953293800354,
974
+ "learning_rate": 7.54965457685665e-05,
975
+ "loss": 1.2396,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.2850916549160364,
980
+ "grad_norm": 0.447109580039978,
981
+ "learning_rate": 7.528065630397237e-05,
982
+ "loss": 1.2767,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.2871426740161518,
987
+ "grad_norm": 0.44506722688674927,
988
+ "learning_rate": 7.506476683937824e-05,
989
+ "loss": 1.2546,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.28919369311626714,
994
+ "grad_norm": 0.44061776995658875,
995
+ "learning_rate": 7.484887737478411e-05,
996
+ "loss": 1.2413,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.29124471221638254,
1001
+ "grad_norm": 0.45085111260414124,
1002
+ "learning_rate": 7.463298791018999e-05,
1003
+ "loss": 1.2483,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.2932957313164979,
1008
+ "grad_norm": 0.4437837600708008,
1009
+ "learning_rate": 7.441709844559586e-05,
1010
+ "loss": 1.252,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.29534675041661324,
1015
+ "grad_norm": 0.4294221103191376,
1016
+ "learning_rate": 7.420120898100174e-05,
1017
+ "loss": 1.2386,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.29739776951672864,
1022
+ "grad_norm": 0.4780830144882202,
1023
+ "learning_rate": 7.39853195164076e-05,
1024
+ "loss": 1.2639,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.299448788616844,
1029
+ "grad_norm": 0.44152942299842834,
1030
+ "learning_rate": 7.376943005181347e-05,
1031
+ "loss": 1.2756,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.3014998077169594,
1036
+ "grad_norm": 0.41989192366600037,
1037
+ "learning_rate": 7.355354058721935e-05,
1038
+ "loss": 1.2614,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.30355082681707474,
1043
+ "grad_norm": 0.5871754884719849,
1044
+ "learning_rate": 7.333765112262521e-05,
1045
+ "loss": 1.2615,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.3056018459171901,
1050
+ "grad_norm": 0.4467261731624603,
1051
+ "learning_rate": 7.31217616580311e-05,
1052
+ "loss": 1.2624,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.3076528650173055,
1057
+ "grad_norm": 0.49219033122062683,
1058
+ "learning_rate": 7.290587219343696e-05,
1059
+ "loss": 1.289,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.30970388411742084,
1064
+ "grad_norm": 0.4700734317302704,
1065
+ "learning_rate": 7.268998272884284e-05,
1066
+ "loss": 1.242,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.31175490321753624,
1071
+ "grad_norm": 0.4607170820236206,
1072
+ "learning_rate": 7.247409326424871e-05,
1073
+ "loss": 1.2554,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.3138059223176516,
1078
+ "grad_norm": 0.4335988759994507,
1079
+ "learning_rate": 7.225820379965457e-05,
1080
+ "loss": 1.2423,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.31585694141776693,
1085
+ "grad_norm": 0.4366897940635681,
1086
+ "learning_rate": 7.204231433506046e-05,
1087
+ "loss": 1.2219,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.31790796051788234,
1092
+ "grad_norm": 0.45856085419654846,
1093
+ "learning_rate": 7.182642487046632e-05,
1094
+ "loss": 1.2189,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.3199589796179977,
1099
+ "grad_norm": 0.4563063085079193,
1100
+ "learning_rate": 7.16105354058722e-05,
1101
+ "loss": 1.2696,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.3220099987181131,
1106
+ "grad_norm": 0.4276934862136841,
1107
+ "learning_rate": 7.139464594127807e-05,
1108
+ "loss": 1.2659,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.32406101781822844,
1113
+ "grad_norm": 0.46200886368751526,
1114
+ "learning_rate": 7.117875647668394e-05,
1115
+ "loss": 1.2261,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.3261120369183438,
1120
+ "grad_norm": 0.4863358736038208,
1121
+ "learning_rate": 7.096286701208982e-05,
1122
+ "loss": 1.2292,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.3281630560184592,
1127
+ "grad_norm": 0.4537160098552704,
1128
+ "learning_rate": 7.074697754749569e-05,
1129
+ "loss": 1.2453,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.33021407511857453,
1134
+ "grad_norm": 0.4507627487182617,
1135
+ "learning_rate": 7.053108808290155e-05,
1136
+ "loss": 1.2081,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.33226509421868994,
1141
+ "grad_norm": 0.43197301030158997,
1142
+ "learning_rate": 7.031519861830744e-05,
1143
+ "loss": 1.2757,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.3343161133188053,
1148
+ "grad_norm": 0.4551820456981659,
1149
+ "learning_rate": 7.00993091537133e-05,
1150
+ "loss": 1.2751,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.33636713241892063,
1155
+ "grad_norm": 0.45099398493766785,
1156
+ "learning_rate": 6.988341968911917e-05,
1157
+ "loss": 1.2583,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.33841815151903604,
1162
+ "grad_norm": 0.46787434816360474,
1163
+ "learning_rate": 6.966753022452505e-05,
1164
+ "loss": 1.2448,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.3404691706191514,
1169
+ "grad_norm": 0.45500054955482483,
1170
+ "learning_rate": 6.945164075993091e-05,
1171
+ "loss": 1.2394,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.3425201897192668,
1176
+ "grad_norm": 0.4682730436325073,
1177
+ "learning_rate": 6.92357512953368e-05,
1178
+ "loss": 1.2287,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.34457120881938214,
1183
+ "grad_norm": 0.4615074396133423,
1184
+ "learning_rate": 6.901986183074266e-05,
1185
+ "loss": 1.2042,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.3466222279194975,
1190
+ "grad_norm": 0.4548027217388153,
1191
+ "learning_rate": 6.880397236614854e-05,
1192
+ "loss": 1.2671,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.3486732470196129,
1197
+ "grad_norm": 0.4783169627189636,
1198
+ "learning_rate": 6.858808290155441e-05,
1199
+ "loss": 1.2533,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.35072426611972823,
1204
+ "grad_norm": 0.46452414989471436,
1205
+ "learning_rate": 6.837219343696027e-05,
1206
+ "loss": 1.2681,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.35277528521984364,
1211
+ "grad_norm": 0.4663463532924652,
1212
+ "learning_rate": 6.815630397236615e-05,
1213
+ "loss": 1.2561,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.354826304319959,
1218
+ "grad_norm": 0.46744370460510254,
1219
+ "learning_rate": 6.794041450777202e-05,
1220
+ "loss": 1.2453,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.35687732342007433,
1225
+ "grad_norm": 0.471835732460022,
1226
+ "learning_rate": 6.77245250431779e-05,
1227
+ "loss": 1.2472,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.35892834252018974,
1232
+ "grad_norm": 0.4618450701236725,
1233
+ "learning_rate": 6.750863557858377e-05,
1234
+ "loss": 1.2547,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.3609793616203051,
1239
+ "grad_norm": 0.4651658833026886,
1240
+ "learning_rate": 6.729274611398963e-05,
1241
+ "loss": 1.2623,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.36303038072042043,
1246
+ "grad_norm": 0.46842116117477417,
1247
+ "learning_rate": 6.707685664939551e-05,
1248
+ "loss": 1.2391,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.36508139982053583,
1253
+ "grad_norm": 0.45604613423347473,
1254
+ "learning_rate": 6.686096718480138e-05,
1255
+ "loss": 1.2884,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.3671324189206512,
1260
+ "grad_norm": 0.4306802451610565,
1261
+ "learning_rate": 6.664507772020726e-05,
1262
+ "loss": 1.2252,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.3691834380207666,
1267
+ "grad_norm": 0.4549136757850647,
1268
+ "learning_rate": 6.642918825561312e-05,
1269
+ "loss": 1.2496,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.37123445712088193,
1274
+ "grad_norm": 0.47443437576293945,
1275
+ "learning_rate": 6.6213298791019e-05,
1276
+ "loss": 1.2655,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.3732854762209973,
1281
+ "grad_norm": 0.46772050857543945,
1282
+ "learning_rate": 6.599740932642487e-05,
1283
+ "loss": 1.2366,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.3753364953211127,
1288
+ "grad_norm": 0.4691794216632843,
1289
+ "learning_rate": 6.578151986183075e-05,
1290
+ "loss": 1.2152,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.37738751442122803,
1295
+ "grad_norm": 0.43691304326057434,
1296
+ "learning_rate": 6.556563039723662e-05,
1297
+ "loss": 1.2511,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.37943853352134344,
1302
+ "grad_norm": 0.4595348536968231,
1303
+ "learning_rate": 6.534974093264248e-05,
1304
+ "loss": 1.2635,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.3814895526214588,
1309
+ "grad_norm": 0.44760558009147644,
1310
+ "learning_rate": 6.513385146804836e-05,
1311
+ "loss": 1.2342,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.38354057172157413,
1316
+ "grad_norm": 0.4559841454029083,
1317
+ "learning_rate": 6.491796200345423e-05,
1318
+ "loss": 1.2432,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.38559159082168953,
1323
+ "grad_norm": 0.4497215449810028,
1324
+ "learning_rate": 6.470207253886011e-05,
1325
+ "loss": 1.2267,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.3876426099218049,
1330
+ "grad_norm": 0.4863613247871399,
1331
+ "learning_rate": 6.448618307426598e-05,
1332
+ "loss": 1.254,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.3896936290219203,
1337
+ "grad_norm": 0.4500603675842285,
1338
+ "learning_rate": 6.427029360967185e-05,
1339
+ "loss": 1.2214,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.39174464812203563,
1344
+ "grad_norm": 0.4400598704814911,
1345
+ "learning_rate": 6.405440414507774e-05,
1346
+ "loss": 1.2352,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.393795667222151,
1351
+ "grad_norm": 0.46070367097854614,
1352
+ "learning_rate": 6.38385146804836e-05,
1353
+ "loss": 1.2468,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.3958466863222664,
1358
+ "grad_norm": 0.44312766194343567,
1359
+ "learning_rate": 6.362262521588946e-05,
1360
+ "loss": 1.1923,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.39789770542238173,
1365
+ "grad_norm": 0.5013573169708252,
1366
+ "learning_rate": 6.340673575129535e-05,
1367
+ "loss": 1.2361,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.39994872452249713,
1372
+ "grad_norm": 0.4884537160396576,
1373
+ "learning_rate": 6.319084628670121e-05,
1374
+ "loss": 1.2434,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.4019997436226125,
1379
+ "grad_norm": 0.46138620376586914,
1380
+ "learning_rate": 6.297495682210708e-05,
1381
+ "loss": 1.257,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.40405076272272783,
1386
+ "grad_norm": 0.4941729009151459,
1387
+ "learning_rate": 6.275906735751296e-05,
1388
+ "loss": 1.2347,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.40610178182284323,
1393
+ "grad_norm": 0.4675595760345459,
1394
+ "learning_rate": 6.254317789291882e-05,
1395
+ "loss": 1.2353,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.4081528009229586,
1400
+ "grad_norm": 0.47944632172584534,
1401
+ "learning_rate": 6.232728842832471e-05,
1402
+ "loss": 1.2643,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.410203820023074,
1407
+ "grad_norm": 0.4476461112499237,
1408
+ "learning_rate": 6.211139896373057e-05,
1409
+ "loss": 1.2558,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.41225483912318933,
1414
+ "grad_norm": 0.4706653654575348,
1415
+ "learning_rate": 6.189550949913645e-05,
1416
+ "loss": 1.227,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.4143058582233047,
1421
+ "grad_norm": 0.48062801361083984,
1422
+ "learning_rate": 6.167962003454232e-05,
1423
+ "loss": 1.2273,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.4163568773234201,
1428
+ "grad_norm": 0.46771204471588135,
1429
+ "learning_rate": 6.146373056994818e-05,
1430
+ "loss": 1.2268,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.41840789642353543,
1435
+ "grad_norm": 0.4725424647331238,
1436
+ "learning_rate": 6.124784110535406e-05,
1437
+ "loss": 1.2009,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.42045891552365083,
1442
+ "grad_norm": 0.47520384192466736,
1443
+ "learning_rate": 6.1031951640759934e-05,
1444
+ "loss": 1.2511,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.4225099346237662,
1449
+ "grad_norm": 0.44635480642318726,
1450
+ "learning_rate": 6.081606217616581e-05,
1451
+ "loss": 1.21,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.42456095372388153,
1456
+ "grad_norm": 0.47436651587486267,
1457
+ "learning_rate": 6.060017271157168e-05,
1458
+ "loss": 1.2116,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.42661197282399693,
1463
+ "grad_norm": 0.5115741491317749,
1464
+ "learning_rate": 6.0384283246977546e-05,
1465
+ "loss": 1.2778,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.4286629919241123,
1470
+ "grad_norm": 0.4488040506839752,
1471
+ "learning_rate": 6.016839378238343e-05,
1472
+ "loss": 1.2242,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.4307140110242277,
1477
+ "grad_norm": 0.4834796190261841,
1478
+ "learning_rate": 5.9952504317789296e-05,
1479
+ "loss": 1.2357,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.43276503012434303,
1484
+ "grad_norm": 0.45478227734565735,
1485
+ "learning_rate": 5.973661485319517e-05,
1486
+ "loss": 1.2233,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.4348160492244584,
1491
+ "grad_norm": 0.4539099633693695,
1492
+ "learning_rate": 5.952072538860104e-05,
1493
+ "loss": 1.2527,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.4368670683245738,
1498
+ "grad_norm": 0.47722533345222473,
1499
+ "learning_rate": 5.930483592400691e-05,
1500
+ "loss": 1.2015,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.43891808742468913,
1505
+ "grad_norm": 0.472023069858551,
1506
+ "learning_rate": 5.908894645941278e-05,
1507
+ "loss": 1.2222,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.44096910652480453,
1512
+ "grad_norm": 0.4648214876651764,
1513
+ "learning_rate": 5.887305699481865e-05,
1514
+ "loss": 1.2112,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.4430201256249199,
1519
+ "grad_norm": 0.48654377460479736,
1520
+ "learning_rate": 5.8657167530224534e-05,
1521
+ "loss": 1.227,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.44507114472503523,
1526
+ "grad_norm": 0.4997814893722534,
1527
+ "learning_rate": 5.84412780656304e-05,
1528
+ "loss": 1.2721,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.44712216382515063,
1533
+ "grad_norm": 0.47997352480888367,
1534
+ "learning_rate": 5.822538860103627e-05,
1535
+ "loss": 1.2018,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.449173182925266,
1540
+ "grad_norm": 0.4899247884750366,
1541
+ "learning_rate": 5.8009499136442146e-05,
1542
+ "loss": 1.2599,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.4512242020253814,
1547
+ "grad_norm": 0.4752749800682068,
1548
+ "learning_rate": 5.7793609671848014e-05,
1549
+ "loss": 1.2171,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.45327522112549673,
1554
+ "grad_norm": 0.4801314175128937,
1555
+ "learning_rate": 5.7577720207253896e-05,
1556
+ "loss": 1.2234,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.4553262402256121,
1561
+ "grad_norm": 0.4591893255710602,
1562
+ "learning_rate": 5.7361830742659764e-05,
1563
+ "loss": 1.2242,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.4573772593257275,
1568
+ "grad_norm": 0.46896713972091675,
1569
+ "learning_rate": 5.7145941278065626e-05,
1570
+ "loss": 1.2117,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.45942827842584283,
1575
+ "grad_norm": 0.4853857755661011,
1576
+ "learning_rate": 5.693005181347151e-05,
1577
+ "loss": 1.2218,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.46147929752595823,
1582
+ "grad_norm": 0.4648151993751526,
1583
+ "learning_rate": 5.6714162348877376e-05,
1584
+ "loss": 1.2401,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.4635303166260736,
1589
+ "grad_norm": 0.4839739501476288,
1590
+ "learning_rate": 5.649827288428325e-05,
1591
+ "loss": 1.1976,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.4655813357261889,
1596
+ "grad_norm": 0.4986715018749237,
1597
+ "learning_rate": 5.628238341968912e-05,
1598
+ "loss": 1.2274,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.46763235482630433,
1603
+ "grad_norm": 0.4636840522289276,
1604
+ "learning_rate": 5.606649395509499e-05,
1605
+ "loss": 1.236,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.4696833739264197,
1610
+ "grad_norm": 0.5011271834373474,
1611
+ "learning_rate": 5.585060449050087e-05,
1612
+ "loss": 1.2275,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.4717343930265351,
1617
+ "grad_norm": 0.4648337662220001,
1618
+ "learning_rate": 5.563471502590674e-05,
1619
+ "loss": 1.2457,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.47378541212665043,
1624
+ "grad_norm": 0.47708699107170105,
1625
+ "learning_rate": 5.5418825561312614e-05,
1626
+ "loss": 1.2316,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.4758364312267658,
1631
+ "grad_norm": 0.4954835772514343,
1632
+ "learning_rate": 5.520293609671848e-05,
1633
+ "loss": 1.229,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.4778874503268812,
1638
+ "grad_norm": 0.4701727330684662,
1639
+ "learning_rate": 5.498704663212435e-05,
1640
+ "loss": 1.248,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.47993846942699653,
1645
+ "grad_norm": 0.4796009957790375,
1646
+ "learning_rate": 5.477115716753023e-05,
1647
+ "loss": 1.2248,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.48198948852711193,
1652
+ "grad_norm": 0.4906330406665802,
1653
+ "learning_rate": 5.4555267702936094e-05,
1654
+ "loss": 1.2628,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.4840405076272273,
1659
+ "grad_norm": 0.47203144431114197,
1660
+ "learning_rate": 5.4339378238341976e-05,
1661
+ "loss": 1.2067,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.4860915267273426,
1666
+ "grad_norm": 0.503813624382019,
1667
+ "learning_rate": 5.4123488773747845e-05,
1668
+ "loss": 1.2006,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.48814254582745803,
1673
+ "grad_norm": 0.4918235242366791,
1674
+ "learning_rate": 5.390759930915371e-05,
1675
+ "loss": 1.1887,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.4901935649275734,
1680
+ "grad_norm": 0.4799112379550934,
1681
+ "learning_rate": 5.369170984455959e-05,
1682
+ "loss": 1.2079,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.4922445840276888,
1687
+ "grad_norm": 0.4769650101661682,
1688
+ "learning_rate": 5.347582037996546e-05,
1689
+ "loss": 1.1945,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.49429560312780413,
1694
+ "grad_norm": 0.5079638957977295,
1695
+ "learning_rate": 5.325993091537134e-05,
1696
+ "loss": 1.2294,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.4963466222279195,
1701
+ "grad_norm": 0.520418643951416,
1702
+ "learning_rate": 5.304404145077721e-05,
1703
+ "loss": 1.2308,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.4983976413280349,
1708
+ "grad_norm": 0.4546453058719635,
1709
+ "learning_rate": 5.2828151986183075e-05,
1710
+ "loss": 1.2206,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.5004486604281503,
1715
+ "grad_norm": 0.47760534286499023,
1716
+ "learning_rate": 5.261226252158895e-05,
1717
+ "loss": 1.208,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.5024996795282656,
1722
+ "grad_norm": 0.5267066955566406,
1723
+ "learning_rate": 5.239637305699482e-05,
1724
+ "loss": 1.2123,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.504550698628381,
1729
+ "grad_norm": 0.45763811469078064,
1730
+ "learning_rate": 5.2180483592400694e-05,
1731
+ "loss": 1.2159,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.5066017177284964,
1736
+ "grad_norm": 0.4922376871109009,
1737
+ "learning_rate": 5.196459412780656e-05,
1738
+ "loss": 1.2456,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.5086527368286117,
1743
+ "grad_norm": 0.47043368220329285,
1744
+ "learning_rate": 5.174870466321243e-05,
1745
+ "loss": 1.2052,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.5107037559287271,
1750
+ "grad_norm": 0.5082889795303345,
1751
+ "learning_rate": 5.153281519861831e-05,
1752
+ "loss": 1.2393,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.5127547750288425,
1757
+ "grad_norm": 0.4955206513404846,
1758
+ "learning_rate": 5.131692573402418e-05,
1759
+ "loss": 1.2323,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.5148057941289578,
1764
+ "grad_norm": 0.48625460267066956,
1765
+ "learning_rate": 5.1101036269430057e-05,
1766
+ "loss": 1.206,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.5168568132290732,
1771
+ "grad_norm": 0.49060237407684326,
1772
+ "learning_rate": 5.0885146804835925e-05,
1773
+ "loss": 1.2353,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.5189078323291886,
1778
+ "grad_norm": 0.46809640526771545,
1779
+ "learning_rate": 5.0669257340241793e-05,
1780
+ "loss": 1.2287,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.520958851429304,
1785
+ "grad_norm": 0.4944596290588379,
1786
+ "learning_rate": 5.0453367875647675e-05,
1787
+ "loss": 1.2413,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.5230098705294193,
1792
+ "grad_norm": 0.46914994716644287,
1793
+ "learning_rate": 5.023747841105354e-05,
1794
+ "loss": 1.22,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.5250608896295347,
1799
+ "grad_norm": 0.4888727366924286,
1800
+ "learning_rate": 5.002158894645942e-05,
1801
+ "loss": 1.2343,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.5271119087296501,
1806
+ "grad_norm": 0.4785778522491455,
1807
+ "learning_rate": 4.980569948186529e-05,
1808
+ "loss": 1.187,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.5291629278297654,
1813
+ "grad_norm": 0.4947550594806671,
1814
+ "learning_rate": 4.958981001727116e-05,
1815
+ "loss": 1.2288,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.5312139469298808,
1820
+ "grad_norm": 0.5263291597366333,
1821
+ "learning_rate": 4.937392055267703e-05,
1822
+ "loss": 1.2044,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.5332649660299962,
1827
+ "grad_norm": 0.49239382147789,
1828
+ "learning_rate": 4.9158031088082906e-05,
1829
+ "loss": 1.1865,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.5353159851301115,
1834
+ "grad_norm": 0.48874983191490173,
1835
+ "learning_rate": 4.8942141623488775e-05,
1836
+ "loss": 1.2672,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.5373670042302269,
1841
+ "grad_norm": 0.48474863171577454,
1842
+ "learning_rate": 4.872625215889465e-05,
1843
+ "loss": 1.2359,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.5394180233303423,
1848
+ "grad_norm": 0.4978977143764496,
1849
+ "learning_rate": 4.851036269430052e-05,
1850
+ "loss": 1.2139,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.5414690424304577,
1855
+ "grad_norm": 0.5144924521446228,
1856
+ "learning_rate": 4.829447322970639e-05,
1857
+ "loss": 1.221,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.543520061530573,
1862
+ "grad_norm": 0.5082759857177734,
1863
+ "learning_rate": 4.807858376511227e-05,
1864
+ "loss": 1.2209,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.5455710806306884,
1869
+ "grad_norm": 0.4933965504169464,
1870
+ "learning_rate": 4.786269430051814e-05,
1871
+ "loss": 1.207,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.5476220997308038,
1876
+ "grad_norm": 0.49464166164398193,
1877
+ "learning_rate": 4.7646804835924005e-05,
1878
+ "loss": 1.2398,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.5496731188309191,
1883
+ "grad_norm": 0.49377110600471497,
1884
+ "learning_rate": 4.743091537132988e-05,
1885
+ "loss": 1.2451,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.5517241379310345,
1890
+ "grad_norm": 0.5111104846000671,
1891
+ "learning_rate": 4.7215025906735756e-05,
1892
+ "loss": 1.2197,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.5537751570311499,
1897
+ "grad_norm": 0.47716042399406433,
1898
+ "learning_rate": 4.699913644214163e-05,
1899
+ "loss": 1.1891,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.5558261761312652,
1904
+ "grad_norm": 0.5081655383110046,
1905
+ "learning_rate": 4.678324697754749e-05,
1906
+ "loss": 1.2507,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.5578771952313806,
1911
+ "grad_norm": 0.49036547541618347,
1912
+ "learning_rate": 4.656735751295337e-05,
1913
+ "loss": 1.1805,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.559928214331496,
1918
+ "grad_norm": 0.5139365792274475,
1919
+ "learning_rate": 4.635146804835924e-05,
1920
+ "loss": 1.2361,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.5619792334316114,
1925
+ "grad_norm": 0.5098669528961182,
1926
+ "learning_rate": 4.613557858376512e-05,
1927
+ "loss": 1.2409,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.5640302525317267,
1932
+ "grad_norm": 0.4786950349807739,
1933
+ "learning_rate": 4.5919689119170986e-05,
1934
+ "loss": 1.2067,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.5660812716318421,
1939
+ "grad_norm": 0.5063204169273376,
1940
+ "learning_rate": 4.5703799654576855e-05,
1941
+ "loss": 1.1942,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.5681322907319575,
1946
+ "grad_norm": 0.511663556098938,
1947
+ "learning_rate": 4.548791018998273e-05,
1948
+ "loss": 1.2017,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.5701833098320728,
1953
+ "grad_norm": 0.48765748739242554,
1954
+ "learning_rate": 4.5272020725388605e-05,
1955
+ "loss": 1.222,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.5722343289321882,
1960
+ "grad_norm": 0.49707624316215515,
1961
+ "learning_rate": 4.5056131260794474e-05,
1962
+ "loss": 1.2075,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.5742853480323036,
1967
+ "grad_norm": 0.5067517757415771,
1968
+ "learning_rate": 4.484024179620035e-05,
1969
+ "loss": 1.211,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.5763363671324189,
1974
+ "grad_norm": 0.4615229368209839,
1975
+ "learning_rate": 4.462435233160622e-05,
1976
+ "loss": 1.2303,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.5783873862325343,
1981
+ "grad_norm": 0.4948524236679077,
1982
+ "learning_rate": 4.440846286701209e-05,
1983
+ "loss": 1.2024,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.5804384053326497,
1988
+ "grad_norm": 0.5140314102172852,
1989
+ "learning_rate": 4.419257340241796e-05,
1990
+ "loss": 1.2217,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.5824894244327651,
1995
+ "grad_norm": 0.5108122825622559,
1996
+ "learning_rate": 4.3976683937823836e-05,
1997
+ "loss": 1.1838,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.5845404435328804,
2002
+ "grad_norm": 0.5021159052848816,
2003
+ "learning_rate": 4.376079447322971e-05,
2004
+ "loss": 1.2418,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.5865914626329958,
2009
+ "grad_norm": 0.5086933374404907,
2010
+ "learning_rate": 4.354490500863558e-05,
2011
+ "loss": 1.2321,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.5886424817331112,
2016
+ "grad_norm": 0.5083547830581665,
2017
+ "learning_rate": 4.332901554404145e-05,
2018
+ "loss": 1.2035,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.5906935008332265,
2023
+ "grad_norm": 0.4828626215457916,
2024
+ "learning_rate": 4.311312607944732e-05,
2025
+ "loss": 1.2302,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.5927445199333419,
2030
+ "grad_norm": 0.5140969157218933,
2031
+ "learning_rate": 4.28972366148532e-05,
2032
+ "loss": 1.2058,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.5947955390334573,
2037
+ "grad_norm": 0.497364342212677,
2038
+ "learning_rate": 4.2681347150259074e-05,
2039
+ "loss": 1.2382,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.5968465581335726,
2044
+ "grad_norm": 0.49104997515678406,
2045
+ "learning_rate": 4.246545768566494e-05,
2046
+ "loss": 1.2322,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.598897577233688,
2051
+ "grad_norm": 0.521659255027771,
2052
+ "learning_rate": 4.224956822107081e-05,
2053
+ "loss": 1.1868,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.6009485963338034,
2058
+ "grad_norm": 0.5175550580024719,
2059
+ "learning_rate": 4.2033678756476686e-05,
2060
+ "loss": 1.2169,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.6029996154339188,
2065
+ "grad_norm": 0.4998300075531006,
2066
+ "learning_rate": 4.181778929188256e-05,
2067
+ "loss": 1.2227,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.6050506345340341,
2072
+ "grad_norm": 0.4932349622249603,
2073
+ "learning_rate": 4.160189982728843e-05,
2074
+ "loss": 1.2371,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.6071016536341495,
2079
+ "grad_norm": 0.5610498189926147,
2080
+ "learning_rate": 4.1386010362694304e-05,
2081
+ "loss": 1.2105,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.6091526727342649,
2086
+ "grad_norm": 0.4975990355014801,
2087
+ "learning_rate": 4.117012089810017e-05,
2088
+ "loss": 1.2511,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.6112036918343802,
2093
+ "grad_norm": 0.5154693722724915,
2094
+ "learning_rate": 4.095423143350605e-05,
2095
+ "loss": 1.2399,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.6132547109344956,
2100
+ "grad_norm": 0.4968002736568451,
2101
+ "learning_rate": 4.0738341968911916e-05,
2102
+ "loss": 1.2041,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.615305730034611,
2107
+ "grad_norm": 0.4866868555545807,
2108
+ "learning_rate": 4.052245250431779e-05,
2109
+ "loss": 1.1965,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.6173567491347263,
2114
+ "grad_norm": 0.5152925848960876,
2115
+ "learning_rate": 4.030656303972367e-05,
2116
+ "loss": 1.2298,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.6194077682348417,
2121
+ "grad_norm": 0.513058602809906,
2122
+ "learning_rate": 4.0090673575129535e-05,
2123
+ "loss": 1.2414,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.6214587873349571,
2128
+ "grad_norm": 0.5031930208206177,
2129
+ "learning_rate": 3.987478411053541e-05,
2130
+ "loss": 1.1766,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.6235098064350725,
2135
+ "grad_norm": 0.5087730288505554,
2136
+ "learning_rate": 3.965889464594128e-05,
2137
+ "loss": 1.229,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.6255608255351878,
2142
+ "grad_norm": 0.4878797233104706,
2143
+ "learning_rate": 3.9443005181347154e-05,
2144
+ "loss": 1.2018,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.6276118446353032,
2149
+ "grad_norm": 0.5124858617782593,
2150
+ "learning_rate": 3.922711571675303e-05,
2151
+ "loss": 1.1848,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.6296628637354186,
2156
+ "grad_norm": 0.49720969796180725,
2157
+ "learning_rate": 3.90112262521589e-05,
2158
+ "loss": 1.1892,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.6317138828355339,
2163
+ "grad_norm": 0.49900123476982117,
2164
+ "learning_rate": 3.8795336787564766e-05,
2165
+ "loss": 1.2027,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.6337649019356493,
2170
+ "grad_norm": 0.5007952451705933,
2171
+ "learning_rate": 3.857944732297064e-05,
2172
+ "loss": 1.2373,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.6358159210357647,
2177
+ "grad_norm": 0.49481576681137085,
2178
+ "learning_rate": 3.8363557858376516e-05,
2179
+ "loss": 1.2294,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.63786694013588,
2184
+ "grad_norm": 0.4979318082332611,
2185
+ "learning_rate": 3.8147668393782385e-05,
2186
+ "loss": 1.2312,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.6399179592359954,
2191
+ "grad_norm": 0.49939480423927307,
2192
+ "learning_rate": 3.793177892918825e-05,
2193
+ "loss": 1.2394,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.6419689783361108,
2198
+ "grad_norm": 0.5186517834663391,
2199
+ "learning_rate": 3.771588946459413e-05,
2200
+ "loss": 1.199,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.6440199974362262,
2205
+ "grad_norm": 0.5386569499969482,
2206
+ "learning_rate": 3.7500000000000003e-05,
2207
+ "loss": 1.1801,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.6460710165363415,
2212
+ "grad_norm": 0.5134577751159668,
2213
+ "learning_rate": 3.728411053540587e-05,
2214
+ "loss": 1.2286,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.6481220356364569,
2219
+ "grad_norm": 0.5191785097122192,
2220
+ "learning_rate": 3.706822107081175e-05,
2221
+ "loss": 1.2068,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.6501730547365723,
2226
+ "grad_norm": 0.4857168197631836,
2227
+ "learning_rate": 3.6852331606217615e-05,
2228
+ "loss": 1.2116,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.6522240738366876,
2233
+ "grad_norm": 0.5283413529396057,
2234
+ "learning_rate": 3.663644214162349e-05,
2235
+ "loss": 1.1792,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.654275092936803,
2240
+ "grad_norm": 0.528938353061676,
2241
+ "learning_rate": 3.6420552677029366e-05,
2242
+ "loss": 1.1963,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.6563261120369184,
2247
+ "grad_norm": 0.5067134499549866,
2248
+ "learning_rate": 3.6204663212435234e-05,
2249
+ "loss": 1.2476,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.6583771311370337,
2254
+ "grad_norm": 0.4993511736392975,
2255
+ "learning_rate": 3.598877374784111e-05,
2256
+ "loss": 1.2273,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.6604281502371491,
2261
+ "grad_norm": 0.5275943279266357,
2262
+ "learning_rate": 3.577288428324698e-05,
2263
+ "loss": 1.2287,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.6624791693372645,
2268
+ "grad_norm": 0.49331194162368774,
2269
+ "learning_rate": 3.555699481865285e-05,
2270
+ "loss": 1.1794,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.6645301884373799,
2275
+ "grad_norm": 0.5065453052520752,
2276
+ "learning_rate": 3.534110535405872e-05,
2277
+ "loss": 1.2342,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.6665812075374952,
2282
+ "grad_norm": 0.5334459543228149,
2283
+ "learning_rate": 3.51252158894646e-05,
2284
+ "loss": 1.1782,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.6686322266376106,
2289
+ "grad_norm": 0.535772979259491,
2290
+ "learning_rate": 3.490932642487047e-05,
2291
+ "loss": 1.2108,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.670683245737726,
2296
+ "grad_norm": 0.5377807021141052,
2297
+ "learning_rate": 3.469343696027634e-05,
2298
+ "loss": 1.1903,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.6727342648378413,
2303
+ "grad_norm": 0.5266278386116028,
2304
+ "learning_rate": 3.447754749568221e-05,
2305
+ "loss": 1.2183,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.6747852839379567,
2310
+ "grad_norm": 0.4987232983112335,
2311
+ "learning_rate": 3.4261658031088084e-05,
2312
+ "loss": 1.1915,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.6768363030380721,
2317
+ "grad_norm": 0.5178554058074951,
2318
+ "learning_rate": 3.404576856649396e-05,
2319
+ "loss": 1.179,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 0.6788873221381874,
2324
+ "grad_norm": 0.5086014270782471,
2325
+ "learning_rate": 3.382987910189983e-05,
2326
+ "loss": 1.2298,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 0.6809383412383028,
2331
+ "grad_norm": 0.5420427918434143,
2332
+ "learning_rate": 3.3613989637305696e-05,
2333
+ "loss": 1.2072,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 0.6829893603384182,
2338
+ "grad_norm": 0.5170331001281738,
2339
+ "learning_rate": 3.339810017271157e-05,
2340
+ "loss": 1.2252,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 0.6850403794385336,
2345
+ "grad_norm": 0.48680609464645386,
2346
+ "learning_rate": 3.3182210708117446e-05,
2347
+ "loss": 1.2059,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 0.6870913985386489,
2352
+ "grad_norm": 0.5035340189933777,
2353
+ "learning_rate": 3.296632124352332e-05,
2354
+ "loss": 1.2009,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 0.6891424176387643,
2359
+ "grad_norm": 0.513165295124054,
2360
+ "learning_rate": 3.275043177892919e-05,
2361
+ "loss": 1.1844,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 0.6911934367388797,
2366
+ "grad_norm": 0.5243003368377686,
2367
+ "learning_rate": 3.2534542314335065e-05,
2368
+ "loss": 1.2009,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 0.693244455838995,
2373
+ "grad_norm": 0.5219825506210327,
2374
+ "learning_rate": 3.2318652849740933e-05,
2375
+ "loss": 1.2039,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 0.6952954749391104,
2380
+ "grad_norm": 0.5202507972717285,
2381
+ "learning_rate": 3.210276338514681e-05,
2382
+ "loss": 1.225,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 0.6973464940392258,
2387
+ "grad_norm": 0.5152229070663452,
2388
+ "learning_rate": 3.188687392055268e-05,
2389
+ "loss": 1.1886,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 0.6993975131393411,
2394
+ "grad_norm": 0.5382890701293945,
2395
+ "learning_rate": 3.167098445595855e-05,
2396
+ "loss": 1.2113,
2397
+ "step": 3410
2398
+ },
2399
+ {
2400
+ "epoch": 0.7014485322394565,
2401
+ "grad_norm": 0.5525237917900085,
2402
+ "learning_rate": 3.145509499136443e-05,
2403
+ "loss": 1.2283,
2404
+ "step": 3420
2405
+ },
2406
+ {
2407
+ "epoch": 0.7034995513395719,
2408
+ "grad_norm": 0.5308887958526611,
2409
+ "learning_rate": 3.1239205526770296e-05,
2410
+ "loss": 1.2311,
2411
+ "step": 3430
2412
+ },
2413
+ {
2414
+ "epoch": 0.7055505704396873,
2415
+ "grad_norm": 0.5247687697410583,
2416
+ "learning_rate": 3.1023316062176164e-05,
2417
+ "loss": 1.1946,
2418
+ "step": 3440
2419
+ },
2420
+ {
2421
+ "epoch": 0.7076015895398026,
2422
+ "grad_norm": 0.5322206616401672,
2423
+ "learning_rate": 3.080742659758204e-05,
2424
+ "loss": 1.2198,
2425
+ "step": 3450
2426
+ },
2427
+ {
2428
+ "epoch": 0.709652608639918,
2429
+ "grad_norm": 0.5104162693023682,
2430
+ "learning_rate": 3.0591537132987915e-05,
2431
+ "loss": 1.2105,
2432
+ "step": 3460
2433
+ },
2434
+ {
2435
+ "epoch": 0.7117036277400334,
2436
+ "grad_norm": 0.4890803098678589,
2437
+ "learning_rate": 3.0375647668393786e-05,
2438
+ "loss": 1.2074,
2439
+ "step": 3470
2440
+ },
2441
+ {
2442
+ "epoch": 0.7137546468401487,
2443
+ "grad_norm": 0.529225766658783,
2444
+ "learning_rate": 3.0159758203799655e-05,
2445
+ "loss": 1.2321,
2446
+ "step": 3480
2447
+ },
2448
+ {
2449
+ "epoch": 0.7158056659402641,
2450
+ "grad_norm": 0.5252069234848022,
2451
+ "learning_rate": 2.9943868739205527e-05,
2452
+ "loss": 1.1995,
2453
+ "step": 3490
2454
+ },
2455
+ {
2456
+ "epoch": 0.7178566850403795,
2457
+ "grad_norm": 0.5369967818260193,
2458
+ "learning_rate": 2.9727979274611402e-05,
2459
+ "loss": 1.2234,
2460
+ "step": 3500
2461
+ },
2462
+ {
2463
+ "epoch": 0.7199077041404948,
2464
+ "grad_norm": 0.5053485631942749,
2465
+ "learning_rate": 2.9512089810017274e-05,
2466
+ "loss": 1.2035,
2467
+ "step": 3510
2468
+ },
2469
+ {
2470
+ "epoch": 0.7219587232406102,
2471
+ "grad_norm": 0.5131696462631226,
2472
+ "learning_rate": 2.929620034542315e-05,
2473
+ "loss": 1.2681,
2474
+ "step": 3520
2475
+ },
2476
+ {
2477
+ "epoch": 0.7240097423407256,
2478
+ "grad_norm": 0.5332499742507935,
2479
+ "learning_rate": 2.9080310880829014e-05,
2480
+ "loss": 1.2039,
2481
+ "step": 3530
2482
+ },
2483
+ {
2484
+ "epoch": 0.7260607614408409,
2485
+ "grad_norm": 0.5105617046356201,
2486
+ "learning_rate": 2.886442141623489e-05,
2487
+ "loss": 1.2,
2488
+ "step": 3540
2489
+ },
2490
+ {
2491
+ "epoch": 0.7281117805409563,
2492
+ "grad_norm": 0.5197264552116394,
2493
+ "learning_rate": 2.864853195164076e-05,
2494
+ "loss": 1.1821,
2495
+ "step": 3550
2496
+ },
2497
+ {
2498
+ "epoch": 0.7301627996410717,
2499
+ "grad_norm": 0.505455493927002,
2500
+ "learning_rate": 2.8432642487046636e-05,
2501
+ "loss": 1.2158,
2502
+ "step": 3560
2503
+ },
2504
+ {
2505
+ "epoch": 0.7322138187411871,
2506
+ "grad_norm": 0.5290804505348206,
2507
+ "learning_rate": 2.8216753022452508e-05,
2508
+ "loss": 1.174,
2509
+ "step": 3570
2510
+ },
2511
+ {
2512
+ "epoch": 0.7342648378413024,
2513
+ "grad_norm": 0.5349313020706177,
2514
+ "learning_rate": 2.8000863557858376e-05,
2515
+ "loss": 1.2301,
2516
+ "step": 3580
2517
+ },
2518
+ {
2519
+ "epoch": 0.7363158569414178,
2520
+ "grad_norm": 0.4875812530517578,
2521
+ "learning_rate": 2.7784974093264248e-05,
2522
+ "loss": 1.2015,
2523
+ "step": 3590
2524
+ },
2525
+ {
2526
+ "epoch": 0.7383668760415332,
2527
+ "grad_norm": 0.5164597630500793,
2528
+ "learning_rate": 2.7569084628670123e-05,
2529
+ "loss": 1.2294,
2530
+ "step": 3600
2531
+ },
2532
+ {
2533
+ "epoch": 0.7404178951416485,
2534
+ "grad_norm": 0.5129172801971436,
2535
+ "learning_rate": 2.7353195164075995e-05,
2536
+ "loss": 1.2122,
2537
+ "step": 3610
2538
+ },
2539
+ {
2540
+ "epoch": 0.7424689142417639,
2541
+ "grad_norm": 0.5218586921691895,
2542
+ "learning_rate": 2.713730569948187e-05,
2543
+ "loss": 1.2002,
2544
+ "step": 3620
2545
+ },
2546
+ {
2547
+ "epoch": 0.7445199333418793,
2548
+ "grad_norm": 0.5423296093940735,
2549
+ "learning_rate": 2.6921416234887735e-05,
2550
+ "loss": 1.1685,
2551
+ "step": 3630
2552
+ },
2553
+ {
2554
+ "epoch": 0.7465709524419946,
2555
+ "grad_norm": 0.5151218771934509,
2556
+ "learning_rate": 2.670552677029361e-05,
2557
+ "loss": 1.2167,
2558
+ "step": 3640
2559
+ },
2560
+ {
2561
+ "epoch": 0.74862197154211,
2562
+ "grad_norm": 0.5160235166549683,
2563
+ "learning_rate": 2.6489637305699482e-05,
2564
+ "loss": 1.2269,
2565
+ "step": 3650
2566
+ },
2567
+ {
2568
+ "epoch": 0.7506729906422254,
2569
+ "grad_norm": 0.5056514143943787,
2570
+ "learning_rate": 2.6273747841105357e-05,
2571
+ "loss": 1.2467,
2572
+ "step": 3660
2573
+ },
2574
+ {
2575
+ "epoch": 0.7527240097423408,
2576
+ "grad_norm": 0.52911776304245,
2577
+ "learning_rate": 2.605785837651123e-05,
2578
+ "loss": 1.2182,
2579
+ "step": 3670
2580
+ },
2581
+ {
2582
+ "epoch": 0.7547750288424561,
2583
+ "grad_norm": 0.5172019600868225,
2584
+ "learning_rate": 2.5841968911917097e-05,
2585
+ "loss": 1.1888,
2586
+ "step": 3680
2587
+ },
2588
+ {
2589
+ "epoch": 0.7568260479425715,
2590
+ "grad_norm": 0.5043123960494995,
2591
+ "learning_rate": 2.562607944732297e-05,
2592
+ "loss": 1.2004,
2593
+ "step": 3690
2594
+ },
2595
+ {
2596
+ "epoch": 0.7588770670426869,
2597
+ "grad_norm": 0.5103533267974854,
2598
+ "learning_rate": 2.5410189982728844e-05,
2599
+ "loss": 1.1627,
2600
+ "step": 3700
2601
+ },
2602
+ {
2603
+ "epoch": 0.7609280861428022,
2604
+ "grad_norm": 0.5295760631561279,
2605
+ "learning_rate": 2.5194300518134716e-05,
2606
+ "loss": 1.1604,
2607
+ "step": 3710
2608
+ },
2609
+ {
2610
+ "epoch": 0.7629791052429176,
2611
+ "grad_norm": 0.5427724719047546,
2612
+ "learning_rate": 2.4978411053540588e-05,
2613
+ "loss": 1.1781,
2614
+ "step": 3720
2615
+ },
2616
+ {
2617
+ "epoch": 0.765030124343033,
2618
+ "grad_norm": 0.5164818167686462,
2619
+ "learning_rate": 2.476252158894646e-05,
2620
+ "loss": 1.2208,
2621
+ "step": 3730
2622
+ },
2623
+ {
2624
+ "epoch": 0.7670811434431483,
2625
+ "grad_norm": 0.5196744799613953,
2626
+ "learning_rate": 2.4546632124352335e-05,
2627
+ "loss": 1.1971,
2628
+ "step": 3740
2629
+ },
2630
+ {
2631
+ "epoch": 0.7691321625432637,
2632
+ "grad_norm": 0.5128475427627563,
2633
+ "learning_rate": 2.4330742659758203e-05,
2634
+ "loss": 1.1909,
2635
+ "step": 3750
2636
+ },
2637
+ {
2638
+ "epoch": 0.7711831816433791,
2639
+ "grad_norm": 0.49743902683258057,
2640
+ "learning_rate": 2.411485319516408e-05,
2641
+ "loss": 1.2109,
2642
+ "step": 3760
2643
+ },
2644
+ {
2645
+ "epoch": 0.7732342007434945,
2646
+ "grad_norm": 0.5152381658554077,
2647
+ "learning_rate": 2.3898963730569947e-05,
2648
+ "loss": 1.2228,
2649
+ "step": 3770
2650
+ },
2651
+ {
2652
+ "epoch": 0.7752852198436098,
2653
+ "grad_norm": 0.5446299910545349,
2654
+ "learning_rate": 2.3683074265975822e-05,
2655
+ "loss": 1.1953,
2656
+ "step": 3780
2657
+ },
2658
+ {
2659
+ "epoch": 0.7773362389437252,
2660
+ "grad_norm": 0.5300847291946411,
2661
+ "learning_rate": 2.3467184801381694e-05,
2662
+ "loss": 1.1843,
2663
+ "step": 3790
2664
+ },
2665
+ {
2666
+ "epoch": 0.7793872580438406,
2667
+ "grad_norm": 0.5129801630973816,
2668
+ "learning_rate": 2.3251295336787566e-05,
2669
+ "loss": 1.1809,
2670
+ "step": 3800
2671
+ },
2672
+ {
2673
+ "epoch": 0.7814382771439559,
2674
+ "grad_norm": 0.549198567867279,
2675
+ "learning_rate": 2.3035405872193438e-05,
2676
+ "loss": 1.2099,
2677
+ "step": 3810
2678
+ },
2679
+ {
2680
+ "epoch": 0.7834892962440713,
2681
+ "grad_norm": 0.5118544101715088,
2682
+ "learning_rate": 2.281951640759931e-05,
2683
+ "loss": 1.2149,
2684
+ "step": 3820
2685
+ },
2686
+ {
2687
+ "epoch": 0.7855403153441867,
2688
+ "grad_norm": 0.5479713082313538,
2689
+ "learning_rate": 2.260362694300518e-05,
2690
+ "loss": 1.1771,
2691
+ "step": 3830
2692
+ },
2693
+ {
2694
+ "epoch": 0.787591334444302,
2695
+ "grad_norm": 0.541350245475769,
2696
+ "learning_rate": 2.2387737478411056e-05,
2697
+ "loss": 1.1737,
2698
+ "step": 3840
2699
+ },
2700
+ {
2701
+ "epoch": 0.7896423535444174,
2702
+ "grad_norm": 0.5543351769447327,
2703
+ "learning_rate": 2.2171848013816925e-05,
2704
+ "loss": 1.2233,
2705
+ "step": 3850
2706
+ },
2707
+ {
2708
+ "epoch": 0.7916933726445328,
2709
+ "grad_norm": 0.5010188817977905,
2710
+ "learning_rate": 2.19559585492228e-05,
2711
+ "loss": 1.1938,
2712
+ "step": 3860
2713
+ },
2714
+ {
2715
+ "epoch": 0.7937443917446482,
2716
+ "grad_norm": 0.5245205760002136,
2717
+ "learning_rate": 2.1740069084628672e-05,
2718
+ "loss": 1.2015,
2719
+ "step": 3870
2720
+ },
2721
+ {
2722
+ "epoch": 0.7957954108447635,
2723
+ "grad_norm": 0.5324139595031738,
2724
+ "learning_rate": 2.1524179620034544e-05,
2725
+ "loss": 1.2248,
2726
+ "step": 3880
2727
+ },
2728
+ {
2729
+ "epoch": 0.7978464299448789,
2730
+ "grad_norm": 0.5172831416130066,
2731
+ "learning_rate": 2.1308290155440415e-05,
2732
+ "loss": 1.1992,
2733
+ "step": 3890
2734
+ },
2735
+ {
2736
+ "epoch": 0.7998974490449943,
2737
+ "grad_norm": 0.5434138178825378,
2738
+ "learning_rate": 2.1092400690846287e-05,
2739
+ "loss": 1.1813,
2740
+ "step": 3900
2741
+ },
2742
+ {
2743
+ "epoch": 0.8019484681451096,
2744
+ "grad_norm": 0.5221844911575317,
2745
+ "learning_rate": 2.087651122625216e-05,
2746
+ "loss": 1.1625,
2747
+ "step": 3910
2748
+ },
2749
+ {
2750
+ "epoch": 0.803999487245225,
2751
+ "grad_norm": 0.5027469992637634,
2752
+ "learning_rate": 2.0660621761658034e-05,
2753
+ "loss": 1.181,
2754
+ "step": 3920
2755
+ },
2756
+ {
2757
+ "epoch": 0.8060505063453404,
2758
+ "grad_norm": 0.5298044085502625,
2759
+ "learning_rate": 2.0444732297063903e-05,
2760
+ "loss": 1.2079,
2761
+ "step": 3930
2762
+ },
2763
+ {
2764
+ "epoch": 0.8081015254454557,
2765
+ "grad_norm": 0.5463908910751343,
2766
+ "learning_rate": 2.0228842832469778e-05,
2767
+ "loss": 1.2009,
2768
+ "step": 3940
2769
+ },
2770
+ {
2771
+ "epoch": 0.8101525445455711,
2772
+ "grad_norm": 0.5394027233123779,
2773
+ "learning_rate": 2.0012953367875646e-05,
2774
+ "loss": 1.1931,
2775
+ "step": 3950
2776
+ },
2777
+ {
2778
+ "epoch": 0.8122035636456865,
2779
+ "grad_norm": 0.5041294097900391,
2780
+ "learning_rate": 1.979706390328152e-05,
2781
+ "loss": 1.2107,
2782
+ "step": 3960
2783
+ },
2784
+ {
2785
+ "epoch": 0.8142545827458019,
2786
+ "grad_norm": 0.5223291516304016,
2787
+ "learning_rate": 1.9581174438687393e-05,
2788
+ "loss": 1.1775,
2789
+ "step": 3970
2790
+ },
2791
+ {
2792
+ "epoch": 0.8163056018459172,
2793
+ "grad_norm": 0.5221052169799805,
2794
+ "learning_rate": 1.9365284974093265e-05,
2795
+ "loss": 1.2052,
2796
+ "step": 3980
2797
+ },
2798
+ {
2799
+ "epoch": 0.8183566209460326,
2800
+ "grad_norm": 0.5229529738426208,
2801
+ "learning_rate": 1.9149395509499137e-05,
2802
+ "loss": 1.1922,
2803
+ "step": 3990
2804
+ },
2805
+ {
2806
+ "epoch": 0.820407640046148,
2807
+ "grad_norm": 0.5651980042457581,
2808
+ "learning_rate": 1.893350604490501e-05,
2809
+ "loss": 1.2043,
2810
+ "step": 4000
2811
+ },
2812
+ {
2813
+ "epoch": 0.8224586591462633,
2814
+ "grad_norm": 0.5169751644134521,
2815
+ "learning_rate": 1.871761658031088e-05,
2816
+ "loss": 1.2157,
2817
+ "step": 4010
2818
+ },
2819
+ {
2820
+ "epoch": 0.8245096782463787,
2821
+ "grad_norm": 0.5741276144981384,
2822
+ "learning_rate": 1.8501727115716755e-05,
2823
+ "loss": 1.2112,
2824
+ "step": 4020
2825
+ },
2826
+ {
2827
+ "epoch": 0.8265606973464941,
2828
+ "grad_norm": 0.530596137046814,
2829
+ "learning_rate": 1.8285837651122624e-05,
2830
+ "loss": 1.2535,
2831
+ "step": 4030
2832
+ },
2833
+ {
2834
+ "epoch": 0.8286117164466094,
2835
+ "grad_norm": 0.5436383485794067,
2836
+ "learning_rate": 1.80699481865285e-05,
2837
+ "loss": 1.1789,
2838
+ "step": 4040
2839
+ },
2840
+ {
2841
+ "epoch": 0.8306627355467248,
2842
+ "grad_norm": 0.5238965749740601,
2843
+ "learning_rate": 1.7854058721934368e-05,
2844
+ "loss": 1.1645,
2845
+ "step": 4050
2846
+ },
2847
+ {
2848
+ "epoch": 0.8327137546468402,
2849
+ "grad_norm": 0.5226778388023376,
2850
+ "learning_rate": 1.7638169257340243e-05,
2851
+ "loss": 1.2238,
2852
+ "step": 4060
2853
+ },
2854
+ {
2855
+ "epoch": 0.8347647737469556,
2856
+ "grad_norm": 0.5810254812240601,
2857
+ "learning_rate": 1.7422279792746114e-05,
2858
+ "loss": 1.2212,
2859
+ "step": 4070
2860
+ },
2861
+ {
2862
+ "epoch": 0.8368157928470709,
2863
+ "grad_norm": 0.5228540301322937,
2864
+ "learning_rate": 1.7206390328151986e-05,
2865
+ "loss": 1.2025,
2866
+ "step": 4080
2867
+ },
2868
+ {
2869
+ "epoch": 0.8388668119471863,
2870
+ "grad_norm": 0.5112829804420471,
2871
+ "learning_rate": 1.6990500863557858e-05,
2872
+ "loss": 1.1838,
2873
+ "step": 4090
2874
+ },
2875
+ {
2876
+ "epoch": 0.8409178310473017,
2877
+ "grad_norm": 0.5092179775238037,
2878
+ "learning_rate": 1.6774611398963733e-05,
2879
+ "loss": 1.1981,
2880
+ "step": 4100
2881
+ },
2882
+ {
2883
+ "epoch": 0.842968850147417,
2884
+ "grad_norm": 0.5236721634864807,
2885
+ "learning_rate": 1.65587219343696e-05,
2886
+ "loss": 1.1994,
2887
+ "step": 4110
2888
+ },
2889
+ {
2890
+ "epoch": 0.8450198692475324,
2891
+ "grad_norm": 0.5067551732063293,
2892
+ "learning_rate": 1.6342832469775477e-05,
2893
+ "loss": 1.1758,
2894
+ "step": 4120
2895
+ },
2896
+ {
2897
+ "epoch": 0.8470708883476478,
2898
+ "grad_norm": 0.5471055507659912,
2899
+ "learning_rate": 1.6126943005181345e-05,
2900
+ "loss": 1.2315,
2901
+ "step": 4130
2902
+ },
2903
+ {
2904
+ "epoch": 0.8491219074477631,
2905
+ "grad_norm": 0.514798641204834,
2906
+ "learning_rate": 1.591105354058722e-05,
2907
+ "loss": 1.183,
2908
+ "step": 4140
2909
+ },
2910
+ {
2911
+ "epoch": 0.8511729265478785,
2912
+ "grad_norm": 0.5316623449325562,
2913
+ "learning_rate": 1.5695164075993092e-05,
2914
+ "loss": 1.1997,
2915
+ "step": 4150
2916
+ },
2917
+ {
2918
+ "epoch": 0.8532239456479939,
2919
+ "grad_norm": 0.531896710395813,
2920
+ "learning_rate": 1.5479274611398964e-05,
2921
+ "loss": 1.1967,
2922
+ "step": 4160
2923
+ },
2924
+ {
2925
+ "epoch": 0.8552749647481093,
2926
+ "grad_norm": 0.5044012665748596,
2927
+ "learning_rate": 1.5263385146804836e-05,
2928
+ "loss": 1.2061,
2929
+ "step": 4170
2930
+ },
2931
+ {
2932
+ "epoch": 0.8573259838482246,
2933
+ "grad_norm": 0.547264039516449,
2934
+ "learning_rate": 1.5047495682210708e-05,
2935
+ "loss": 1.1975,
2936
+ "step": 4180
2937
+ },
2938
+ {
2939
+ "epoch": 0.85937700294834,
2940
+ "grad_norm": 0.5514972805976868,
2941
+ "learning_rate": 1.4831606217616581e-05,
2942
+ "loss": 1.2044,
2943
+ "step": 4190
2944
+ },
2945
+ {
2946
+ "epoch": 0.8614280220484554,
2947
+ "grad_norm": 0.5322652459144592,
2948
+ "learning_rate": 1.4615716753022455e-05,
2949
+ "loss": 1.2044,
2950
+ "step": 4200
2951
+ },
2952
+ {
2953
+ "epoch": 0.8634790411485707,
2954
+ "grad_norm": 0.5309359431266785,
2955
+ "learning_rate": 1.4399827288428325e-05,
2956
+ "loss": 1.2066,
2957
+ "step": 4210
2958
+ },
2959
+ {
2960
+ "epoch": 0.8655300602486861,
2961
+ "grad_norm": 0.5314792394638062,
2962
+ "learning_rate": 1.4183937823834198e-05,
2963
+ "loss": 1.2006,
2964
+ "step": 4220
2965
+ },
2966
+ {
2967
+ "epoch": 0.8675810793488015,
2968
+ "grad_norm": 0.5549922585487366,
2969
+ "learning_rate": 1.3968048359240068e-05,
2970
+ "loss": 1.2058,
2971
+ "step": 4230
2972
+ },
2973
+ {
2974
+ "epoch": 0.8696320984489168,
2975
+ "grad_norm": 0.5373049378395081,
2976
+ "learning_rate": 1.3752158894645942e-05,
2977
+ "loss": 1.2002,
2978
+ "step": 4240
2979
+ },
2980
+ {
2981
+ "epoch": 0.8716831175490322,
2982
+ "grad_norm": 0.5322666764259338,
2983
+ "learning_rate": 1.3536269430051815e-05,
2984
+ "loss": 1.215,
2985
+ "step": 4250
2986
+ },
2987
+ {
2988
+ "epoch": 0.8737341366491476,
2989
+ "grad_norm": 0.5549564957618713,
2990
+ "learning_rate": 1.3320379965457685e-05,
2991
+ "loss": 1.2131,
2992
+ "step": 4260
2993
+ },
2994
+ {
2995
+ "epoch": 0.875785155749263,
2996
+ "grad_norm": 0.5308319926261902,
2997
+ "learning_rate": 1.3104490500863559e-05,
2998
+ "loss": 1.2203,
2999
+ "step": 4270
3000
+ },
3001
+ {
3002
+ "epoch": 0.8778361748493783,
3003
+ "grad_norm": 0.5089017152786255,
3004
+ "learning_rate": 1.2888601036269432e-05,
3005
+ "loss": 1.1801,
3006
+ "step": 4280
3007
+ },
3008
+ {
3009
+ "epoch": 0.8798871939494937,
3010
+ "grad_norm": 0.5377966165542603,
3011
+ "learning_rate": 1.2672711571675302e-05,
3012
+ "loss": 1.189,
3013
+ "step": 4290
3014
+ },
3015
+ {
3016
+ "epoch": 0.8819382130496091,
3017
+ "grad_norm": 0.5528485178947449,
3018
+ "learning_rate": 1.2456822107081174e-05,
3019
+ "loss": 1.2197,
3020
+ "step": 4300
3021
+ },
3022
+ {
3023
+ "epoch": 0.8839892321497244,
3024
+ "grad_norm": 0.5241679549217224,
3025
+ "learning_rate": 1.2240932642487048e-05,
3026
+ "loss": 1.1652,
3027
+ "step": 4310
3028
+ },
3029
+ {
3030
+ "epoch": 0.8860402512498398,
3031
+ "grad_norm": 0.5626764893531799,
3032
+ "learning_rate": 1.202504317789292e-05,
3033
+ "loss": 1.1805,
3034
+ "step": 4320
3035
+ },
3036
+ {
3037
+ "epoch": 0.8880912703499552,
3038
+ "grad_norm": 0.5248028635978699,
3039
+ "learning_rate": 1.1809153713298791e-05,
3040
+ "loss": 1.1652,
3041
+ "step": 4330
3042
+ },
3043
+ {
3044
+ "epoch": 0.8901422894500705,
3045
+ "grad_norm": 0.5452848672866821,
3046
+ "learning_rate": 1.1593264248704663e-05,
3047
+ "loss": 1.2171,
3048
+ "step": 4340
3049
+ },
3050
+ {
3051
+ "epoch": 0.8921933085501859,
3052
+ "grad_norm": 0.5505712628364563,
3053
+ "learning_rate": 1.1377374784110537e-05,
3054
+ "loss": 1.1967,
3055
+ "step": 4350
3056
+ },
3057
+ {
3058
+ "epoch": 0.8942443276503013,
3059
+ "grad_norm": 0.5437038540840149,
3060
+ "learning_rate": 1.1161485319516408e-05,
3061
+ "loss": 1.2216,
3062
+ "step": 4360
3063
+ },
3064
+ {
3065
+ "epoch": 0.8962953467504167,
3066
+ "grad_norm": 0.5138014554977417,
3067
+ "learning_rate": 1.094559585492228e-05,
3068
+ "loss": 1.193,
3069
+ "step": 4370
3070
+ },
3071
+ {
3072
+ "epoch": 0.898346365850532,
3073
+ "grad_norm": 0.542080283164978,
3074
+ "learning_rate": 1.0729706390328152e-05,
3075
+ "loss": 1.1677,
3076
+ "step": 4380
3077
+ },
3078
+ {
3079
+ "epoch": 0.9003973849506474,
3080
+ "grad_norm": 0.5166792273521423,
3081
+ "learning_rate": 1.0513816925734024e-05,
3082
+ "loss": 1.2147,
3083
+ "step": 4390
3084
+ },
3085
+ {
3086
+ "epoch": 0.9024484040507628,
3087
+ "grad_norm": 0.536491334438324,
3088
+ "learning_rate": 1.0297927461139897e-05,
3089
+ "loss": 1.2077,
3090
+ "step": 4400
3091
+ }
3092
+ ],
3093
+ "logging_steps": 10,
3094
+ "max_steps": 4876,
3095
+ "num_input_tokens_seen": 0,
3096
+ "num_train_epochs": 1,
3097
+ "save_steps": 200,
3098
+ "stateful_callbacks": {
3099
+ "TrainerControl": {
3100
+ "args": {
3101
+ "should_epoch_stop": false,
3102
+ "should_evaluate": false,
3103
+ "should_log": false,
3104
+ "should_save": true,
3105
+ "should_training_stop": false
3106
+ },
3107
+ "attributes": {}
3108
+ }
3109
+ },
3110
+ "total_flos": 1.1227995226985472e+17,
3111
+ "train_batch_size": 4,
3112
+ "trial_name": null,
3113
+ "trial_params": null
3114
+ }
lora_checkpoints/checkpoint-4400/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
3
+ size 5624
lora_checkpoints/checkpoint-4600/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: uaritm/gemma3_1b_med_qa_ru
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:uaritm/gemma3_1b_med_qa_ru
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
lora_checkpoints/checkpoint-4600/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
lora_checkpoints/checkpoint-4600/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71ffe8eb65fbb93105908f266c3615953eb7f27d7b126202c9cbe9696d56a76a
3
+ size 52231312
lora_checkpoints/checkpoint-4600/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
lora_checkpoints/checkpoint-4600/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
lora_checkpoints/checkpoint-4600/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fe9000272a4d0f562595a138a32bd4c6248f15178ca24507e8825653d6547da
3
+ size 104671958
lora_checkpoints/checkpoint-4600/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf68f7d2db510e0dece06020a6c1f3e492c8d7f7df52a36128ef0f01be2e4ddf
3
+ size 14244
lora_checkpoints/checkpoint-4600/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d60713e239aa461be03135bdbca99b2bfe14ea1d561b3ce05394a2a8b3b9e7
3
+ size 988
lora_checkpoints/checkpoint-4600/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a249874e4ec66b917b0e2b9932feaf87c6ddf577951744ab797e8c3252e36163
3
+ size 1064
lora_checkpoints/checkpoint-4600/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
lora_checkpoints/checkpoint-4600/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
lora_checkpoints/checkpoint-4600/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
lora_checkpoints/checkpoint-4600/trainer_state.json ADDED
@@ -0,0 +1,3254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9434687860530702,
6
+ "eval_steps": 500,
7
+ "global_step": 4600,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00205101910011537,
14
+ "grad_norm": 1.9277215003967285,
15
+ "learning_rate": 3.6885245901639347e-06,
16
+ "loss": 1.4306,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.00410203820023074,
21
+ "grad_norm": 0.3513035476207733,
22
+ "learning_rate": 7.78688524590164e-06,
23
+ "loss": 1.3524,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.006153057300346109,
28
+ "grad_norm": 0.3364648222923279,
29
+ "learning_rate": 1.1885245901639344e-05,
30
+ "loss": 1.3188,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.00820407640046148,
35
+ "grad_norm": 0.3382512927055359,
36
+ "learning_rate": 1.598360655737705e-05,
37
+ "loss": 1.3418,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.01025509550057685,
42
+ "grad_norm": 0.360334575176239,
43
+ "learning_rate": 2.0081967213114755e-05,
44
+ "loss": 1.3381,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.012306114600692218,
49
+ "grad_norm": 0.3408481180667877,
50
+ "learning_rate": 2.418032786885246e-05,
51
+ "loss": 1.3365,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.014357133700807588,
56
+ "grad_norm": 0.36211535334587097,
57
+ "learning_rate": 2.8278688524590162e-05,
58
+ "loss": 1.3314,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.01640815280092296,
63
+ "grad_norm": 0.38704580068588257,
64
+ "learning_rate": 3.237704918032787e-05,
65
+ "loss": 1.3108,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.018459171901038327,
70
+ "grad_norm": 0.44303640723228455,
71
+ "learning_rate": 3.6475409836065576e-05,
72
+ "loss": 1.3073,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.0205101910011537,
77
+ "grad_norm": 0.4073602557182312,
78
+ "learning_rate": 4.057377049180328e-05,
79
+ "loss": 1.2993,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.022561210101269068,
84
+ "grad_norm": 0.4478100538253784,
85
+ "learning_rate": 4.467213114754098e-05,
86
+ "loss": 1.3413,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.024612229201384436,
91
+ "grad_norm": 0.39146170020103455,
92
+ "learning_rate": 4.8770491803278687e-05,
93
+ "loss": 1.3168,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.026663248301499808,
98
+ "grad_norm": 0.3786431849002838,
99
+ "learning_rate": 5.28688524590164e-05,
100
+ "loss": 1.2774,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.028714267401615177,
105
+ "grad_norm": 0.4014948904514313,
106
+ "learning_rate": 5.69672131147541e-05,
107
+ "loss": 1.346,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.03076528650173055,
112
+ "grad_norm": 0.3987842798233032,
113
+ "learning_rate": 6.10655737704918e-05,
114
+ "loss": 1.2816,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.03281630560184592,
119
+ "grad_norm": 0.3897082507610321,
120
+ "learning_rate": 6.516393442622951e-05,
121
+ "loss": 1.3485,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.034867324701961286,
126
+ "grad_norm": 0.373279333114624,
127
+ "learning_rate": 6.926229508196722e-05,
128
+ "loss": 1.3185,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.036918343802076654,
133
+ "grad_norm": 0.3812575340270996,
134
+ "learning_rate": 7.336065573770491e-05,
135
+ "loss": 1.3394,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.03896936290219203,
140
+ "grad_norm": 0.35926997661590576,
141
+ "learning_rate": 7.745901639344263e-05,
142
+ "loss": 1.2821,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.0410203820023074,
147
+ "grad_norm": 0.3649434745311737,
148
+ "learning_rate": 8.155737704918032e-05,
149
+ "loss": 1.33,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.04307140110242277,
154
+ "grad_norm": 0.345662921667099,
155
+ "learning_rate": 8.565573770491803e-05,
156
+ "loss": 1.3107,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.045122420202538135,
161
+ "grad_norm": 0.37169769406318665,
162
+ "learning_rate": 8.975409836065574e-05,
163
+ "loss": 1.309,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.047173439302653504,
168
+ "grad_norm": 0.37920281291007996,
169
+ "learning_rate": 9.385245901639344e-05,
170
+ "loss": 1.3352,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.04922445840276887,
175
+ "grad_norm": 0.35772770643234253,
176
+ "learning_rate": 9.795081967213115e-05,
177
+ "loss": 1.2402,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.05127547750288425,
182
+ "grad_norm": 0.38790181279182434,
183
+ "learning_rate": 9.989205526770294e-05,
184
+ "loss": 1.326,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.053326496602999617,
189
+ "grad_norm": 0.3545536696910858,
190
+ "learning_rate": 9.967616580310882e-05,
191
+ "loss": 1.3173,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.055377515703114985,
196
+ "grad_norm": 0.3845142722129822,
197
+ "learning_rate": 9.946027633851469e-05,
198
+ "loss": 1.2949,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.057428534803230354,
203
+ "grad_norm": 0.38621339201927185,
204
+ "learning_rate": 9.924438687392055e-05,
205
+ "loss": 1.2773,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.05947955390334572,
210
+ "grad_norm": 0.38091301918029785,
211
+ "learning_rate": 9.902849740932643e-05,
212
+ "loss": 1.3282,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.0615305730034611,
217
+ "grad_norm": 0.37546730041503906,
218
+ "learning_rate": 9.88126079447323e-05,
219
+ "loss": 1.2862,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.06358159210357646,
224
+ "grad_norm": 0.3515011966228485,
225
+ "learning_rate": 9.859671848013817e-05,
226
+ "loss": 1.2937,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.06563261120369183,
231
+ "grad_norm": 0.3863738775253296,
232
+ "learning_rate": 9.838082901554406e-05,
233
+ "loss": 1.3056,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.06768363030380721,
238
+ "grad_norm": 0.36615240573883057,
239
+ "learning_rate": 9.816493955094992e-05,
240
+ "loss": 1.3062,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.06973464940392257,
245
+ "grad_norm": 0.37741243839263916,
246
+ "learning_rate": 9.794905008635579e-05,
247
+ "loss": 1.3094,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.07178566850403795,
252
+ "grad_norm": 0.38626739382743835,
253
+ "learning_rate": 9.773316062176167e-05,
254
+ "loss": 1.2947,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.07383668760415331,
259
+ "grad_norm": 0.38667401671409607,
260
+ "learning_rate": 9.751727115716753e-05,
261
+ "loss": 1.2976,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.07588770670426868,
266
+ "grad_norm": 0.36084800958633423,
267
+ "learning_rate": 9.730138169257342e-05,
268
+ "loss": 1.27,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.07793872580438406,
273
+ "grad_norm": 0.3754425346851349,
274
+ "learning_rate": 9.708549222797928e-05,
275
+ "loss": 1.3243,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.07998974490449942,
280
+ "grad_norm": 0.39857473969459534,
281
+ "learning_rate": 9.686960276338515e-05,
282
+ "loss": 1.3077,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.0820407640046148,
287
+ "grad_norm": 0.3919648230075836,
288
+ "learning_rate": 9.665371329879103e-05,
289
+ "loss": 1.2985,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.08409178310473016,
294
+ "grad_norm": 0.3675483465194702,
295
+ "learning_rate": 9.643782383419689e-05,
296
+ "loss": 1.2946,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.08614280220484553,
301
+ "grad_norm": 0.3898465633392334,
302
+ "learning_rate": 9.622193436960277e-05,
303
+ "loss": 1.333,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.08819382130496091,
308
+ "grad_norm": 0.3681259751319885,
309
+ "learning_rate": 9.600604490500864e-05,
310
+ "loss": 1.2968,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.09024484040507627,
315
+ "grad_norm": 0.36453816294670105,
316
+ "learning_rate": 9.57901554404145e-05,
317
+ "loss": 1.272,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.09229585950519165,
322
+ "grad_norm": 0.34828147292137146,
323
+ "learning_rate": 9.557426597582039e-05,
324
+ "loss": 1.3245,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.09434687860530701,
329
+ "grad_norm": 0.3570501208305359,
330
+ "learning_rate": 9.535837651122625e-05,
331
+ "loss": 1.313,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.09639789770542238,
336
+ "grad_norm": 0.36692506074905396,
337
+ "learning_rate": 9.514248704663213e-05,
338
+ "loss": 1.2915,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.09844891680553775,
343
+ "grad_norm": 0.39161381125450134,
344
+ "learning_rate": 9.4926597582038e-05,
345
+ "loss": 1.3101,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.10049993590565312,
350
+ "grad_norm": 0.3808858394622803,
351
+ "learning_rate": 9.471070811744387e-05,
352
+ "loss": 1.3099,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.1025509550057685,
357
+ "grad_norm": 0.3541582524776459,
358
+ "learning_rate": 9.449481865284975e-05,
359
+ "loss": 1.2772,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.10460197410588386,
364
+ "grad_norm": 0.379190593957901,
365
+ "learning_rate": 9.427892918825562e-05,
366
+ "loss": 1.2914,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.10665299320599923,
371
+ "grad_norm": 0.37727421522140503,
372
+ "learning_rate": 9.406303972366149e-05,
373
+ "loss": 1.2888,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1087040123061146,
378
+ "grad_norm": 0.3787306845188141,
379
+ "learning_rate": 9.384715025906737e-05,
380
+ "loss": 1.3049,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.11075503140622997,
385
+ "grad_norm": 0.3831459581851959,
386
+ "learning_rate": 9.363126079447323e-05,
387
+ "loss": 1.2631,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.11280605050634535,
392
+ "grad_norm": 0.37274929881095886,
393
+ "learning_rate": 9.34153713298791e-05,
394
+ "loss": 1.3313,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.11485706960646071,
399
+ "grad_norm": 0.3683277368545532,
400
+ "learning_rate": 9.319948186528498e-05,
401
+ "loss": 1.2528,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.11690808870657608,
406
+ "grad_norm": 0.39554840326309204,
407
+ "learning_rate": 9.298359240069085e-05,
408
+ "loss": 1.2737,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.11895910780669144,
413
+ "grad_norm": 0.39166760444641113,
414
+ "learning_rate": 9.276770293609673e-05,
415
+ "loss": 1.271,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.12101012690680682,
420
+ "grad_norm": 0.384085476398468,
421
+ "learning_rate": 9.255181347150259e-05,
422
+ "loss": 1.2921,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.1230611460069222,
427
+ "grad_norm": 0.3704201281070709,
428
+ "learning_rate": 9.233592400690847e-05,
429
+ "loss": 1.2776,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.12511216510703757,
434
+ "grad_norm": 0.3844301998615265,
435
+ "learning_rate": 9.212003454231434e-05,
436
+ "loss": 1.3067,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.12716318420715292,
441
+ "grad_norm": 0.3971571922302246,
442
+ "learning_rate": 9.190414507772022e-05,
443
+ "loss": 1.2792,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.1292142033072683,
448
+ "grad_norm": 0.40666353702545166,
449
+ "learning_rate": 9.168825561312608e-05,
450
+ "loss": 1.2964,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.13126522240738367,
455
+ "grad_norm": 0.38252532482147217,
456
+ "learning_rate": 9.147236614853195e-05,
457
+ "loss": 1.2815,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.13331624150749904,
462
+ "grad_norm": 0.37795621156692505,
463
+ "learning_rate": 9.125647668393783e-05,
464
+ "loss": 1.283,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.13536726060761442,
469
+ "grad_norm": 0.4035683572292328,
470
+ "learning_rate": 9.10405872193437e-05,
471
+ "loss": 1.288,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.13741827970772977,
476
+ "grad_norm": 0.410669207572937,
477
+ "learning_rate": 9.082469775474958e-05,
478
+ "loss": 1.2659,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.13946929880784514,
483
+ "grad_norm": 0.3809865713119507,
484
+ "learning_rate": 9.060880829015544e-05,
485
+ "loss": 1.3133,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.14152031790796052,
490
+ "grad_norm": 0.3748447597026825,
491
+ "learning_rate": 9.039291882556131e-05,
492
+ "loss": 1.2643,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.1435713370080759,
497
+ "grad_norm": 0.39292991161346436,
498
+ "learning_rate": 9.017702936096719e-05,
499
+ "loss": 1.2855,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.14562235610819127,
504
+ "grad_norm": 0.4399755001068115,
505
+ "learning_rate": 8.996113989637307e-05,
506
+ "loss": 1.286,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.14767337520830662,
511
+ "grad_norm": 0.42447429895401,
512
+ "learning_rate": 8.974525043177894e-05,
513
+ "loss": 1.2736,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.149724394308422,
518
+ "grad_norm": 0.37248438596725464,
519
+ "learning_rate": 8.95293609671848e-05,
520
+ "loss": 1.2652,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.15177541340853737,
525
+ "grad_norm": 0.39122238755226135,
526
+ "learning_rate": 8.931347150259068e-05,
527
+ "loss": 1.2814,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.15382643250865274,
532
+ "grad_norm": 0.3697800040245056,
533
+ "learning_rate": 8.909758203799655e-05,
534
+ "loss": 1.2462,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.15587745160876812,
539
+ "grad_norm": 0.3901929259300232,
540
+ "learning_rate": 8.888169257340241e-05,
541
+ "loss": 1.2742,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.15792847070888347,
546
+ "grad_norm": 0.3833727538585663,
547
+ "learning_rate": 8.86658031088083e-05,
548
+ "loss": 1.3015,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.15997948980899884,
553
+ "grad_norm": 0.4028802216053009,
554
+ "learning_rate": 8.844991364421416e-05,
555
+ "loss": 1.2631,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.16203050890911422,
560
+ "grad_norm": 0.39087918400764465,
561
+ "learning_rate": 8.823402417962004e-05,
562
+ "loss": 1.2993,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.1640815280092296,
567
+ "grad_norm": 0.39453235268592834,
568
+ "learning_rate": 8.801813471502591e-05,
569
+ "loss": 1.2544,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.16613254710934497,
574
+ "grad_norm": 0.42142602801322937,
575
+ "learning_rate": 8.780224525043178e-05,
576
+ "loss": 1.2676,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.16818356620946032,
581
+ "grad_norm": 0.36646899580955505,
582
+ "learning_rate": 8.758635578583767e-05,
583
+ "loss": 1.2765,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.1702345853095757,
588
+ "grad_norm": 0.4253019094467163,
589
+ "learning_rate": 8.737046632124353e-05,
590
+ "loss": 1.3003,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.17228560440969107,
595
+ "grad_norm": 0.41490674018859863,
596
+ "learning_rate": 8.715457685664939e-05,
597
+ "loss": 1.2731,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.17433662350980644,
602
+ "grad_norm": 0.405460387468338,
603
+ "learning_rate": 8.693868739205528e-05,
604
+ "loss": 1.2122,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.17638764260992182,
609
+ "grad_norm": 0.4028235375881195,
610
+ "learning_rate": 8.672279792746114e-05,
611
+ "loss": 1.3238,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.17843866171003717,
616
+ "grad_norm": 0.38994792103767395,
617
+ "learning_rate": 8.650690846286701e-05,
618
+ "loss": 1.2875,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.18048968081015254,
623
+ "grad_norm": 0.4099538326263428,
624
+ "learning_rate": 8.629101899827289e-05,
625
+ "loss": 1.2807,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.18254069991026792,
630
+ "grad_norm": 0.40470021963119507,
631
+ "learning_rate": 8.607512953367875e-05,
632
+ "loss": 1.2802,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.1845917190103833,
637
+ "grad_norm": 0.4066854417324066,
638
+ "learning_rate": 8.585924006908464e-05,
639
+ "loss": 1.2464,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.18664273811049864,
644
+ "grad_norm": 0.38739994168281555,
645
+ "learning_rate": 8.56433506044905e-05,
646
+ "loss": 1.2831,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.18869375721061402,
651
+ "grad_norm": 0.4257420301437378,
652
+ "learning_rate": 8.542746113989638e-05,
653
+ "loss": 1.2679,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.1907447763107294,
658
+ "grad_norm": 0.41571488976478577,
659
+ "learning_rate": 8.521157167530225e-05,
660
+ "loss": 1.2501,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.19279579541084477,
665
+ "grad_norm": 0.4178495407104492,
666
+ "learning_rate": 8.499568221070811e-05,
667
+ "loss": 1.2657,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.19484681451096014,
672
+ "grad_norm": 0.4083455801010132,
673
+ "learning_rate": 8.477979274611399e-05,
674
+ "loss": 1.2781,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.1968978336110755,
679
+ "grad_norm": 0.4067554175853729,
680
+ "learning_rate": 8.456390328151986e-05,
681
+ "loss": 1.2582,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.19894885271119087,
686
+ "grad_norm": 0.4067447781562805,
687
+ "learning_rate": 8.434801381692574e-05,
688
+ "loss": 1.2948,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.20099987181130624,
693
+ "grad_norm": 0.44283562898635864,
694
+ "learning_rate": 8.413212435233161e-05,
695
+ "loss": 1.3011,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.20305089091142162,
700
+ "grad_norm": 0.41568294167518616,
701
+ "learning_rate": 8.391623488773748e-05,
702
+ "loss": 1.2804,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.205101910011537,
707
+ "grad_norm": 0.4183642864227295,
708
+ "learning_rate": 8.370034542314335e-05,
709
+ "loss": 1.2228,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.20715292911165234,
714
+ "grad_norm": 0.4311917722225189,
715
+ "learning_rate": 8.348445595854923e-05,
716
+ "loss": 1.2714,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.20920394821176772,
721
+ "grad_norm": 0.41575828194618225,
722
+ "learning_rate": 8.32685664939551e-05,
723
+ "loss": 1.2783,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.2112549673118831,
728
+ "grad_norm": 0.3958878815174103,
729
+ "learning_rate": 8.305267702936098e-05,
730
+ "loss": 1.2558,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.21330598641199847,
735
+ "grad_norm": 0.43759557604789734,
736
+ "learning_rate": 8.283678756476684e-05,
737
+ "loss": 1.2557,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.21535700551211384,
742
+ "grad_norm": 0.41460636258125305,
743
+ "learning_rate": 8.262089810017271e-05,
744
+ "loss": 1.2851,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.2174080246122292,
749
+ "grad_norm": 0.4114689826965332,
750
+ "learning_rate": 8.240500863557859e-05,
751
+ "loss": 1.3076,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.21945904371234456,
756
+ "grad_norm": 0.42222094535827637,
757
+ "learning_rate": 8.218911917098446e-05,
758
+ "loss": 1.2263,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.22151006281245994,
763
+ "grad_norm": 0.4098639488220215,
764
+ "learning_rate": 8.197322970639033e-05,
765
+ "loss": 1.2779,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.22356108191257532,
770
+ "grad_norm": 0.4205043315887451,
771
+ "learning_rate": 8.175734024179621e-05,
772
+ "loss": 1.2177,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.2256121010126907,
777
+ "grad_norm": 0.4501648247241974,
778
+ "learning_rate": 8.154145077720208e-05,
779
+ "loss": 1.3227,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.22766312011280604,
784
+ "grad_norm": 0.41510599851608276,
785
+ "learning_rate": 8.132556131260795e-05,
786
+ "loss": 1.3177,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.22971413921292141,
791
+ "grad_norm": 0.41567444801330566,
792
+ "learning_rate": 8.110967184801383e-05,
793
+ "loss": 1.2506,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.2317651583130368,
798
+ "grad_norm": 0.4262779653072357,
799
+ "learning_rate": 8.089378238341969e-05,
800
+ "loss": 1.2506,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.23381617741315217,
805
+ "grad_norm": 0.4220465421676636,
806
+ "learning_rate": 8.067789291882558e-05,
807
+ "loss": 1.2514,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.23586719651326754,
812
+ "grad_norm": 0.4169275462627411,
813
+ "learning_rate": 8.046200345423144e-05,
814
+ "loss": 1.2693,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.2379182156133829,
819
+ "grad_norm": 0.43145328760147095,
820
+ "learning_rate": 8.02461139896373e-05,
821
+ "loss": 1.2394,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.23996923471349826,
826
+ "grad_norm": 0.42889878153800964,
827
+ "learning_rate": 8.003022452504319e-05,
828
+ "loss": 1.248,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.24202025381361364,
833
+ "grad_norm": 0.41731464862823486,
834
+ "learning_rate": 7.981433506044905e-05,
835
+ "loss": 1.2498,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.24407127291372901,
840
+ "grad_norm": 0.4326362609863281,
841
+ "learning_rate": 7.959844559585493e-05,
842
+ "loss": 1.265,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.2461222920138444,
847
+ "grad_norm": 0.4242352843284607,
848
+ "learning_rate": 7.93825561312608e-05,
849
+ "loss": 1.2672,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.24817331111395974,
854
+ "grad_norm": 0.4441153407096863,
855
+ "learning_rate": 7.916666666666666e-05,
856
+ "loss": 1.2944,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.25022433021407514,
861
+ "grad_norm": 0.40912818908691406,
862
+ "learning_rate": 7.895077720207255e-05,
863
+ "loss": 1.2702,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.2522753493141905,
868
+ "grad_norm": 0.44539037346839905,
869
+ "learning_rate": 7.873488773747841e-05,
870
+ "loss": 1.2228,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.25432636841430584,
875
+ "grad_norm": 0.4299303889274597,
876
+ "learning_rate": 7.851899827288429e-05,
877
+ "loss": 1.2328,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.25637738751442124,
882
+ "grad_norm": 0.4408973455429077,
883
+ "learning_rate": 7.830310880829016e-05,
884
+ "loss": 1.2358,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.2584284066145366,
889
+ "grad_norm": 0.4100968837738037,
890
+ "learning_rate": 7.808721934369602e-05,
891
+ "loss": 1.2458,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.260479425714652,
896
+ "grad_norm": 0.4401489198207855,
897
+ "learning_rate": 7.787132987910191e-05,
898
+ "loss": 1.2593,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.26253044481476734,
903
+ "grad_norm": 0.4514229893684387,
904
+ "learning_rate": 7.765544041450777e-05,
905
+ "loss": 1.2632,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.2645814639148827,
910
+ "grad_norm": 0.38684791326522827,
911
+ "learning_rate": 7.743955094991365e-05,
912
+ "loss": 1.2424,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.2666324830149981,
917
+ "grad_norm": 0.46148189902305603,
918
+ "learning_rate": 7.722366148531953e-05,
919
+ "loss": 1.2445,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.26868350211511344,
924
+ "grad_norm": 0.4319213628768921,
925
+ "learning_rate": 7.700777202072539e-05,
926
+ "loss": 1.2253,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.27073452121522884,
931
+ "grad_norm": 0.4195545017719269,
932
+ "learning_rate": 7.679188255613126e-05,
933
+ "loss": 1.2578,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.2727855403153442,
938
+ "grad_norm": 0.43690159916877747,
939
+ "learning_rate": 7.657599309153714e-05,
940
+ "loss": 1.2573,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.27483655941545954,
945
+ "grad_norm": 0.44571492075920105,
946
+ "learning_rate": 7.636010362694301e-05,
947
+ "loss": 1.2607,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.27688757851557494,
952
+ "grad_norm": 0.43295958638191223,
953
+ "learning_rate": 7.614421416234889e-05,
954
+ "loss": 1.2278,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.2789385976156903,
959
+ "grad_norm": 0.44495707750320435,
960
+ "learning_rate": 7.592832469775475e-05,
961
+ "loss": 1.2798,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.2809896167158057,
966
+ "grad_norm": 0.4412330985069275,
967
+ "learning_rate": 7.571243523316062e-05,
968
+ "loss": 1.2501,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.28304063581592104,
973
+ "grad_norm": 0.44599953293800354,
974
+ "learning_rate": 7.54965457685665e-05,
975
+ "loss": 1.2396,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.2850916549160364,
980
+ "grad_norm": 0.447109580039978,
981
+ "learning_rate": 7.528065630397237e-05,
982
+ "loss": 1.2767,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.2871426740161518,
987
+ "grad_norm": 0.44506722688674927,
988
+ "learning_rate": 7.506476683937824e-05,
989
+ "loss": 1.2546,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.28919369311626714,
994
+ "grad_norm": 0.44061776995658875,
995
+ "learning_rate": 7.484887737478411e-05,
996
+ "loss": 1.2413,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.29124471221638254,
1001
+ "grad_norm": 0.45085111260414124,
1002
+ "learning_rate": 7.463298791018999e-05,
1003
+ "loss": 1.2483,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.2932957313164979,
1008
+ "grad_norm": 0.4437837600708008,
1009
+ "learning_rate": 7.441709844559586e-05,
1010
+ "loss": 1.252,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.29534675041661324,
1015
+ "grad_norm": 0.4294221103191376,
1016
+ "learning_rate": 7.420120898100174e-05,
1017
+ "loss": 1.2386,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.29739776951672864,
1022
+ "grad_norm": 0.4780830144882202,
1023
+ "learning_rate": 7.39853195164076e-05,
1024
+ "loss": 1.2639,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.299448788616844,
1029
+ "grad_norm": 0.44152942299842834,
1030
+ "learning_rate": 7.376943005181347e-05,
1031
+ "loss": 1.2756,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.3014998077169594,
1036
+ "grad_norm": 0.41989192366600037,
1037
+ "learning_rate": 7.355354058721935e-05,
1038
+ "loss": 1.2614,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.30355082681707474,
1043
+ "grad_norm": 0.5871754884719849,
1044
+ "learning_rate": 7.333765112262521e-05,
1045
+ "loss": 1.2615,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.3056018459171901,
1050
+ "grad_norm": 0.4467261731624603,
1051
+ "learning_rate": 7.31217616580311e-05,
1052
+ "loss": 1.2624,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.3076528650173055,
1057
+ "grad_norm": 0.49219033122062683,
1058
+ "learning_rate": 7.290587219343696e-05,
1059
+ "loss": 1.289,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.30970388411742084,
1064
+ "grad_norm": 0.4700734317302704,
1065
+ "learning_rate": 7.268998272884284e-05,
1066
+ "loss": 1.242,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.31175490321753624,
1071
+ "grad_norm": 0.4607170820236206,
1072
+ "learning_rate": 7.247409326424871e-05,
1073
+ "loss": 1.2554,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.3138059223176516,
1078
+ "grad_norm": 0.4335988759994507,
1079
+ "learning_rate": 7.225820379965457e-05,
1080
+ "loss": 1.2423,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.31585694141776693,
1085
+ "grad_norm": 0.4366897940635681,
1086
+ "learning_rate": 7.204231433506046e-05,
1087
+ "loss": 1.2219,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.31790796051788234,
1092
+ "grad_norm": 0.45856085419654846,
1093
+ "learning_rate": 7.182642487046632e-05,
1094
+ "loss": 1.2189,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.3199589796179977,
1099
+ "grad_norm": 0.4563063085079193,
1100
+ "learning_rate": 7.16105354058722e-05,
1101
+ "loss": 1.2696,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.3220099987181131,
1106
+ "grad_norm": 0.4276934862136841,
1107
+ "learning_rate": 7.139464594127807e-05,
1108
+ "loss": 1.2659,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.32406101781822844,
1113
+ "grad_norm": 0.46200886368751526,
1114
+ "learning_rate": 7.117875647668394e-05,
1115
+ "loss": 1.2261,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.3261120369183438,
1120
+ "grad_norm": 0.4863358736038208,
1121
+ "learning_rate": 7.096286701208982e-05,
1122
+ "loss": 1.2292,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.3281630560184592,
1127
+ "grad_norm": 0.4537160098552704,
1128
+ "learning_rate": 7.074697754749569e-05,
1129
+ "loss": 1.2453,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.33021407511857453,
1134
+ "grad_norm": 0.4507627487182617,
1135
+ "learning_rate": 7.053108808290155e-05,
1136
+ "loss": 1.2081,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.33226509421868994,
1141
+ "grad_norm": 0.43197301030158997,
1142
+ "learning_rate": 7.031519861830744e-05,
1143
+ "loss": 1.2757,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.3343161133188053,
1148
+ "grad_norm": 0.4551820456981659,
1149
+ "learning_rate": 7.00993091537133e-05,
1150
+ "loss": 1.2751,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.33636713241892063,
1155
+ "grad_norm": 0.45099398493766785,
1156
+ "learning_rate": 6.988341968911917e-05,
1157
+ "loss": 1.2583,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.33841815151903604,
1162
+ "grad_norm": 0.46787434816360474,
1163
+ "learning_rate": 6.966753022452505e-05,
1164
+ "loss": 1.2448,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.3404691706191514,
1169
+ "grad_norm": 0.45500054955482483,
1170
+ "learning_rate": 6.945164075993091e-05,
1171
+ "loss": 1.2394,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.3425201897192668,
1176
+ "grad_norm": 0.4682730436325073,
1177
+ "learning_rate": 6.92357512953368e-05,
1178
+ "loss": 1.2287,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.34457120881938214,
1183
+ "grad_norm": 0.4615074396133423,
1184
+ "learning_rate": 6.901986183074266e-05,
1185
+ "loss": 1.2042,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.3466222279194975,
1190
+ "grad_norm": 0.4548027217388153,
1191
+ "learning_rate": 6.880397236614854e-05,
1192
+ "loss": 1.2671,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.3486732470196129,
1197
+ "grad_norm": 0.4783169627189636,
1198
+ "learning_rate": 6.858808290155441e-05,
1199
+ "loss": 1.2533,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.35072426611972823,
1204
+ "grad_norm": 0.46452414989471436,
1205
+ "learning_rate": 6.837219343696027e-05,
1206
+ "loss": 1.2681,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.35277528521984364,
1211
+ "grad_norm": 0.4663463532924652,
1212
+ "learning_rate": 6.815630397236615e-05,
1213
+ "loss": 1.2561,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.354826304319959,
1218
+ "grad_norm": 0.46744370460510254,
1219
+ "learning_rate": 6.794041450777202e-05,
1220
+ "loss": 1.2453,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.35687732342007433,
1225
+ "grad_norm": 0.471835732460022,
1226
+ "learning_rate": 6.77245250431779e-05,
1227
+ "loss": 1.2472,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.35892834252018974,
1232
+ "grad_norm": 0.4618450701236725,
1233
+ "learning_rate": 6.750863557858377e-05,
1234
+ "loss": 1.2547,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.3609793616203051,
1239
+ "grad_norm": 0.4651658833026886,
1240
+ "learning_rate": 6.729274611398963e-05,
1241
+ "loss": 1.2623,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.36303038072042043,
1246
+ "grad_norm": 0.46842116117477417,
1247
+ "learning_rate": 6.707685664939551e-05,
1248
+ "loss": 1.2391,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.36508139982053583,
1253
+ "grad_norm": 0.45604613423347473,
1254
+ "learning_rate": 6.686096718480138e-05,
1255
+ "loss": 1.2884,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.3671324189206512,
1260
+ "grad_norm": 0.4306802451610565,
1261
+ "learning_rate": 6.664507772020726e-05,
1262
+ "loss": 1.2252,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.3691834380207666,
1267
+ "grad_norm": 0.4549136757850647,
1268
+ "learning_rate": 6.642918825561312e-05,
1269
+ "loss": 1.2496,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.37123445712088193,
1274
+ "grad_norm": 0.47443437576293945,
1275
+ "learning_rate": 6.6213298791019e-05,
1276
+ "loss": 1.2655,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.3732854762209973,
1281
+ "grad_norm": 0.46772050857543945,
1282
+ "learning_rate": 6.599740932642487e-05,
1283
+ "loss": 1.2366,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.3753364953211127,
1288
+ "grad_norm": 0.4691794216632843,
1289
+ "learning_rate": 6.578151986183075e-05,
1290
+ "loss": 1.2152,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.37738751442122803,
1295
+ "grad_norm": 0.43691304326057434,
1296
+ "learning_rate": 6.556563039723662e-05,
1297
+ "loss": 1.2511,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.37943853352134344,
1302
+ "grad_norm": 0.4595348536968231,
1303
+ "learning_rate": 6.534974093264248e-05,
1304
+ "loss": 1.2635,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.3814895526214588,
1309
+ "grad_norm": 0.44760558009147644,
1310
+ "learning_rate": 6.513385146804836e-05,
1311
+ "loss": 1.2342,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.38354057172157413,
1316
+ "grad_norm": 0.4559841454029083,
1317
+ "learning_rate": 6.491796200345423e-05,
1318
+ "loss": 1.2432,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.38559159082168953,
1323
+ "grad_norm": 0.4497215449810028,
1324
+ "learning_rate": 6.470207253886011e-05,
1325
+ "loss": 1.2267,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.3876426099218049,
1330
+ "grad_norm": 0.4863613247871399,
1331
+ "learning_rate": 6.448618307426598e-05,
1332
+ "loss": 1.254,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.3896936290219203,
1337
+ "grad_norm": 0.4500603675842285,
1338
+ "learning_rate": 6.427029360967185e-05,
1339
+ "loss": 1.2214,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.39174464812203563,
1344
+ "grad_norm": 0.4400598704814911,
1345
+ "learning_rate": 6.405440414507774e-05,
1346
+ "loss": 1.2352,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.393795667222151,
1351
+ "grad_norm": 0.46070367097854614,
1352
+ "learning_rate": 6.38385146804836e-05,
1353
+ "loss": 1.2468,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.3958466863222664,
1358
+ "grad_norm": 0.44312766194343567,
1359
+ "learning_rate": 6.362262521588946e-05,
1360
+ "loss": 1.1923,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.39789770542238173,
1365
+ "grad_norm": 0.5013573169708252,
1366
+ "learning_rate": 6.340673575129535e-05,
1367
+ "loss": 1.2361,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.39994872452249713,
1372
+ "grad_norm": 0.4884537160396576,
1373
+ "learning_rate": 6.319084628670121e-05,
1374
+ "loss": 1.2434,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.4019997436226125,
1379
+ "grad_norm": 0.46138620376586914,
1380
+ "learning_rate": 6.297495682210708e-05,
1381
+ "loss": 1.257,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.40405076272272783,
1386
+ "grad_norm": 0.4941729009151459,
1387
+ "learning_rate": 6.275906735751296e-05,
1388
+ "loss": 1.2347,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.40610178182284323,
1393
+ "grad_norm": 0.4675595760345459,
1394
+ "learning_rate": 6.254317789291882e-05,
1395
+ "loss": 1.2353,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.4081528009229586,
1400
+ "grad_norm": 0.47944632172584534,
1401
+ "learning_rate": 6.232728842832471e-05,
1402
+ "loss": 1.2643,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.410203820023074,
1407
+ "grad_norm": 0.4476461112499237,
1408
+ "learning_rate": 6.211139896373057e-05,
1409
+ "loss": 1.2558,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.41225483912318933,
1414
+ "grad_norm": 0.4706653654575348,
1415
+ "learning_rate": 6.189550949913645e-05,
1416
+ "loss": 1.227,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.4143058582233047,
1421
+ "grad_norm": 0.48062801361083984,
1422
+ "learning_rate": 6.167962003454232e-05,
1423
+ "loss": 1.2273,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.4163568773234201,
1428
+ "grad_norm": 0.46771204471588135,
1429
+ "learning_rate": 6.146373056994818e-05,
1430
+ "loss": 1.2268,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.41840789642353543,
1435
+ "grad_norm": 0.4725424647331238,
1436
+ "learning_rate": 6.124784110535406e-05,
1437
+ "loss": 1.2009,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.42045891552365083,
1442
+ "grad_norm": 0.47520384192466736,
1443
+ "learning_rate": 6.1031951640759934e-05,
1444
+ "loss": 1.2511,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.4225099346237662,
1449
+ "grad_norm": 0.44635480642318726,
1450
+ "learning_rate": 6.081606217616581e-05,
1451
+ "loss": 1.21,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.42456095372388153,
1456
+ "grad_norm": 0.47436651587486267,
1457
+ "learning_rate": 6.060017271157168e-05,
1458
+ "loss": 1.2116,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.42661197282399693,
1463
+ "grad_norm": 0.5115741491317749,
1464
+ "learning_rate": 6.0384283246977546e-05,
1465
+ "loss": 1.2778,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.4286629919241123,
1470
+ "grad_norm": 0.4488040506839752,
1471
+ "learning_rate": 6.016839378238343e-05,
1472
+ "loss": 1.2242,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.4307140110242277,
1477
+ "grad_norm": 0.4834796190261841,
1478
+ "learning_rate": 5.9952504317789296e-05,
1479
+ "loss": 1.2357,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.43276503012434303,
1484
+ "grad_norm": 0.45478227734565735,
1485
+ "learning_rate": 5.973661485319517e-05,
1486
+ "loss": 1.2233,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.4348160492244584,
1491
+ "grad_norm": 0.4539099633693695,
1492
+ "learning_rate": 5.952072538860104e-05,
1493
+ "loss": 1.2527,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.4368670683245738,
1498
+ "grad_norm": 0.47722533345222473,
1499
+ "learning_rate": 5.930483592400691e-05,
1500
+ "loss": 1.2015,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.43891808742468913,
1505
+ "grad_norm": 0.472023069858551,
1506
+ "learning_rate": 5.908894645941278e-05,
1507
+ "loss": 1.2222,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.44096910652480453,
1512
+ "grad_norm": 0.4648214876651764,
1513
+ "learning_rate": 5.887305699481865e-05,
1514
+ "loss": 1.2112,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.4430201256249199,
1519
+ "grad_norm": 0.48654377460479736,
1520
+ "learning_rate": 5.8657167530224534e-05,
1521
+ "loss": 1.227,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.44507114472503523,
1526
+ "grad_norm": 0.4997814893722534,
1527
+ "learning_rate": 5.84412780656304e-05,
1528
+ "loss": 1.2721,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.44712216382515063,
1533
+ "grad_norm": 0.47997352480888367,
1534
+ "learning_rate": 5.822538860103627e-05,
1535
+ "loss": 1.2018,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.449173182925266,
1540
+ "grad_norm": 0.4899247884750366,
1541
+ "learning_rate": 5.8009499136442146e-05,
1542
+ "loss": 1.2599,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.4512242020253814,
1547
+ "grad_norm": 0.4752749800682068,
1548
+ "learning_rate": 5.7793609671848014e-05,
1549
+ "loss": 1.2171,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.45327522112549673,
1554
+ "grad_norm": 0.4801314175128937,
1555
+ "learning_rate": 5.7577720207253896e-05,
1556
+ "loss": 1.2234,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.4553262402256121,
1561
+ "grad_norm": 0.4591893255710602,
1562
+ "learning_rate": 5.7361830742659764e-05,
1563
+ "loss": 1.2242,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.4573772593257275,
1568
+ "grad_norm": 0.46896713972091675,
1569
+ "learning_rate": 5.7145941278065626e-05,
1570
+ "loss": 1.2117,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.45942827842584283,
1575
+ "grad_norm": 0.4853857755661011,
1576
+ "learning_rate": 5.693005181347151e-05,
1577
+ "loss": 1.2218,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.46147929752595823,
1582
+ "grad_norm": 0.4648151993751526,
1583
+ "learning_rate": 5.6714162348877376e-05,
1584
+ "loss": 1.2401,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.4635303166260736,
1589
+ "grad_norm": 0.4839739501476288,
1590
+ "learning_rate": 5.649827288428325e-05,
1591
+ "loss": 1.1976,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.4655813357261889,
1596
+ "grad_norm": 0.4986715018749237,
1597
+ "learning_rate": 5.628238341968912e-05,
1598
+ "loss": 1.2274,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.46763235482630433,
1603
+ "grad_norm": 0.4636840522289276,
1604
+ "learning_rate": 5.606649395509499e-05,
1605
+ "loss": 1.236,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.4696833739264197,
1610
+ "grad_norm": 0.5011271834373474,
1611
+ "learning_rate": 5.585060449050087e-05,
1612
+ "loss": 1.2275,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.4717343930265351,
1617
+ "grad_norm": 0.4648337662220001,
1618
+ "learning_rate": 5.563471502590674e-05,
1619
+ "loss": 1.2457,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.47378541212665043,
1624
+ "grad_norm": 0.47708699107170105,
1625
+ "learning_rate": 5.5418825561312614e-05,
1626
+ "loss": 1.2316,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.4758364312267658,
1631
+ "grad_norm": 0.4954835772514343,
1632
+ "learning_rate": 5.520293609671848e-05,
1633
+ "loss": 1.229,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.4778874503268812,
1638
+ "grad_norm": 0.4701727330684662,
1639
+ "learning_rate": 5.498704663212435e-05,
1640
+ "loss": 1.248,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.47993846942699653,
1645
+ "grad_norm": 0.4796009957790375,
1646
+ "learning_rate": 5.477115716753023e-05,
1647
+ "loss": 1.2248,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.48198948852711193,
1652
+ "grad_norm": 0.4906330406665802,
1653
+ "learning_rate": 5.4555267702936094e-05,
1654
+ "loss": 1.2628,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.4840405076272273,
1659
+ "grad_norm": 0.47203144431114197,
1660
+ "learning_rate": 5.4339378238341976e-05,
1661
+ "loss": 1.2067,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.4860915267273426,
1666
+ "grad_norm": 0.503813624382019,
1667
+ "learning_rate": 5.4123488773747845e-05,
1668
+ "loss": 1.2006,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.48814254582745803,
1673
+ "grad_norm": 0.4918235242366791,
1674
+ "learning_rate": 5.390759930915371e-05,
1675
+ "loss": 1.1887,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.4901935649275734,
1680
+ "grad_norm": 0.4799112379550934,
1681
+ "learning_rate": 5.369170984455959e-05,
1682
+ "loss": 1.2079,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.4922445840276888,
1687
+ "grad_norm": 0.4769650101661682,
1688
+ "learning_rate": 5.347582037996546e-05,
1689
+ "loss": 1.1945,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.49429560312780413,
1694
+ "grad_norm": 0.5079638957977295,
1695
+ "learning_rate": 5.325993091537134e-05,
1696
+ "loss": 1.2294,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.4963466222279195,
1701
+ "grad_norm": 0.520418643951416,
1702
+ "learning_rate": 5.304404145077721e-05,
1703
+ "loss": 1.2308,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.4983976413280349,
1708
+ "grad_norm": 0.4546453058719635,
1709
+ "learning_rate": 5.2828151986183075e-05,
1710
+ "loss": 1.2206,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.5004486604281503,
1715
+ "grad_norm": 0.47760534286499023,
1716
+ "learning_rate": 5.261226252158895e-05,
1717
+ "loss": 1.208,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.5024996795282656,
1722
+ "grad_norm": 0.5267066955566406,
1723
+ "learning_rate": 5.239637305699482e-05,
1724
+ "loss": 1.2123,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.504550698628381,
1729
+ "grad_norm": 0.45763811469078064,
1730
+ "learning_rate": 5.2180483592400694e-05,
1731
+ "loss": 1.2159,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.5066017177284964,
1736
+ "grad_norm": 0.4922376871109009,
1737
+ "learning_rate": 5.196459412780656e-05,
1738
+ "loss": 1.2456,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.5086527368286117,
1743
+ "grad_norm": 0.47043368220329285,
1744
+ "learning_rate": 5.174870466321243e-05,
1745
+ "loss": 1.2052,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.5107037559287271,
1750
+ "grad_norm": 0.5082889795303345,
1751
+ "learning_rate": 5.153281519861831e-05,
1752
+ "loss": 1.2393,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.5127547750288425,
1757
+ "grad_norm": 0.4955206513404846,
1758
+ "learning_rate": 5.131692573402418e-05,
1759
+ "loss": 1.2323,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.5148057941289578,
1764
+ "grad_norm": 0.48625460267066956,
1765
+ "learning_rate": 5.1101036269430057e-05,
1766
+ "loss": 1.206,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.5168568132290732,
1771
+ "grad_norm": 0.49060237407684326,
1772
+ "learning_rate": 5.0885146804835925e-05,
1773
+ "loss": 1.2353,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.5189078323291886,
1778
+ "grad_norm": 0.46809640526771545,
1779
+ "learning_rate": 5.0669257340241793e-05,
1780
+ "loss": 1.2287,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.520958851429304,
1785
+ "grad_norm": 0.4944596290588379,
1786
+ "learning_rate": 5.0453367875647675e-05,
1787
+ "loss": 1.2413,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.5230098705294193,
1792
+ "grad_norm": 0.46914994716644287,
1793
+ "learning_rate": 5.023747841105354e-05,
1794
+ "loss": 1.22,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.5250608896295347,
1799
+ "grad_norm": 0.4888727366924286,
1800
+ "learning_rate": 5.002158894645942e-05,
1801
+ "loss": 1.2343,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.5271119087296501,
1806
+ "grad_norm": 0.4785778522491455,
1807
+ "learning_rate": 4.980569948186529e-05,
1808
+ "loss": 1.187,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.5291629278297654,
1813
+ "grad_norm": 0.4947550594806671,
1814
+ "learning_rate": 4.958981001727116e-05,
1815
+ "loss": 1.2288,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.5312139469298808,
1820
+ "grad_norm": 0.5263291597366333,
1821
+ "learning_rate": 4.937392055267703e-05,
1822
+ "loss": 1.2044,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.5332649660299962,
1827
+ "grad_norm": 0.49239382147789,
1828
+ "learning_rate": 4.9158031088082906e-05,
1829
+ "loss": 1.1865,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.5353159851301115,
1834
+ "grad_norm": 0.48874983191490173,
1835
+ "learning_rate": 4.8942141623488775e-05,
1836
+ "loss": 1.2672,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.5373670042302269,
1841
+ "grad_norm": 0.48474863171577454,
1842
+ "learning_rate": 4.872625215889465e-05,
1843
+ "loss": 1.2359,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.5394180233303423,
1848
+ "grad_norm": 0.4978977143764496,
1849
+ "learning_rate": 4.851036269430052e-05,
1850
+ "loss": 1.2139,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.5414690424304577,
1855
+ "grad_norm": 0.5144924521446228,
1856
+ "learning_rate": 4.829447322970639e-05,
1857
+ "loss": 1.221,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.543520061530573,
1862
+ "grad_norm": 0.5082759857177734,
1863
+ "learning_rate": 4.807858376511227e-05,
1864
+ "loss": 1.2209,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.5455710806306884,
1869
+ "grad_norm": 0.4933965504169464,
1870
+ "learning_rate": 4.786269430051814e-05,
1871
+ "loss": 1.207,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.5476220997308038,
1876
+ "grad_norm": 0.49464166164398193,
1877
+ "learning_rate": 4.7646804835924005e-05,
1878
+ "loss": 1.2398,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.5496731188309191,
1883
+ "grad_norm": 0.49377110600471497,
1884
+ "learning_rate": 4.743091537132988e-05,
1885
+ "loss": 1.2451,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.5517241379310345,
1890
+ "grad_norm": 0.5111104846000671,
1891
+ "learning_rate": 4.7215025906735756e-05,
1892
+ "loss": 1.2197,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.5537751570311499,
1897
+ "grad_norm": 0.47716042399406433,
1898
+ "learning_rate": 4.699913644214163e-05,
1899
+ "loss": 1.1891,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.5558261761312652,
1904
+ "grad_norm": 0.5081655383110046,
1905
+ "learning_rate": 4.678324697754749e-05,
1906
+ "loss": 1.2507,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.5578771952313806,
1911
+ "grad_norm": 0.49036547541618347,
1912
+ "learning_rate": 4.656735751295337e-05,
1913
+ "loss": 1.1805,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.559928214331496,
1918
+ "grad_norm": 0.5139365792274475,
1919
+ "learning_rate": 4.635146804835924e-05,
1920
+ "loss": 1.2361,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.5619792334316114,
1925
+ "grad_norm": 0.5098669528961182,
1926
+ "learning_rate": 4.613557858376512e-05,
1927
+ "loss": 1.2409,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.5640302525317267,
1932
+ "grad_norm": 0.4786950349807739,
1933
+ "learning_rate": 4.5919689119170986e-05,
1934
+ "loss": 1.2067,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.5660812716318421,
1939
+ "grad_norm": 0.5063204169273376,
1940
+ "learning_rate": 4.5703799654576855e-05,
1941
+ "loss": 1.1942,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.5681322907319575,
1946
+ "grad_norm": 0.511663556098938,
1947
+ "learning_rate": 4.548791018998273e-05,
1948
+ "loss": 1.2017,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.5701833098320728,
1953
+ "grad_norm": 0.48765748739242554,
1954
+ "learning_rate": 4.5272020725388605e-05,
1955
+ "loss": 1.222,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.5722343289321882,
1960
+ "grad_norm": 0.49707624316215515,
1961
+ "learning_rate": 4.5056131260794474e-05,
1962
+ "loss": 1.2075,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.5742853480323036,
1967
+ "grad_norm": 0.5067517757415771,
1968
+ "learning_rate": 4.484024179620035e-05,
1969
+ "loss": 1.211,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.5763363671324189,
1974
+ "grad_norm": 0.4615229368209839,
1975
+ "learning_rate": 4.462435233160622e-05,
1976
+ "loss": 1.2303,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.5783873862325343,
1981
+ "grad_norm": 0.4948524236679077,
1982
+ "learning_rate": 4.440846286701209e-05,
1983
+ "loss": 1.2024,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.5804384053326497,
1988
+ "grad_norm": 0.5140314102172852,
1989
+ "learning_rate": 4.419257340241796e-05,
1990
+ "loss": 1.2217,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.5824894244327651,
1995
+ "grad_norm": 0.5108122825622559,
1996
+ "learning_rate": 4.3976683937823836e-05,
1997
+ "loss": 1.1838,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.5845404435328804,
2002
+ "grad_norm": 0.5021159052848816,
2003
+ "learning_rate": 4.376079447322971e-05,
2004
+ "loss": 1.2418,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.5865914626329958,
2009
+ "grad_norm": 0.5086933374404907,
2010
+ "learning_rate": 4.354490500863558e-05,
2011
+ "loss": 1.2321,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.5886424817331112,
2016
+ "grad_norm": 0.5083547830581665,
2017
+ "learning_rate": 4.332901554404145e-05,
2018
+ "loss": 1.2035,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.5906935008332265,
2023
+ "grad_norm": 0.4828626215457916,
2024
+ "learning_rate": 4.311312607944732e-05,
2025
+ "loss": 1.2302,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.5927445199333419,
2030
+ "grad_norm": 0.5140969157218933,
2031
+ "learning_rate": 4.28972366148532e-05,
2032
+ "loss": 1.2058,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.5947955390334573,
2037
+ "grad_norm": 0.497364342212677,
2038
+ "learning_rate": 4.2681347150259074e-05,
2039
+ "loss": 1.2382,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.5968465581335726,
2044
+ "grad_norm": 0.49104997515678406,
2045
+ "learning_rate": 4.246545768566494e-05,
2046
+ "loss": 1.2322,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.598897577233688,
2051
+ "grad_norm": 0.521659255027771,
2052
+ "learning_rate": 4.224956822107081e-05,
2053
+ "loss": 1.1868,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.6009485963338034,
2058
+ "grad_norm": 0.5175550580024719,
2059
+ "learning_rate": 4.2033678756476686e-05,
2060
+ "loss": 1.2169,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.6029996154339188,
2065
+ "grad_norm": 0.4998300075531006,
2066
+ "learning_rate": 4.181778929188256e-05,
2067
+ "loss": 1.2227,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.6050506345340341,
2072
+ "grad_norm": 0.4932349622249603,
2073
+ "learning_rate": 4.160189982728843e-05,
2074
+ "loss": 1.2371,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.6071016536341495,
2079
+ "grad_norm": 0.5610498189926147,
2080
+ "learning_rate": 4.1386010362694304e-05,
2081
+ "loss": 1.2105,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.6091526727342649,
2086
+ "grad_norm": 0.4975990355014801,
2087
+ "learning_rate": 4.117012089810017e-05,
2088
+ "loss": 1.2511,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.6112036918343802,
2093
+ "grad_norm": 0.5154693722724915,
2094
+ "learning_rate": 4.095423143350605e-05,
2095
+ "loss": 1.2399,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.6132547109344956,
2100
+ "grad_norm": 0.4968002736568451,
2101
+ "learning_rate": 4.0738341968911916e-05,
2102
+ "loss": 1.2041,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.615305730034611,
2107
+ "grad_norm": 0.4866868555545807,
2108
+ "learning_rate": 4.052245250431779e-05,
2109
+ "loss": 1.1965,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.6173567491347263,
2114
+ "grad_norm": 0.5152925848960876,
2115
+ "learning_rate": 4.030656303972367e-05,
2116
+ "loss": 1.2298,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.6194077682348417,
2121
+ "grad_norm": 0.513058602809906,
2122
+ "learning_rate": 4.0090673575129535e-05,
2123
+ "loss": 1.2414,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.6214587873349571,
2128
+ "grad_norm": 0.5031930208206177,
2129
+ "learning_rate": 3.987478411053541e-05,
2130
+ "loss": 1.1766,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.6235098064350725,
2135
+ "grad_norm": 0.5087730288505554,
2136
+ "learning_rate": 3.965889464594128e-05,
2137
+ "loss": 1.229,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.6255608255351878,
2142
+ "grad_norm": 0.4878797233104706,
2143
+ "learning_rate": 3.9443005181347154e-05,
2144
+ "loss": 1.2018,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.6276118446353032,
2149
+ "grad_norm": 0.5124858617782593,
2150
+ "learning_rate": 3.922711571675303e-05,
2151
+ "loss": 1.1848,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.6296628637354186,
2156
+ "grad_norm": 0.49720969796180725,
2157
+ "learning_rate": 3.90112262521589e-05,
2158
+ "loss": 1.1892,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.6317138828355339,
2163
+ "grad_norm": 0.49900123476982117,
2164
+ "learning_rate": 3.8795336787564766e-05,
2165
+ "loss": 1.2027,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.6337649019356493,
2170
+ "grad_norm": 0.5007952451705933,
2171
+ "learning_rate": 3.857944732297064e-05,
2172
+ "loss": 1.2373,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.6358159210357647,
2177
+ "grad_norm": 0.49481576681137085,
2178
+ "learning_rate": 3.8363557858376516e-05,
2179
+ "loss": 1.2294,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.63786694013588,
2184
+ "grad_norm": 0.4979318082332611,
2185
+ "learning_rate": 3.8147668393782385e-05,
2186
+ "loss": 1.2312,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.6399179592359954,
2191
+ "grad_norm": 0.49939480423927307,
2192
+ "learning_rate": 3.793177892918825e-05,
2193
+ "loss": 1.2394,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.6419689783361108,
2198
+ "grad_norm": 0.5186517834663391,
2199
+ "learning_rate": 3.771588946459413e-05,
2200
+ "loss": 1.199,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.6440199974362262,
2205
+ "grad_norm": 0.5386569499969482,
2206
+ "learning_rate": 3.7500000000000003e-05,
2207
+ "loss": 1.1801,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.6460710165363415,
2212
+ "grad_norm": 0.5134577751159668,
2213
+ "learning_rate": 3.728411053540587e-05,
2214
+ "loss": 1.2286,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.6481220356364569,
2219
+ "grad_norm": 0.5191785097122192,
2220
+ "learning_rate": 3.706822107081175e-05,
2221
+ "loss": 1.2068,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.6501730547365723,
2226
+ "grad_norm": 0.4857168197631836,
2227
+ "learning_rate": 3.6852331606217615e-05,
2228
+ "loss": 1.2116,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.6522240738366876,
2233
+ "grad_norm": 0.5283413529396057,
2234
+ "learning_rate": 3.663644214162349e-05,
2235
+ "loss": 1.1792,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.654275092936803,
2240
+ "grad_norm": 0.528938353061676,
2241
+ "learning_rate": 3.6420552677029366e-05,
2242
+ "loss": 1.1963,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.6563261120369184,
2247
+ "grad_norm": 0.5067134499549866,
2248
+ "learning_rate": 3.6204663212435234e-05,
2249
+ "loss": 1.2476,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.6583771311370337,
2254
+ "grad_norm": 0.4993511736392975,
2255
+ "learning_rate": 3.598877374784111e-05,
2256
+ "loss": 1.2273,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.6604281502371491,
2261
+ "grad_norm": 0.5275943279266357,
2262
+ "learning_rate": 3.577288428324698e-05,
2263
+ "loss": 1.2287,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.6624791693372645,
2268
+ "grad_norm": 0.49331194162368774,
2269
+ "learning_rate": 3.555699481865285e-05,
2270
+ "loss": 1.1794,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.6645301884373799,
2275
+ "grad_norm": 0.5065453052520752,
2276
+ "learning_rate": 3.534110535405872e-05,
2277
+ "loss": 1.2342,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.6665812075374952,
2282
+ "grad_norm": 0.5334459543228149,
2283
+ "learning_rate": 3.51252158894646e-05,
2284
+ "loss": 1.1782,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.6686322266376106,
2289
+ "grad_norm": 0.535772979259491,
2290
+ "learning_rate": 3.490932642487047e-05,
2291
+ "loss": 1.2108,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.670683245737726,
2296
+ "grad_norm": 0.5377807021141052,
2297
+ "learning_rate": 3.469343696027634e-05,
2298
+ "loss": 1.1903,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.6727342648378413,
2303
+ "grad_norm": 0.5266278386116028,
2304
+ "learning_rate": 3.447754749568221e-05,
2305
+ "loss": 1.2183,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.6747852839379567,
2310
+ "grad_norm": 0.4987232983112335,
2311
+ "learning_rate": 3.4261658031088084e-05,
2312
+ "loss": 1.1915,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.6768363030380721,
2317
+ "grad_norm": 0.5178554058074951,
2318
+ "learning_rate": 3.404576856649396e-05,
2319
+ "loss": 1.179,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 0.6788873221381874,
2324
+ "grad_norm": 0.5086014270782471,
2325
+ "learning_rate": 3.382987910189983e-05,
2326
+ "loss": 1.2298,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 0.6809383412383028,
2331
+ "grad_norm": 0.5420427918434143,
2332
+ "learning_rate": 3.3613989637305696e-05,
2333
+ "loss": 1.2072,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 0.6829893603384182,
2338
+ "grad_norm": 0.5170331001281738,
2339
+ "learning_rate": 3.339810017271157e-05,
2340
+ "loss": 1.2252,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 0.6850403794385336,
2345
+ "grad_norm": 0.48680609464645386,
2346
+ "learning_rate": 3.3182210708117446e-05,
2347
+ "loss": 1.2059,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 0.6870913985386489,
2352
+ "grad_norm": 0.5035340189933777,
2353
+ "learning_rate": 3.296632124352332e-05,
2354
+ "loss": 1.2009,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 0.6891424176387643,
2359
+ "grad_norm": 0.513165295124054,
2360
+ "learning_rate": 3.275043177892919e-05,
2361
+ "loss": 1.1844,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 0.6911934367388797,
2366
+ "grad_norm": 0.5243003368377686,
2367
+ "learning_rate": 3.2534542314335065e-05,
2368
+ "loss": 1.2009,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 0.693244455838995,
2373
+ "grad_norm": 0.5219825506210327,
2374
+ "learning_rate": 3.2318652849740933e-05,
2375
+ "loss": 1.2039,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 0.6952954749391104,
2380
+ "grad_norm": 0.5202507972717285,
2381
+ "learning_rate": 3.210276338514681e-05,
2382
+ "loss": 1.225,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 0.6973464940392258,
2387
+ "grad_norm": 0.5152229070663452,
2388
+ "learning_rate": 3.188687392055268e-05,
2389
+ "loss": 1.1886,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 0.6993975131393411,
2394
+ "grad_norm": 0.5382890701293945,
2395
+ "learning_rate": 3.167098445595855e-05,
2396
+ "loss": 1.2113,
2397
+ "step": 3410
2398
+ },
2399
+ {
2400
+ "epoch": 0.7014485322394565,
2401
+ "grad_norm": 0.5525237917900085,
2402
+ "learning_rate": 3.145509499136443e-05,
2403
+ "loss": 1.2283,
2404
+ "step": 3420
2405
+ },
2406
+ {
2407
+ "epoch": 0.7034995513395719,
2408
+ "grad_norm": 0.5308887958526611,
2409
+ "learning_rate": 3.1239205526770296e-05,
2410
+ "loss": 1.2311,
2411
+ "step": 3430
2412
+ },
2413
+ {
2414
+ "epoch": 0.7055505704396873,
2415
+ "grad_norm": 0.5247687697410583,
2416
+ "learning_rate": 3.1023316062176164e-05,
2417
+ "loss": 1.1946,
2418
+ "step": 3440
2419
+ },
2420
+ {
2421
+ "epoch": 0.7076015895398026,
2422
+ "grad_norm": 0.5322206616401672,
2423
+ "learning_rate": 3.080742659758204e-05,
2424
+ "loss": 1.2198,
2425
+ "step": 3450
2426
+ },
2427
+ {
2428
+ "epoch": 0.709652608639918,
2429
+ "grad_norm": 0.5104162693023682,
2430
+ "learning_rate": 3.0591537132987915e-05,
2431
+ "loss": 1.2105,
2432
+ "step": 3460
2433
+ },
2434
+ {
2435
+ "epoch": 0.7117036277400334,
2436
+ "grad_norm": 0.4890803098678589,
2437
+ "learning_rate": 3.0375647668393786e-05,
2438
+ "loss": 1.2074,
2439
+ "step": 3470
2440
+ },
2441
+ {
2442
+ "epoch": 0.7137546468401487,
2443
+ "grad_norm": 0.529225766658783,
2444
+ "learning_rate": 3.0159758203799655e-05,
2445
+ "loss": 1.2321,
2446
+ "step": 3480
2447
+ },
2448
+ {
2449
+ "epoch": 0.7158056659402641,
2450
+ "grad_norm": 0.5252069234848022,
2451
+ "learning_rate": 2.9943868739205527e-05,
2452
+ "loss": 1.1995,
2453
+ "step": 3490
2454
+ },
2455
+ {
2456
+ "epoch": 0.7178566850403795,
2457
+ "grad_norm": 0.5369967818260193,
2458
+ "learning_rate": 2.9727979274611402e-05,
2459
+ "loss": 1.2234,
2460
+ "step": 3500
2461
+ },
2462
+ {
2463
+ "epoch": 0.7199077041404948,
2464
+ "grad_norm": 0.5053485631942749,
2465
+ "learning_rate": 2.9512089810017274e-05,
2466
+ "loss": 1.2035,
2467
+ "step": 3510
2468
+ },
2469
+ {
2470
+ "epoch": 0.7219587232406102,
2471
+ "grad_norm": 0.5131696462631226,
2472
+ "learning_rate": 2.929620034542315e-05,
2473
+ "loss": 1.2681,
2474
+ "step": 3520
2475
+ },
2476
+ {
2477
+ "epoch": 0.7240097423407256,
2478
+ "grad_norm": 0.5332499742507935,
2479
+ "learning_rate": 2.9080310880829014e-05,
2480
+ "loss": 1.2039,
2481
+ "step": 3530
2482
+ },
2483
+ {
2484
+ "epoch": 0.7260607614408409,
2485
+ "grad_norm": 0.5105617046356201,
2486
+ "learning_rate": 2.886442141623489e-05,
2487
+ "loss": 1.2,
2488
+ "step": 3540
2489
+ },
2490
+ {
2491
+ "epoch": 0.7281117805409563,
2492
+ "grad_norm": 0.5197264552116394,
2493
+ "learning_rate": 2.864853195164076e-05,
2494
+ "loss": 1.1821,
2495
+ "step": 3550
2496
+ },
2497
+ {
2498
+ "epoch": 0.7301627996410717,
2499
+ "grad_norm": 0.505455493927002,
2500
+ "learning_rate": 2.8432642487046636e-05,
2501
+ "loss": 1.2158,
2502
+ "step": 3560
2503
+ },
2504
+ {
2505
+ "epoch": 0.7322138187411871,
2506
+ "grad_norm": 0.5290804505348206,
2507
+ "learning_rate": 2.8216753022452508e-05,
2508
+ "loss": 1.174,
2509
+ "step": 3570
2510
+ },
2511
+ {
2512
+ "epoch": 0.7342648378413024,
2513
+ "grad_norm": 0.5349313020706177,
2514
+ "learning_rate": 2.8000863557858376e-05,
2515
+ "loss": 1.2301,
2516
+ "step": 3580
2517
+ },
2518
+ {
2519
+ "epoch": 0.7363158569414178,
2520
+ "grad_norm": 0.4875812530517578,
2521
+ "learning_rate": 2.7784974093264248e-05,
2522
+ "loss": 1.2015,
2523
+ "step": 3590
2524
+ },
2525
+ {
2526
+ "epoch": 0.7383668760415332,
2527
+ "grad_norm": 0.5164597630500793,
2528
+ "learning_rate": 2.7569084628670123e-05,
2529
+ "loss": 1.2294,
2530
+ "step": 3600
2531
+ },
2532
+ {
2533
+ "epoch": 0.7404178951416485,
2534
+ "grad_norm": 0.5129172801971436,
2535
+ "learning_rate": 2.7353195164075995e-05,
2536
+ "loss": 1.2122,
2537
+ "step": 3610
2538
+ },
2539
+ {
2540
+ "epoch": 0.7424689142417639,
2541
+ "grad_norm": 0.5218586921691895,
2542
+ "learning_rate": 2.713730569948187e-05,
2543
+ "loss": 1.2002,
2544
+ "step": 3620
2545
+ },
2546
+ {
2547
+ "epoch": 0.7445199333418793,
2548
+ "grad_norm": 0.5423296093940735,
2549
+ "learning_rate": 2.6921416234887735e-05,
2550
+ "loss": 1.1685,
2551
+ "step": 3630
2552
+ },
2553
+ {
2554
+ "epoch": 0.7465709524419946,
2555
+ "grad_norm": 0.5151218771934509,
2556
+ "learning_rate": 2.670552677029361e-05,
2557
+ "loss": 1.2167,
2558
+ "step": 3640
2559
+ },
2560
+ {
2561
+ "epoch": 0.74862197154211,
2562
+ "grad_norm": 0.5160235166549683,
2563
+ "learning_rate": 2.6489637305699482e-05,
2564
+ "loss": 1.2269,
2565
+ "step": 3650
2566
+ },
2567
+ {
2568
+ "epoch": 0.7506729906422254,
2569
+ "grad_norm": 0.5056514143943787,
2570
+ "learning_rate": 2.6273747841105357e-05,
2571
+ "loss": 1.2467,
2572
+ "step": 3660
2573
+ },
2574
+ {
2575
+ "epoch": 0.7527240097423408,
2576
+ "grad_norm": 0.52911776304245,
2577
+ "learning_rate": 2.605785837651123e-05,
2578
+ "loss": 1.2182,
2579
+ "step": 3670
2580
+ },
2581
+ {
2582
+ "epoch": 0.7547750288424561,
2583
+ "grad_norm": 0.5172019600868225,
2584
+ "learning_rate": 2.5841968911917097e-05,
2585
+ "loss": 1.1888,
2586
+ "step": 3680
2587
+ },
2588
+ {
2589
+ "epoch": 0.7568260479425715,
2590
+ "grad_norm": 0.5043123960494995,
2591
+ "learning_rate": 2.562607944732297e-05,
2592
+ "loss": 1.2004,
2593
+ "step": 3690
2594
+ },
2595
+ {
2596
+ "epoch": 0.7588770670426869,
2597
+ "grad_norm": 0.5103533267974854,
2598
+ "learning_rate": 2.5410189982728844e-05,
2599
+ "loss": 1.1627,
2600
+ "step": 3700
2601
+ },
2602
+ {
2603
+ "epoch": 0.7609280861428022,
2604
+ "grad_norm": 0.5295760631561279,
2605
+ "learning_rate": 2.5194300518134716e-05,
2606
+ "loss": 1.1604,
2607
+ "step": 3710
2608
+ },
2609
+ {
2610
+ "epoch": 0.7629791052429176,
2611
+ "grad_norm": 0.5427724719047546,
2612
+ "learning_rate": 2.4978411053540588e-05,
2613
+ "loss": 1.1781,
2614
+ "step": 3720
2615
+ },
2616
+ {
2617
+ "epoch": 0.765030124343033,
2618
+ "grad_norm": 0.5164818167686462,
2619
+ "learning_rate": 2.476252158894646e-05,
2620
+ "loss": 1.2208,
2621
+ "step": 3730
2622
+ },
2623
+ {
2624
+ "epoch": 0.7670811434431483,
2625
+ "grad_norm": 0.5196744799613953,
2626
+ "learning_rate": 2.4546632124352335e-05,
2627
+ "loss": 1.1971,
2628
+ "step": 3740
2629
+ },
2630
+ {
2631
+ "epoch": 0.7691321625432637,
2632
+ "grad_norm": 0.5128475427627563,
2633
+ "learning_rate": 2.4330742659758203e-05,
2634
+ "loss": 1.1909,
2635
+ "step": 3750
2636
+ },
2637
+ {
2638
+ "epoch": 0.7711831816433791,
2639
+ "grad_norm": 0.49743902683258057,
2640
+ "learning_rate": 2.411485319516408e-05,
2641
+ "loss": 1.2109,
2642
+ "step": 3760
2643
+ },
2644
+ {
2645
+ "epoch": 0.7732342007434945,
2646
+ "grad_norm": 0.5152381658554077,
2647
+ "learning_rate": 2.3898963730569947e-05,
2648
+ "loss": 1.2228,
2649
+ "step": 3770
2650
+ },
2651
+ {
2652
+ "epoch": 0.7752852198436098,
2653
+ "grad_norm": 0.5446299910545349,
2654
+ "learning_rate": 2.3683074265975822e-05,
2655
+ "loss": 1.1953,
2656
+ "step": 3780
2657
+ },
2658
+ {
2659
+ "epoch": 0.7773362389437252,
2660
+ "grad_norm": 0.5300847291946411,
2661
+ "learning_rate": 2.3467184801381694e-05,
2662
+ "loss": 1.1843,
2663
+ "step": 3790
2664
+ },
2665
+ {
2666
+ "epoch": 0.7793872580438406,
2667
+ "grad_norm": 0.5129801630973816,
2668
+ "learning_rate": 2.3251295336787566e-05,
2669
+ "loss": 1.1809,
2670
+ "step": 3800
2671
+ },
2672
+ {
2673
+ "epoch": 0.7814382771439559,
2674
+ "grad_norm": 0.549198567867279,
2675
+ "learning_rate": 2.3035405872193438e-05,
2676
+ "loss": 1.2099,
2677
+ "step": 3810
2678
+ },
2679
+ {
2680
+ "epoch": 0.7834892962440713,
2681
+ "grad_norm": 0.5118544101715088,
2682
+ "learning_rate": 2.281951640759931e-05,
2683
+ "loss": 1.2149,
2684
+ "step": 3820
2685
+ },
2686
+ {
2687
+ "epoch": 0.7855403153441867,
2688
+ "grad_norm": 0.5479713082313538,
2689
+ "learning_rate": 2.260362694300518e-05,
2690
+ "loss": 1.1771,
2691
+ "step": 3830
2692
+ },
2693
+ {
2694
+ "epoch": 0.787591334444302,
2695
+ "grad_norm": 0.541350245475769,
2696
+ "learning_rate": 2.2387737478411056e-05,
2697
+ "loss": 1.1737,
2698
+ "step": 3840
2699
+ },
2700
+ {
2701
+ "epoch": 0.7896423535444174,
2702
+ "grad_norm": 0.5543351769447327,
2703
+ "learning_rate": 2.2171848013816925e-05,
2704
+ "loss": 1.2233,
2705
+ "step": 3850
2706
+ },
2707
+ {
2708
+ "epoch": 0.7916933726445328,
2709
+ "grad_norm": 0.5010188817977905,
2710
+ "learning_rate": 2.19559585492228e-05,
2711
+ "loss": 1.1938,
2712
+ "step": 3860
2713
+ },
2714
+ {
2715
+ "epoch": 0.7937443917446482,
2716
+ "grad_norm": 0.5245205760002136,
2717
+ "learning_rate": 2.1740069084628672e-05,
2718
+ "loss": 1.2015,
2719
+ "step": 3870
2720
+ },
2721
+ {
2722
+ "epoch": 0.7957954108447635,
2723
+ "grad_norm": 0.5324139595031738,
2724
+ "learning_rate": 2.1524179620034544e-05,
2725
+ "loss": 1.2248,
2726
+ "step": 3880
2727
+ },
2728
+ {
2729
+ "epoch": 0.7978464299448789,
2730
+ "grad_norm": 0.5172831416130066,
2731
+ "learning_rate": 2.1308290155440415e-05,
2732
+ "loss": 1.1992,
2733
+ "step": 3890
2734
+ },
2735
+ {
2736
+ "epoch": 0.7998974490449943,
2737
+ "grad_norm": 0.5434138178825378,
2738
+ "learning_rate": 2.1092400690846287e-05,
2739
+ "loss": 1.1813,
2740
+ "step": 3900
2741
+ },
2742
+ {
2743
+ "epoch": 0.8019484681451096,
2744
+ "grad_norm": 0.5221844911575317,
2745
+ "learning_rate": 2.087651122625216e-05,
2746
+ "loss": 1.1625,
2747
+ "step": 3910
2748
+ },
2749
+ {
2750
+ "epoch": 0.803999487245225,
2751
+ "grad_norm": 0.5027469992637634,
2752
+ "learning_rate": 2.0660621761658034e-05,
2753
+ "loss": 1.181,
2754
+ "step": 3920
2755
+ },
2756
+ {
2757
+ "epoch": 0.8060505063453404,
2758
+ "grad_norm": 0.5298044085502625,
2759
+ "learning_rate": 2.0444732297063903e-05,
2760
+ "loss": 1.2079,
2761
+ "step": 3930
2762
+ },
2763
+ {
2764
+ "epoch": 0.8081015254454557,
2765
+ "grad_norm": 0.5463908910751343,
2766
+ "learning_rate": 2.0228842832469778e-05,
2767
+ "loss": 1.2009,
2768
+ "step": 3940
2769
+ },
2770
+ {
2771
+ "epoch": 0.8101525445455711,
2772
+ "grad_norm": 0.5394027233123779,
2773
+ "learning_rate": 2.0012953367875646e-05,
2774
+ "loss": 1.1931,
2775
+ "step": 3950
2776
+ },
2777
+ {
2778
+ "epoch": 0.8122035636456865,
2779
+ "grad_norm": 0.5041294097900391,
2780
+ "learning_rate": 1.979706390328152e-05,
2781
+ "loss": 1.2107,
2782
+ "step": 3960
2783
+ },
2784
+ {
2785
+ "epoch": 0.8142545827458019,
2786
+ "grad_norm": 0.5223291516304016,
2787
+ "learning_rate": 1.9581174438687393e-05,
2788
+ "loss": 1.1775,
2789
+ "step": 3970
2790
+ },
2791
+ {
2792
+ "epoch": 0.8163056018459172,
2793
+ "grad_norm": 0.5221052169799805,
2794
+ "learning_rate": 1.9365284974093265e-05,
2795
+ "loss": 1.2052,
2796
+ "step": 3980
2797
+ },
2798
+ {
2799
+ "epoch": 0.8183566209460326,
2800
+ "grad_norm": 0.5229529738426208,
2801
+ "learning_rate": 1.9149395509499137e-05,
2802
+ "loss": 1.1922,
2803
+ "step": 3990
2804
+ },
2805
+ {
2806
+ "epoch": 0.820407640046148,
2807
+ "grad_norm": 0.5651980042457581,
2808
+ "learning_rate": 1.893350604490501e-05,
2809
+ "loss": 1.2043,
2810
+ "step": 4000
2811
+ },
2812
+ {
2813
+ "epoch": 0.8224586591462633,
2814
+ "grad_norm": 0.5169751644134521,
2815
+ "learning_rate": 1.871761658031088e-05,
2816
+ "loss": 1.2157,
2817
+ "step": 4010
2818
+ },
2819
+ {
2820
+ "epoch": 0.8245096782463787,
2821
+ "grad_norm": 0.5741276144981384,
2822
+ "learning_rate": 1.8501727115716755e-05,
2823
+ "loss": 1.2112,
2824
+ "step": 4020
2825
+ },
2826
+ {
2827
+ "epoch": 0.8265606973464941,
2828
+ "grad_norm": 0.530596137046814,
2829
+ "learning_rate": 1.8285837651122624e-05,
2830
+ "loss": 1.2535,
2831
+ "step": 4030
2832
+ },
2833
+ {
2834
+ "epoch": 0.8286117164466094,
2835
+ "grad_norm": 0.5436383485794067,
2836
+ "learning_rate": 1.80699481865285e-05,
2837
+ "loss": 1.1789,
2838
+ "step": 4040
2839
+ },
2840
+ {
2841
+ "epoch": 0.8306627355467248,
2842
+ "grad_norm": 0.5238965749740601,
2843
+ "learning_rate": 1.7854058721934368e-05,
2844
+ "loss": 1.1645,
2845
+ "step": 4050
2846
+ },
2847
+ {
2848
+ "epoch": 0.8327137546468402,
2849
+ "grad_norm": 0.5226778388023376,
2850
+ "learning_rate": 1.7638169257340243e-05,
2851
+ "loss": 1.2238,
2852
+ "step": 4060
2853
+ },
2854
+ {
2855
+ "epoch": 0.8347647737469556,
2856
+ "grad_norm": 0.5810254812240601,
2857
+ "learning_rate": 1.7422279792746114e-05,
2858
+ "loss": 1.2212,
2859
+ "step": 4070
2860
+ },
2861
+ {
2862
+ "epoch": 0.8368157928470709,
2863
+ "grad_norm": 0.5228540301322937,
2864
+ "learning_rate": 1.7206390328151986e-05,
2865
+ "loss": 1.2025,
2866
+ "step": 4080
2867
+ },
2868
+ {
2869
+ "epoch": 0.8388668119471863,
2870
+ "grad_norm": 0.5112829804420471,
2871
+ "learning_rate": 1.6990500863557858e-05,
2872
+ "loss": 1.1838,
2873
+ "step": 4090
2874
+ },
2875
+ {
2876
+ "epoch": 0.8409178310473017,
2877
+ "grad_norm": 0.5092179775238037,
2878
+ "learning_rate": 1.6774611398963733e-05,
2879
+ "loss": 1.1981,
2880
+ "step": 4100
2881
+ },
2882
+ {
2883
+ "epoch": 0.842968850147417,
2884
+ "grad_norm": 0.5236721634864807,
2885
+ "learning_rate": 1.65587219343696e-05,
2886
+ "loss": 1.1994,
2887
+ "step": 4110
2888
+ },
2889
+ {
2890
+ "epoch": 0.8450198692475324,
2891
+ "grad_norm": 0.5067551732063293,
2892
+ "learning_rate": 1.6342832469775477e-05,
2893
+ "loss": 1.1758,
2894
+ "step": 4120
2895
+ },
2896
+ {
2897
+ "epoch": 0.8470708883476478,
2898
+ "grad_norm": 0.5471055507659912,
2899
+ "learning_rate": 1.6126943005181345e-05,
2900
+ "loss": 1.2315,
2901
+ "step": 4130
2902
+ },
2903
+ {
2904
+ "epoch": 0.8491219074477631,
2905
+ "grad_norm": 0.514798641204834,
2906
+ "learning_rate": 1.591105354058722e-05,
2907
+ "loss": 1.183,
2908
+ "step": 4140
2909
+ },
2910
+ {
2911
+ "epoch": 0.8511729265478785,
2912
+ "grad_norm": 0.5316623449325562,
2913
+ "learning_rate": 1.5695164075993092e-05,
2914
+ "loss": 1.1997,
2915
+ "step": 4150
2916
+ },
2917
+ {
2918
+ "epoch": 0.8532239456479939,
2919
+ "grad_norm": 0.531896710395813,
2920
+ "learning_rate": 1.5479274611398964e-05,
2921
+ "loss": 1.1967,
2922
+ "step": 4160
2923
+ },
2924
+ {
2925
+ "epoch": 0.8552749647481093,
2926
+ "grad_norm": 0.5044012665748596,
2927
+ "learning_rate": 1.5263385146804836e-05,
2928
+ "loss": 1.2061,
2929
+ "step": 4170
2930
+ },
2931
+ {
2932
+ "epoch": 0.8573259838482246,
2933
+ "grad_norm": 0.547264039516449,
2934
+ "learning_rate": 1.5047495682210708e-05,
2935
+ "loss": 1.1975,
2936
+ "step": 4180
2937
+ },
2938
+ {
2939
+ "epoch": 0.85937700294834,
2940
+ "grad_norm": 0.5514972805976868,
2941
+ "learning_rate": 1.4831606217616581e-05,
2942
+ "loss": 1.2044,
2943
+ "step": 4190
2944
+ },
2945
+ {
2946
+ "epoch": 0.8614280220484554,
2947
+ "grad_norm": 0.5322652459144592,
2948
+ "learning_rate": 1.4615716753022455e-05,
2949
+ "loss": 1.2044,
2950
+ "step": 4200
2951
+ },
2952
+ {
2953
+ "epoch": 0.8634790411485707,
2954
+ "grad_norm": 0.5309359431266785,
2955
+ "learning_rate": 1.4399827288428325e-05,
2956
+ "loss": 1.2066,
2957
+ "step": 4210
2958
+ },
2959
+ {
2960
+ "epoch": 0.8655300602486861,
2961
+ "grad_norm": 0.5314792394638062,
2962
+ "learning_rate": 1.4183937823834198e-05,
2963
+ "loss": 1.2006,
2964
+ "step": 4220
2965
+ },
2966
+ {
2967
+ "epoch": 0.8675810793488015,
2968
+ "grad_norm": 0.5549922585487366,
2969
+ "learning_rate": 1.3968048359240068e-05,
2970
+ "loss": 1.2058,
2971
+ "step": 4230
2972
+ },
2973
+ {
2974
+ "epoch": 0.8696320984489168,
2975
+ "grad_norm": 0.5373049378395081,
2976
+ "learning_rate": 1.3752158894645942e-05,
2977
+ "loss": 1.2002,
2978
+ "step": 4240
2979
+ },
2980
+ {
2981
+ "epoch": 0.8716831175490322,
2982
+ "grad_norm": 0.5322666764259338,
2983
+ "learning_rate": 1.3536269430051815e-05,
2984
+ "loss": 1.215,
2985
+ "step": 4250
2986
+ },
2987
+ {
2988
+ "epoch": 0.8737341366491476,
2989
+ "grad_norm": 0.5549564957618713,
2990
+ "learning_rate": 1.3320379965457685e-05,
2991
+ "loss": 1.2131,
2992
+ "step": 4260
2993
+ },
2994
+ {
2995
+ "epoch": 0.875785155749263,
2996
+ "grad_norm": 0.5308319926261902,
2997
+ "learning_rate": 1.3104490500863559e-05,
2998
+ "loss": 1.2203,
2999
+ "step": 4270
3000
+ },
3001
+ {
3002
+ "epoch": 0.8778361748493783,
3003
+ "grad_norm": 0.5089017152786255,
3004
+ "learning_rate": 1.2888601036269432e-05,
3005
+ "loss": 1.1801,
3006
+ "step": 4280
3007
+ },
3008
+ {
3009
+ "epoch": 0.8798871939494937,
3010
+ "grad_norm": 0.5377966165542603,
3011
+ "learning_rate": 1.2672711571675302e-05,
3012
+ "loss": 1.189,
3013
+ "step": 4290
3014
+ },
3015
+ {
3016
+ "epoch": 0.8819382130496091,
3017
+ "grad_norm": 0.5528485178947449,
3018
+ "learning_rate": 1.2456822107081174e-05,
3019
+ "loss": 1.2197,
3020
+ "step": 4300
3021
+ },
3022
+ {
3023
+ "epoch": 0.8839892321497244,
3024
+ "grad_norm": 0.5241679549217224,
3025
+ "learning_rate": 1.2240932642487048e-05,
3026
+ "loss": 1.1652,
3027
+ "step": 4310
3028
+ },
3029
+ {
3030
+ "epoch": 0.8860402512498398,
3031
+ "grad_norm": 0.5626764893531799,
3032
+ "learning_rate": 1.202504317789292e-05,
3033
+ "loss": 1.1805,
3034
+ "step": 4320
3035
+ },
3036
+ {
3037
+ "epoch": 0.8880912703499552,
3038
+ "grad_norm": 0.5248028635978699,
3039
+ "learning_rate": 1.1809153713298791e-05,
3040
+ "loss": 1.1652,
3041
+ "step": 4330
3042
+ },
3043
+ {
3044
+ "epoch": 0.8901422894500705,
3045
+ "grad_norm": 0.5452848672866821,
3046
+ "learning_rate": 1.1593264248704663e-05,
3047
+ "loss": 1.2171,
3048
+ "step": 4340
3049
+ },
3050
+ {
3051
+ "epoch": 0.8921933085501859,
3052
+ "grad_norm": 0.5505712628364563,
3053
+ "learning_rate": 1.1377374784110537e-05,
3054
+ "loss": 1.1967,
3055
+ "step": 4350
3056
+ },
3057
+ {
3058
+ "epoch": 0.8942443276503013,
3059
+ "grad_norm": 0.5437038540840149,
3060
+ "learning_rate": 1.1161485319516408e-05,
3061
+ "loss": 1.2216,
3062
+ "step": 4360
3063
+ },
3064
+ {
3065
+ "epoch": 0.8962953467504167,
3066
+ "grad_norm": 0.5138014554977417,
3067
+ "learning_rate": 1.094559585492228e-05,
3068
+ "loss": 1.193,
3069
+ "step": 4370
3070
+ },
3071
+ {
3072
+ "epoch": 0.898346365850532,
3073
+ "grad_norm": 0.542080283164978,
3074
+ "learning_rate": 1.0729706390328152e-05,
3075
+ "loss": 1.1677,
3076
+ "step": 4380
3077
+ },
3078
+ {
3079
+ "epoch": 0.9003973849506474,
3080
+ "grad_norm": 0.5166792273521423,
3081
+ "learning_rate": 1.0513816925734024e-05,
3082
+ "loss": 1.2147,
3083
+ "step": 4390
3084
+ },
3085
+ {
3086
+ "epoch": 0.9024484040507628,
3087
+ "grad_norm": 0.536491334438324,
3088
+ "learning_rate": 1.0297927461139897e-05,
3089
+ "loss": 1.2077,
3090
+ "step": 4400
3091
+ },
3092
+ {
3093
+ "epoch": 0.9044994231508781,
3094
+ "grad_norm": 0.5504462718963623,
3095
+ "learning_rate": 1.0082037996545769e-05,
3096
+ "loss": 1.1913,
3097
+ "step": 4410
3098
+ },
3099
+ {
3100
+ "epoch": 0.9065504422509935,
3101
+ "grad_norm": 0.5299994945526123,
3102
+ "learning_rate": 9.866148531951641e-06,
3103
+ "loss": 1.1987,
3104
+ "step": 4420
3105
+ },
3106
+ {
3107
+ "epoch": 0.9086014613511089,
3108
+ "grad_norm": 0.5432473421096802,
3109
+ "learning_rate": 9.650259067357513e-06,
3110
+ "loss": 1.199,
3111
+ "step": 4430
3112
+ },
3113
+ {
3114
+ "epoch": 0.9106524804512242,
3115
+ "grad_norm": 0.529331386089325,
3116
+ "learning_rate": 9.434369602763386e-06,
3117
+ "loss": 1.214,
3118
+ "step": 4440
3119
+ },
3120
+ {
3121
+ "epoch": 0.9127034995513396,
3122
+ "grad_norm": 0.49785298109054565,
3123
+ "learning_rate": 9.218480138169258e-06,
3124
+ "loss": 1.202,
3125
+ "step": 4450
3126
+ },
3127
+ {
3128
+ "epoch": 0.914754518651455,
3129
+ "grad_norm": 0.5281327962875366,
3130
+ "learning_rate": 9.00259067357513e-06,
3131
+ "loss": 1.1904,
3132
+ "step": 4460
3133
+ },
3134
+ {
3135
+ "epoch": 0.9168055377515704,
3136
+ "grad_norm": 0.5474033951759338,
3137
+ "learning_rate": 8.786701208981002e-06,
3138
+ "loss": 1.1972,
3139
+ "step": 4470
3140
+ },
3141
+ {
3142
+ "epoch": 0.9188565568516857,
3143
+ "grad_norm": 0.5412236452102661,
3144
+ "learning_rate": 8.570811744386873e-06,
3145
+ "loss": 1.1797,
3146
+ "step": 4480
3147
+ },
3148
+ {
3149
+ "epoch": 0.9209075759518011,
3150
+ "grad_norm": 0.5599170923233032,
3151
+ "learning_rate": 8.354922279792747e-06,
3152
+ "loss": 1.176,
3153
+ "step": 4490
3154
+ },
3155
+ {
3156
+ "epoch": 0.9229585950519165,
3157
+ "grad_norm": 0.5590323805809021,
3158
+ "learning_rate": 8.139032815198619e-06,
3159
+ "loss": 1.1863,
3160
+ "step": 4500
3161
+ },
3162
+ {
3163
+ "epoch": 0.9250096141520318,
3164
+ "grad_norm": 0.566150426864624,
3165
+ "learning_rate": 7.92314335060449e-06,
3166
+ "loss": 1.2217,
3167
+ "step": 4510
3168
+ },
3169
+ {
3170
+ "epoch": 0.9270606332521472,
3171
+ "grad_norm": 0.5459644794464111,
3172
+ "learning_rate": 7.707253886010362e-06,
3173
+ "loss": 1.1903,
3174
+ "step": 4520
3175
+ },
3176
+ {
3177
+ "epoch": 0.9291116523522626,
3178
+ "grad_norm": 0.5333088636398315,
3179
+ "learning_rate": 7.491364421416235e-06,
3180
+ "loss": 1.2076,
3181
+ "step": 4530
3182
+ },
3183
+ {
3184
+ "epoch": 0.9311626714523779,
3185
+ "grad_norm": 0.5921478271484375,
3186
+ "learning_rate": 7.2754749568221076e-06,
3187
+ "loss": 1.191,
3188
+ "step": 4540
3189
+ },
3190
+ {
3191
+ "epoch": 0.9332136905524933,
3192
+ "grad_norm": 0.5061055421829224,
3193
+ "learning_rate": 7.059585492227979e-06,
3194
+ "loss": 1.1787,
3195
+ "step": 4550
3196
+ },
3197
+ {
3198
+ "epoch": 0.9352647096526087,
3199
+ "grad_norm": 0.5804794430732727,
3200
+ "learning_rate": 6.843696027633852e-06,
3201
+ "loss": 1.2096,
3202
+ "step": 4560
3203
+ },
3204
+ {
3205
+ "epoch": 0.9373157287527241,
3206
+ "grad_norm": 0.5328559875488281,
3207
+ "learning_rate": 6.627806563039724e-06,
3208
+ "loss": 1.2072,
3209
+ "step": 4570
3210
+ },
3211
+ {
3212
+ "epoch": 0.9393667478528394,
3213
+ "grad_norm": 0.518925130367279,
3214
+ "learning_rate": 6.4119170984455965e-06,
3215
+ "loss": 1.2119,
3216
+ "step": 4580
3217
+ },
3218
+ {
3219
+ "epoch": 0.9414177669529548,
3220
+ "grad_norm": 0.5092957019805908,
3221
+ "learning_rate": 6.196027633851468e-06,
3222
+ "loss": 1.2137,
3223
+ "step": 4590
3224
+ },
3225
+ {
3226
+ "epoch": 0.9434687860530702,
3227
+ "grad_norm": 0.5156581401824951,
3228
+ "learning_rate": 5.980138169257341e-06,
3229
+ "loss": 1.2059,
3230
+ "step": 4600
3231
+ }
3232
+ ],
3233
+ "logging_steps": 10,
3234
+ "max_steps": 4876,
3235
+ "num_input_tokens_seen": 0,
3236
+ "num_train_epochs": 1,
3237
+ "save_steps": 200,
3238
+ "stateful_callbacks": {
3239
+ "TrainerControl": {
3240
+ "args": {
3241
+ "should_epoch_stop": false,
3242
+ "should_evaluate": false,
3243
+ "should_log": false,
3244
+ "should_save": true,
3245
+ "should_training_stop": false
3246
+ },
3247
+ "attributes": {}
3248
+ }
3249
+ },
3250
+ "total_flos": 1.173859535124265e+17,
3251
+ "train_batch_size": 4,
3252
+ "trial_name": null,
3253
+ "trial_params": null
3254
+ }
lora_checkpoints/checkpoint-4600/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
3
+ size 5624
lora_checkpoints/checkpoint-4800/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: uaritm/gemma3_1b_med_qa_ru
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:uaritm/gemma3_1b_med_qa_ru
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.17.1
lora_checkpoints/checkpoint-4800/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "uaritm/gemma3_1b_med_qa_ru",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 16,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "v_proj",
29
+ "k_proj",
30
+ "o_proj",
31
+ "q_proj",
32
+ "gate_proj",
33
+ "down_proj",
34
+ "up_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
lora_checkpoints/checkpoint-4800/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:838cb10282d32f7525e548df00df831364d0a56716513f0f666bb587cf01bb04
3
+ size 52231312
lora_checkpoints/checkpoint-4800/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
lora_checkpoints/checkpoint-4800/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
lora_checkpoints/checkpoint-4800/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:541c17967fd1cb6da978fa5ce8a107f2b33a846b96278b868f133dabbd3e18c2
3
+ size 104671958
lora_checkpoints/checkpoint-4800/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c56340152e52ff384fdecd489c87b5947889c784ecc63c969b82f3f6c043c7b1
3
+ size 14244
lora_checkpoints/checkpoint-4800/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5da4ea0c1bcacc6c536f51e41f20fb1c9301dc84cb8e04333e56f06168b8cb83
3
+ size 988
lora_checkpoints/checkpoint-4800/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd087ae287cc95f1935ac195b488fa11a3ff4d8db1628daeadd0359c3d8cd18
3
+ size 1064
lora_checkpoints/checkpoint-4800/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
lora_checkpoints/checkpoint-4800/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
lora_checkpoints/checkpoint-4800/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
lora_checkpoints/checkpoint-4800/trainer_state.json ADDED
@@ -0,0 +1,3394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9844891680553776,
6
+ "eval_steps": 500,
7
+ "global_step": 4800,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00205101910011537,
14
+ "grad_norm": 1.9277215003967285,
15
+ "learning_rate": 3.6885245901639347e-06,
16
+ "loss": 1.4306,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.00410203820023074,
21
+ "grad_norm": 0.3513035476207733,
22
+ "learning_rate": 7.78688524590164e-06,
23
+ "loss": 1.3524,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.006153057300346109,
28
+ "grad_norm": 0.3364648222923279,
29
+ "learning_rate": 1.1885245901639344e-05,
30
+ "loss": 1.3188,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.00820407640046148,
35
+ "grad_norm": 0.3382512927055359,
36
+ "learning_rate": 1.598360655737705e-05,
37
+ "loss": 1.3418,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.01025509550057685,
42
+ "grad_norm": 0.360334575176239,
43
+ "learning_rate": 2.0081967213114755e-05,
44
+ "loss": 1.3381,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.012306114600692218,
49
+ "grad_norm": 0.3408481180667877,
50
+ "learning_rate": 2.418032786885246e-05,
51
+ "loss": 1.3365,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.014357133700807588,
56
+ "grad_norm": 0.36211535334587097,
57
+ "learning_rate": 2.8278688524590162e-05,
58
+ "loss": 1.3314,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.01640815280092296,
63
+ "grad_norm": 0.38704580068588257,
64
+ "learning_rate": 3.237704918032787e-05,
65
+ "loss": 1.3108,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.018459171901038327,
70
+ "grad_norm": 0.44303640723228455,
71
+ "learning_rate": 3.6475409836065576e-05,
72
+ "loss": 1.3073,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.0205101910011537,
77
+ "grad_norm": 0.4073602557182312,
78
+ "learning_rate": 4.057377049180328e-05,
79
+ "loss": 1.2993,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.022561210101269068,
84
+ "grad_norm": 0.4478100538253784,
85
+ "learning_rate": 4.467213114754098e-05,
86
+ "loss": 1.3413,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.024612229201384436,
91
+ "grad_norm": 0.39146170020103455,
92
+ "learning_rate": 4.8770491803278687e-05,
93
+ "loss": 1.3168,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.026663248301499808,
98
+ "grad_norm": 0.3786431849002838,
99
+ "learning_rate": 5.28688524590164e-05,
100
+ "loss": 1.2774,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.028714267401615177,
105
+ "grad_norm": 0.4014948904514313,
106
+ "learning_rate": 5.69672131147541e-05,
107
+ "loss": 1.346,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.03076528650173055,
112
+ "grad_norm": 0.3987842798233032,
113
+ "learning_rate": 6.10655737704918e-05,
114
+ "loss": 1.2816,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.03281630560184592,
119
+ "grad_norm": 0.3897082507610321,
120
+ "learning_rate": 6.516393442622951e-05,
121
+ "loss": 1.3485,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.034867324701961286,
126
+ "grad_norm": 0.373279333114624,
127
+ "learning_rate": 6.926229508196722e-05,
128
+ "loss": 1.3185,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.036918343802076654,
133
+ "grad_norm": 0.3812575340270996,
134
+ "learning_rate": 7.336065573770491e-05,
135
+ "loss": 1.3394,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.03896936290219203,
140
+ "grad_norm": 0.35926997661590576,
141
+ "learning_rate": 7.745901639344263e-05,
142
+ "loss": 1.2821,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.0410203820023074,
147
+ "grad_norm": 0.3649434745311737,
148
+ "learning_rate": 8.155737704918032e-05,
149
+ "loss": 1.33,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.04307140110242277,
154
+ "grad_norm": 0.345662921667099,
155
+ "learning_rate": 8.565573770491803e-05,
156
+ "loss": 1.3107,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.045122420202538135,
161
+ "grad_norm": 0.37169769406318665,
162
+ "learning_rate": 8.975409836065574e-05,
163
+ "loss": 1.309,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.047173439302653504,
168
+ "grad_norm": 0.37920281291007996,
169
+ "learning_rate": 9.385245901639344e-05,
170
+ "loss": 1.3352,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.04922445840276887,
175
+ "grad_norm": 0.35772770643234253,
176
+ "learning_rate": 9.795081967213115e-05,
177
+ "loss": 1.2402,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.05127547750288425,
182
+ "grad_norm": 0.38790181279182434,
183
+ "learning_rate": 9.989205526770294e-05,
184
+ "loss": 1.326,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.053326496602999617,
189
+ "grad_norm": 0.3545536696910858,
190
+ "learning_rate": 9.967616580310882e-05,
191
+ "loss": 1.3173,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.055377515703114985,
196
+ "grad_norm": 0.3845142722129822,
197
+ "learning_rate": 9.946027633851469e-05,
198
+ "loss": 1.2949,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.057428534803230354,
203
+ "grad_norm": 0.38621339201927185,
204
+ "learning_rate": 9.924438687392055e-05,
205
+ "loss": 1.2773,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.05947955390334572,
210
+ "grad_norm": 0.38091301918029785,
211
+ "learning_rate": 9.902849740932643e-05,
212
+ "loss": 1.3282,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.0615305730034611,
217
+ "grad_norm": 0.37546730041503906,
218
+ "learning_rate": 9.88126079447323e-05,
219
+ "loss": 1.2862,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.06358159210357646,
224
+ "grad_norm": 0.3515011966228485,
225
+ "learning_rate": 9.859671848013817e-05,
226
+ "loss": 1.2937,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.06563261120369183,
231
+ "grad_norm": 0.3863738775253296,
232
+ "learning_rate": 9.838082901554406e-05,
233
+ "loss": 1.3056,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.06768363030380721,
238
+ "grad_norm": 0.36615240573883057,
239
+ "learning_rate": 9.816493955094992e-05,
240
+ "loss": 1.3062,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.06973464940392257,
245
+ "grad_norm": 0.37741243839263916,
246
+ "learning_rate": 9.794905008635579e-05,
247
+ "loss": 1.3094,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.07178566850403795,
252
+ "grad_norm": 0.38626739382743835,
253
+ "learning_rate": 9.773316062176167e-05,
254
+ "loss": 1.2947,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.07383668760415331,
259
+ "grad_norm": 0.38667401671409607,
260
+ "learning_rate": 9.751727115716753e-05,
261
+ "loss": 1.2976,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.07588770670426868,
266
+ "grad_norm": 0.36084800958633423,
267
+ "learning_rate": 9.730138169257342e-05,
268
+ "loss": 1.27,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.07793872580438406,
273
+ "grad_norm": 0.3754425346851349,
274
+ "learning_rate": 9.708549222797928e-05,
275
+ "loss": 1.3243,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.07998974490449942,
280
+ "grad_norm": 0.39857473969459534,
281
+ "learning_rate": 9.686960276338515e-05,
282
+ "loss": 1.3077,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.0820407640046148,
287
+ "grad_norm": 0.3919648230075836,
288
+ "learning_rate": 9.665371329879103e-05,
289
+ "loss": 1.2985,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.08409178310473016,
294
+ "grad_norm": 0.3675483465194702,
295
+ "learning_rate": 9.643782383419689e-05,
296
+ "loss": 1.2946,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.08614280220484553,
301
+ "grad_norm": 0.3898465633392334,
302
+ "learning_rate": 9.622193436960277e-05,
303
+ "loss": 1.333,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.08819382130496091,
308
+ "grad_norm": 0.3681259751319885,
309
+ "learning_rate": 9.600604490500864e-05,
310
+ "loss": 1.2968,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.09024484040507627,
315
+ "grad_norm": 0.36453816294670105,
316
+ "learning_rate": 9.57901554404145e-05,
317
+ "loss": 1.272,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.09229585950519165,
322
+ "grad_norm": 0.34828147292137146,
323
+ "learning_rate": 9.557426597582039e-05,
324
+ "loss": 1.3245,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.09434687860530701,
329
+ "grad_norm": 0.3570501208305359,
330
+ "learning_rate": 9.535837651122625e-05,
331
+ "loss": 1.313,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.09639789770542238,
336
+ "grad_norm": 0.36692506074905396,
337
+ "learning_rate": 9.514248704663213e-05,
338
+ "loss": 1.2915,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.09844891680553775,
343
+ "grad_norm": 0.39161381125450134,
344
+ "learning_rate": 9.4926597582038e-05,
345
+ "loss": 1.3101,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.10049993590565312,
350
+ "grad_norm": 0.3808858394622803,
351
+ "learning_rate": 9.471070811744387e-05,
352
+ "loss": 1.3099,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.1025509550057685,
357
+ "grad_norm": 0.3541582524776459,
358
+ "learning_rate": 9.449481865284975e-05,
359
+ "loss": 1.2772,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.10460197410588386,
364
+ "grad_norm": 0.379190593957901,
365
+ "learning_rate": 9.427892918825562e-05,
366
+ "loss": 1.2914,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.10665299320599923,
371
+ "grad_norm": 0.37727421522140503,
372
+ "learning_rate": 9.406303972366149e-05,
373
+ "loss": 1.2888,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1087040123061146,
378
+ "grad_norm": 0.3787306845188141,
379
+ "learning_rate": 9.384715025906737e-05,
380
+ "loss": 1.3049,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.11075503140622997,
385
+ "grad_norm": 0.3831459581851959,
386
+ "learning_rate": 9.363126079447323e-05,
387
+ "loss": 1.2631,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.11280605050634535,
392
+ "grad_norm": 0.37274929881095886,
393
+ "learning_rate": 9.34153713298791e-05,
394
+ "loss": 1.3313,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.11485706960646071,
399
+ "grad_norm": 0.3683277368545532,
400
+ "learning_rate": 9.319948186528498e-05,
401
+ "loss": 1.2528,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.11690808870657608,
406
+ "grad_norm": 0.39554840326309204,
407
+ "learning_rate": 9.298359240069085e-05,
408
+ "loss": 1.2737,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.11895910780669144,
413
+ "grad_norm": 0.39166760444641113,
414
+ "learning_rate": 9.276770293609673e-05,
415
+ "loss": 1.271,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.12101012690680682,
420
+ "grad_norm": 0.384085476398468,
421
+ "learning_rate": 9.255181347150259e-05,
422
+ "loss": 1.2921,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.1230611460069222,
427
+ "grad_norm": 0.3704201281070709,
428
+ "learning_rate": 9.233592400690847e-05,
429
+ "loss": 1.2776,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.12511216510703757,
434
+ "grad_norm": 0.3844301998615265,
435
+ "learning_rate": 9.212003454231434e-05,
436
+ "loss": 1.3067,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.12716318420715292,
441
+ "grad_norm": 0.3971571922302246,
442
+ "learning_rate": 9.190414507772022e-05,
443
+ "loss": 1.2792,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.1292142033072683,
448
+ "grad_norm": 0.40666353702545166,
449
+ "learning_rate": 9.168825561312608e-05,
450
+ "loss": 1.2964,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.13126522240738367,
455
+ "grad_norm": 0.38252532482147217,
456
+ "learning_rate": 9.147236614853195e-05,
457
+ "loss": 1.2815,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.13331624150749904,
462
+ "grad_norm": 0.37795621156692505,
463
+ "learning_rate": 9.125647668393783e-05,
464
+ "loss": 1.283,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.13536726060761442,
469
+ "grad_norm": 0.4035683572292328,
470
+ "learning_rate": 9.10405872193437e-05,
471
+ "loss": 1.288,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.13741827970772977,
476
+ "grad_norm": 0.410669207572937,
477
+ "learning_rate": 9.082469775474958e-05,
478
+ "loss": 1.2659,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.13946929880784514,
483
+ "grad_norm": 0.3809865713119507,
484
+ "learning_rate": 9.060880829015544e-05,
485
+ "loss": 1.3133,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.14152031790796052,
490
+ "grad_norm": 0.3748447597026825,
491
+ "learning_rate": 9.039291882556131e-05,
492
+ "loss": 1.2643,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.1435713370080759,
497
+ "grad_norm": 0.39292991161346436,
498
+ "learning_rate": 9.017702936096719e-05,
499
+ "loss": 1.2855,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.14562235610819127,
504
+ "grad_norm": 0.4399755001068115,
505
+ "learning_rate": 8.996113989637307e-05,
506
+ "loss": 1.286,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.14767337520830662,
511
+ "grad_norm": 0.42447429895401,
512
+ "learning_rate": 8.974525043177894e-05,
513
+ "loss": 1.2736,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.149724394308422,
518
+ "grad_norm": 0.37248438596725464,
519
+ "learning_rate": 8.95293609671848e-05,
520
+ "loss": 1.2652,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.15177541340853737,
525
+ "grad_norm": 0.39122238755226135,
526
+ "learning_rate": 8.931347150259068e-05,
527
+ "loss": 1.2814,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.15382643250865274,
532
+ "grad_norm": 0.3697800040245056,
533
+ "learning_rate": 8.909758203799655e-05,
534
+ "loss": 1.2462,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.15587745160876812,
539
+ "grad_norm": 0.3901929259300232,
540
+ "learning_rate": 8.888169257340241e-05,
541
+ "loss": 1.2742,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.15792847070888347,
546
+ "grad_norm": 0.3833727538585663,
547
+ "learning_rate": 8.86658031088083e-05,
548
+ "loss": 1.3015,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.15997948980899884,
553
+ "grad_norm": 0.4028802216053009,
554
+ "learning_rate": 8.844991364421416e-05,
555
+ "loss": 1.2631,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.16203050890911422,
560
+ "grad_norm": 0.39087918400764465,
561
+ "learning_rate": 8.823402417962004e-05,
562
+ "loss": 1.2993,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.1640815280092296,
567
+ "grad_norm": 0.39453235268592834,
568
+ "learning_rate": 8.801813471502591e-05,
569
+ "loss": 1.2544,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.16613254710934497,
574
+ "grad_norm": 0.42142602801322937,
575
+ "learning_rate": 8.780224525043178e-05,
576
+ "loss": 1.2676,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.16818356620946032,
581
+ "grad_norm": 0.36646899580955505,
582
+ "learning_rate": 8.758635578583767e-05,
583
+ "loss": 1.2765,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.1702345853095757,
588
+ "grad_norm": 0.4253019094467163,
589
+ "learning_rate": 8.737046632124353e-05,
590
+ "loss": 1.3003,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.17228560440969107,
595
+ "grad_norm": 0.41490674018859863,
596
+ "learning_rate": 8.715457685664939e-05,
597
+ "loss": 1.2731,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.17433662350980644,
602
+ "grad_norm": 0.405460387468338,
603
+ "learning_rate": 8.693868739205528e-05,
604
+ "loss": 1.2122,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.17638764260992182,
609
+ "grad_norm": 0.4028235375881195,
610
+ "learning_rate": 8.672279792746114e-05,
611
+ "loss": 1.3238,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.17843866171003717,
616
+ "grad_norm": 0.38994792103767395,
617
+ "learning_rate": 8.650690846286701e-05,
618
+ "loss": 1.2875,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.18048968081015254,
623
+ "grad_norm": 0.4099538326263428,
624
+ "learning_rate": 8.629101899827289e-05,
625
+ "loss": 1.2807,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.18254069991026792,
630
+ "grad_norm": 0.40470021963119507,
631
+ "learning_rate": 8.607512953367875e-05,
632
+ "loss": 1.2802,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.1845917190103833,
637
+ "grad_norm": 0.4066854417324066,
638
+ "learning_rate": 8.585924006908464e-05,
639
+ "loss": 1.2464,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.18664273811049864,
644
+ "grad_norm": 0.38739994168281555,
645
+ "learning_rate": 8.56433506044905e-05,
646
+ "loss": 1.2831,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.18869375721061402,
651
+ "grad_norm": 0.4257420301437378,
652
+ "learning_rate": 8.542746113989638e-05,
653
+ "loss": 1.2679,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.1907447763107294,
658
+ "grad_norm": 0.41571488976478577,
659
+ "learning_rate": 8.521157167530225e-05,
660
+ "loss": 1.2501,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.19279579541084477,
665
+ "grad_norm": 0.4178495407104492,
666
+ "learning_rate": 8.499568221070811e-05,
667
+ "loss": 1.2657,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.19484681451096014,
672
+ "grad_norm": 0.4083455801010132,
673
+ "learning_rate": 8.477979274611399e-05,
674
+ "loss": 1.2781,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.1968978336110755,
679
+ "grad_norm": 0.4067554175853729,
680
+ "learning_rate": 8.456390328151986e-05,
681
+ "loss": 1.2582,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.19894885271119087,
686
+ "grad_norm": 0.4067447781562805,
687
+ "learning_rate": 8.434801381692574e-05,
688
+ "loss": 1.2948,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.20099987181130624,
693
+ "grad_norm": 0.44283562898635864,
694
+ "learning_rate": 8.413212435233161e-05,
695
+ "loss": 1.3011,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.20305089091142162,
700
+ "grad_norm": 0.41568294167518616,
701
+ "learning_rate": 8.391623488773748e-05,
702
+ "loss": 1.2804,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.205101910011537,
707
+ "grad_norm": 0.4183642864227295,
708
+ "learning_rate": 8.370034542314335e-05,
709
+ "loss": 1.2228,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.20715292911165234,
714
+ "grad_norm": 0.4311917722225189,
715
+ "learning_rate": 8.348445595854923e-05,
716
+ "loss": 1.2714,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.20920394821176772,
721
+ "grad_norm": 0.41575828194618225,
722
+ "learning_rate": 8.32685664939551e-05,
723
+ "loss": 1.2783,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.2112549673118831,
728
+ "grad_norm": 0.3958878815174103,
729
+ "learning_rate": 8.305267702936098e-05,
730
+ "loss": 1.2558,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.21330598641199847,
735
+ "grad_norm": 0.43759557604789734,
736
+ "learning_rate": 8.283678756476684e-05,
737
+ "loss": 1.2557,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.21535700551211384,
742
+ "grad_norm": 0.41460636258125305,
743
+ "learning_rate": 8.262089810017271e-05,
744
+ "loss": 1.2851,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.2174080246122292,
749
+ "grad_norm": 0.4114689826965332,
750
+ "learning_rate": 8.240500863557859e-05,
751
+ "loss": 1.3076,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.21945904371234456,
756
+ "grad_norm": 0.42222094535827637,
757
+ "learning_rate": 8.218911917098446e-05,
758
+ "loss": 1.2263,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.22151006281245994,
763
+ "grad_norm": 0.4098639488220215,
764
+ "learning_rate": 8.197322970639033e-05,
765
+ "loss": 1.2779,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.22356108191257532,
770
+ "grad_norm": 0.4205043315887451,
771
+ "learning_rate": 8.175734024179621e-05,
772
+ "loss": 1.2177,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.2256121010126907,
777
+ "grad_norm": 0.4501648247241974,
778
+ "learning_rate": 8.154145077720208e-05,
779
+ "loss": 1.3227,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.22766312011280604,
784
+ "grad_norm": 0.41510599851608276,
785
+ "learning_rate": 8.132556131260795e-05,
786
+ "loss": 1.3177,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.22971413921292141,
791
+ "grad_norm": 0.41567444801330566,
792
+ "learning_rate": 8.110967184801383e-05,
793
+ "loss": 1.2506,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.2317651583130368,
798
+ "grad_norm": 0.4262779653072357,
799
+ "learning_rate": 8.089378238341969e-05,
800
+ "loss": 1.2506,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.23381617741315217,
805
+ "grad_norm": 0.4220465421676636,
806
+ "learning_rate": 8.067789291882558e-05,
807
+ "loss": 1.2514,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.23586719651326754,
812
+ "grad_norm": 0.4169275462627411,
813
+ "learning_rate": 8.046200345423144e-05,
814
+ "loss": 1.2693,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.2379182156133829,
819
+ "grad_norm": 0.43145328760147095,
820
+ "learning_rate": 8.02461139896373e-05,
821
+ "loss": 1.2394,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.23996923471349826,
826
+ "grad_norm": 0.42889878153800964,
827
+ "learning_rate": 8.003022452504319e-05,
828
+ "loss": 1.248,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.24202025381361364,
833
+ "grad_norm": 0.41731464862823486,
834
+ "learning_rate": 7.981433506044905e-05,
835
+ "loss": 1.2498,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.24407127291372901,
840
+ "grad_norm": 0.4326362609863281,
841
+ "learning_rate": 7.959844559585493e-05,
842
+ "loss": 1.265,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.2461222920138444,
847
+ "grad_norm": 0.4242352843284607,
848
+ "learning_rate": 7.93825561312608e-05,
849
+ "loss": 1.2672,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.24817331111395974,
854
+ "grad_norm": 0.4441153407096863,
855
+ "learning_rate": 7.916666666666666e-05,
856
+ "loss": 1.2944,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.25022433021407514,
861
+ "grad_norm": 0.40912818908691406,
862
+ "learning_rate": 7.895077720207255e-05,
863
+ "loss": 1.2702,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.2522753493141905,
868
+ "grad_norm": 0.44539037346839905,
869
+ "learning_rate": 7.873488773747841e-05,
870
+ "loss": 1.2228,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.25432636841430584,
875
+ "grad_norm": 0.4299303889274597,
876
+ "learning_rate": 7.851899827288429e-05,
877
+ "loss": 1.2328,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.25637738751442124,
882
+ "grad_norm": 0.4408973455429077,
883
+ "learning_rate": 7.830310880829016e-05,
884
+ "loss": 1.2358,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.2584284066145366,
889
+ "grad_norm": 0.4100968837738037,
890
+ "learning_rate": 7.808721934369602e-05,
891
+ "loss": 1.2458,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.260479425714652,
896
+ "grad_norm": 0.4401489198207855,
897
+ "learning_rate": 7.787132987910191e-05,
898
+ "loss": 1.2593,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.26253044481476734,
903
+ "grad_norm": 0.4514229893684387,
904
+ "learning_rate": 7.765544041450777e-05,
905
+ "loss": 1.2632,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.2645814639148827,
910
+ "grad_norm": 0.38684791326522827,
911
+ "learning_rate": 7.743955094991365e-05,
912
+ "loss": 1.2424,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.2666324830149981,
917
+ "grad_norm": 0.46148189902305603,
918
+ "learning_rate": 7.722366148531953e-05,
919
+ "loss": 1.2445,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.26868350211511344,
924
+ "grad_norm": 0.4319213628768921,
925
+ "learning_rate": 7.700777202072539e-05,
926
+ "loss": 1.2253,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.27073452121522884,
931
+ "grad_norm": 0.4195545017719269,
932
+ "learning_rate": 7.679188255613126e-05,
933
+ "loss": 1.2578,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.2727855403153442,
938
+ "grad_norm": 0.43690159916877747,
939
+ "learning_rate": 7.657599309153714e-05,
940
+ "loss": 1.2573,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.27483655941545954,
945
+ "grad_norm": 0.44571492075920105,
946
+ "learning_rate": 7.636010362694301e-05,
947
+ "loss": 1.2607,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.27688757851557494,
952
+ "grad_norm": 0.43295958638191223,
953
+ "learning_rate": 7.614421416234889e-05,
954
+ "loss": 1.2278,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.2789385976156903,
959
+ "grad_norm": 0.44495707750320435,
960
+ "learning_rate": 7.592832469775475e-05,
961
+ "loss": 1.2798,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.2809896167158057,
966
+ "grad_norm": 0.4412330985069275,
967
+ "learning_rate": 7.571243523316062e-05,
968
+ "loss": 1.2501,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.28304063581592104,
973
+ "grad_norm": 0.44599953293800354,
974
+ "learning_rate": 7.54965457685665e-05,
975
+ "loss": 1.2396,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.2850916549160364,
980
+ "grad_norm": 0.447109580039978,
981
+ "learning_rate": 7.528065630397237e-05,
982
+ "loss": 1.2767,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.2871426740161518,
987
+ "grad_norm": 0.44506722688674927,
988
+ "learning_rate": 7.506476683937824e-05,
989
+ "loss": 1.2546,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.28919369311626714,
994
+ "grad_norm": 0.44061776995658875,
995
+ "learning_rate": 7.484887737478411e-05,
996
+ "loss": 1.2413,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.29124471221638254,
1001
+ "grad_norm": 0.45085111260414124,
1002
+ "learning_rate": 7.463298791018999e-05,
1003
+ "loss": 1.2483,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.2932957313164979,
1008
+ "grad_norm": 0.4437837600708008,
1009
+ "learning_rate": 7.441709844559586e-05,
1010
+ "loss": 1.252,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.29534675041661324,
1015
+ "grad_norm": 0.4294221103191376,
1016
+ "learning_rate": 7.420120898100174e-05,
1017
+ "loss": 1.2386,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.29739776951672864,
1022
+ "grad_norm": 0.4780830144882202,
1023
+ "learning_rate": 7.39853195164076e-05,
1024
+ "loss": 1.2639,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.299448788616844,
1029
+ "grad_norm": 0.44152942299842834,
1030
+ "learning_rate": 7.376943005181347e-05,
1031
+ "loss": 1.2756,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.3014998077169594,
1036
+ "grad_norm": 0.41989192366600037,
1037
+ "learning_rate": 7.355354058721935e-05,
1038
+ "loss": 1.2614,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.30355082681707474,
1043
+ "grad_norm": 0.5871754884719849,
1044
+ "learning_rate": 7.333765112262521e-05,
1045
+ "loss": 1.2615,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.3056018459171901,
1050
+ "grad_norm": 0.4467261731624603,
1051
+ "learning_rate": 7.31217616580311e-05,
1052
+ "loss": 1.2624,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.3076528650173055,
1057
+ "grad_norm": 0.49219033122062683,
1058
+ "learning_rate": 7.290587219343696e-05,
1059
+ "loss": 1.289,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.30970388411742084,
1064
+ "grad_norm": 0.4700734317302704,
1065
+ "learning_rate": 7.268998272884284e-05,
1066
+ "loss": 1.242,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.31175490321753624,
1071
+ "grad_norm": 0.4607170820236206,
1072
+ "learning_rate": 7.247409326424871e-05,
1073
+ "loss": 1.2554,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.3138059223176516,
1078
+ "grad_norm": 0.4335988759994507,
1079
+ "learning_rate": 7.225820379965457e-05,
1080
+ "loss": 1.2423,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.31585694141776693,
1085
+ "grad_norm": 0.4366897940635681,
1086
+ "learning_rate": 7.204231433506046e-05,
1087
+ "loss": 1.2219,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.31790796051788234,
1092
+ "grad_norm": 0.45856085419654846,
1093
+ "learning_rate": 7.182642487046632e-05,
1094
+ "loss": 1.2189,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.3199589796179977,
1099
+ "grad_norm": 0.4563063085079193,
1100
+ "learning_rate": 7.16105354058722e-05,
1101
+ "loss": 1.2696,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.3220099987181131,
1106
+ "grad_norm": 0.4276934862136841,
1107
+ "learning_rate": 7.139464594127807e-05,
1108
+ "loss": 1.2659,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.32406101781822844,
1113
+ "grad_norm": 0.46200886368751526,
1114
+ "learning_rate": 7.117875647668394e-05,
1115
+ "loss": 1.2261,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.3261120369183438,
1120
+ "grad_norm": 0.4863358736038208,
1121
+ "learning_rate": 7.096286701208982e-05,
1122
+ "loss": 1.2292,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.3281630560184592,
1127
+ "grad_norm": 0.4537160098552704,
1128
+ "learning_rate": 7.074697754749569e-05,
1129
+ "loss": 1.2453,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.33021407511857453,
1134
+ "grad_norm": 0.4507627487182617,
1135
+ "learning_rate": 7.053108808290155e-05,
1136
+ "loss": 1.2081,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.33226509421868994,
1141
+ "grad_norm": 0.43197301030158997,
1142
+ "learning_rate": 7.031519861830744e-05,
1143
+ "loss": 1.2757,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.3343161133188053,
1148
+ "grad_norm": 0.4551820456981659,
1149
+ "learning_rate": 7.00993091537133e-05,
1150
+ "loss": 1.2751,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.33636713241892063,
1155
+ "grad_norm": 0.45099398493766785,
1156
+ "learning_rate": 6.988341968911917e-05,
1157
+ "loss": 1.2583,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.33841815151903604,
1162
+ "grad_norm": 0.46787434816360474,
1163
+ "learning_rate": 6.966753022452505e-05,
1164
+ "loss": 1.2448,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.3404691706191514,
1169
+ "grad_norm": 0.45500054955482483,
1170
+ "learning_rate": 6.945164075993091e-05,
1171
+ "loss": 1.2394,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.3425201897192668,
1176
+ "grad_norm": 0.4682730436325073,
1177
+ "learning_rate": 6.92357512953368e-05,
1178
+ "loss": 1.2287,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.34457120881938214,
1183
+ "grad_norm": 0.4615074396133423,
1184
+ "learning_rate": 6.901986183074266e-05,
1185
+ "loss": 1.2042,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.3466222279194975,
1190
+ "grad_norm": 0.4548027217388153,
1191
+ "learning_rate": 6.880397236614854e-05,
1192
+ "loss": 1.2671,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.3486732470196129,
1197
+ "grad_norm": 0.4783169627189636,
1198
+ "learning_rate": 6.858808290155441e-05,
1199
+ "loss": 1.2533,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.35072426611972823,
1204
+ "grad_norm": 0.46452414989471436,
1205
+ "learning_rate": 6.837219343696027e-05,
1206
+ "loss": 1.2681,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.35277528521984364,
1211
+ "grad_norm": 0.4663463532924652,
1212
+ "learning_rate": 6.815630397236615e-05,
1213
+ "loss": 1.2561,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.354826304319959,
1218
+ "grad_norm": 0.46744370460510254,
1219
+ "learning_rate": 6.794041450777202e-05,
1220
+ "loss": 1.2453,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.35687732342007433,
1225
+ "grad_norm": 0.471835732460022,
1226
+ "learning_rate": 6.77245250431779e-05,
1227
+ "loss": 1.2472,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.35892834252018974,
1232
+ "grad_norm": 0.4618450701236725,
1233
+ "learning_rate": 6.750863557858377e-05,
1234
+ "loss": 1.2547,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.3609793616203051,
1239
+ "grad_norm": 0.4651658833026886,
1240
+ "learning_rate": 6.729274611398963e-05,
1241
+ "loss": 1.2623,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.36303038072042043,
1246
+ "grad_norm": 0.46842116117477417,
1247
+ "learning_rate": 6.707685664939551e-05,
1248
+ "loss": 1.2391,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.36508139982053583,
1253
+ "grad_norm": 0.45604613423347473,
1254
+ "learning_rate": 6.686096718480138e-05,
1255
+ "loss": 1.2884,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.3671324189206512,
1260
+ "grad_norm": 0.4306802451610565,
1261
+ "learning_rate": 6.664507772020726e-05,
1262
+ "loss": 1.2252,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.3691834380207666,
1267
+ "grad_norm": 0.4549136757850647,
1268
+ "learning_rate": 6.642918825561312e-05,
1269
+ "loss": 1.2496,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.37123445712088193,
1274
+ "grad_norm": 0.47443437576293945,
1275
+ "learning_rate": 6.6213298791019e-05,
1276
+ "loss": 1.2655,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.3732854762209973,
1281
+ "grad_norm": 0.46772050857543945,
1282
+ "learning_rate": 6.599740932642487e-05,
1283
+ "loss": 1.2366,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.3753364953211127,
1288
+ "grad_norm": 0.4691794216632843,
1289
+ "learning_rate": 6.578151986183075e-05,
1290
+ "loss": 1.2152,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.37738751442122803,
1295
+ "grad_norm": 0.43691304326057434,
1296
+ "learning_rate": 6.556563039723662e-05,
1297
+ "loss": 1.2511,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.37943853352134344,
1302
+ "grad_norm": 0.4595348536968231,
1303
+ "learning_rate": 6.534974093264248e-05,
1304
+ "loss": 1.2635,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.3814895526214588,
1309
+ "grad_norm": 0.44760558009147644,
1310
+ "learning_rate": 6.513385146804836e-05,
1311
+ "loss": 1.2342,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.38354057172157413,
1316
+ "grad_norm": 0.4559841454029083,
1317
+ "learning_rate": 6.491796200345423e-05,
1318
+ "loss": 1.2432,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.38559159082168953,
1323
+ "grad_norm": 0.4497215449810028,
1324
+ "learning_rate": 6.470207253886011e-05,
1325
+ "loss": 1.2267,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.3876426099218049,
1330
+ "grad_norm": 0.4863613247871399,
1331
+ "learning_rate": 6.448618307426598e-05,
1332
+ "loss": 1.254,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.3896936290219203,
1337
+ "grad_norm": 0.4500603675842285,
1338
+ "learning_rate": 6.427029360967185e-05,
1339
+ "loss": 1.2214,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.39174464812203563,
1344
+ "grad_norm": 0.4400598704814911,
1345
+ "learning_rate": 6.405440414507774e-05,
1346
+ "loss": 1.2352,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.393795667222151,
1351
+ "grad_norm": 0.46070367097854614,
1352
+ "learning_rate": 6.38385146804836e-05,
1353
+ "loss": 1.2468,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.3958466863222664,
1358
+ "grad_norm": 0.44312766194343567,
1359
+ "learning_rate": 6.362262521588946e-05,
1360
+ "loss": 1.1923,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.39789770542238173,
1365
+ "grad_norm": 0.5013573169708252,
1366
+ "learning_rate": 6.340673575129535e-05,
1367
+ "loss": 1.2361,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.39994872452249713,
1372
+ "grad_norm": 0.4884537160396576,
1373
+ "learning_rate": 6.319084628670121e-05,
1374
+ "loss": 1.2434,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.4019997436226125,
1379
+ "grad_norm": 0.46138620376586914,
1380
+ "learning_rate": 6.297495682210708e-05,
1381
+ "loss": 1.257,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.40405076272272783,
1386
+ "grad_norm": 0.4941729009151459,
1387
+ "learning_rate": 6.275906735751296e-05,
1388
+ "loss": 1.2347,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.40610178182284323,
1393
+ "grad_norm": 0.4675595760345459,
1394
+ "learning_rate": 6.254317789291882e-05,
1395
+ "loss": 1.2353,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.4081528009229586,
1400
+ "grad_norm": 0.47944632172584534,
1401
+ "learning_rate": 6.232728842832471e-05,
1402
+ "loss": 1.2643,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.410203820023074,
1407
+ "grad_norm": 0.4476461112499237,
1408
+ "learning_rate": 6.211139896373057e-05,
1409
+ "loss": 1.2558,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.41225483912318933,
1414
+ "grad_norm": 0.4706653654575348,
1415
+ "learning_rate": 6.189550949913645e-05,
1416
+ "loss": 1.227,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.4143058582233047,
1421
+ "grad_norm": 0.48062801361083984,
1422
+ "learning_rate": 6.167962003454232e-05,
1423
+ "loss": 1.2273,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.4163568773234201,
1428
+ "grad_norm": 0.46771204471588135,
1429
+ "learning_rate": 6.146373056994818e-05,
1430
+ "loss": 1.2268,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.41840789642353543,
1435
+ "grad_norm": 0.4725424647331238,
1436
+ "learning_rate": 6.124784110535406e-05,
1437
+ "loss": 1.2009,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.42045891552365083,
1442
+ "grad_norm": 0.47520384192466736,
1443
+ "learning_rate": 6.1031951640759934e-05,
1444
+ "loss": 1.2511,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.4225099346237662,
1449
+ "grad_norm": 0.44635480642318726,
1450
+ "learning_rate": 6.081606217616581e-05,
1451
+ "loss": 1.21,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.42456095372388153,
1456
+ "grad_norm": 0.47436651587486267,
1457
+ "learning_rate": 6.060017271157168e-05,
1458
+ "loss": 1.2116,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.42661197282399693,
1463
+ "grad_norm": 0.5115741491317749,
1464
+ "learning_rate": 6.0384283246977546e-05,
1465
+ "loss": 1.2778,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.4286629919241123,
1470
+ "grad_norm": 0.4488040506839752,
1471
+ "learning_rate": 6.016839378238343e-05,
1472
+ "loss": 1.2242,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.4307140110242277,
1477
+ "grad_norm": 0.4834796190261841,
1478
+ "learning_rate": 5.9952504317789296e-05,
1479
+ "loss": 1.2357,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.43276503012434303,
1484
+ "grad_norm": 0.45478227734565735,
1485
+ "learning_rate": 5.973661485319517e-05,
1486
+ "loss": 1.2233,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.4348160492244584,
1491
+ "grad_norm": 0.4539099633693695,
1492
+ "learning_rate": 5.952072538860104e-05,
1493
+ "loss": 1.2527,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.4368670683245738,
1498
+ "grad_norm": 0.47722533345222473,
1499
+ "learning_rate": 5.930483592400691e-05,
1500
+ "loss": 1.2015,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.43891808742468913,
1505
+ "grad_norm": 0.472023069858551,
1506
+ "learning_rate": 5.908894645941278e-05,
1507
+ "loss": 1.2222,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.44096910652480453,
1512
+ "grad_norm": 0.4648214876651764,
1513
+ "learning_rate": 5.887305699481865e-05,
1514
+ "loss": 1.2112,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.4430201256249199,
1519
+ "grad_norm": 0.48654377460479736,
1520
+ "learning_rate": 5.8657167530224534e-05,
1521
+ "loss": 1.227,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.44507114472503523,
1526
+ "grad_norm": 0.4997814893722534,
1527
+ "learning_rate": 5.84412780656304e-05,
1528
+ "loss": 1.2721,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.44712216382515063,
1533
+ "grad_norm": 0.47997352480888367,
1534
+ "learning_rate": 5.822538860103627e-05,
1535
+ "loss": 1.2018,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.449173182925266,
1540
+ "grad_norm": 0.4899247884750366,
1541
+ "learning_rate": 5.8009499136442146e-05,
1542
+ "loss": 1.2599,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.4512242020253814,
1547
+ "grad_norm": 0.4752749800682068,
1548
+ "learning_rate": 5.7793609671848014e-05,
1549
+ "loss": 1.2171,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.45327522112549673,
1554
+ "grad_norm": 0.4801314175128937,
1555
+ "learning_rate": 5.7577720207253896e-05,
1556
+ "loss": 1.2234,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.4553262402256121,
1561
+ "grad_norm": 0.4591893255710602,
1562
+ "learning_rate": 5.7361830742659764e-05,
1563
+ "loss": 1.2242,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.4573772593257275,
1568
+ "grad_norm": 0.46896713972091675,
1569
+ "learning_rate": 5.7145941278065626e-05,
1570
+ "loss": 1.2117,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.45942827842584283,
1575
+ "grad_norm": 0.4853857755661011,
1576
+ "learning_rate": 5.693005181347151e-05,
1577
+ "loss": 1.2218,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.46147929752595823,
1582
+ "grad_norm": 0.4648151993751526,
1583
+ "learning_rate": 5.6714162348877376e-05,
1584
+ "loss": 1.2401,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.4635303166260736,
1589
+ "grad_norm": 0.4839739501476288,
1590
+ "learning_rate": 5.649827288428325e-05,
1591
+ "loss": 1.1976,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.4655813357261889,
1596
+ "grad_norm": 0.4986715018749237,
1597
+ "learning_rate": 5.628238341968912e-05,
1598
+ "loss": 1.2274,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.46763235482630433,
1603
+ "grad_norm": 0.4636840522289276,
1604
+ "learning_rate": 5.606649395509499e-05,
1605
+ "loss": 1.236,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.4696833739264197,
1610
+ "grad_norm": 0.5011271834373474,
1611
+ "learning_rate": 5.585060449050087e-05,
1612
+ "loss": 1.2275,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.4717343930265351,
1617
+ "grad_norm": 0.4648337662220001,
1618
+ "learning_rate": 5.563471502590674e-05,
1619
+ "loss": 1.2457,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.47378541212665043,
1624
+ "grad_norm": 0.47708699107170105,
1625
+ "learning_rate": 5.5418825561312614e-05,
1626
+ "loss": 1.2316,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.4758364312267658,
1631
+ "grad_norm": 0.4954835772514343,
1632
+ "learning_rate": 5.520293609671848e-05,
1633
+ "loss": 1.229,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.4778874503268812,
1638
+ "grad_norm": 0.4701727330684662,
1639
+ "learning_rate": 5.498704663212435e-05,
1640
+ "loss": 1.248,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.47993846942699653,
1645
+ "grad_norm": 0.4796009957790375,
1646
+ "learning_rate": 5.477115716753023e-05,
1647
+ "loss": 1.2248,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.48198948852711193,
1652
+ "grad_norm": 0.4906330406665802,
1653
+ "learning_rate": 5.4555267702936094e-05,
1654
+ "loss": 1.2628,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.4840405076272273,
1659
+ "grad_norm": 0.47203144431114197,
1660
+ "learning_rate": 5.4339378238341976e-05,
1661
+ "loss": 1.2067,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.4860915267273426,
1666
+ "grad_norm": 0.503813624382019,
1667
+ "learning_rate": 5.4123488773747845e-05,
1668
+ "loss": 1.2006,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.48814254582745803,
1673
+ "grad_norm": 0.4918235242366791,
1674
+ "learning_rate": 5.390759930915371e-05,
1675
+ "loss": 1.1887,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.4901935649275734,
1680
+ "grad_norm": 0.4799112379550934,
1681
+ "learning_rate": 5.369170984455959e-05,
1682
+ "loss": 1.2079,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.4922445840276888,
1687
+ "grad_norm": 0.4769650101661682,
1688
+ "learning_rate": 5.347582037996546e-05,
1689
+ "loss": 1.1945,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.49429560312780413,
1694
+ "grad_norm": 0.5079638957977295,
1695
+ "learning_rate": 5.325993091537134e-05,
1696
+ "loss": 1.2294,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.4963466222279195,
1701
+ "grad_norm": 0.520418643951416,
1702
+ "learning_rate": 5.304404145077721e-05,
1703
+ "loss": 1.2308,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.4983976413280349,
1708
+ "grad_norm": 0.4546453058719635,
1709
+ "learning_rate": 5.2828151986183075e-05,
1710
+ "loss": 1.2206,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.5004486604281503,
1715
+ "grad_norm": 0.47760534286499023,
1716
+ "learning_rate": 5.261226252158895e-05,
1717
+ "loss": 1.208,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.5024996795282656,
1722
+ "grad_norm": 0.5267066955566406,
1723
+ "learning_rate": 5.239637305699482e-05,
1724
+ "loss": 1.2123,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.504550698628381,
1729
+ "grad_norm": 0.45763811469078064,
1730
+ "learning_rate": 5.2180483592400694e-05,
1731
+ "loss": 1.2159,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.5066017177284964,
1736
+ "grad_norm": 0.4922376871109009,
1737
+ "learning_rate": 5.196459412780656e-05,
1738
+ "loss": 1.2456,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.5086527368286117,
1743
+ "grad_norm": 0.47043368220329285,
1744
+ "learning_rate": 5.174870466321243e-05,
1745
+ "loss": 1.2052,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.5107037559287271,
1750
+ "grad_norm": 0.5082889795303345,
1751
+ "learning_rate": 5.153281519861831e-05,
1752
+ "loss": 1.2393,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.5127547750288425,
1757
+ "grad_norm": 0.4955206513404846,
1758
+ "learning_rate": 5.131692573402418e-05,
1759
+ "loss": 1.2323,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.5148057941289578,
1764
+ "grad_norm": 0.48625460267066956,
1765
+ "learning_rate": 5.1101036269430057e-05,
1766
+ "loss": 1.206,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.5168568132290732,
1771
+ "grad_norm": 0.49060237407684326,
1772
+ "learning_rate": 5.0885146804835925e-05,
1773
+ "loss": 1.2353,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.5189078323291886,
1778
+ "grad_norm": 0.46809640526771545,
1779
+ "learning_rate": 5.0669257340241793e-05,
1780
+ "loss": 1.2287,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.520958851429304,
1785
+ "grad_norm": 0.4944596290588379,
1786
+ "learning_rate": 5.0453367875647675e-05,
1787
+ "loss": 1.2413,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.5230098705294193,
1792
+ "grad_norm": 0.46914994716644287,
1793
+ "learning_rate": 5.023747841105354e-05,
1794
+ "loss": 1.22,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.5250608896295347,
1799
+ "grad_norm": 0.4888727366924286,
1800
+ "learning_rate": 5.002158894645942e-05,
1801
+ "loss": 1.2343,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.5271119087296501,
1806
+ "grad_norm": 0.4785778522491455,
1807
+ "learning_rate": 4.980569948186529e-05,
1808
+ "loss": 1.187,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.5291629278297654,
1813
+ "grad_norm": 0.4947550594806671,
1814
+ "learning_rate": 4.958981001727116e-05,
1815
+ "loss": 1.2288,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.5312139469298808,
1820
+ "grad_norm": 0.5263291597366333,
1821
+ "learning_rate": 4.937392055267703e-05,
1822
+ "loss": 1.2044,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.5332649660299962,
1827
+ "grad_norm": 0.49239382147789,
1828
+ "learning_rate": 4.9158031088082906e-05,
1829
+ "loss": 1.1865,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.5353159851301115,
1834
+ "grad_norm": 0.48874983191490173,
1835
+ "learning_rate": 4.8942141623488775e-05,
1836
+ "loss": 1.2672,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.5373670042302269,
1841
+ "grad_norm": 0.48474863171577454,
1842
+ "learning_rate": 4.872625215889465e-05,
1843
+ "loss": 1.2359,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.5394180233303423,
1848
+ "grad_norm": 0.4978977143764496,
1849
+ "learning_rate": 4.851036269430052e-05,
1850
+ "loss": 1.2139,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.5414690424304577,
1855
+ "grad_norm": 0.5144924521446228,
1856
+ "learning_rate": 4.829447322970639e-05,
1857
+ "loss": 1.221,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.543520061530573,
1862
+ "grad_norm": 0.5082759857177734,
1863
+ "learning_rate": 4.807858376511227e-05,
1864
+ "loss": 1.2209,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.5455710806306884,
1869
+ "grad_norm": 0.4933965504169464,
1870
+ "learning_rate": 4.786269430051814e-05,
1871
+ "loss": 1.207,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.5476220997308038,
1876
+ "grad_norm": 0.49464166164398193,
1877
+ "learning_rate": 4.7646804835924005e-05,
1878
+ "loss": 1.2398,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.5496731188309191,
1883
+ "grad_norm": 0.49377110600471497,
1884
+ "learning_rate": 4.743091537132988e-05,
1885
+ "loss": 1.2451,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.5517241379310345,
1890
+ "grad_norm": 0.5111104846000671,
1891
+ "learning_rate": 4.7215025906735756e-05,
1892
+ "loss": 1.2197,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.5537751570311499,
1897
+ "grad_norm": 0.47716042399406433,
1898
+ "learning_rate": 4.699913644214163e-05,
1899
+ "loss": 1.1891,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.5558261761312652,
1904
+ "grad_norm": 0.5081655383110046,
1905
+ "learning_rate": 4.678324697754749e-05,
1906
+ "loss": 1.2507,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.5578771952313806,
1911
+ "grad_norm": 0.49036547541618347,
1912
+ "learning_rate": 4.656735751295337e-05,
1913
+ "loss": 1.1805,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.559928214331496,
1918
+ "grad_norm": 0.5139365792274475,
1919
+ "learning_rate": 4.635146804835924e-05,
1920
+ "loss": 1.2361,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.5619792334316114,
1925
+ "grad_norm": 0.5098669528961182,
1926
+ "learning_rate": 4.613557858376512e-05,
1927
+ "loss": 1.2409,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.5640302525317267,
1932
+ "grad_norm": 0.4786950349807739,
1933
+ "learning_rate": 4.5919689119170986e-05,
1934
+ "loss": 1.2067,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.5660812716318421,
1939
+ "grad_norm": 0.5063204169273376,
1940
+ "learning_rate": 4.5703799654576855e-05,
1941
+ "loss": 1.1942,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.5681322907319575,
1946
+ "grad_norm": 0.511663556098938,
1947
+ "learning_rate": 4.548791018998273e-05,
1948
+ "loss": 1.2017,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.5701833098320728,
1953
+ "grad_norm": 0.48765748739242554,
1954
+ "learning_rate": 4.5272020725388605e-05,
1955
+ "loss": 1.222,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.5722343289321882,
1960
+ "grad_norm": 0.49707624316215515,
1961
+ "learning_rate": 4.5056131260794474e-05,
1962
+ "loss": 1.2075,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.5742853480323036,
1967
+ "grad_norm": 0.5067517757415771,
1968
+ "learning_rate": 4.484024179620035e-05,
1969
+ "loss": 1.211,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.5763363671324189,
1974
+ "grad_norm": 0.4615229368209839,
1975
+ "learning_rate": 4.462435233160622e-05,
1976
+ "loss": 1.2303,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.5783873862325343,
1981
+ "grad_norm": 0.4948524236679077,
1982
+ "learning_rate": 4.440846286701209e-05,
1983
+ "loss": 1.2024,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.5804384053326497,
1988
+ "grad_norm": 0.5140314102172852,
1989
+ "learning_rate": 4.419257340241796e-05,
1990
+ "loss": 1.2217,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.5824894244327651,
1995
+ "grad_norm": 0.5108122825622559,
1996
+ "learning_rate": 4.3976683937823836e-05,
1997
+ "loss": 1.1838,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.5845404435328804,
2002
+ "grad_norm": 0.5021159052848816,
2003
+ "learning_rate": 4.376079447322971e-05,
2004
+ "loss": 1.2418,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.5865914626329958,
2009
+ "grad_norm": 0.5086933374404907,
2010
+ "learning_rate": 4.354490500863558e-05,
2011
+ "loss": 1.2321,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.5886424817331112,
2016
+ "grad_norm": 0.5083547830581665,
2017
+ "learning_rate": 4.332901554404145e-05,
2018
+ "loss": 1.2035,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.5906935008332265,
2023
+ "grad_norm": 0.4828626215457916,
2024
+ "learning_rate": 4.311312607944732e-05,
2025
+ "loss": 1.2302,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.5927445199333419,
2030
+ "grad_norm": 0.5140969157218933,
2031
+ "learning_rate": 4.28972366148532e-05,
2032
+ "loss": 1.2058,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.5947955390334573,
2037
+ "grad_norm": 0.497364342212677,
2038
+ "learning_rate": 4.2681347150259074e-05,
2039
+ "loss": 1.2382,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.5968465581335726,
2044
+ "grad_norm": 0.49104997515678406,
2045
+ "learning_rate": 4.246545768566494e-05,
2046
+ "loss": 1.2322,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.598897577233688,
2051
+ "grad_norm": 0.521659255027771,
2052
+ "learning_rate": 4.224956822107081e-05,
2053
+ "loss": 1.1868,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.6009485963338034,
2058
+ "grad_norm": 0.5175550580024719,
2059
+ "learning_rate": 4.2033678756476686e-05,
2060
+ "loss": 1.2169,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.6029996154339188,
2065
+ "grad_norm": 0.4998300075531006,
2066
+ "learning_rate": 4.181778929188256e-05,
2067
+ "loss": 1.2227,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.6050506345340341,
2072
+ "grad_norm": 0.4932349622249603,
2073
+ "learning_rate": 4.160189982728843e-05,
2074
+ "loss": 1.2371,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.6071016536341495,
2079
+ "grad_norm": 0.5610498189926147,
2080
+ "learning_rate": 4.1386010362694304e-05,
2081
+ "loss": 1.2105,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.6091526727342649,
2086
+ "grad_norm": 0.4975990355014801,
2087
+ "learning_rate": 4.117012089810017e-05,
2088
+ "loss": 1.2511,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.6112036918343802,
2093
+ "grad_norm": 0.5154693722724915,
2094
+ "learning_rate": 4.095423143350605e-05,
2095
+ "loss": 1.2399,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.6132547109344956,
2100
+ "grad_norm": 0.4968002736568451,
2101
+ "learning_rate": 4.0738341968911916e-05,
2102
+ "loss": 1.2041,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.615305730034611,
2107
+ "grad_norm": 0.4866868555545807,
2108
+ "learning_rate": 4.052245250431779e-05,
2109
+ "loss": 1.1965,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.6173567491347263,
2114
+ "grad_norm": 0.5152925848960876,
2115
+ "learning_rate": 4.030656303972367e-05,
2116
+ "loss": 1.2298,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.6194077682348417,
2121
+ "grad_norm": 0.513058602809906,
2122
+ "learning_rate": 4.0090673575129535e-05,
2123
+ "loss": 1.2414,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.6214587873349571,
2128
+ "grad_norm": 0.5031930208206177,
2129
+ "learning_rate": 3.987478411053541e-05,
2130
+ "loss": 1.1766,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.6235098064350725,
2135
+ "grad_norm": 0.5087730288505554,
2136
+ "learning_rate": 3.965889464594128e-05,
2137
+ "loss": 1.229,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.6255608255351878,
2142
+ "grad_norm": 0.4878797233104706,
2143
+ "learning_rate": 3.9443005181347154e-05,
2144
+ "loss": 1.2018,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.6276118446353032,
2149
+ "grad_norm": 0.5124858617782593,
2150
+ "learning_rate": 3.922711571675303e-05,
2151
+ "loss": 1.1848,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.6296628637354186,
2156
+ "grad_norm": 0.49720969796180725,
2157
+ "learning_rate": 3.90112262521589e-05,
2158
+ "loss": 1.1892,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.6317138828355339,
2163
+ "grad_norm": 0.49900123476982117,
2164
+ "learning_rate": 3.8795336787564766e-05,
2165
+ "loss": 1.2027,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.6337649019356493,
2170
+ "grad_norm": 0.5007952451705933,
2171
+ "learning_rate": 3.857944732297064e-05,
2172
+ "loss": 1.2373,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.6358159210357647,
2177
+ "grad_norm": 0.49481576681137085,
2178
+ "learning_rate": 3.8363557858376516e-05,
2179
+ "loss": 1.2294,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.63786694013588,
2184
+ "grad_norm": 0.4979318082332611,
2185
+ "learning_rate": 3.8147668393782385e-05,
2186
+ "loss": 1.2312,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.6399179592359954,
2191
+ "grad_norm": 0.49939480423927307,
2192
+ "learning_rate": 3.793177892918825e-05,
2193
+ "loss": 1.2394,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.6419689783361108,
2198
+ "grad_norm": 0.5186517834663391,
2199
+ "learning_rate": 3.771588946459413e-05,
2200
+ "loss": 1.199,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.6440199974362262,
2205
+ "grad_norm": 0.5386569499969482,
2206
+ "learning_rate": 3.7500000000000003e-05,
2207
+ "loss": 1.1801,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.6460710165363415,
2212
+ "grad_norm": 0.5134577751159668,
2213
+ "learning_rate": 3.728411053540587e-05,
2214
+ "loss": 1.2286,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.6481220356364569,
2219
+ "grad_norm": 0.5191785097122192,
2220
+ "learning_rate": 3.706822107081175e-05,
2221
+ "loss": 1.2068,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.6501730547365723,
2226
+ "grad_norm": 0.4857168197631836,
2227
+ "learning_rate": 3.6852331606217615e-05,
2228
+ "loss": 1.2116,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.6522240738366876,
2233
+ "grad_norm": 0.5283413529396057,
2234
+ "learning_rate": 3.663644214162349e-05,
2235
+ "loss": 1.1792,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.654275092936803,
2240
+ "grad_norm": 0.528938353061676,
2241
+ "learning_rate": 3.6420552677029366e-05,
2242
+ "loss": 1.1963,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.6563261120369184,
2247
+ "grad_norm": 0.5067134499549866,
2248
+ "learning_rate": 3.6204663212435234e-05,
2249
+ "loss": 1.2476,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.6583771311370337,
2254
+ "grad_norm": 0.4993511736392975,
2255
+ "learning_rate": 3.598877374784111e-05,
2256
+ "loss": 1.2273,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.6604281502371491,
2261
+ "grad_norm": 0.5275943279266357,
2262
+ "learning_rate": 3.577288428324698e-05,
2263
+ "loss": 1.2287,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.6624791693372645,
2268
+ "grad_norm": 0.49331194162368774,
2269
+ "learning_rate": 3.555699481865285e-05,
2270
+ "loss": 1.1794,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.6645301884373799,
2275
+ "grad_norm": 0.5065453052520752,
2276
+ "learning_rate": 3.534110535405872e-05,
2277
+ "loss": 1.2342,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.6665812075374952,
2282
+ "grad_norm": 0.5334459543228149,
2283
+ "learning_rate": 3.51252158894646e-05,
2284
+ "loss": 1.1782,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.6686322266376106,
2289
+ "grad_norm": 0.535772979259491,
2290
+ "learning_rate": 3.490932642487047e-05,
2291
+ "loss": 1.2108,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.670683245737726,
2296
+ "grad_norm": 0.5377807021141052,
2297
+ "learning_rate": 3.469343696027634e-05,
2298
+ "loss": 1.1903,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.6727342648378413,
2303
+ "grad_norm": 0.5266278386116028,
2304
+ "learning_rate": 3.447754749568221e-05,
2305
+ "loss": 1.2183,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.6747852839379567,
2310
+ "grad_norm": 0.4987232983112335,
2311
+ "learning_rate": 3.4261658031088084e-05,
2312
+ "loss": 1.1915,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.6768363030380721,
2317
+ "grad_norm": 0.5178554058074951,
2318
+ "learning_rate": 3.404576856649396e-05,
2319
+ "loss": 1.179,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 0.6788873221381874,
2324
+ "grad_norm": 0.5086014270782471,
2325
+ "learning_rate": 3.382987910189983e-05,
2326
+ "loss": 1.2298,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 0.6809383412383028,
2331
+ "grad_norm": 0.5420427918434143,
2332
+ "learning_rate": 3.3613989637305696e-05,
2333
+ "loss": 1.2072,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 0.6829893603384182,
2338
+ "grad_norm": 0.5170331001281738,
2339
+ "learning_rate": 3.339810017271157e-05,
2340
+ "loss": 1.2252,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 0.6850403794385336,
2345
+ "grad_norm": 0.48680609464645386,
2346
+ "learning_rate": 3.3182210708117446e-05,
2347
+ "loss": 1.2059,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 0.6870913985386489,
2352
+ "grad_norm": 0.5035340189933777,
2353
+ "learning_rate": 3.296632124352332e-05,
2354
+ "loss": 1.2009,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 0.6891424176387643,
2359
+ "grad_norm": 0.513165295124054,
2360
+ "learning_rate": 3.275043177892919e-05,
2361
+ "loss": 1.1844,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 0.6911934367388797,
2366
+ "grad_norm": 0.5243003368377686,
2367
+ "learning_rate": 3.2534542314335065e-05,
2368
+ "loss": 1.2009,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 0.693244455838995,
2373
+ "grad_norm": 0.5219825506210327,
2374
+ "learning_rate": 3.2318652849740933e-05,
2375
+ "loss": 1.2039,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 0.6952954749391104,
2380
+ "grad_norm": 0.5202507972717285,
2381
+ "learning_rate": 3.210276338514681e-05,
2382
+ "loss": 1.225,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 0.6973464940392258,
2387
+ "grad_norm": 0.5152229070663452,
2388
+ "learning_rate": 3.188687392055268e-05,
2389
+ "loss": 1.1886,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 0.6993975131393411,
2394
+ "grad_norm": 0.5382890701293945,
2395
+ "learning_rate": 3.167098445595855e-05,
2396
+ "loss": 1.2113,
2397
+ "step": 3410
2398
+ },
2399
+ {
2400
+ "epoch": 0.7014485322394565,
2401
+ "grad_norm": 0.5525237917900085,
2402
+ "learning_rate": 3.145509499136443e-05,
2403
+ "loss": 1.2283,
2404
+ "step": 3420
2405
+ },
2406
+ {
2407
+ "epoch": 0.7034995513395719,
2408
+ "grad_norm": 0.5308887958526611,
2409
+ "learning_rate": 3.1239205526770296e-05,
2410
+ "loss": 1.2311,
2411
+ "step": 3430
2412
+ },
2413
+ {
2414
+ "epoch": 0.7055505704396873,
2415
+ "grad_norm": 0.5247687697410583,
2416
+ "learning_rate": 3.1023316062176164e-05,
2417
+ "loss": 1.1946,
2418
+ "step": 3440
2419
+ },
2420
+ {
2421
+ "epoch": 0.7076015895398026,
2422
+ "grad_norm": 0.5322206616401672,
2423
+ "learning_rate": 3.080742659758204e-05,
2424
+ "loss": 1.2198,
2425
+ "step": 3450
2426
+ },
2427
+ {
2428
+ "epoch": 0.709652608639918,
2429
+ "grad_norm": 0.5104162693023682,
2430
+ "learning_rate": 3.0591537132987915e-05,
2431
+ "loss": 1.2105,
2432
+ "step": 3460
2433
+ },
2434
+ {
2435
+ "epoch": 0.7117036277400334,
2436
+ "grad_norm": 0.4890803098678589,
2437
+ "learning_rate": 3.0375647668393786e-05,
2438
+ "loss": 1.2074,
2439
+ "step": 3470
2440
+ },
2441
+ {
2442
+ "epoch": 0.7137546468401487,
2443
+ "grad_norm": 0.529225766658783,
2444
+ "learning_rate": 3.0159758203799655e-05,
2445
+ "loss": 1.2321,
2446
+ "step": 3480
2447
+ },
2448
+ {
2449
+ "epoch": 0.7158056659402641,
2450
+ "grad_norm": 0.5252069234848022,
2451
+ "learning_rate": 2.9943868739205527e-05,
2452
+ "loss": 1.1995,
2453
+ "step": 3490
2454
+ },
2455
+ {
2456
+ "epoch": 0.7178566850403795,
2457
+ "grad_norm": 0.5369967818260193,
2458
+ "learning_rate": 2.9727979274611402e-05,
2459
+ "loss": 1.2234,
2460
+ "step": 3500
2461
+ },
2462
+ {
2463
+ "epoch": 0.7199077041404948,
2464
+ "grad_norm": 0.5053485631942749,
2465
+ "learning_rate": 2.9512089810017274e-05,
2466
+ "loss": 1.2035,
2467
+ "step": 3510
2468
+ },
2469
+ {
2470
+ "epoch": 0.7219587232406102,
2471
+ "grad_norm": 0.5131696462631226,
2472
+ "learning_rate": 2.929620034542315e-05,
2473
+ "loss": 1.2681,
2474
+ "step": 3520
2475
+ },
2476
+ {
2477
+ "epoch": 0.7240097423407256,
2478
+ "grad_norm": 0.5332499742507935,
2479
+ "learning_rate": 2.9080310880829014e-05,
2480
+ "loss": 1.2039,
2481
+ "step": 3530
2482
+ },
2483
+ {
2484
+ "epoch": 0.7260607614408409,
2485
+ "grad_norm": 0.5105617046356201,
2486
+ "learning_rate": 2.886442141623489e-05,
2487
+ "loss": 1.2,
2488
+ "step": 3540
2489
+ },
2490
+ {
2491
+ "epoch": 0.7281117805409563,
2492
+ "grad_norm": 0.5197264552116394,
2493
+ "learning_rate": 2.864853195164076e-05,
2494
+ "loss": 1.1821,
2495
+ "step": 3550
2496
+ },
2497
+ {
2498
+ "epoch": 0.7301627996410717,
2499
+ "grad_norm": 0.505455493927002,
2500
+ "learning_rate": 2.8432642487046636e-05,
2501
+ "loss": 1.2158,
2502
+ "step": 3560
2503
+ },
2504
+ {
2505
+ "epoch": 0.7322138187411871,
2506
+ "grad_norm": 0.5290804505348206,
2507
+ "learning_rate": 2.8216753022452508e-05,
2508
+ "loss": 1.174,
2509
+ "step": 3570
2510
+ },
2511
+ {
2512
+ "epoch": 0.7342648378413024,
2513
+ "grad_norm": 0.5349313020706177,
2514
+ "learning_rate": 2.8000863557858376e-05,
2515
+ "loss": 1.2301,
2516
+ "step": 3580
2517
+ },
2518
+ {
2519
+ "epoch": 0.7363158569414178,
2520
+ "grad_norm": 0.4875812530517578,
2521
+ "learning_rate": 2.7784974093264248e-05,
2522
+ "loss": 1.2015,
2523
+ "step": 3590
2524
+ },
2525
+ {
2526
+ "epoch": 0.7383668760415332,
2527
+ "grad_norm": 0.5164597630500793,
2528
+ "learning_rate": 2.7569084628670123e-05,
2529
+ "loss": 1.2294,
2530
+ "step": 3600
2531
+ },
2532
+ {
2533
+ "epoch": 0.7404178951416485,
2534
+ "grad_norm": 0.5129172801971436,
2535
+ "learning_rate": 2.7353195164075995e-05,
2536
+ "loss": 1.2122,
2537
+ "step": 3610
2538
+ },
2539
+ {
2540
+ "epoch": 0.7424689142417639,
2541
+ "grad_norm": 0.5218586921691895,
2542
+ "learning_rate": 2.713730569948187e-05,
2543
+ "loss": 1.2002,
2544
+ "step": 3620
2545
+ },
2546
+ {
2547
+ "epoch": 0.7445199333418793,
2548
+ "grad_norm": 0.5423296093940735,
2549
+ "learning_rate": 2.6921416234887735e-05,
2550
+ "loss": 1.1685,
2551
+ "step": 3630
2552
+ },
2553
+ {
2554
+ "epoch": 0.7465709524419946,
2555
+ "grad_norm": 0.5151218771934509,
2556
+ "learning_rate": 2.670552677029361e-05,
2557
+ "loss": 1.2167,
2558
+ "step": 3640
2559
+ },
2560
+ {
2561
+ "epoch": 0.74862197154211,
2562
+ "grad_norm": 0.5160235166549683,
2563
+ "learning_rate": 2.6489637305699482e-05,
2564
+ "loss": 1.2269,
2565
+ "step": 3650
2566
+ },
2567
+ {
2568
+ "epoch": 0.7506729906422254,
2569
+ "grad_norm": 0.5056514143943787,
2570
+ "learning_rate": 2.6273747841105357e-05,
2571
+ "loss": 1.2467,
2572
+ "step": 3660
2573
+ },
2574
+ {
2575
+ "epoch": 0.7527240097423408,
2576
+ "grad_norm": 0.52911776304245,
2577
+ "learning_rate": 2.605785837651123e-05,
2578
+ "loss": 1.2182,
2579
+ "step": 3670
2580
+ },
2581
+ {
2582
+ "epoch": 0.7547750288424561,
2583
+ "grad_norm": 0.5172019600868225,
2584
+ "learning_rate": 2.5841968911917097e-05,
2585
+ "loss": 1.1888,
2586
+ "step": 3680
2587
+ },
2588
+ {
2589
+ "epoch": 0.7568260479425715,
2590
+ "grad_norm": 0.5043123960494995,
2591
+ "learning_rate": 2.562607944732297e-05,
2592
+ "loss": 1.2004,
2593
+ "step": 3690
2594
+ },
2595
+ {
2596
+ "epoch": 0.7588770670426869,
2597
+ "grad_norm": 0.5103533267974854,
2598
+ "learning_rate": 2.5410189982728844e-05,
2599
+ "loss": 1.1627,
2600
+ "step": 3700
2601
+ },
2602
+ {
2603
+ "epoch": 0.7609280861428022,
2604
+ "grad_norm": 0.5295760631561279,
2605
+ "learning_rate": 2.5194300518134716e-05,
2606
+ "loss": 1.1604,
2607
+ "step": 3710
2608
+ },
2609
+ {
2610
+ "epoch": 0.7629791052429176,
2611
+ "grad_norm": 0.5427724719047546,
2612
+ "learning_rate": 2.4978411053540588e-05,
2613
+ "loss": 1.1781,
2614
+ "step": 3720
2615
+ },
2616
+ {
2617
+ "epoch": 0.765030124343033,
2618
+ "grad_norm": 0.5164818167686462,
2619
+ "learning_rate": 2.476252158894646e-05,
2620
+ "loss": 1.2208,
2621
+ "step": 3730
2622
+ },
2623
+ {
2624
+ "epoch": 0.7670811434431483,
2625
+ "grad_norm": 0.5196744799613953,
2626
+ "learning_rate": 2.4546632124352335e-05,
2627
+ "loss": 1.1971,
2628
+ "step": 3740
2629
+ },
2630
+ {
2631
+ "epoch": 0.7691321625432637,
2632
+ "grad_norm": 0.5128475427627563,
2633
+ "learning_rate": 2.4330742659758203e-05,
2634
+ "loss": 1.1909,
2635
+ "step": 3750
2636
+ },
2637
+ {
2638
+ "epoch": 0.7711831816433791,
2639
+ "grad_norm": 0.49743902683258057,
2640
+ "learning_rate": 2.411485319516408e-05,
2641
+ "loss": 1.2109,
2642
+ "step": 3760
2643
+ },
2644
+ {
2645
+ "epoch": 0.7732342007434945,
2646
+ "grad_norm": 0.5152381658554077,
2647
+ "learning_rate": 2.3898963730569947e-05,
2648
+ "loss": 1.2228,
2649
+ "step": 3770
2650
+ },
2651
+ {
2652
+ "epoch": 0.7752852198436098,
2653
+ "grad_norm": 0.5446299910545349,
2654
+ "learning_rate": 2.3683074265975822e-05,
2655
+ "loss": 1.1953,
2656
+ "step": 3780
2657
+ },
2658
+ {
2659
+ "epoch": 0.7773362389437252,
2660
+ "grad_norm": 0.5300847291946411,
2661
+ "learning_rate": 2.3467184801381694e-05,
2662
+ "loss": 1.1843,
2663
+ "step": 3790
2664
+ },
2665
+ {
2666
+ "epoch": 0.7793872580438406,
2667
+ "grad_norm": 0.5129801630973816,
2668
+ "learning_rate": 2.3251295336787566e-05,
2669
+ "loss": 1.1809,
2670
+ "step": 3800
2671
+ },
2672
+ {
2673
+ "epoch": 0.7814382771439559,
2674
+ "grad_norm": 0.549198567867279,
2675
+ "learning_rate": 2.3035405872193438e-05,
2676
+ "loss": 1.2099,
2677
+ "step": 3810
2678
+ },
2679
+ {
2680
+ "epoch": 0.7834892962440713,
2681
+ "grad_norm": 0.5118544101715088,
2682
+ "learning_rate": 2.281951640759931e-05,
2683
+ "loss": 1.2149,
2684
+ "step": 3820
2685
+ },
2686
+ {
2687
+ "epoch": 0.7855403153441867,
2688
+ "grad_norm": 0.5479713082313538,
2689
+ "learning_rate": 2.260362694300518e-05,
2690
+ "loss": 1.1771,
2691
+ "step": 3830
2692
+ },
2693
+ {
2694
+ "epoch": 0.787591334444302,
2695
+ "grad_norm": 0.541350245475769,
2696
+ "learning_rate": 2.2387737478411056e-05,
2697
+ "loss": 1.1737,
2698
+ "step": 3840
2699
+ },
2700
+ {
2701
+ "epoch": 0.7896423535444174,
2702
+ "grad_norm": 0.5543351769447327,
2703
+ "learning_rate": 2.2171848013816925e-05,
2704
+ "loss": 1.2233,
2705
+ "step": 3850
2706
+ },
2707
+ {
2708
+ "epoch": 0.7916933726445328,
2709
+ "grad_norm": 0.5010188817977905,
2710
+ "learning_rate": 2.19559585492228e-05,
2711
+ "loss": 1.1938,
2712
+ "step": 3860
2713
+ },
2714
+ {
2715
+ "epoch": 0.7937443917446482,
2716
+ "grad_norm": 0.5245205760002136,
2717
+ "learning_rate": 2.1740069084628672e-05,
2718
+ "loss": 1.2015,
2719
+ "step": 3870
2720
+ },
2721
+ {
2722
+ "epoch": 0.7957954108447635,
2723
+ "grad_norm": 0.5324139595031738,
2724
+ "learning_rate": 2.1524179620034544e-05,
2725
+ "loss": 1.2248,
2726
+ "step": 3880
2727
+ },
2728
+ {
2729
+ "epoch": 0.7978464299448789,
2730
+ "grad_norm": 0.5172831416130066,
2731
+ "learning_rate": 2.1308290155440415e-05,
2732
+ "loss": 1.1992,
2733
+ "step": 3890
2734
+ },
2735
+ {
2736
+ "epoch": 0.7998974490449943,
2737
+ "grad_norm": 0.5434138178825378,
2738
+ "learning_rate": 2.1092400690846287e-05,
2739
+ "loss": 1.1813,
2740
+ "step": 3900
2741
+ },
2742
+ {
2743
+ "epoch": 0.8019484681451096,
2744
+ "grad_norm": 0.5221844911575317,
2745
+ "learning_rate": 2.087651122625216e-05,
2746
+ "loss": 1.1625,
2747
+ "step": 3910
2748
+ },
2749
+ {
2750
+ "epoch": 0.803999487245225,
2751
+ "grad_norm": 0.5027469992637634,
2752
+ "learning_rate": 2.0660621761658034e-05,
2753
+ "loss": 1.181,
2754
+ "step": 3920
2755
+ },
2756
+ {
2757
+ "epoch": 0.8060505063453404,
2758
+ "grad_norm": 0.5298044085502625,
2759
+ "learning_rate": 2.0444732297063903e-05,
2760
+ "loss": 1.2079,
2761
+ "step": 3930
2762
+ },
2763
+ {
2764
+ "epoch": 0.8081015254454557,
2765
+ "grad_norm": 0.5463908910751343,
2766
+ "learning_rate": 2.0228842832469778e-05,
2767
+ "loss": 1.2009,
2768
+ "step": 3940
2769
+ },
2770
+ {
2771
+ "epoch": 0.8101525445455711,
2772
+ "grad_norm": 0.5394027233123779,
2773
+ "learning_rate": 2.0012953367875646e-05,
2774
+ "loss": 1.1931,
2775
+ "step": 3950
2776
+ },
2777
+ {
2778
+ "epoch": 0.8122035636456865,
2779
+ "grad_norm": 0.5041294097900391,
2780
+ "learning_rate": 1.979706390328152e-05,
2781
+ "loss": 1.2107,
2782
+ "step": 3960
2783
+ },
2784
+ {
2785
+ "epoch": 0.8142545827458019,
2786
+ "grad_norm": 0.5223291516304016,
2787
+ "learning_rate": 1.9581174438687393e-05,
2788
+ "loss": 1.1775,
2789
+ "step": 3970
2790
+ },
2791
+ {
2792
+ "epoch": 0.8163056018459172,
2793
+ "grad_norm": 0.5221052169799805,
2794
+ "learning_rate": 1.9365284974093265e-05,
2795
+ "loss": 1.2052,
2796
+ "step": 3980
2797
+ },
2798
+ {
2799
+ "epoch": 0.8183566209460326,
2800
+ "grad_norm": 0.5229529738426208,
2801
+ "learning_rate": 1.9149395509499137e-05,
2802
+ "loss": 1.1922,
2803
+ "step": 3990
2804
+ },
2805
+ {
2806
+ "epoch": 0.820407640046148,
2807
+ "grad_norm": 0.5651980042457581,
2808
+ "learning_rate": 1.893350604490501e-05,
2809
+ "loss": 1.2043,
2810
+ "step": 4000
2811
+ },
2812
+ {
2813
+ "epoch": 0.8224586591462633,
2814
+ "grad_norm": 0.5169751644134521,
2815
+ "learning_rate": 1.871761658031088e-05,
2816
+ "loss": 1.2157,
2817
+ "step": 4010
2818
+ },
2819
+ {
2820
+ "epoch": 0.8245096782463787,
2821
+ "grad_norm": 0.5741276144981384,
2822
+ "learning_rate": 1.8501727115716755e-05,
2823
+ "loss": 1.2112,
2824
+ "step": 4020
2825
+ },
2826
+ {
2827
+ "epoch": 0.8265606973464941,
2828
+ "grad_norm": 0.530596137046814,
2829
+ "learning_rate": 1.8285837651122624e-05,
2830
+ "loss": 1.2535,
2831
+ "step": 4030
2832
+ },
2833
+ {
2834
+ "epoch": 0.8286117164466094,
2835
+ "grad_norm": 0.5436383485794067,
2836
+ "learning_rate": 1.80699481865285e-05,
2837
+ "loss": 1.1789,
2838
+ "step": 4040
2839
+ },
2840
+ {
2841
+ "epoch": 0.8306627355467248,
2842
+ "grad_norm": 0.5238965749740601,
2843
+ "learning_rate": 1.7854058721934368e-05,
2844
+ "loss": 1.1645,
2845
+ "step": 4050
2846
+ },
2847
+ {
2848
+ "epoch": 0.8327137546468402,
2849
+ "grad_norm": 0.5226778388023376,
2850
+ "learning_rate": 1.7638169257340243e-05,
2851
+ "loss": 1.2238,
2852
+ "step": 4060
2853
+ },
2854
+ {
2855
+ "epoch": 0.8347647737469556,
2856
+ "grad_norm": 0.5810254812240601,
2857
+ "learning_rate": 1.7422279792746114e-05,
2858
+ "loss": 1.2212,
2859
+ "step": 4070
2860
+ },
2861
+ {
2862
+ "epoch": 0.8368157928470709,
2863
+ "grad_norm": 0.5228540301322937,
2864
+ "learning_rate": 1.7206390328151986e-05,
2865
+ "loss": 1.2025,
2866
+ "step": 4080
2867
+ },
2868
+ {
2869
+ "epoch": 0.8388668119471863,
2870
+ "grad_norm": 0.5112829804420471,
2871
+ "learning_rate": 1.6990500863557858e-05,
2872
+ "loss": 1.1838,
2873
+ "step": 4090
2874
+ },
2875
+ {
2876
+ "epoch": 0.8409178310473017,
2877
+ "grad_norm": 0.5092179775238037,
2878
+ "learning_rate": 1.6774611398963733e-05,
2879
+ "loss": 1.1981,
2880
+ "step": 4100
2881
+ },
2882
+ {
2883
+ "epoch": 0.842968850147417,
2884
+ "grad_norm": 0.5236721634864807,
2885
+ "learning_rate": 1.65587219343696e-05,
2886
+ "loss": 1.1994,
2887
+ "step": 4110
2888
+ },
2889
+ {
2890
+ "epoch": 0.8450198692475324,
2891
+ "grad_norm": 0.5067551732063293,
2892
+ "learning_rate": 1.6342832469775477e-05,
2893
+ "loss": 1.1758,
2894
+ "step": 4120
2895
+ },
2896
+ {
2897
+ "epoch": 0.8470708883476478,
2898
+ "grad_norm": 0.5471055507659912,
2899
+ "learning_rate": 1.6126943005181345e-05,
2900
+ "loss": 1.2315,
2901
+ "step": 4130
2902
+ },
2903
+ {
2904
+ "epoch": 0.8491219074477631,
2905
+ "grad_norm": 0.514798641204834,
2906
+ "learning_rate": 1.591105354058722e-05,
2907
+ "loss": 1.183,
2908
+ "step": 4140
2909
+ },
2910
+ {
2911
+ "epoch": 0.8511729265478785,
2912
+ "grad_norm": 0.5316623449325562,
2913
+ "learning_rate": 1.5695164075993092e-05,
2914
+ "loss": 1.1997,
2915
+ "step": 4150
2916
+ },
2917
+ {
2918
+ "epoch": 0.8532239456479939,
2919
+ "grad_norm": 0.531896710395813,
2920
+ "learning_rate": 1.5479274611398964e-05,
2921
+ "loss": 1.1967,
2922
+ "step": 4160
2923
+ },
2924
+ {
2925
+ "epoch": 0.8552749647481093,
2926
+ "grad_norm": 0.5044012665748596,
2927
+ "learning_rate": 1.5263385146804836e-05,
2928
+ "loss": 1.2061,
2929
+ "step": 4170
2930
+ },
2931
+ {
2932
+ "epoch": 0.8573259838482246,
2933
+ "grad_norm": 0.547264039516449,
2934
+ "learning_rate": 1.5047495682210708e-05,
2935
+ "loss": 1.1975,
2936
+ "step": 4180
2937
+ },
2938
+ {
2939
+ "epoch": 0.85937700294834,
2940
+ "grad_norm": 0.5514972805976868,
2941
+ "learning_rate": 1.4831606217616581e-05,
2942
+ "loss": 1.2044,
2943
+ "step": 4190
2944
+ },
2945
+ {
2946
+ "epoch": 0.8614280220484554,
2947
+ "grad_norm": 0.5322652459144592,
2948
+ "learning_rate": 1.4615716753022455e-05,
2949
+ "loss": 1.2044,
2950
+ "step": 4200
2951
+ },
2952
+ {
2953
+ "epoch": 0.8634790411485707,
2954
+ "grad_norm": 0.5309359431266785,
2955
+ "learning_rate": 1.4399827288428325e-05,
2956
+ "loss": 1.2066,
2957
+ "step": 4210
2958
+ },
2959
+ {
2960
+ "epoch": 0.8655300602486861,
2961
+ "grad_norm": 0.5314792394638062,
2962
+ "learning_rate": 1.4183937823834198e-05,
2963
+ "loss": 1.2006,
2964
+ "step": 4220
2965
+ },
2966
+ {
2967
+ "epoch": 0.8675810793488015,
2968
+ "grad_norm": 0.5549922585487366,
2969
+ "learning_rate": 1.3968048359240068e-05,
2970
+ "loss": 1.2058,
2971
+ "step": 4230
2972
+ },
2973
+ {
2974
+ "epoch": 0.8696320984489168,
2975
+ "grad_norm": 0.5373049378395081,
2976
+ "learning_rate": 1.3752158894645942e-05,
2977
+ "loss": 1.2002,
2978
+ "step": 4240
2979
+ },
2980
+ {
2981
+ "epoch": 0.8716831175490322,
2982
+ "grad_norm": 0.5322666764259338,
2983
+ "learning_rate": 1.3536269430051815e-05,
2984
+ "loss": 1.215,
2985
+ "step": 4250
2986
+ },
2987
+ {
2988
+ "epoch": 0.8737341366491476,
2989
+ "grad_norm": 0.5549564957618713,
2990
+ "learning_rate": 1.3320379965457685e-05,
2991
+ "loss": 1.2131,
2992
+ "step": 4260
2993
+ },
2994
+ {
2995
+ "epoch": 0.875785155749263,
2996
+ "grad_norm": 0.5308319926261902,
2997
+ "learning_rate": 1.3104490500863559e-05,
2998
+ "loss": 1.2203,
2999
+ "step": 4270
3000
+ },
3001
+ {
3002
+ "epoch": 0.8778361748493783,
3003
+ "grad_norm": 0.5089017152786255,
3004
+ "learning_rate": 1.2888601036269432e-05,
3005
+ "loss": 1.1801,
3006
+ "step": 4280
3007
+ },
3008
+ {
3009
+ "epoch": 0.8798871939494937,
3010
+ "grad_norm": 0.5377966165542603,
3011
+ "learning_rate": 1.2672711571675302e-05,
3012
+ "loss": 1.189,
3013
+ "step": 4290
3014
+ },
3015
+ {
3016
+ "epoch": 0.8819382130496091,
3017
+ "grad_norm": 0.5528485178947449,
3018
+ "learning_rate": 1.2456822107081174e-05,
3019
+ "loss": 1.2197,
3020
+ "step": 4300
3021
+ },
3022
+ {
3023
+ "epoch": 0.8839892321497244,
3024
+ "grad_norm": 0.5241679549217224,
3025
+ "learning_rate": 1.2240932642487048e-05,
3026
+ "loss": 1.1652,
3027
+ "step": 4310
3028
+ },
3029
+ {
3030
+ "epoch": 0.8860402512498398,
3031
+ "grad_norm": 0.5626764893531799,
3032
+ "learning_rate": 1.202504317789292e-05,
3033
+ "loss": 1.1805,
3034
+ "step": 4320
3035
+ },
3036
+ {
3037
+ "epoch": 0.8880912703499552,
3038
+ "grad_norm": 0.5248028635978699,
3039
+ "learning_rate": 1.1809153713298791e-05,
3040
+ "loss": 1.1652,
3041
+ "step": 4330
3042
+ },
3043
+ {
3044
+ "epoch": 0.8901422894500705,
3045
+ "grad_norm": 0.5452848672866821,
3046
+ "learning_rate": 1.1593264248704663e-05,
3047
+ "loss": 1.2171,
3048
+ "step": 4340
3049
+ },
3050
+ {
3051
+ "epoch": 0.8921933085501859,
3052
+ "grad_norm": 0.5505712628364563,
3053
+ "learning_rate": 1.1377374784110537e-05,
3054
+ "loss": 1.1967,
3055
+ "step": 4350
3056
+ },
3057
+ {
3058
+ "epoch": 0.8942443276503013,
3059
+ "grad_norm": 0.5437038540840149,
3060
+ "learning_rate": 1.1161485319516408e-05,
3061
+ "loss": 1.2216,
3062
+ "step": 4360
3063
+ },
3064
+ {
3065
+ "epoch": 0.8962953467504167,
3066
+ "grad_norm": 0.5138014554977417,
3067
+ "learning_rate": 1.094559585492228e-05,
3068
+ "loss": 1.193,
3069
+ "step": 4370
3070
+ },
3071
+ {
3072
+ "epoch": 0.898346365850532,
3073
+ "grad_norm": 0.542080283164978,
3074
+ "learning_rate": 1.0729706390328152e-05,
3075
+ "loss": 1.1677,
3076
+ "step": 4380
3077
+ },
3078
+ {
3079
+ "epoch": 0.9003973849506474,
3080
+ "grad_norm": 0.5166792273521423,
3081
+ "learning_rate": 1.0513816925734024e-05,
3082
+ "loss": 1.2147,
3083
+ "step": 4390
3084
+ },
3085
+ {
3086
+ "epoch": 0.9024484040507628,
3087
+ "grad_norm": 0.536491334438324,
3088
+ "learning_rate": 1.0297927461139897e-05,
3089
+ "loss": 1.2077,
3090
+ "step": 4400
3091
+ },
3092
+ {
3093
+ "epoch": 0.9044994231508781,
3094
+ "grad_norm": 0.5504462718963623,
3095
+ "learning_rate": 1.0082037996545769e-05,
3096
+ "loss": 1.1913,
3097
+ "step": 4410
3098
+ },
3099
+ {
3100
+ "epoch": 0.9065504422509935,
3101
+ "grad_norm": 0.5299994945526123,
3102
+ "learning_rate": 9.866148531951641e-06,
3103
+ "loss": 1.1987,
3104
+ "step": 4420
3105
+ },
3106
+ {
3107
+ "epoch": 0.9086014613511089,
3108
+ "grad_norm": 0.5432473421096802,
3109
+ "learning_rate": 9.650259067357513e-06,
3110
+ "loss": 1.199,
3111
+ "step": 4430
3112
+ },
3113
+ {
3114
+ "epoch": 0.9106524804512242,
3115
+ "grad_norm": 0.529331386089325,
3116
+ "learning_rate": 9.434369602763386e-06,
3117
+ "loss": 1.214,
3118
+ "step": 4440
3119
+ },
3120
+ {
3121
+ "epoch": 0.9127034995513396,
3122
+ "grad_norm": 0.49785298109054565,
3123
+ "learning_rate": 9.218480138169258e-06,
3124
+ "loss": 1.202,
3125
+ "step": 4450
3126
+ },
3127
+ {
3128
+ "epoch": 0.914754518651455,
3129
+ "grad_norm": 0.5281327962875366,
3130
+ "learning_rate": 9.00259067357513e-06,
3131
+ "loss": 1.1904,
3132
+ "step": 4460
3133
+ },
3134
+ {
3135
+ "epoch": 0.9168055377515704,
3136
+ "grad_norm": 0.5474033951759338,
3137
+ "learning_rate": 8.786701208981002e-06,
3138
+ "loss": 1.1972,
3139
+ "step": 4470
3140
+ },
3141
+ {
3142
+ "epoch": 0.9188565568516857,
3143
+ "grad_norm": 0.5412236452102661,
3144
+ "learning_rate": 8.570811744386873e-06,
3145
+ "loss": 1.1797,
3146
+ "step": 4480
3147
+ },
3148
+ {
3149
+ "epoch": 0.9209075759518011,
3150
+ "grad_norm": 0.5599170923233032,
3151
+ "learning_rate": 8.354922279792747e-06,
3152
+ "loss": 1.176,
3153
+ "step": 4490
3154
+ },
3155
+ {
3156
+ "epoch": 0.9229585950519165,
3157
+ "grad_norm": 0.5590323805809021,
3158
+ "learning_rate": 8.139032815198619e-06,
3159
+ "loss": 1.1863,
3160
+ "step": 4500
3161
+ },
3162
+ {
3163
+ "epoch": 0.9250096141520318,
3164
+ "grad_norm": 0.566150426864624,
3165
+ "learning_rate": 7.92314335060449e-06,
3166
+ "loss": 1.2217,
3167
+ "step": 4510
3168
+ },
3169
+ {
3170
+ "epoch": 0.9270606332521472,
3171
+ "grad_norm": 0.5459644794464111,
3172
+ "learning_rate": 7.707253886010362e-06,
3173
+ "loss": 1.1903,
3174
+ "step": 4520
3175
+ },
3176
+ {
3177
+ "epoch": 0.9291116523522626,
3178
+ "grad_norm": 0.5333088636398315,
3179
+ "learning_rate": 7.491364421416235e-06,
3180
+ "loss": 1.2076,
3181
+ "step": 4530
3182
+ },
3183
+ {
3184
+ "epoch": 0.9311626714523779,
3185
+ "grad_norm": 0.5921478271484375,
3186
+ "learning_rate": 7.2754749568221076e-06,
3187
+ "loss": 1.191,
3188
+ "step": 4540
3189
+ },
3190
+ {
3191
+ "epoch": 0.9332136905524933,
3192
+ "grad_norm": 0.5061055421829224,
3193
+ "learning_rate": 7.059585492227979e-06,
3194
+ "loss": 1.1787,
3195
+ "step": 4550
3196
+ },
3197
+ {
3198
+ "epoch": 0.9352647096526087,
3199
+ "grad_norm": 0.5804794430732727,
3200
+ "learning_rate": 6.843696027633852e-06,
3201
+ "loss": 1.2096,
3202
+ "step": 4560
3203
+ },
3204
+ {
3205
+ "epoch": 0.9373157287527241,
3206
+ "grad_norm": 0.5328559875488281,
3207
+ "learning_rate": 6.627806563039724e-06,
3208
+ "loss": 1.2072,
3209
+ "step": 4570
3210
+ },
3211
+ {
3212
+ "epoch": 0.9393667478528394,
3213
+ "grad_norm": 0.518925130367279,
3214
+ "learning_rate": 6.4119170984455965e-06,
3215
+ "loss": 1.2119,
3216
+ "step": 4580
3217
+ },
3218
+ {
3219
+ "epoch": 0.9414177669529548,
3220
+ "grad_norm": 0.5092957019805908,
3221
+ "learning_rate": 6.196027633851468e-06,
3222
+ "loss": 1.2137,
3223
+ "step": 4590
3224
+ },
3225
+ {
3226
+ "epoch": 0.9434687860530702,
3227
+ "grad_norm": 0.5156581401824951,
3228
+ "learning_rate": 5.980138169257341e-06,
3229
+ "loss": 1.2059,
3230
+ "step": 4600
3231
+ },
3232
+ {
3233
+ "epoch": 0.9455198051531855,
3234
+ "grad_norm": 0.5467930436134338,
3235
+ "learning_rate": 5.764248704663213e-06,
3236
+ "loss": 1.2111,
3237
+ "step": 4610
3238
+ },
3239
+ {
3240
+ "epoch": 0.9475708242533009,
3241
+ "grad_norm": 0.5478744506835938,
3242
+ "learning_rate": 5.548359240069085e-06,
3243
+ "loss": 1.2014,
3244
+ "step": 4620
3245
+ },
3246
+ {
3247
+ "epoch": 0.9496218433534163,
3248
+ "grad_norm": 0.5648489594459534,
3249
+ "learning_rate": 5.332469775474957e-06,
3250
+ "loss": 1.179,
3251
+ "step": 4630
3252
+ },
3253
+ {
3254
+ "epoch": 0.9516728624535316,
3255
+ "grad_norm": 0.5315075516700745,
3256
+ "learning_rate": 5.11658031088083e-06,
3257
+ "loss": 1.2005,
3258
+ "step": 4640
3259
+ },
3260
+ {
3261
+ "epoch": 0.953723881553647,
3262
+ "grad_norm": 0.5487618446350098,
3263
+ "learning_rate": 4.9006908462867016e-06,
3264
+ "loss": 1.2175,
3265
+ "step": 4650
3266
+ },
3267
+ {
3268
+ "epoch": 0.9557749006537624,
3269
+ "grad_norm": 0.5332956314086914,
3270
+ "learning_rate": 4.684801381692574e-06,
3271
+ "loss": 1.167,
3272
+ "step": 4660
3273
+ },
3274
+ {
3275
+ "epoch": 0.9578259197538777,
3276
+ "grad_norm": 0.5483719110488892,
3277
+ "learning_rate": 4.468911917098446e-06,
3278
+ "loss": 1.1857,
3279
+ "step": 4670
3280
+ },
3281
+ {
3282
+ "epoch": 0.9598769388539931,
3283
+ "grad_norm": 0.5331635475158691,
3284
+ "learning_rate": 4.253022452504319e-06,
3285
+ "loss": 1.209,
3286
+ "step": 4680
3287
+ },
3288
+ {
3289
+ "epoch": 0.9619279579541085,
3290
+ "grad_norm": 0.5277014970779419,
3291
+ "learning_rate": 4.0371329879101905e-06,
3292
+ "loss": 1.1785,
3293
+ "step": 4690
3294
+ },
3295
+ {
3296
+ "epoch": 0.9639789770542239,
3297
+ "grad_norm": 0.5312276482582092,
3298
+ "learning_rate": 3.821243523316062e-06,
3299
+ "loss": 1.1873,
3300
+ "step": 4700
3301
+ },
3302
+ {
3303
+ "epoch": 0.9660299961543392,
3304
+ "grad_norm": 0.532839834690094,
3305
+ "learning_rate": 3.6053540587219345e-06,
3306
+ "loss": 1.2065,
3307
+ "step": 4710
3308
+ },
3309
+ {
3310
+ "epoch": 0.9680810152544546,
3311
+ "grad_norm": 0.5413607954978943,
3312
+ "learning_rate": 3.3894645941278067e-06,
3313
+ "loss": 1.2088,
3314
+ "step": 4720
3315
+ },
3316
+ {
3317
+ "epoch": 0.97013203435457,
3318
+ "grad_norm": 0.5259295105934143,
3319
+ "learning_rate": 3.173575129533679e-06,
3320
+ "loss": 1.2008,
3321
+ "step": 4730
3322
+ },
3323
+ {
3324
+ "epoch": 0.9721830534546853,
3325
+ "grad_norm": 0.5716482996940613,
3326
+ "learning_rate": 2.957685664939551e-06,
3327
+ "loss": 1.199,
3328
+ "step": 4740
3329
+ },
3330
+ {
3331
+ "epoch": 0.9742340725548007,
3332
+ "grad_norm": 0.5410735607147217,
3333
+ "learning_rate": 2.7417962003454234e-06,
3334
+ "loss": 1.2012,
3335
+ "step": 4750
3336
+ },
3337
+ {
3338
+ "epoch": 0.9762850916549161,
3339
+ "grad_norm": 0.5225376486778259,
3340
+ "learning_rate": 2.5259067357512956e-06,
3341
+ "loss": 1.1964,
3342
+ "step": 4760
3343
+ },
3344
+ {
3345
+ "epoch": 0.9783361107550314,
3346
+ "grad_norm": 0.5672590732574463,
3347
+ "learning_rate": 2.310017271157168e-06,
3348
+ "loss": 1.1853,
3349
+ "step": 4770
3350
+ },
3351
+ {
3352
+ "epoch": 0.9803871298551468,
3353
+ "grad_norm": 0.5533677339553833,
3354
+ "learning_rate": 2.09412780656304e-06,
3355
+ "loss": 1.1863,
3356
+ "step": 4780
3357
+ },
3358
+ {
3359
+ "epoch": 0.9824381489552622,
3360
+ "grad_norm": 0.5407289862632751,
3361
+ "learning_rate": 1.878238341968912e-06,
3362
+ "loss": 1.2232,
3363
+ "step": 4790
3364
+ },
3365
+ {
3366
+ "epoch": 0.9844891680553776,
3367
+ "grad_norm": 0.558232843875885,
3368
+ "learning_rate": 1.6623488773747842e-06,
3369
+ "loss": 1.1719,
3370
+ "step": 4800
3371
+ }
3372
+ ],
3373
+ "logging_steps": 10,
3374
+ "max_steps": 4876,
3375
+ "num_input_tokens_seen": 0,
3376
+ "num_train_epochs": 1,
3377
+ "save_steps": 200,
3378
+ "stateful_callbacks": {
3379
+ "TrainerControl": {
3380
+ "args": {
3381
+ "should_epoch_stop": false,
3382
+ "should_evaluate": false,
3383
+ "should_log": false,
3384
+ "should_save": true,
3385
+ "should_training_stop": false
3386
+ },
3387
+ "attributes": {}
3388
+ }
3389
+ },
3390
+ "total_flos": 1.2249860917047091e+17,
3391
+ "train_batch_size": 4,
3392
+ "trial_name": null,
3393
+ "trial_params": null
3394
+ }
lora_checkpoints/checkpoint-4800/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b8f6520f47933838e96dca56ee883040325b73481aff07afcabf963674a84fe
3
+ size 5624
lora_checkpoints/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
lora_checkpoints/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
lora_checkpoints/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff