SpireLab commited on
Commit
137c748
·
verified ·
1 Parent(s): c6f7a63

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. muril_bh_domain/checkpoint-4000/README.md +202 -0
  2. muril_bh_domain/checkpoint-4000/adapter_config.json +32 -0
  3. muril_bh_domain/checkpoint-4000/adapter_model.safetensors +3 -0
  4. muril_bh_domain/checkpoint-4000/optimizer.pt +3 -0
  5. muril_bh_domain/checkpoint-4000/rng_state.pth +3 -0
  6. muril_bh_domain/checkpoint-4000/scheduler.pt +3 -0
  7. muril_bh_domain/checkpoint-4000/trainer_state.json +341 -0
  8. muril_bh_domain/checkpoint-4000/training_args.bin +3 -0
  9. muril_bh_domain/checkpoint-4740/README.md +202 -0
  10. muril_bh_domain/checkpoint-4740/adapter_config.json +32 -0
  11. muril_bh_domain/checkpoint-4740/adapter_model.safetensors +3 -0
  12. muril_bh_domain/checkpoint-4740/optimizer.pt +3 -0
  13. muril_bh_domain/checkpoint-4740/rng_state.pth +3 -0
  14. muril_bh_domain/checkpoint-4740/scheduler.pt +3 -0
  15. muril_bh_domain/checkpoint-4740/trainer_state.json +390 -0
  16. muril_bh_domain/checkpoint-4740/training_args.bin +3 -0
  17. muril_bh_domain/config.json +26 -0
  18. muril_bh_domain/generation_config.json +5 -0
  19. muril_bh_domain/model.safetensors +3 -0
  20. muril_bh_domain/special_tokens_map.json +7 -0
  21. muril_bh_domain/tokenizer.json +0 -0
  22. muril_bh_domain/tokenizer_config.json +58 -0
  23. muril_bh_domain/vocab.txt +0 -0
  24. muril_bn_domain/config.json +26 -0
  25. muril_bn_domain/generation_config.json +5 -0
  26. muril_bn_domain/model.safetensors +3 -0
  27. muril_bn_domain/special_tokens_map.json +7 -0
  28. muril_bn_domain/tokenizer.json +0 -0
  29. muril_bn_domain/tokenizer_config.json +58 -0
  30. muril_bn_domain/training_args.bin +3 -0
  31. muril_bn_domain/vocab.txt +0 -0
  32. muril_ch_domain/checkpoint-30500/README.md +202 -0
  33. muril_ch_domain/checkpoint-30500/adapter_config.json +34 -0
  34. muril_ch_domain/checkpoint-30500/adapter_model.safetensors +3 -0
  35. muril_ch_domain/checkpoint-30500/optimizer.pt +3 -0
  36. muril_ch_domain/checkpoint-30500/rng_state.pth +3 -0
  37. muril_ch_domain/checkpoint-30500/scheduler.pt +3 -0
  38. muril_ch_domain/checkpoint-30500/trainer_state.json +2595 -0
  39. muril_ch_domain/checkpoint-30500/training_args.bin +3 -0
  40. muril_ch_domain/checkpoint-30663/README.md +202 -0
  41. muril_ch_domain/checkpoint-30663/adapter_config.json +34 -0
  42. muril_ch_domain/checkpoint-30663/adapter_model.safetensors +3 -0
  43. muril_ch_domain/checkpoint-30663/optimizer.pt +3 -0
  44. muril_ch_domain/checkpoint-30663/rng_state.pth +3 -0
  45. muril_ch_domain/checkpoint-30663/scheduler.pt +3 -0
  46. muril_ch_domain/checkpoint-30663/trainer_state.json +2602 -0
  47. muril_ch_domain/checkpoint-30663/training_args.bin +3 -0
  48. muril_ch_domain/config.json +26 -0
  49. muril_ch_domain/generation_config.json +5 -0
  50. muril_ch_domain/model.safetensors +3 -0
muril_bh_domain/checkpoint-4000/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
muril_bh_domain/checkpoint-4000/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "query",
27
+ "value"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
muril_bh_domain/checkpoint-4000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:deb73549679fd91090a64a88d75c687dde7b28b08950efa95d45de74872e7203
3
+ size 2366064
muril_bh_domain/checkpoint-4000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1daa4eb30785be66449e5d62b8e08a5908afcf2aa7986702f353f32bc8b42d3
3
+ size 4759290
muril_bh_domain/checkpoint-4000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e24e2231a33bdad5887754a5b6968219ef12961d04369739f1350efee44b72
3
+ size 14244
muril_bh_domain/checkpoint-4000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edbdfc0163382607dc91e6b9699f3f3a2c3a204482ef6360e91ff42dd0f5b83a
3
+ size 1064
muril_bh_domain/checkpoint-4000/trainer_state.json ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.530844669408415,
5
+ "eval_steps": 1000,
6
+ "global_step": 4000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06327111673521038,
13
+ "grad_norm": 4.5707688331604,
14
+ "learning_rate": 1.0548523206751056e-05,
15
+ "loss": 6.0536,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.12654223347042076,
20
+ "grad_norm": 7.803924083709717,
21
+ "learning_rate": 2.1097046413502112e-05,
22
+ "loss": 5.8177,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.18981335020563114,
27
+ "grad_norm": 8.050126075744629,
28
+ "learning_rate": 3.1645569620253167e-05,
29
+ "loss": 5.0311,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.2530844669408415,
34
+ "grad_norm": 8.803975105285645,
35
+ "learning_rate": 4.2194092827004224e-05,
36
+ "loss": 4.7351,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.3163555836760519,
41
+ "grad_norm": 12.846040725708008,
42
+ "learning_rate": 4.96952648851383e-05,
43
+ "loss": 4.5453,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3796267004112623,
48
+ "grad_norm": 15.128095626831055,
49
+ "learning_rate": 4.852320675105486e-05,
50
+ "loss": 4.3979,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.44289781714647264,
55
+ "grad_norm": 10.33956527709961,
56
+ "learning_rate": 4.7351148616971405e-05,
57
+ "loss": 4.4056,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.506168933881683,
62
+ "grad_norm": 23.287179946899414,
63
+ "learning_rate": 4.617909048288795e-05,
64
+ "loss": 4.2787,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.5694400506168934,
69
+ "grad_norm": 12.955565452575684,
70
+ "learning_rate": 4.50070323488045e-05,
71
+ "loss": 4.3504,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.6327111673521038,
76
+ "grad_norm": 10.07479476928711,
77
+ "learning_rate": 4.3834974214721055e-05,
78
+ "loss": 4.2521,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.6327111673521038,
83
+ "eval_runtime": 33.0567,
84
+ "eval_samples_per_second": 95.593,
85
+ "eval_steps_per_second": 11.949,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.6959822840873141,
90
+ "grad_norm": 12.32780647277832,
91
+ "learning_rate": 4.26629160806376e-05,
92
+ "loss": 4.2314,
93
+ "step": 1100
94
+ },
95
+ {
96
+ "epoch": 0.7592534008225246,
97
+ "grad_norm": 15.515801429748535,
98
+ "learning_rate": 4.149085794655415e-05,
99
+ "loss": 4.3058,
100
+ "step": 1200
101
+ },
102
+ {
103
+ "epoch": 0.8225245175577349,
104
+ "grad_norm": 12.472834587097168,
105
+ "learning_rate": 4.03187998124707e-05,
106
+ "loss": 4.2158,
107
+ "step": 1300
108
+ },
109
+ {
110
+ "epoch": 0.8857956342929453,
111
+ "grad_norm": 16.903112411499023,
112
+ "learning_rate": 3.914674167838725e-05,
113
+ "loss": 4.142,
114
+ "step": 1400
115
+ },
116
+ {
117
+ "epoch": 0.9490667510281556,
118
+ "grad_norm": 13.487791061401367,
119
+ "learning_rate": 3.79746835443038e-05,
120
+ "loss": 4.0534,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 1.012337867763366,
125
+ "grad_norm": 14.721494674682617,
126
+ "learning_rate": 3.680262541022035e-05,
127
+ "loss": 4.1405,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 1.0756089844985763,
132
+ "grad_norm": 16.011690139770508,
133
+ "learning_rate": 3.56305672761369e-05,
134
+ "loss": 4.1192,
135
+ "step": 1700
136
+ },
137
+ {
138
+ "epoch": 1.1388801012337868,
139
+ "grad_norm": 15.692846298217773,
140
+ "learning_rate": 3.445850914205345e-05,
141
+ "loss": 4.1407,
142
+ "step": 1800
143
+ },
144
+ {
145
+ "epoch": 1.2021512179689973,
146
+ "grad_norm": 13.71811294555664,
147
+ "learning_rate": 3.328645100797e-05,
148
+ "loss": 4.1839,
149
+ "step": 1900
150
+ },
151
+ {
152
+ "epoch": 1.2654223347042075,
153
+ "grad_norm": 13.474448204040527,
154
+ "learning_rate": 3.2114392873886545e-05,
155
+ "loss": 4.0813,
156
+ "step": 2000
157
+ },
158
+ {
159
+ "epoch": 1.2654223347042075,
160
+ "eval_runtime": 33.0889,
161
+ "eval_samples_per_second": 95.5,
162
+ "eval_steps_per_second": 11.938,
163
+ "step": 2000
164
+ },
165
+ {
166
+ "epoch": 1.328693451439418,
167
+ "grad_norm": 15.276654243469238,
168
+ "learning_rate": 3.09423347398031e-05,
169
+ "loss": 3.9624,
170
+ "step": 2100
171
+ },
172
+ {
173
+ "epoch": 1.3919645681746282,
174
+ "grad_norm": 13.796238899230957,
175
+ "learning_rate": 2.9770276605719643e-05,
176
+ "loss": 4.0653,
177
+ "step": 2200
178
+ },
179
+ {
180
+ "epoch": 1.4552356849098387,
181
+ "grad_norm": 16.452594757080078,
182
+ "learning_rate": 2.8598218471636194e-05,
183
+ "loss": 4.0338,
184
+ "step": 2300
185
+ },
186
+ {
187
+ "epoch": 1.518506801645049,
188
+ "grad_norm": 20.27753257751465,
189
+ "learning_rate": 2.7426160337552742e-05,
190
+ "loss": 4.0962,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 1.5817779183802594,
195
+ "grad_norm": 15.492554664611816,
196
+ "learning_rate": 2.6254102203469293e-05,
197
+ "loss": 4.057,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 1.6450490351154698,
202
+ "grad_norm": 15.298516273498535,
203
+ "learning_rate": 2.508204406938584e-05,
204
+ "loss": 4.0301,
205
+ "step": 2600
206
+ },
207
+ {
208
+ "epoch": 1.70832015185068,
209
+ "grad_norm": 15.670785903930664,
210
+ "learning_rate": 2.3909985935302392e-05,
211
+ "loss": 4.0129,
212
+ "step": 2700
213
+ },
214
+ {
215
+ "epoch": 1.7715912685858906,
216
+ "grad_norm": 18.541555404663086,
217
+ "learning_rate": 2.2737927801218943e-05,
218
+ "loss": 3.9724,
219
+ "step": 2800
220
+ },
221
+ {
222
+ "epoch": 1.834862385321101,
223
+ "grad_norm": 19.13411521911621,
224
+ "learning_rate": 2.156586966713549e-05,
225
+ "loss": 4.0044,
226
+ "step": 2900
227
+ },
228
+ {
229
+ "epoch": 1.8981335020563113,
230
+ "grad_norm": 14.532624244689941,
231
+ "learning_rate": 2.039381153305204e-05,
232
+ "loss": 3.9882,
233
+ "step": 3000
234
+ },
235
+ {
236
+ "epoch": 1.8981335020563113,
237
+ "eval_runtime": 33.1181,
238
+ "eval_samples_per_second": 95.416,
239
+ "eval_steps_per_second": 11.927,
240
+ "step": 3000
241
+ },
242
+ {
243
+ "epoch": 1.9614046187915217,
244
+ "grad_norm": 15.767202377319336,
245
+ "learning_rate": 1.922175339896859e-05,
246
+ "loss": 3.9372,
247
+ "step": 3100
248
+ },
249
+ {
250
+ "epoch": 2.024675735526732,
251
+ "grad_norm": 17.210546493530273,
252
+ "learning_rate": 1.804969526488514e-05,
253
+ "loss": 3.9757,
254
+ "step": 3200
255
+ },
256
+ {
257
+ "epoch": 2.0879468522619424,
258
+ "grad_norm": 15.209254264831543,
259
+ "learning_rate": 1.6877637130801688e-05,
260
+ "loss": 3.9668,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 2.1512179689971527,
265
+ "grad_norm": 16.821176528930664,
266
+ "learning_rate": 1.570557899671824e-05,
267
+ "loss": 3.9732,
268
+ "step": 3400
269
+ },
270
+ {
271
+ "epoch": 2.2144890857323634,
272
+ "grad_norm": 15.914960861206055,
273
+ "learning_rate": 1.4533520862634786e-05,
274
+ "loss": 3.9375,
275
+ "step": 3500
276
+ },
277
+ {
278
+ "epoch": 2.2777602024675736,
279
+ "grad_norm": 16.489627838134766,
280
+ "learning_rate": 1.3361462728551336e-05,
281
+ "loss": 3.9993,
282
+ "step": 3600
283
+ },
284
+ {
285
+ "epoch": 2.341031319202784,
286
+ "grad_norm": 17.943021774291992,
287
+ "learning_rate": 1.2189404594467887e-05,
288
+ "loss": 3.9889,
289
+ "step": 3700
290
+ },
291
+ {
292
+ "epoch": 2.4043024359379945,
293
+ "grad_norm": 14.150239944458008,
294
+ "learning_rate": 1.1017346460384436e-05,
295
+ "loss": 4.0,
296
+ "step": 3800
297
+ },
298
+ {
299
+ "epoch": 2.4675735526732048,
300
+ "grad_norm": 15.843707084655762,
301
+ "learning_rate": 9.845288326300985e-06,
302
+ "loss": 3.9527,
303
+ "step": 3900
304
+ },
305
+ {
306
+ "epoch": 2.530844669408415,
307
+ "grad_norm": 16.922142028808594,
308
+ "learning_rate": 8.673230192217533e-06,
309
+ "loss": 3.8801,
310
+ "step": 4000
311
+ },
312
+ {
313
+ "epoch": 2.530844669408415,
314
+ "eval_runtime": 33.0326,
315
+ "eval_samples_per_second": 95.663,
316
+ "eval_steps_per_second": 11.958,
317
+ "step": 4000
318
+ }
319
+ ],
320
+ "logging_steps": 100,
321
+ "max_steps": 4740,
322
+ "num_input_tokens_seen": 0,
323
+ "num_train_epochs": 3,
324
+ "save_steps": 1000,
325
+ "stateful_callbacks": {
326
+ "TrainerControl": {
327
+ "args": {
328
+ "should_epoch_stop": false,
329
+ "should_evaluate": false,
330
+ "should_log": false,
331
+ "should_save": true,
332
+ "should_training_stop": false
333
+ },
334
+ "attributes": {}
335
+ }
336
+ },
337
+ "total_flos": 1700301256769430.0,
338
+ "train_batch_size": 8,
339
+ "trial_name": null,
340
+ "trial_params": null
341
+ }
muril_bh_domain/checkpoint-4000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a22c26ec129497c28db8c0928a06ac51978ce151cffa0ea84f44917313aa4fe
3
+ size 5304
muril_bh_domain/checkpoint-4740/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
muril_bh_domain/checkpoint-4740/adapter_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 32,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 16,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "query",
27
+ "value"
28
+ ],
29
+ "task_type": null,
30
+ "use_dora": false,
31
+ "use_rslora": false
32
+ }
muril_bh_domain/checkpoint-4740/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7f62c24737614448fd04808465491ac77f32241a0a2a9cab226b74deb95ad2
3
+ size 2366064
muril_bh_domain/checkpoint-4740/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e80f2c68afd84173016108d5470b712a09e93d00829013e7b7624072079df805
3
+ size 4759290
muril_bh_domain/checkpoint-4740/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9afe1aba4f99f67ff47a476d35ef83b49ca75cc2da61962433d26d445986b7
3
+ size 14244
muril_bh_domain/checkpoint-4740/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45472c9e15a883df2692bf1340ef4c87defa24d90bfd8aefefaaecfb0aac3aa4
3
+ size 1064
muril_bh_domain/checkpoint-4740/trainer_state.json ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.999050933248972,
5
+ "eval_steps": 1000,
6
+ "global_step": 4740,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06327111673521038,
13
+ "grad_norm": 4.5707688331604,
14
+ "learning_rate": 1.0548523206751056e-05,
15
+ "loss": 6.0536,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.12654223347042076,
20
+ "grad_norm": 7.803924083709717,
21
+ "learning_rate": 2.1097046413502112e-05,
22
+ "loss": 5.8177,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.18981335020563114,
27
+ "grad_norm": 8.050126075744629,
28
+ "learning_rate": 3.1645569620253167e-05,
29
+ "loss": 5.0311,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.2530844669408415,
34
+ "grad_norm": 8.803975105285645,
35
+ "learning_rate": 4.2194092827004224e-05,
36
+ "loss": 4.7351,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.3163555836760519,
41
+ "grad_norm": 12.846040725708008,
42
+ "learning_rate": 4.96952648851383e-05,
43
+ "loss": 4.5453,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.3796267004112623,
48
+ "grad_norm": 15.128095626831055,
49
+ "learning_rate": 4.852320675105486e-05,
50
+ "loss": 4.3979,
51
+ "step": 600
52
+ },
53
+ {
54
+ "epoch": 0.44289781714647264,
55
+ "grad_norm": 10.33956527709961,
56
+ "learning_rate": 4.7351148616971405e-05,
57
+ "loss": 4.4056,
58
+ "step": 700
59
+ },
60
+ {
61
+ "epoch": 0.506168933881683,
62
+ "grad_norm": 23.287179946899414,
63
+ "learning_rate": 4.617909048288795e-05,
64
+ "loss": 4.2787,
65
+ "step": 800
66
+ },
67
+ {
68
+ "epoch": 0.5694400506168934,
69
+ "grad_norm": 12.955565452575684,
70
+ "learning_rate": 4.50070323488045e-05,
71
+ "loss": 4.3504,
72
+ "step": 900
73
+ },
74
+ {
75
+ "epoch": 0.6327111673521038,
76
+ "grad_norm": 10.07479476928711,
77
+ "learning_rate": 4.3834974214721055e-05,
78
+ "loss": 4.2521,
79
+ "step": 1000
80
+ },
81
+ {
82
+ "epoch": 0.6327111673521038,
83
+ "eval_runtime": 33.0567,
84
+ "eval_samples_per_second": 95.593,
85
+ "eval_steps_per_second": 11.949,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.6959822840873141,
90
+ "grad_norm": 12.32780647277832,
91
+ "learning_rate": 4.26629160806376e-05,
92
+ "loss": 4.2314,
93
+ "step": 1100
94
+ },
95
+ {
96
+ "epoch": 0.7592534008225246,
97
+ "grad_norm": 15.515801429748535,
98
+ "learning_rate": 4.149085794655415e-05,
99
+ "loss": 4.3058,
100
+ "step": 1200
101
+ },
102
+ {
103
+ "epoch": 0.8225245175577349,
104
+ "grad_norm": 12.472834587097168,
105
+ "learning_rate": 4.03187998124707e-05,
106
+ "loss": 4.2158,
107
+ "step": 1300
108
+ },
109
+ {
110
+ "epoch": 0.8857956342929453,
111
+ "grad_norm": 16.903112411499023,
112
+ "learning_rate": 3.914674167838725e-05,
113
+ "loss": 4.142,
114
+ "step": 1400
115
+ },
116
+ {
117
+ "epoch": 0.9490667510281556,
118
+ "grad_norm": 13.487791061401367,
119
+ "learning_rate": 3.79746835443038e-05,
120
+ "loss": 4.0534,
121
+ "step": 1500
122
+ },
123
+ {
124
+ "epoch": 1.012337867763366,
125
+ "grad_norm": 14.721494674682617,
126
+ "learning_rate": 3.680262541022035e-05,
127
+ "loss": 4.1405,
128
+ "step": 1600
129
+ },
130
+ {
131
+ "epoch": 1.0756089844985763,
132
+ "grad_norm": 16.011690139770508,
133
+ "learning_rate": 3.56305672761369e-05,
134
+ "loss": 4.1192,
135
+ "step": 1700
136
+ },
137
+ {
138
+ "epoch": 1.1388801012337868,
139
+ "grad_norm": 15.692846298217773,
140
+ "learning_rate": 3.445850914205345e-05,
141
+ "loss": 4.1407,
142
+ "step": 1800
143
+ },
144
+ {
145
+ "epoch": 1.2021512179689973,
146
+ "grad_norm": 13.71811294555664,
147
+ "learning_rate": 3.328645100797e-05,
148
+ "loss": 4.1839,
149
+ "step": 1900
150
+ },
151
+ {
152
+ "epoch": 1.2654223347042075,
153
+ "grad_norm": 13.474448204040527,
154
+ "learning_rate": 3.2114392873886545e-05,
155
+ "loss": 4.0813,
156
+ "step": 2000
157
+ },
158
+ {
159
+ "epoch": 1.2654223347042075,
160
+ "eval_runtime": 33.0889,
161
+ "eval_samples_per_second": 95.5,
162
+ "eval_steps_per_second": 11.938,
163
+ "step": 2000
164
+ },
165
+ {
166
+ "epoch": 1.328693451439418,
167
+ "grad_norm": 15.276654243469238,
168
+ "learning_rate": 3.09423347398031e-05,
169
+ "loss": 3.9624,
170
+ "step": 2100
171
+ },
172
+ {
173
+ "epoch": 1.3919645681746282,
174
+ "grad_norm": 13.796238899230957,
175
+ "learning_rate": 2.9770276605719643e-05,
176
+ "loss": 4.0653,
177
+ "step": 2200
178
+ },
179
+ {
180
+ "epoch": 1.4552356849098387,
181
+ "grad_norm": 16.452594757080078,
182
+ "learning_rate": 2.8598218471636194e-05,
183
+ "loss": 4.0338,
184
+ "step": 2300
185
+ },
186
+ {
187
+ "epoch": 1.518506801645049,
188
+ "grad_norm": 20.27753257751465,
189
+ "learning_rate": 2.7426160337552742e-05,
190
+ "loss": 4.0962,
191
+ "step": 2400
192
+ },
193
+ {
194
+ "epoch": 1.5817779183802594,
195
+ "grad_norm": 15.492554664611816,
196
+ "learning_rate": 2.6254102203469293e-05,
197
+ "loss": 4.057,
198
+ "step": 2500
199
+ },
200
+ {
201
+ "epoch": 1.6450490351154698,
202
+ "grad_norm": 15.298516273498535,
203
+ "learning_rate": 2.508204406938584e-05,
204
+ "loss": 4.0301,
205
+ "step": 2600
206
+ },
207
+ {
208
+ "epoch": 1.70832015185068,
209
+ "grad_norm": 15.670785903930664,
210
+ "learning_rate": 2.3909985935302392e-05,
211
+ "loss": 4.0129,
212
+ "step": 2700
213
+ },
214
+ {
215
+ "epoch": 1.7715912685858906,
216
+ "grad_norm": 18.541555404663086,
217
+ "learning_rate": 2.2737927801218943e-05,
218
+ "loss": 3.9724,
219
+ "step": 2800
220
+ },
221
+ {
222
+ "epoch": 1.834862385321101,
223
+ "grad_norm": 19.13411521911621,
224
+ "learning_rate": 2.156586966713549e-05,
225
+ "loss": 4.0044,
226
+ "step": 2900
227
+ },
228
+ {
229
+ "epoch": 1.8981335020563113,
230
+ "grad_norm": 14.532624244689941,
231
+ "learning_rate": 2.039381153305204e-05,
232
+ "loss": 3.9882,
233
+ "step": 3000
234
+ },
235
+ {
236
+ "epoch": 1.8981335020563113,
237
+ "eval_runtime": 33.1181,
238
+ "eval_samples_per_second": 95.416,
239
+ "eval_steps_per_second": 11.927,
240
+ "step": 3000
241
+ },
242
+ {
243
+ "epoch": 1.9614046187915217,
244
+ "grad_norm": 15.767202377319336,
245
+ "learning_rate": 1.922175339896859e-05,
246
+ "loss": 3.9372,
247
+ "step": 3100
248
+ },
249
+ {
250
+ "epoch": 2.024675735526732,
251
+ "grad_norm": 17.210546493530273,
252
+ "learning_rate": 1.804969526488514e-05,
253
+ "loss": 3.9757,
254
+ "step": 3200
255
+ },
256
+ {
257
+ "epoch": 2.0879468522619424,
258
+ "grad_norm": 15.209254264831543,
259
+ "learning_rate": 1.6877637130801688e-05,
260
+ "loss": 3.9668,
261
+ "step": 3300
262
+ },
263
+ {
264
+ "epoch": 2.1512179689971527,
265
+ "grad_norm": 16.821176528930664,
266
+ "learning_rate": 1.570557899671824e-05,
267
+ "loss": 3.9732,
268
+ "step": 3400
269
+ },
270
+ {
271
+ "epoch": 2.2144890857323634,
272
+ "grad_norm": 15.914960861206055,
273
+ "learning_rate": 1.4533520862634786e-05,
274
+ "loss": 3.9375,
275
+ "step": 3500
276
+ },
277
+ {
278
+ "epoch": 2.2777602024675736,
279
+ "grad_norm": 16.489627838134766,
280
+ "learning_rate": 1.3361462728551336e-05,
281
+ "loss": 3.9993,
282
+ "step": 3600
283
+ },
284
+ {
285
+ "epoch": 2.341031319202784,
286
+ "grad_norm": 17.943021774291992,
287
+ "learning_rate": 1.2189404594467887e-05,
288
+ "loss": 3.9889,
289
+ "step": 3700
290
+ },
291
+ {
292
+ "epoch": 2.4043024359379945,
293
+ "grad_norm": 14.150239944458008,
294
+ "learning_rate": 1.1017346460384436e-05,
295
+ "loss": 4.0,
296
+ "step": 3800
297
+ },
298
+ {
299
+ "epoch": 2.4675735526732048,
300
+ "grad_norm": 15.843707084655762,
301
+ "learning_rate": 9.845288326300985e-06,
302
+ "loss": 3.9527,
303
+ "step": 3900
304
+ },
305
+ {
306
+ "epoch": 2.530844669408415,
307
+ "grad_norm": 16.922142028808594,
308
+ "learning_rate": 8.673230192217533e-06,
309
+ "loss": 3.8801,
310
+ "step": 4000
311
+ },
312
+ {
313
+ "epoch": 2.530844669408415,
314
+ "eval_runtime": 33.0326,
315
+ "eval_samples_per_second": 95.663,
316
+ "eval_steps_per_second": 11.958,
317
+ "step": 4000
318
+ },
319
+ {
320
+ "epoch": 2.5941157861436253,
321
+ "grad_norm": 19.046831130981445,
322
+ "learning_rate": 7.501172058134085e-06,
323
+ "loss": 3.9705,
324
+ "step": 4100
325
+ },
326
+ {
327
+ "epoch": 2.657386902878836,
328
+ "grad_norm": 15.516547203063965,
329
+ "learning_rate": 6.329113924050633e-06,
330
+ "loss": 3.9296,
331
+ "step": 4200
332
+ },
333
+ {
334
+ "epoch": 2.720658019614046,
335
+ "grad_norm": 16.0513916015625,
336
+ "learning_rate": 5.157055789967183e-06,
337
+ "loss": 4.0594,
338
+ "step": 4300
339
+ },
340
+ {
341
+ "epoch": 2.7839291363492564,
342
+ "grad_norm": 13.820748329162598,
343
+ "learning_rate": 3.984997655883732e-06,
344
+ "loss": 3.9353,
345
+ "step": 4400
346
+ },
347
+ {
348
+ "epoch": 2.847200253084467,
349
+ "grad_norm": 14.607758522033691,
350
+ "learning_rate": 2.8129395218002813e-06,
351
+ "loss": 4.0605,
352
+ "step": 4500
353
+ },
354
+ {
355
+ "epoch": 2.9104713698196774,
356
+ "grad_norm": 18.928266525268555,
357
+ "learning_rate": 1.6408813877168308e-06,
358
+ "loss": 3.9663,
359
+ "step": 4600
360
+ },
361
+ {
362
+ "epoch": 2.9737424865548876,
363
+ "grad_norm": 17.247835159301758,
364
+ "learning_rate": 4.688232536333803e-07,
365
+ "loss": 3.9057,
366
+ "step": 4700
367
+ }
368
+ ],
369
+ "logging_steps": 100,
370
+ "max_steps": 4740,
371
+ "num_input_tokens_seen": 0,
372
+ "num_train_epochs": 3,
373
+ "save_steps": 1000,
374
+ "stateful_callbacks": {
375
+ "TrainerControl": {
376
+ "args": {
377
+ "should_epoch_stop": false,
378
+ "should_evaluate": false,
379
+ "should_log": false,
380
+ "should_save": true,
381
+ "should_training_stop": true
382
+ },
383
+ "attributes": {}
384
+ }
385
+ },
386
+ "total_flos": 2016724755342822.0,
387
+ "train_batch_size": 8,
388
+ "trial_name": null,
389
+ "trial_params": null
390
+ }
muril_bh_domain/checkpoint-4740/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a22c26ec129497c28db8c0928a06ac51978ce151cffa0ea84f44917313aa4fe
3
+ size 5304
muril_bh_domain/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
muril_bh_domain/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.46.3"
5
+ }
muril_bh_domain/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df8f3e88e0c7d9391d346c9abb7e6b7b0729bf480dc131d713e7822db8e5b2a1
3
+ size 951043900
muril_bh_domain/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
muril_bh_domain/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
muril_bh_domain/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
muril_bh_domain/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
muril_bn_domain/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
muril_bn_domain/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.46.3"
5
+ }
muril_bn_domain/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9f38111bd7b18e2536f6a68716b77d056876facaf70df2e15d69a6e775c40b
3
+ size 951043900
muril_bn_domain/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
muril_bn_domain/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
muril_bn_domain/tokenizer_config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "103": {
20
+ "content": "[MASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "104": {
28
+ "content": "[CLS]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "105": {
36
+ "content": "[SEP]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "lowercase": false,
49
+ "mask_token": "[MASK]",
50
+ "model_max_length": 512,
51
+ "never_split": null,
52
+ "pad_token": "[PAD]",
53
+ "sep_token": "[SEP]",
54
+ "strip_accents": false,
55
+ "tokenize_chinese_chars": true,
56
+ "tokenizer_class": "BertTokenizer",
57
+ "unk_token": "[UNK]"
58
+ }
muril_bn_domain/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7045f6bb9f738a9d8ede00a3453edc500b8eb579edea5aaeff971f5a4baffd29
3
+ size 5304
muril_bn_domain/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
muril_ch_domain/checkpoint-30500/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
muril_ch_domain/checkpoint-30500/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 64,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "key",
27
+ "query",
28
+ "value",
29
+ "dense"
30
+ ],
31
+ "task_type": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
muril_ch_domain/checkpoint-30500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a355714bc7655686649d935524ea8155c31088d897e38f8e91446e8d3699105
3
+ size 42881168
muril_ch_domain/checkpoint-30500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e53fc7022d7bc695ba1e3ff7fdd1b3368c75e2346cdc9c402d6558e85d67284b
3
+ size 85843898
muril_ch_domain/checkpoint-30500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dcba7c160327f0ad67913f451ea37b39145ec96b984b91f6a4a72b0a5056736
3
+ size 14244
muril_ch_domain/checkpoint-30500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e37c8f6a8cf0a15e09cca408bc24278664576b9301e4288b0b47f7768d94d82b
3
+ size 1064
muril_ch_domain/checkpoint-30500/trainer_state.json ADDED
@@ -0,0 +1,2595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9839064716528885,
5
+ "eval_steps": 500,
6
+ "global_step": 30500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009783299907058651,
13
+ "grad_norm": 33.780609130859375,
14
+ "learning_rate": 1.6302575806977503e-06,
15
+ "loss": 6.574,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.019566599814117302,
20
+ "grad_norm": 33.439022064208984,
21
+ "learning_rate": 3.2605151613955006e-06,
22
+ "loss": 6.1653,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02934989972117595,
27
+ "grad_norm": 27.25751304626465,
28
+ "learning_rate": 4.890772742093251e-06,
29
+ "loss": 5.5515,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.039133199628234604,
34
+ "grad_norm": 38.36979675292969,
35
+ "learning_rate": 6.521030322791001e-06,
36
+ "loss": 5.0531,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.048916499535293256,
41
+ "grad_norm": 29.350488662719727,
42
+ "learning_rate": 8.15128790348875e-06,
43
+ "loss": 4.9225,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.048916499535293256,
48
+ "eval_runtime": 181.5812,
49
+ "eval_samples_per_second": 112.578,
50
+ "eval_steps_per_second": 14.076,
51
+ "step": 500
52
+ },
53
+ {
54
+ "epoch": 0.0586997994423519,
55
+ "grad_norm": 33.02122497558594,
56
+ "learning_rate": 9.781545484186502e-06,
57
+ "loss": 4.8186,
58
+ "step": 600
59
+ },
60
+ {
61
+ "epoch": 0.06848309934941056,
62
+ "grad_norm": 42.41593933105469,
63
+ "learning_rate": 1.1411803064884251e-05,
64
+ "loss": 4.5769,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.07826639925646921,
69
+ "grad_norm": 40.29044723510742,
70
+ "learning_rate": 1.3042060645582003e-05,
71
+ "loss": 4.3963,
72
+ "step": 800
73
+ },
74
+ {
75
+ "epoch": 0.08804969916352785,
76
+ "grad_norm": 38.0811653137207,
77
+ "learning_rate": 1.4672318226279752e-05,
78
+ "loss": 4.3393,
79
+ "step": 900
80
+ },
81
+ {
82
+ "epoch": 0.09783299907058651,
83
+ "grad_norm": 36.08370590209961,
84
+ "learning_rate": 1.63025758069775e-05,
85
+ "loss": 4.2421,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.09783299907058651,
90
+ "eval_runtime": 181.886,
91
+ "eval_samples_per_second": 112.389,
92
+ "eval_steps_per_second": 14.053,
93
+ "step": 1000
94
+ },
95
+ {
96
+ "epoch": 0.10761629897764516,
97
+ "grad_norm": 37.253684997558594,
98
+ "learning_rate": 1.7932833387675256e-05,
99
+ "loss": 4.1156,
100
+ "step": 1100
101
+ },
102
+ {
103
+ "epoch": 0.1173995988847038,
104
+ "grad_norm": 33.003475189208984,
105
+ "learning_rate": 1.9563090968373004e-05,
106
+ "loss": 4.0112,
107
+ "step": 1200
108
+ },
109
+ {
110
+ "epoch": 0.12718289879176245,
111
+ "grad_norm": 30.727867126464844,
112
+ "learning_rate": 2.1193348549070755e-05,
113
+ "loss": 3.9969,
114
+ "step": 1300
115
+ },
116
+ {
117
+ "epoch": 0.13696619869882112,
118
+ "grad_norm": 37.471092224121094,
119
+ "learning_rate": 2.2823606129768503e-05,
120
+ "loss": 3.874,
121
+ "step": 1400
122
+ },
123
+ {
124
+ "epoch": 0.14674949860587977,
125
+ "grad_norm": 42.32167434692383,
126
+ "learning_rate": 2.4453863710466254e-05,
127
+ "loss": 3.8518,
128
+ "step": 1500
129
+ },
130
+ {
131
+ "epoch": 0.14674949860587977,
132
+ "eval_runtime": 181.9332,
133
+ "eval_samples_per_second": 112.36,
134
+ "eval_steps_per_second": 14.049,
135
+ "step": 1500
136
+ },
137
+ {
138
+ "epoch": 0.15653279851293841,
139
+ "grad_norm": 38.00124740600586,
140
+ "learning_rate": 2.6084121291164005e-05,
141
+ "loss": 3.918,
142
+ "step": 1600
143
+ },
144
+ {
145
+ "epoch": 0.16631609841999706,
146
+ "grad_norm": 44.637386322021484,
147
+ "learning_rate": 2.7714378871861756e-05,
148
+ "loss": 3.9134,
149
+ "step": 1700
150
+ },
151
+ {
152
+ "epoch": 0.1760993983270557,
153
+ "grad_norm": 49.578609466552734,
154
+ "learning_rate": 2.9344636452559504e-05,
155
+ "loss": 3.7507,
156
+ "step": 1800
157
+ },
158
+ {
159
+ "epoch": 0.18588269823411438,
160
+ "grad_norm": 36.65715789794922,
161
+ "learning_rate": 3.0974894033257255e-05,
162
+ "loss": 3.7551,
163
+ "step": 1900
164
+ },
165
+ {
166
+ "epoch": 0.19566599814117303,
167
+ "grad_norm": 36.873443603515625,
168
+ "learning_rate": 3.2605151613955e-05,
169
+ "loss": 3.6951,
170
+ "step": 2000
171
+ },
172
+ {
173
+ "epoch": 0.19566599814117303,
174
+ "eval_runtime": 181.8273,
175
+ "eval_samples_per_second": 112.425,
176
+ "eval_steps_per_second": 14.057,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 0.20544929804823167,
181
+ "grad_norm": 33.025413513183594,
182
+ "learning_rate": 3.423540919465276e-05,
183
+ "loss": 3.6603,
184
+ "step": 2100
185
+ },
186
+ {
187
+ "epoch": 0.21523259795529032,
188
+ "grad_norm": 30.105051040649414,
189
+ "learning_rate": 3.586566677535051e-05,
190
+ "loss": 3.525,
191
+ "step": 2200
192
+ },
193
+ {
194
+ "epoch": 0.22501589786234896,
195
+ "grad_norm": 34.5129280090332,
196
+ "learning_rate": 3.749592435604825e-05,
197
+ "loss": 3.6454,
198
+ "step": 2300
199
+ },
200
+ {
201
+ "epoch": 0.2347991977694076,
202
+ "grad_norm": 33.16934585571289,
203
+ "learning_rate": 3.912618193674601e-05,
204
+ "loss": 3.6356,
205
+ "step": 2400
206
+ },
207
+ {
208
+ "epoch": 0.24458249767646628,
209
+ "grad_norm": 33.5789794921875,
210
+ "learning_rate": 4.0756439517443756e-05,
211
+ "loss": 3.5605,
212
+ "step": 2500
213
+ },
214
+ {
215
+ "epoch": 0.24458249767646628,
216
+ "eval_runtime": 181.7254,
217
+ "eval_samples_per_second": 112.488,
218
+ "eval_steps_per_second": 14.065,
219
+ "step": 2500
220
+ },
221
+ {
222
+ "epoch": 0.2543657975835249,
223
+ "grad_norm": 34.30876159667969,
224
+ "learning_rate": 4.238669709814151e-05,
225
+ "loss": 3.5447,
226
+ "step": 2600
227
+ },
228
+ {
229
+ "epoch": 0.2641490974905836,
230
+ "grad_norm": 29.907989501953125,
231
+ "learning_rate": 4.401695467883926e-05,
232
+ "loss": 3.5116,
233
+ "step": 2700
234
+ },
235
+ {
236
+ "epoch": 0.27393239739764225,
237
+ "grad_norm": 34.08231735229492,
238
+ "learning_rate": 4.5647212259537006e-05,
239
+ "loss": 3.4941,
240
+ "step": 2800
241
+ },
242
+ {
243
+ "epoch": 0.28371569730470086,
244
+ "grad_norm": 25.034149169921875,
245
+ "learning_rate": 4.727746984023476e-05,
246
+ "loss": 3.4863,
247
+ "step": 2900
248
+ },
249
+ {
250
+ "epoch": 0.29349899721175954,
251
+ "grad_norm": 32.21685028076172,
252
+ "learning_rate": 4.890772742093251e-05,
253
+ "loss": 3.5096,
254
+ "step": 3000
255
+ },
256
+ {
257
+ "epoch": 0.29349899721175954,
258
+ "eval_runtime": 181.6612,
259
+ "eval_samples_per_second": 112.528,
260
+ "eval_steps_per_second": 14.07,
261
+ "step": 3000
262
+ },
263
+ {
264
+ "epoch": 0.30328229711881816,
265
+ "grad_norm": 24.290380477905273,
266
+ "learning_rate": 4.9940208725902305e-05,
267
+ "loss": 3.3867,
268
+ "step": 3100
269
+ },
270
+ {
271
+ "epoch": 0.31306559702587683,
272
+ "grad_norm": 22.924575805664062,
273
+ "learning_rate": 4.975902304681838e-05,
274
+ "loss": 3.398,
275
+ "step": 3200
276
+ },
277
+ {
278
+ "epoch": 0.3228488969329355,
279
+ "grad_norm": 19.540430068969727,
280
+ "learning_rate": 4.957783736773446e-05,
281
+ "loss": 3.3727,
282
+ "step": 3300
283
+ },
284
+ {
285
+ "epoch": 0.3326321968399941,
286
+ "grad_norm": 22.529376983642578,
287
+ "learning_rate": 4.939665168865053e-05,
288
+ "loss": 3.3364,
289
+ "step": 3400
290
+ },
291
+ {
292
+ "epoch": 0.3424154967470528,
293
+ "grad_norm": 20.821264266967773,
294
+ "learning_rate": 4.921546600956661e-05,
295
+ "loss": 3.3126,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 0.3424154967470528,
300
+ "eval_runtime": 181.7582,
301
+ "eval_samples_per_second": 112.468,
302
+ "eval_steps_per_second": 14.063,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 0.3521987966541114,
307
+ "grad_norm": 24.346153259277344,
308
+ "learning_rate": 4.903428033048268e-05,
309
+ "loss": 3.2678,
310
+ "step": 3600
311
+ },
312
+ {
313
+ "epoch": 0.3619820965611701,
314
+ "grad_norm": 19.89035415649414,
315
+ "learning_rate": 4.8853094651398754e-05,
316
+ "loss": 3.3233,
317
+ "step": 3700
318
+ },
319
+ {
320
+ "epoch": 0.37176539646822876,
321
+ "grad_norm": 17.938880920410156,
322
+ "learning_rate": 4.8671908972314825e-05,
323
+ "loss": 3.2822,
324
+ "step": 3800
325
+ },
326
+ {
327
+ "epoch": 0.3815486963752874,
328
+ "grad_norm": 16.92071533203125,
329
+ "learning_rate": 4.84907232932309e-05,
330
+ "loss": 3.2254,
331
+ "step": 3900
332
+ },
333
+ {
334
+ "epoch": 0.39133199628234605,
335
+ "grad_norm": 18.241249084472656,
336
+ "learning_rate": 4.830953761414698e-05,
337
+ "loss": 3.2116,
338
+ "step": 4000
339
+ },
340
+ {
341
+ "epoch": 0.39133199628234605,
342
+ "eval_runtime": 182.8906,
343
+ "eval_samples_per_second": 111.772,
344
+ "eval_steps_per_second": 13.976,
345
+ "step": 4000
346
+ },
347
+ {
348
+ "epoch": 0.40111529618940467,
349
+ "grad_norm": 17.56020736694336,
350
+ "learning_rate": 4.812835193506305e-05,
351
+ "loss": 3.2232,
352
+ "step": 4100
353
+ },
354
+ {
355
+ "epoch": 0.41089859609646334,
356
+ "grad_norm": 17.81117057800293,
357
+ "learning_rate": 4.794716625597913e-05,
358
+ "loss": 3.1936,
359
+ "step": 4200
360
+ },
361
+ {
362
+ "epoch": 0.420681896003522,
363
+ "grad_norm": 19.89581871032715,
364
+ "learning_rate": 4.77659805768952e-05,
365
+ "loss": 3.1443,
366
+ "step": 4300
367
+ },
368
+ {
369
+ "epoch": 0.43046519591058063,
370
+ "grad_norm": 22.968582153320312,
371
+ "learning_rate": 4.758479489781128e-05,
372
+ "loss": 3.2084,
373
+ "step": 4400
374
+ },
375
+ {
376
+ "epoch": 0.4402484958176393,
377
+ "grad_norm": 17.119598388671875,
378
+ "learning_rate": 4.740360921872735e-05,
379
+ "loss": 3.1263,
380
+ "step": 4500
381
+ },
382
+ {
383
+ "epoch": 0.4402484958176393,
384
+ "eval_runtime": 182.3246,
385
+ "eval_samples_per_second": 112.119,
386
+ "eval_steps_per_second": 14.019,
387
+ "step": 4500
388
+ },
389
+ {
390
+ "epoch": 0.4500317957246979,
391
+ "grad_norm": 19.294527053833008,
392
+ "learning_rate": 4.722242353964343e-05,
393
+ "loss": 3.1327,
394
+ "step": 4600
395
+ },
396
+ {
397
+ "epoch": 0.4598150956317566,
398
+ "grad_norm": 16.941057205200195,
399
+ "learning_rate": 4.704123786055951e-05,
400
+ "loss": 3.0944,
401
+ "step": 4700
402
+ },
403
+ {
404
+ "epoch": 0.4695983955388152,
405
+ "grad_norm": 22.43411636352539,
406
+ "learning_rate": 4.686005218147558e-05,
407
+ "loss": 3.1093,
408
+ "step": 4800
409
+ },
410
+ {
411
+ "epoch": 0.4793816954458739,
412
+ "grad_norm": 19.64097023010254,
413
+ "learning_rate": 4.667886650239166e-05,
414
+ "loss": 3.0597,
415
+ "step": 4900
416
+ },
417
+ {
418
+ "epoch": 0.48916499535293256,
419
+ "grad_norm": 19.343788146972656,
420
+ "learning_rate": 4.649768082330773e-05,
421
+ "loss": 3.1659,
422
+ "step": 5000
423
+ },
424
+ {
425
+ "epoch": 0.48916499535293256,
426
+ "eval_runtime": 181.8771,
427
+ "eval_samples_per_second": 112.395,
428
+ "eval_steps_per_second": 14.053,
429
+ "step": 5000
430
+ },
431
+ {
432
+ "epoch": 0.4989482952599912,
433
+ "grad_norm": 19.657760620117188,
434
+ "learning_rate": 4.63164951442238e-05,
435
+ "loss": 3.0506,
436
+ "step": 5100
437
+ },
438
+ {
439
+ "epoch": 0.5087315951670498,
440
+ "grad_norm": 16.2425537109375,
441
+ "learning_rate": 4.613530946513987e-05,
442
+ "loss": 3.0524,
443
+ "step": 5200
444
+ },
445
+ {
446
+ "epoch": 0.5185148950741085,
447
+ "grad_norm": 19.64779281616211,
448
+ "learning_rate": 4.595412378605595e-05,
449
+ "loss": 2.9995,
450
+ "step": 5300
451
+ },
452
+ {
453
+ "epoch": 0.5282981949811671,
454
+ "grad_norm": 17.29520606994629,
455
+ "learning_rate": 4.577293810697203e-05,
456
+ "loss": 3.0932,
457
+ "step": 5400
458
+ },
459
+ {
460
+ "epoch": 0.5380814948882258,
461
+ "grad_norm": 17.694602966308594,
462
+ "learning_rate": 4.55917524278881e-05,
463
+ "loss": 3.0309,
464
+ "step": 5500
465
+ },
466
+ {
467
+ "epoch": 0.5380814948882258,
468
+ "eval_runtime": 181.7231,
469
+ "eval_samples_per_second": 112.49,
470
+ "eval_steps_per_second": 14.065,
471
+ "step": 5500
472
+ },
473
+ {
474
+ "epoch": 0.5478647947952845,
475
+ "grad_norm": 21.030174255371094,
476
+ "learning_rate": 4.541056674880418e-05,
477
+ "loss": 3.0313,
478
+ "step": 5600
479
+ },
480
+ {
481
+ "epoch": 0.5576480947023431,
482
+ "grad_norm": 12.339129447937012,
483
+ "learning_rate": 4.522938106972025e-05,
484
+ "loss": 3.047,
485
+ "step": 5700
486
+ },
487
+ {
488
+ "epoch": 0.5674313946094017,
489
+ "grad_norm": 16.496389389038086,
490
+ "learning_rate": 4.504819539063633e-05,
491
+ "loss": 2.9961,
492
+ "step": 5800
493
+ },
494
+ {
495
+ "epoch": 0.5772146945164603,
496
+ "grad_norm": 15.456297874450684,
497
+ "learning_rate": 4.48670097115524e-05,
498
+ "loss": 2.9821,
499
+ "step": 5900
500
+ },
501
+ {
502
+ "epoch": 0.5869979944235191,
503
+ "grad_norm": 17.8603572845459,
504
+ "learning_rate": 4.468582403246848e-05,
505
+ "loss": 2.9294,
506
+ "step": 6000
507
+ },
508
+ {
509
+ "epoch": 0.5869979944235191,
510
+ "eval_runtime": 181.8258,
511
+ "eval_samples_per_second": 112.426,
512
+ "eval_steps_per_second": 14.057,
513
+ "step": 6000
514
+ },
515
+ {
516
+ "epoch": 0.5967812943305777,
517
+ "grad_norm": 18.85349464416504,
518
+ "learning_rate": 4.450463835338455e-05,
519
+ "loss": 2.9929,
520
+ "step": 6100
521
+ },
522
+ {
523
+ "epoch": 0.6065645942376363,
524
+ "grad_norm": 22.971813201904297,
525
+ "learning_rate": 4.432345267430063e-05,
526
+ "loss": 2.9684,
527
+ "step": 6200
528
+ },
529
+ {
530
+ "epoch": 0.616347894144695,
531
+ "grad_norm": 15.877230644226074,
532
+ "learning_rate": 4.4142266995216706e-05,
533
+ "loss": 2.9399,
534
+ "step": 6300
535
+ },
536
+ {
537
+ "epoch": 0.6261311940517537,
538
+ "grad_norm": 19.847482681274414,
539
+ "learning_rate": 4.396108131613278e-05,
540
+ "loss": 2.88,
541
+ "step": 6400
542
+ },
543
+ {
544
+ "epoch": 0.6359144939588123,
545
+ "grad_norm": 15.004170417785645,
546
+ "learning_rate": 4.377989563704885e-05,
547
+ "loss": 2.9719,
548
+ "step": 6500
549
+ },
550
+ {
551
+ "epoch": 0.6359144939588123,
552
+ "eval_runtime": 182.6045,
553
+ "eval_samples_per_second": 111.947,
554
+ "eval_steps_per_second": 13.997,
555
+ "step": 6500
556
+ },
557
+ {
558
+ "epoch": 0.645697793865871,
559
+ "grad_norm": 19.473665237426758,
560
+ "learning_rate": 4.359870995796492e-05,
561
+ "loss": 2.9246,
562
+ "step": 6600
563
+ },
564
+ {
565
+ "epoch": 0.6554810937729296,
566
+ "grad_norm": 18.071683883666992,
567
+ "learning_rate": 4.3417524278881e-05,
568
+ "loss": 2.9031,
569
+ "step": 6700
570
+ },
571
+ {
572
+ "epoch": 0.6652643936799882,
573
+ "grad_norm": 17.544504165649414,
574
+ "learning_rate": 4.323633859979707e-05,
575
+ "loss": 2.8313,
576
+ "step": 6800
577
+ },
578
+ {
579
+ "epoch": 0.6750476935870469,
580
+ "grad_norm": 18.936140060424805,
581
+ "learning_rate": 4.305515292071315e-05,
582
+ "loss": 2.8536,
583
+ "step": 6900
584
+ },
585
+ {
586
+ "epoch": 0.6848309934941056,
587
+ "grad_norm": 14.77696418762207,
588
+ "learning_rate": 4.2873967241629226e-05,
589
+ "loss": 2.9104,
590
+ "step": 7000
591
+ },
592
+ {
593
+ "epoch": 0.6848309934941056,
594
+ "eval_runtime": 181.938,
595
+ "eval_samples_per_second": 112.357,
596
+ "eval_steps_per_second": 14.049,
597
+ "step": 7000
598
+ },
599
+ {
600
+ "epoch": 0.6946142934011642,
601
+ "grad_norm": 14.303226470947266,
602
+ "learning_rate": 4.26927815625453e-05,
603
+ "loss": 2.8386,
604
+ "step": 7100
605
+ },
606
+ {
607
+ "epoch": 0.7043975933082228,
608
+ "grad_norm": 17.11782455444336,
609
+ "learning_rate": 4.2511595883461376e-05,
610
+ "loss": 2.9013,
611
+ "step": 7200
612
+ },
613
+ {
614
+ "epoch": 0.7141808932152816,
615
+ "grad_norm": 18.661100387573242,
616
+ "learning_rate": 4.233041020437745e-05,
617
+ "loss": 2.9428,
618
+ "step": 7300
619
+ },
620
+ {
621
+ "epoch": 0.7239641931223402,
622
+ "grad_norm": 15.535719871520996,
623
+ "learning_rate": 4.2149224525293525e-05,
624
+ "loss": 2.8582,
625
+ "step": 7400
626
+ },
627
+ {
628
+ "epoch": 0.7337474930293988,
629
+ "grad_norm": 15.3306303024292,
630
+ "learning_rate": 4.19680388462096e-05,
631
+ "loss": 2.8896,
632
+ "step": 7500
633
+ },
634
+ {
635
+ "epoch": 0.7337474930293988,
636
+ "eval_runtime": 181.8938,
637
+ "eval_samples_per_second": 112.384,
638
+ "eval_steps_per_second": 14.052,
639
+ "step": 7500
640
+ },
641
+ {
642
+ "epoch": 0.7435307929364575,
643
+ "grad_norm": 16.730344772338867,
644
+ "learning_rate": 4.1786853167125675e-05,
645
+ "loss": 2.9097,
646
+ "step": 7600
647
+ },
648
+ {
649
+ "epoch": 0.7533140928435161,
650
+ "grad_norm": 18.755483627319336,
651
+ "learning_rate": 4.1605667488041746e-05,
652
+ "loss": 2.8815,
653
+ "step": 7700
654
+ },
655
+ {
656
+ "epoch": 0.7630973927505748,
657
+ "grad_norm": 18.737581253051758,
658
+ "learning_rate": 4.1424481808957824e-05,
659
+ "loss": 2.9202,
660
+ "step": 7800
661
+ },
662
+ {
663
+ "epoch": 0.7728806926576334,
664
+ "grad_norm": 14.711681365966797,
665
+ "learning_rate": 4.1243296129873896e-05,
666
+ "loss": 2.806,
667
+ "step": 7900
668
+ },
669
+ {
670
+ "epoch": 0.7826639925646921,
671
+ "grad_norm": 17.5069580078125,
672
+ "learning_rate": 4.106211045078997e-05,
673
+ "loss": 2.8576,
674
+ "step": 8000
675
+ },
676
+ {
677
+ "epoch": 0.7826639925646921,
678
+ "eval_runtime": 181.9442,
679
+ "eval_samples_per_second": 112.353,
680
+ "eval_steps_per_second": 14.048,
681
+ "step": 8000
682
+ },
683
+ {
684
+ "epoch": 0.7924472924717507,
685
+ "grad_norm": 17.678852081298828,
686
+ "learning_rate": 4.0880924771706046e-05,
687
+ "loss": 2.8035,
688
+ "step": 8100
689
+ },
690
+ {
691
+ "epoch": 0.8022305923788093,
692
+ "grad_norm": 17.644638061523438,
693
+ "learning_rate": 4.069973909262212e-05,
694
+ "loss": 2.7958,
695
+ "step": 8200
696
+ },
697
+ {
698
+ "epoch": 0.8120138922858681,
699
+ "grad_norm": 18.377134323120117,
700
+ "learning_rate": 4.0518553413538195e-05,
701
+ "loss": 2.8055,
702
+ "step": 8300
703
+ },
704
+ {
705
+ "epoch": 0.8217971921929267,
706
+ "grad_norm": 18.026033401489258,
707
+ "learning_rate": 4.0337367734454273e-05,
708
+ "loss": 2.7334,
709
+ "step": 8400
710
+ },
711
+ {
712
+ "epoch": 0.8315804920999853,
713
+ "grad_norm": 14.77315616607666,
714
+ "learning_rate": 4.0156182055370345e-05,
715
+ "loss": 2.8082,
716
+ "step": 8500
717
+ },
718
+ {
719
+ "epoch": 0.8315804920999853,
720
+ "eval_runtime": 182.4176,
721
+ "eval_samples_per_second": 112.062,
722
+ "eval_steps_per_second": 14.012,
723
+ "step": 8500
724
+ },
725
+ {
726
+ "epoch": 0.841363792007044,
727
+ "grad_norm": 13.729479789733887,
728
+ "learning_rate": 3.997499637628642e-05,
729
+ "loss": 2.7939,
730
+ "step": 8600
731
+ },
732
+ {
733
+ "epoch": 0.8511470919141026,
734
+ "grad_norm": 16.34333610534668,
735
+ "learning_rate": 3.9793810697202494e-05,
736
+ "loss": 2.8517,
737
+ "step": 8700
738
+ },
739
+ {
740
+ "epoch": 0.8609303918211613,
741
+ "grad_norm": 22.484411239624023,
742
+ "learning_rate": 3.961262501811857e-05,
743
+ "loss": 2.776,
744
+ "step": 8800
745
+ },
746
+ {
747
+ "epoch": 0.8707136917282199,
748
+ "grad_norm": 15.922870635986328,
749
+ "learning_rate": 3.9431439339034644e-05,
750
+ "loss": 2.7909,
751
+ "step": 8900
752
+ },
753
+ {
754
+ "epoch": 0.8804969916352786,
755
+ "grad_norm": 15.06955623626709,
756
+ "learning_rate": 3.925025365995072e-05,
757
+ "loss": 2.8416,
758
+ "step": 9000
759
+ },
760
+ {
761
+ "epoch": 0.8804969916352786,
762
+ "eval_runtime": 181.9314,
763
+ "eval_samples_per_second": 112.361,
764
+ "eval_steps_per_second": 14.049,
765
+ "step": 9000
766
+ },
767
+ {
768
+ "epoch": 0.8902802915423372,
769
+ "grad_norm": 16.060428619384766,
770
+ "learning_rate": 3.9069067980866794e-05,
771
+ "loss": 2.7803,
772
+ "step": 9100
773
+ },
774
+ {
775
+ "epoch": 0.9000635914493959,
776
+ "grad_norm": 16.80124855041504,
777
+ "learning_rate": 3.888788230178287e-05,
778
+ "loss": 2.7548,
779
+ "step": 9200
780
+ },
781
+ {
782
+ "epoch": 0.9098468913564546,
783
+ "grad_norm": 16.608434677124023,
784
+ "learning_rate": 3.870669662269894e-05,
785
+ "loss": 2.8606,
786
+ "step": 9300
787
+ },
788
+ {
789
+ "epoch": 0.9196301912635132,
790
+ "grad_norm": 14.83870792388916,
791
+ "learning_rate": 3.8525510943615015e-05,
792
+ "loss": 2.7833,
793
+ "step": 9400
794
+ },
795
+ {
796
+ "epoch": 0.9294134911705718,
797
+ "grad_norm": 25.778181076049805,
798
+ "learning_rate": 3.834432526453109e-05,
799
+ "loss": 2.7434,
800
+ "step": 9500
801
+ },
802
+ {
803
+ "epoch": 0.9294134911705718,
804
+ "eval_runtime": 181.99,
805
+ "eval_samples_per_second": 112.325,
806
+ "eval_steps_per_second": 14.045,
807
+ "step": 9500
808
+ },
809
+ {
810
+ "epoch": 0.9391967910776304,
811
+ "grad_norm": 17.374011993408203,
812
+ "learning_rate": 3.8163139585447164e-05,
813
+ "loss": 2.7258,
814
+ "step": 9600
815
+ },
816
+ {
817
+ "epoch": 0.9489800909846892,
818
+ "grad_norm": 17.551128387451172,
819
+ "learning_rate": 3.798195390636324e-05,
820
+ "loss": 2.824,
821
+ "step": 9700
822
+ },
823
+ {
824
+ "epoch": 0.9587633908917478,
825
+ "grad_norm": 14.35797119140625,
826
+ "learning_rate": 3.7800768227279314e-05,
827
+ "loss": 2.745,
828
+ "step": 9800
829
+ },
830
+ {
831
+ "epoch": 0.9685466907988064,
832
+ "grad_norm": 20.098552703857422,
833
+ "learning_rate": 3.761958254819539e-05,
834
+ "loss": 2.7025,
835
+ "step": 9900
836
+ },
837
+ {
838
+ "epoch": 0.9783299907058651,
839
+ "grad_norm": 16.218109130859375,
840
+ "learning_rate": 3.743839686911147e-05,
841
+ "loss": 2.8093,
842
+ "step": 10000
843
+ },
844
+ {
845
+ "epoch": 0.9783299907058651,
846
+ "eval_runtime": 181.8987,
847
+ "eval_samples_per_second": 112.381,
848
+ "eval_steps_per_second": 14.052,
849
+ "step": 10000
850
+ },
851
+ {
852
+ "epoch": 0.9881132906129237,
853
+ "grad_norm": 17.198423385620117,
854
+ "learning_rate": 3.725721119002754e-05,
855
+ "loss": 2.7124,
856
+ "step": 10100
857
+ },
858
+ {
859
+ "epoch": 0.9978965905199824,
860
+ "grad_norm": 18.021198272705078,
861
+ "learning_rate": 3.707602551094362e-05,
862
+ "loss": 2.6922,
863
+ "step": 10200
864
+ },
865
+ {
866
+ "epoch": 1.007679890427041,
867
+ "grad_norm": 15.27678108215332,
868
+ "learning_rate": 3.689483983185969e-05,
869
+ "loss": 2.6743,
870
+ "step": 10300
871
+ },
872
+ {
873
+ "epoch": 1.0174631903340996,
874
+ "grad_norm": 16.770511627197266,
875
+ "learning_rate": 3.671365415277577e-05,
876
+ "loss": 2.857,
877
+ "step": 10400
878
+ },
879
+ {
880
+ "epoch": 1.0272464902411584,
881
+ "grad_norm": 18.810932159423828,
882
+ "learning_rate": 3.653246847369184e-05,
883
+ "loss": 2.7269,
884
+ "step": 10500
885
+ },
886
+ {
887
+ "epoch": 1.0272464902411584,
888
+ "eval_runtime": 181.8537,
889
+ "eval_samples_per_second": 112.409,
890
+ "eval_steps_per_second": 14.055,
891
+ "step": 10500
892
+ },
893
+ {
894
+ "epoch": 1.037029790148217,
895
+ "grad_norm": 18.56201171875,
896
+ "learning_rate": 3.635128279460791e-05,
897
+ "loss": 2.7325,
898
+ "step": 10600
899
+ },
900
+ {
901
+ "epoch": 1.0468130900552757,
902
+ "grad_norm": 15.063011169433594,
903
+ "learning_rate": 3.617009711552399e-05,
904
+ "loss": 2.7827,
905
+ "step": 10700
906
+ },
907
+ {
908
+ "epoch": 1.0565963899623343,
909
+ "grad_norm": 15.339439392089844,
910
+ "learning_rate": 3.598891143644006e-05,
911
+ "loss": 2.7472,
912
+ "step": 10800
913
+ },
914
+ {
915
+ "epoch": 1.066379689869393,
916
+ "grad_norm": 17.466033935546875,
917
+ "learning_rate": 3.580772575735614e-05,
918
+ "loss": 2.7859,
919
+ "step": 10900
920
+ },
921
+ {
922
+ "epoch": 1.0761629897764515,
923
+ "grad_norm": 20.727872848510742,
924
+ "learning_rate": 3.562654007827221e-05,
925
+ "loss": 2.7278,
926
+ "step": 11000
927
+ },
928
+ {
929
+ "epoch": 1.0761629897764515,
930
+ "eval_runtime": 181.8566,
931
+ "eval_samples_per_second": 112.407,
932
+ "eval_steps_per_second": 14.055,
933
+ "step": 11000
934
+ },
935
+ {
936
+ "epoch": 1.0859462896835101,
937
+ "grad_norm": 16.02055549621582,
938
+ "learning_rate": 3.544535439918829e-05,
939
+ "loss": 2.6307,
940
+ "step": 11100
941
+ },
942
+ {
943
+ "epoch": 1.095729589590569,
944
+ "grad_norm": 20.069686889648438,
945
+ "learning_rate": 3.526416872010436e-05,
946
+ "loss": 2.711,
947
+ "step": 11200
948
+ },
949
+ {
950
+ "epoch": 1.1055128894976276,
951
+ "grad_norm": 14.833261489868164,
952
+ "learning_rate": 3.508298304102044e-05,
953
+ "loss": 2.6141,
954
+ "step": 11300
955
+ },
956
+ {
957
+ "epoch": 1.1152961894046862,
958
+ "grad_norm": 14.86436653137207,
959
+ "learning_rate": 3.490179736193652e-05,
960
+ "loss": 2.6816,
961
+ "step": 11400
962
+ },
963
+ {
964
+ "epoch": 1.1250794893117448,
965
+ "grad_norm": 17.955862045288086,
966
+ "learning_rate": 3.472061168285259e-05,
967
+ "loss": 2.6924,
968
+ "step": 11500
969
+ },
970
+ {
971
+ "epoch": 1.1250794893117448,
972
+ "eval_runtime": 181.8085,
973
+ "eval_samples_per_second": 112.437,
974
+ "eval_steps_per_second": 14.059,
975
+ "step": 11500
976
+ },
977
+ {
978
+ "epoch": 1.1348627892188035,
979
+ "grad_norm": 18.360109329223633,
980
+ "learning_rate": 3.453942600376867e-05,
981
+ "loss": 2.6181,
982
+ "step": 11600
983
+ },
984
+ {
985
+ "epoch": 1.144646089125862,
986
+ "grad_norm": 17.547542572021484,
987
+ "learning_rate": 3.435824032468474e-05,
988
+ "loss": 2.6394,
989
+ "step": 11700
990
+ },
991
+ {
992
+ "epoch": 1.154429389032921,
993
+ "grad_norm": 12.194833755493164,
994
+ "learning_rate": 3.417705464560082e-05,
995
+ "loss": 2.6684,
996
+ "step": 11800
997
+ },
998
+ {
999
+ "epoch": 1.1642126889399795,
1000
+ "grad_norm": 17.095104217529297,
1001
+ "learning_rate": 3.399586896651689e-05,
1002
+ "loss": 2.6129,
1003
+ "step": 11900
1004
+ },
1005
+ {
1006
+ "epoch": 1.1739959888470382,
1007
+ "grad_norm": 20.788406372070312,
1008
+ "learning_rate": 3.381468328743296e-05,
1009
+ "loss": 2.5663,
1010
+ "step": 12000
1011
+ },
1012
+ {
1013
+ "epoch": 1.1739959888470382,
1014
+ "eval_runtime": 181.8035,
1015
+ "eval_samples_per_second": 112.44,
1016
+ "eval_steps_per_second": 14.059,
1017
+ "step": 12000
1018
+ },
1019
+ {
1020
+ "epoch": 1.1837792887540968,
1021
+ "grad_norm": 14.261167526245117,
1022
+ "learning_rate": 3.363349760834904e-05,
1023
+ "loss": 2.6544,
1024
+ "step": 12100
1025
+ },
1026
+ {
1027
+ "epoch": 1.1935625886611554,
1028
+ "grad_norm": 24.68012046813965,
1029
+ "learning_rate": 3.345231192926511e-05,
1030
+ "loss": 2.6632,
1031
+ "step": 12200
1032
+ },
1033
+ {
1034
+ "epoch": 1.203345888568214,
1035
+ "grad_norm": 16.10886573791504,
1036
+ "learning_rate": 3.327112625018119e-05,
1037
+ "loss": 2.6366,
1038
+ "step": 12300
1039
+ },
1040
+ {
1041
+ "epoch": 1.2131291884752726,
1042
+ "grad_norm": 18.038848876953125,
1043
+ "learning_rate": 3.308994057109726e-05,
1044
+ "loss": 2.6563,
1045
+ "step": 12400
1046
+ },
1047
+ {
1048
+ "epoch": 1.2229124883823315,
1049
+ "grad_norm": 17.40920639038086,
1050
+ "learning_rate": 3.290875489201334e-05,
1051
+ "loss": 2.718,
1052
+ "step": 12500
1053
+ },
1054
+ {
1055
+ "epoch": 1.2229124883823315,
1056
+ "eval_runtime": 181.9491,
1057
+ "eval_samples_per_second": 112.35,
1058
+ "eval_steps_per_second": 14.048,
1059
+ "step": 12500
1060
+ },
1061
+ {
1062
+ "epoch": 1.23269578828939,
1063
+ "grad_norm": 15.097307205200195,
1064
+ "learning_rate": 3.272756921292941e-05,
1065
+ "loss": 2.7282,
1066
+ "step": 12600
1067
+ },
1068
+ {
1069
+ "epoch": 1.2424790881964487,
1070
+ "grad_norm": 17.63008689880371,
1071
+ "learning_rate": 3.254638353384549e-05,
1072
+ "loss": 2.7104,
1073
+ "step": 12700
1074
+ },
1075
+ {
1076
+ "epoch": 1.2522623881035073,
1077
+ "grad_norm": 16.161130905151367,
1078
+ "learning_rate": 3.236519785476156e-05,
1079
+ "loss": 2.6427,
1080
+ "step": 12800
1081
+ },
1082
+ {
1083
+ "epoch": 1.262045688010566,
1084
+ "grad_norm": 18.786882400512695,
1085
+ "learning_rate": 3.218401217567764e-05,
1086
+ "loss": 2.6105,
1087
+ "step": 12900
1088
+ },
1089
+ {
1090
+ "epoch": 1.2718289879176246,
1091
+ "grad_norm": 24.145421981811523,
1092
+ "learning_rate": 3.2002826496593715e-05,
1093
+ "loss": 2.6322,
1094
+ "step": 13000
1095
+ },
1096
+ {
1097
+ "epoch": 1.2718289879176246,
1098
+ "eval_runtime": 182.5613,
1099
+ "eval_samples_per_second": 111.973,
1100
+ "eval_steps_per_second": 14.001,
1101
+ "step": 13000
1102
+ },
1103
+ {
1104
+ "epoch": 1.2816122878246832,
1105
+ "grad_norm": 15.286133766174316,
1106
+ "learning_rate": 3.1821640817509786e-05,
1107
+ "loss": 2.6465,
1108
+ "step": 13100
1109
+ },
1110
+ {
1111
+ "epoch": 1.291395587731742,
1112
+ "grad_norm": 21.22935676574707,
1113
+ "learning_rate": 3.1640455138425865e-05,
1114
+ "loss": 2.6691,
1115
+ "step": 13200
1116
+ },
1117
+ {
1118
+ "epoch": 1.3011788876388006,
1119
+ "grad_norm": 18.064428329467773,
1120
+ "learning_rate": 3.1459269459341936e-05,
1121
+ "loss": 2.5904,
1122
+ "step": 13300
1123
+ },
1124
+ {
1125
+ "epoch": 1.3109621875458592,
1126
+ "grad_norm": 14.45976448059082,
1127
+ "learning_rate": 3.127808378025801e-05,
1128
+ "loss": 2.6602,
1129
+ "step": 13400
1130
+ },
1131
+ {
1132
+ "epoch": 1.3207454874529179,
1133
+ "grad_norm": 19.72386360168457,
1134
+ "learning_rate": 3.109689810117408e-05,
1135
+ "loss": 2.6337,
1136
+ "step": 13500
1137
+ },
1138
+ {
1139
+ "epoch": 1.3207454874529179,
1140
+ "eval_runtime": 182.4053,
1141
+ "eval_samples_per_second": 112.069,
1142
+ "eval_steps_per_second": 14.013,
1143
+ "step": 13500
1144
+ },
1145
+ {
1146
+ "epoch": 1.3305287873599765,
1147
+ "grad_norm": 17.639583587646484,
1148
+ "learning_rate": 3.091571242209016e-05,
1149
+ "loss": 2.6135,
1150
+ "step": 13600
1151
+ },
1152
+ {
1153
+ "epoch": 1.340312087267035,
1154
+ "grad_norm": 19.71700096130371,
1155
+ "learning_rate": 3.0734526743006235e-05,
1156
+ "loss": 2.6252,
1157
+ "step": 13700
1158
+ },
1159
+ {
1160
+ "epoch": 1.3500953871740937,
1161
+ "grad_norm": 16.715856552124023,
1162
+ "learning_rate": 3.055334106392231e-05,
1163
+ "loss": 2.6475,
1164
+ "step": 13800
1165
+ },
1166
+ {
1167
+ "epoch": 1.3598786870811526,
1168
+ "grad_norm": 12.645075798034668,
1169
+ "learning_rate": 3.0372155384838385e-05,
1170
+ "loss": 2.6199,
1171
+ "step": 13900
1172
+ },
1173
+ {
1174
+ "epoch": 1.3696619869882112,
1175
+ "grad_norm": 20.150625228881836,
1176
+ "learning_rate": 3.0190969705754456e-05,
1177
+ "loss": 2.5567,
1178
+ "step": 14000
1179
+ },
1180
+ {
1181
+ "epoch": 1.3696619869882112,
1182
+ "eval_runtime": 181.9086,
1183
+ "eval_samples_per_second": 112.375,
1184
+ "eval_steps_per_second": 14.051,
1185
+ "step": 14000
1186
+ },
1187
+ {
1188
+ "epoch": 1.3794452868952698,
1189
+ "grad_norm": 19.111286163330078,
1190
+ "learning_rate": 3.0009784026670535e-05,
1191
+ "loss": 2.59,
1192
+ "step": 14100
1193
+ },
1194
+ {
1195
+ "epoch": 1.3892285868023284,
1196
+ "grad_norm": 17.12226104736328,
1197
+ "learning_rate": 2.9828598347586606e-05,
1198
+ "loss": 2.5913,
1199
+ "step": 14200
1200
+ },
1201
+ {
1202
+ "epoch": 1.399011886709387,
1203
+ "grad_norm": 19.741445541381836,
1204
+ "learning_rate": 2.9647412668502684e-05,
1205
+ "loss": 2.5617,
1206
+ "step": 14300
1207
+ },
1208
+ {
1209
+ "epoch": 1.4087951866164456,
1210
+ "grad_norm": 17.605525970458984,
1211
+ "learning_rate": 2.946622698941876e-05,
1212
+ "loss": 2.6077,
1213
+ "step": 14400
1214
+ },
1215
+ {
1216
+ "epoch": 1.4185784865235043,
1217
+ "grad_norm": 17.433218002319336,
1218
+ "learning_rate": 2.928504131033483e-05,
1219
+ "loss": 2.5713,
1220
+ "step": 14500
1221
+ },
1222
+ {
1223
+ "epoch": 1.4185784865235043,
1224
+ "eval_runtime": 181.9305,
1225
+ "eval_samples_per_second": 112.362,
1226
+ "eval_steps_per_second": 14.049,
1227
+ "step": 14500
1228
+ },
1229
+ {
1230
+ "epoch": 1.428361786430563,
1231
+ "grad_norm": 15.442538261413574,
1232
+ "learning_rate": 2.910385563125091e-05,
1233
+ "loss": 2.6499,
1234
+ "step": 14600
1235
+ },
1236
+ {
1237
+ "epoch": 1.4381450863376217,
1238
+ "grad_norm": 15.078730583190918,
1239
+ "learning_rate": 2.892266995216698e-05,
1240
+ "loss": 2.6517,
1241
+ "step": 14700
1242
+ },
1243
+ {
1244
+ "epoch": 1.4479283862446803,
1245
+ "grad_norm": 23.07891273498535,
1246
+ "learning_rate": 2.874148427308306e-05,
1247
+ "loss": 2.594,
1248
+ "step": 14800
1249
+ },
1250
+ {
1251
+ "epoch": 1.457711686151739,
1252
+ "grad_norm": 16.707923889160156,
1253
+ "learning_rate": 2.856029859399913e-05,
1254
+ "loss": 2.6613,
1255
+ "step": 14900
1256
+ },
1257
+ {
1258
+ "epoch": 1.4674949860587976,
1259
+ "grad_norm": 16.731164932250977,
1260
+ "learning_rate": 2.8379112914915208e-05,
1261
+ "loss": 2.5927,
1262
+ "step": 15000
1263
+ },
1264
+ {
1265
+ "epoch": 1.4674949860587976,
1266
+ "eval_runtime": 181.9649,
1267
+ "eval_samples_per_second": 112.34,
1268
+ "eval_steps_per_second": 14.047,
1269
+ "step": 15000
1270
+ },
1271
+ {
1272
+ "epoch": 1.4772782859658564,
1273
+ "grad_norm": 16.020864486694336,
1274
+ "learning_rate": 2.819792723583128e-05,
1275
+ "loss": 2.6464,
1276
+ "step": 15100
1277
+ },
1278
+ {
1279
+ "epoch": 1.4870615858729148,
1280
+ "grad_norm": 16.674760818481445,
1281
+ "learning_rate": 2.8016741556747354e-05,
1282
+ "loss": 2.5853,
1283
+ "step": 15200
1284
+ },
1285
+ {
1286
+ "epoch": 1.4968448857799737,
1287
+ "grad_norm": 16.890748977661133,
1288
+ "learning_rate": 2.7835555877663432e-05,
1289
+ "loss": 2.5748,
1290
+ "step": 15300
1291
+ },
1292
+ {
1293
+ "epoch": 1.5066281856870323,
1294
+ "grad_norm": 20.217845916748047,
1295
+ "learning_rate": 2.7654370198579504e-05,
1296
+ "loss": 2.6204,
1297
+ "step": 15400
1298
+ },
1299
+ {
1300
+ "epoch": 1.516411485594091,
1301
+ "grad_norm": 20.459087371826172,
1302
+ "learning_rate": 2.7473184519495582e-05,
1303
+ "loss": 2.6103,
1304
+ "step": 15500
1305
+ },
1306
+ {
1307
+ "epoch": 1.516411485594091,
1308
+ "eval_runtime": 181.9454,
1309
+ "eval_samples_per_second": 112.352,
1310
+ "eval_steps_per_second": 14.048,
1311
+ "step": 15500
1312
+ },
1313
+ {
1314
+ "epoch": 1.5261947855011495,
1315
+ "grad_norm": 18.207612991333008,
1316
+ "learning_rate": 2.7291998840411654e-05,
1317
+ "loss": 2.5786,
1318
+ "step": 15600
1319
+ },
1320
+ {
1321
+ "epoch": 1.5359780854082081,
1322
+ "grad_norm": 18.084758758544922,
1323
+ "learning_rate": 2.7110813161327732e-05,
1324
+ "loss": 2.6535,
1325
+ "step": 15700
1326
+ },
1327
+ {
1328
+ "epoch": 1.545761385315267,
1329
+ "grad_norm": 15.03881549835205,
1330
+ "learning_rate": 2.6929627482243803e-05,
1331
+ "loss": 2.6061,
1332
+ "step": 15800
1333
+ },
1334
+ {
1335
+ "epoch": 1.5555446852223254,
1336
+ "grad_norm": 16.99995231628418,
1337
+ "learning_rate": 2.6748441803159878e-05,
1338
+ "loss": 2.6151,
1339
+ "step": 15900
1340
+ },
1341
+ {
1342
+ "epoch": 1.5653279851293842,
1343
+ "grad_norm": 15.581089973449707,
1344
+ "learning_rate": 2.6567256124075956e-05,
1345
+ "loss": 2.6163,
1346
+ "step": 16000
1347
+ },
1348
+ {
1349
+ "epoch": 1.5653279851293842,
1350
+ "eval_runtime": 181.8152,
1351
+ "eval_samples_per_second": 112.433,
1352
+ "eval_steps_per_second": 14.058,
1353
+ "step": 16000
1354
+ },
1355
+ {
1356
+ "epoch": 1.5751112850364428,
1357
+ "grad_norm": 21.4382266998291,
1358
+ "learning_rate": 2.6386070444992028e-05,
1359
+ "loss": 2.5975,
1360
+ "step": 16100
1361
+ },
1362
+ {
1363
+ "epoch": 1.5848945849435014,
1364
+ "grad_norm": 15.874536514282227,
1365
+ "learning_rate": 2.6204884765908106e-05,
1366
+ "loss": 2.5851,
1367
+ "step": 16200
1368
+ },
1369
+ {
1370
+ "epoch": 1.59467788485056,
1371
+ "grad_norm": 17.902137756347656,
1372
+ "learning_rate": 2.6023699086824177e-05,
1373
+ "loss": 2.6027,
1374
+ "step": 16300
1375
+ },
1376
+ {
1377
+ "epoch": 1.6044611847576187,
1378
+ "grad_norm": 17.04872703552246,
1379
+ "learning_rate": 2.5842513407740255e-05,
1380
+ "loss": 2.5854,
1381
+ "step": 16400
1382
+ },
1383
+ {
1384
+ "epoch": 1.6142444846646775,
1385
+ "grad_norm": 15.406013488769531,
1386
+ "learning_rate": 2.5661327728656327e-05,
1387
+ "loss": 2.5158,
1388
+ "step": 16500
1389
+ },
1390
+ {
1391
+ "epoch": 1.6142444846646775,
1392
+ "eval_runtime": 181.8647,
1393
+ "eval_samples_per_second": 112.402,
1394
+ "eval_steps_per_second": 14.054,
1395
+ "step": 16500
1396
+ },
1397
+ {
1398
+ "epoch": 1.624027784571736,
1399
+ "grad_norm": 19.62627601623535,
1400
+ "learning_rate": 2.5480142049572402e-05,
1401
+ "loss": 2.5378,
1402
+ "step": 16600
1403
+ },
1404
+ {
1405
+ "epoch": 1.6338110844787948,
1406
+ "grad_norm": 17.825178146362305,
1407
+ "learning_rate": 2.529895637048848e-05,
1408
+ "loss": 2.6162,
1409
+ "step": 16700
1410
+ },
1411
+ {
1412
+ "epoch": 1.6435943843858534,
1413
+ "grad_norm": 15.442023277282715,
1414
+ "learning_rate": 2.511777069140455e-05,
1415
+ "loss": 2.5802,
1416
+ "step": 16800
1417
+ },
1418
+ {
1419
+ "epoch": 1.653377684292912,
1420
+ "grad_norm": 18.695241928100586,
1421
+ "learning_rate": 2.4936585012320626e-05,
1422
+ "loss": 2.585,
1423
+ "step": 16900
1424
+ },
1425
+ {
1426
+ "epoch": 1.6631609841999706,
1427
+ "grad_norm": 18.992969512939453,
1428
+ "learning_rate": 2.4755399333236704e-05,
1429
+ "loss": 2.5448,
1430
+ "step": 17000
1431
+ },
1432
+ {
1433
+ "epoch": 1.6631609841999706,
1434
+ "eval_runtime": 181.91,
1435
+ "eval_samples_per_second": 112.374,
1436
+ "eval_steps_per_second": 14.051,
1437
+ "step": 17000
1438
+ },
1439
+ {
1440
+ "epoch": 1.6729442841070292,
1441
+ "grad_norm": 19.065349578857422,
1442
+ "learning_rate": 2.457421365415278e-05,
1443
+ "loss": 2.6565,
1444
+ "step": 17100
1445
+ },
1446
+ {
1447
+ "epoch": 1.682727584014088,
1448
+ "grad_norm": 20.110734939575195,
1449
+ "learning_rate": 2.439302797506885e-05,
1450
+ "loss": 2.5519,
1451
+ "step": 17200
1452
+ },
1453
+ {
1454
+ "epoch": 1.6925108839211465,
1455
+ "grad_norm": 15.886931419372559,
1456
+ "learning_rate": 2.4211842295984925e-05,
1457
+ "loss": 2.5589,
1458
+ "step": 17300
1459
+ },
1460
+ {
1461
+ "epoch": 1.7022941838282053,
1462
+ "grad_norm": 19.213207244873047,
1463
+ "learning_rate": 2.4030656616901e-05,
1464
+ "loss": 2.5714,
1465
+ "step": 17400
1466
+ },
1467
+ {
1468
+ "epoch": 1.712077483735264,
1469
+ "grad_norm": 17.117481231689453,
1470
+ "learning_rate": 2.3849470937817075e-05,
1471
+ "loss": 2.6682,
1472
+ "step": 17500
1473
+ },
1474
+ {
1475
+ "epoch": 1.712077483735264,
1476
+ "eval_runtime": 181.766,
1477
+ "eval_samples_per_second": 112.463,
1478
+ "eval_steps_per_second": 14.062,
1479
+ "step": 17500
1480
+ },
1481
+ {
1482
+ "epoch": 1.7218607836423225,
1483
+ "grad_norm": 17.19162940979004,
1484
+ "learning_rate": 2.366828525873315e-05,
1485
+ "loss": 2.5591,
1486
+ "step": 17600
1487
+ },
1488
+ {
1489
+ "epoch": 1.7316440835493812,
1490
+ "grad_norm": 15.454411506652832,
1491
+ "learning_rate": 2.3487099579649225e-05,
1492
+ "loss": 2.469,
1493
+ "step": 17700
1494
+ },
1495
+ {
1496
+ "epoch": 1.7414273834564398,
1497
+ "grad_norm": 15.227791786193848,
1498
+ "learning_rate": 2.3305913900565303e-05,
1499
+ "loss": 2.664,
1500
+ "step": 17800
1501
+ },
1502
+ {
1503
+ "epoch": 1.7512106833634986,
1504
+ "grad_norm": 18.5739688873291,
1505
+ "learning_rate": 2.3124728221481374e-05,
1506
+ "loss": 2.5991,
1507
+ "step": 17900
1508
+ },
1509
+ {
1510
+ "epoch": 1.760993983270557,
1511
+ "grad_norm": 12.589066505432129,
1512
+ "learning_rate": 2.294354254239745e-05,
1513
+ "loss": 2.6593,
1514
+ "step": 18000
1515
+ },
1516
+ {
1517
+ "epoch": 1.760993983270557,
1518
+ "eval_runtime": 181.9699,
1519
+ "eval_samples_per_second": 112.337,
1520
+ "eval_steps_per_second": 14.046,
1521
+ "step": 18000
1522
+ },
1523
+ {
1524
+ "epoch": 1.7707772831776158,
1525
+ "grad_norm": 20.695772171020508,
1526
+ "learning_rate": 2.2762356863313524e-05,
1527
+ "loss": 2.5555,
1528
+ "step": 18100
1529
+ },
1530
+ {
1531
+ "epoch": 1.7805605830846745,
1532
+ "grad_norm": 12.731703758239746,
1533
+ "learning_rate": 2.25811711842296e-05,
1534
+ "loss": 2.4617,
1535
+ "step": 18200
1536
+ },
1537
+ {
1538
+ "epoch": 1.790343882991733,
1539
+ "grad_norm": 18.506074905395508,
1540
+ "learning_rate": 2.2399985505145674e-05,
1541
+ "loss": 2.6061,
1542
+ "step": 18300
1543
+ },
1544
+ {
1545
+ "epoch": 1.800127182898792,
1546
+ "grad_norm": 14.8694486618042,
1547
+ "learning_rate": 2.221879982606175e-05,
1548
+ "loss": 2.5779,
1549
+ "step": 18400
1550
+ },
1551
+ {
1552
+ "epoch": 1.8099104828058503,
1553
+ "grad_norm": 22.47985076904297,
1554
+ "learning_rate": 2.2037614146977827e-05,
1555
+ "loss": 2.5012,
1556
+ "step": 18500
1557
+ },
1558
+ {
1559
+ "epoch": 1.8099104828058503,
1560
+ "eval_runtime": 182.3919,
1561
+ "eval_samples_per_second": 112.077,
1562
+ "eval_steps_per_second": 14.014,
1563
+ "step": 18500
1564
+ },
1565
+ {
1566
+ "epoch": 1.8196937827129092,
1567
+ "grad_norm": 25.74334144592285,
1568
+ "learning_rate": 2.1856428467893898e-05,
1569
+ "loss": 2.5265,
1570
+ "step": 18600
1571
+ },
1572
+ {
1573
+ "epoch": 1.8294770826199676,
1574
+ "grad_norm": 18.477630615234375,
1575
+ "learning_rate": 2.1675242788809973e-05,
1576
+ "loss": 2.5555,
1577
+ "step": 18700
1578
+ },
1579
+ {
1580
+ "epoch": 1.8392603825270264,
1581
+ "grad_norm": 14.832316398620605,
1582
+ "learning_rate": 2.1494057109726048e-05,
1583
+ "loss": 2.4609,
1584
+ "step": 18800
1585
+ },
1586
+ {
1587
+ "epoch": 1.849043682434085,
1588
+ "grad_norm": 17.025096893310547,
1589
+ "learning_rate": 2.1312871430642123e-05,
1590
+ "loss": 2.5119,
1591
+ "step": 18900
1592
+ },
1593
+ {
1594
+ "epoch": 1.8588269823411436,
1595
+ "grad_norm": 16.852436065673828,
1596
+ "learning_rate": 2.1131685751558197e-05,
1597
+ "loss": 2.5369,
1598
+ "step": 19000
1599
+ },
1600
+ {
1601
+ "epoch": 1.8588269823411436,
1602
+ "eval_runtime": 181.7443,
1603
+ "eval_samples_per_second": 112.477,
1604
+ "eval_steps_per_second": 14.064,
1605
+ "step": 19000
1606
+ },
1607
+ {
1608
+ "epoch": 1.8686102822482025,
1609
+ "grad_norm": 15.160259246826172,
1610
+ "learning_rate": 2.0950500072474272e-05,
1611
+ "loss": 2.6297,
1612
+ "step": 19100
1613
+ },
1614
+ {
1615
+ "epoch": 1.8783935821552609,
1616
+ "grad_norm": 15.909671783447266,
1617
+ "learning_rate": 2.0769314393390347e-05,
1618
+ "loss": 2.4696,
1619
+ "step": 19200
1620
+ },
1621
+ {
1622
+ "epoch": 1.8881768820623197,
1623
+ "grad_norm": 14.201844215393066,
1624
+ "learning_rate": 2.0588128714306422e-05,
1625
+ "loss": 2.5653,
1626
+ "step": 19300
1627
+ },
1628
+ {
1629
+ "epoch": 1.8979601819693783,
1630
+ "grad_norm": 16.351415634155273,
1631
+ "learning_rate": 2.0406943035222497e-05,
1632
+ "loss": 2.4962,
1633
+ "step": 19400
1634
+ },
1635
+ {
1636
+ "epoch": 1.907743481876437,
1637
+ "grad_norm": 16.943771362304688,
1638
+ "learning_rate": 2.022575735613857e-05,
1639
+ "loss": 2.5091,
1640
+ "step": 19500
1641
+ },
1642
+ {
1643
+ "epoch": 1.907743481876437,
1644
+ "eval_runtime": 181.6486,
1645
+ "eval_samples_per_second": 112.536,
1646
+ "eval_steps_per_second": 14.071,
1647
+ "step": 19500
1648
+ },
1649
+ {
1650
+ "epoch": 1.9175267817834956,
1651
+ "grad_norm": 15.006349563598633,
1652
+ "learning_rate": 2.0044571677054646e-05,
1653
+ "loss": 2.5214,
1654
+ "step": 19600
1655
+ },
1656
+ {
1657
+ "epoch": 1.9273100816905542,
1658
+ "grad_norm": 17.305580139160156,
1659
+ "learning_rate": 1.986338599797072e-05,
1660
+ "loss": 2.4989,
1661
+ "step": 19700
1662
+ },
1663
+ {
1664
+ "epoch": 1.937093381597613,
1665
+ "grad_norm": 17.28044891357422,
1666
+ "learning_rate": 1.9682200318886796e-05,
1667
+ "loss": 2.4008,
1668
+ "step": 19800
1669
+ },
1670
+ {
1671
+ "epoch": 1.9468766815046714,
1672
+ "grad_norm": 18.25079917907715,
1673
+ "learning_rate": 1.950101463980287e-05,
1674
+ "loss": 2.6015,
1675
+ "step": 19900
1676
+ },
1677
+ {
1678
+ "epoch": 1.9566599814117303,
1679
+ "grad_norm": 20.741668701171875,
1680
+ "learning_rate": 1.9319828960718946e-05,
1681
+ "loss": 2.4081,
1682
+ "step": 20000
1683
+ },
1684
+ {
1685
+ "epoch": 1.9566599814117303,
1686
+ "eval_runtime": 181.7745,
1687
+ "eval_samples_per_second": 112.458,
1688
+ "eval_steps_per_second": 14.061,
1689
+ "step": 20000
1690
+ },
1691
+ {
1692
+ "epoch": 1.9664432813187889,
1693
+ "grad_norm": 16.1226863861084,
1694
+ "learning_rate": 1.913864328163502e-05,
1695
+ "loss": 2.5418,
1696
+ "step": 20100
1697
+ },
1698
+ {
1699
+ "epoch": 1.9762265812258475,
1700
+ "grad_norm": 13.914982795715332,
1701
+ "learning_rate": 1.8957457602551095e-05,
1702
+ "loss": 2.5248,
1703
+ "step": 20200
1704
+ },
1705
+ {
1706
+ "epoch": 1.986009881132906,
1707
+ "grad_norm": 15.072690963745117,
1708
+ "learning_rate": 1.877627192346717e-05,
1709
+ "loss": 2.5488,
1710
+ "step": 20300
1711
+ },
1712
+ {
1713
+ "epoch": 1.9957931810399647,
1714
+ "grad_norm": 15.510763168334961,
1715
+ "learning_rate": 1.8595086244383245e-05,
1716
+ "loss": 2.4605,
1717
+ "step": 20400
1718
+ },
1719
+ {
1720
+ "epoch": 2.0055764809470236,
1721
+ "grad_norm": 18.463842391967773,
1722
+ "learning_rate": 1.841390056529932e-05,
1723
+ "loss": 2.522,
1724
+ "step": 20500
1725
+ },
1726
+ {
1727
+ "epoch": 2.0055764809470236,
1728
+ "eval_runtime": 182.07,
1729
+ "eval_samples_per_second": 112.276,
1730
+ "eval_steps_per_second": 14.039,
1731
+ "step": 20500
1732
+ },
1733
+ {
1734
+ "epoch": 2.015359780854082,
1735
+ "grad_norm": 16.670269012451172,
1736
+ "learning_rate": 1.8232714886215394e-05,
1737
+ "loss": 2.5585,
1738
+ "step": 20600
1739
+ },
1740
+ {
1741
+ "epoch": 2.025143080761141,
1742
+ "grad_norm": 20.60368537902832,
1743
+ "learning_rate": 1.805152920713147e-05,
1744
+ "loss": 2.5381,
1745
+ "step": 20700
1746
+ },
1747
+ {
1748
+ "epoch": 2.034926380668199,
1749
+ "grad_norm": 15.686981201171875,
1750
+ "learning_rate": 1.7870343528047544e-05,
1751
+ "loss": 2.5721,
1752
+ "step": 20800
1753
+ },
1754
+ {
1755
+ "epoch": 2.044709680575258,
1756
+ "grad_norm": 14.691718101501465,
1757
+ "learning_rate": 1.768915784896362e-05,
1758
+ "loss": 2.5187,
1759
+ "step": 20900
1760
+ },
1761
+ {
1762
+ "epoch": 2.054492980482317,
1763
+ "grad_norm": 16.31734848022461,
1764
+ "learning_rate": 1.7507972169879694e-05,
1765
+ "loss": 2.5202,
1766
+ "step": 21000
1767
+ },
1768
+ {
1769
+ "epoch": 2.054492980482317,
1770
+ "eval_runtime": 181.9896,
1771
+ "eval_samples_per_second": 112.325,
1772
+ "eval_steps_per_second": 14.045,
1773
+ "step": 21000
1774
+ },
1775
+ {
1776
+ "epoch": 2.0642762803893753,
1777
+ "grad_norm": 12.698554992675781,
1778
+ "learning_rate": 1.732678649079577e-05,
1779
+ "loss": 2.4228,
1780
+ "step": 21100
1781
+ },
1782
+ {
1783
+ "epoch": 2.074059580296434,
1784
+ "grad_norm": 16.34201431274414,
1785
+ "learning_rate": 1.7145600811711843e-05,
1786
+ "loss": 2.3963,
1787
+ "step": 21200
1788
+ },
1789
+ {
1790
+ "epoch": 2.0838428802034925,
1791
+ "grad_norm": 16.52840232849121,
1792
+ "learning_rate": 1.6964415132627918e-05,
1793
+ "loss": 2.4759,
1794
+ "step": 21300
1795
+ },
1796
+ {
1797
+ "epoch": 2.0936261801105513,
1798
+ "grad_norm": 14.856452941894531,
1799
+ "learning_rate": 1.6783229453543993e-05,
1800
+ "loss": 2.4675,
1801
+ "step": 21400
1802
+ },
1803
+ {
1804
+ "epoch": 2.1034094800176097,
1805
+ "grad_norm": 19.68895721435547,
1806
+ "learning_rate": 1.6602043774460068e-05,
1807
+ "loss": 2.5324,
1808
+ "step": 21500
1809
+ },
1810
+ {
1811
+ "epoch": 2.1034094800176097,
1812
+ "eval_runtime": 182.1877,
1813
+ "eval_samples_per_second": 112.203,
1814
+ "eval_steps_per_second": 14.029,
1815
+ "step": 21500
1816
+ },
1817
+ {
1818
+ "epoch": 2.1131927799246686,
1819
+ "grad_norm": 23.248056411743164,
1820
+ "learning_rate": 1.6420858095376143e-05,
1821
+ "loss": 2.5231,
1822
+ "step": 21600
1823
+ },
1824
+ {
1825
+ "epoch": 2.1229760798317274,
1826
+ "grad_norm": 25.471004486083984,
1827
+ "learning_rate": 1.6239672416292217e-05,
1828
+ "loss": 2.5871,
1829
+ "step": 21700
1830
+ },
1831
+ {
1832
+ "epoch": 2.132759379738786,
1833
+ "grad_norm": 17.794851303100586,
1834
+ "learning_rate": 1.6058486737208292e-05,
1835
+ "loss": 2.5008,
1836
+ "step": 21800
1837
+ },
1838
+ {
1839
+ "epoch": 2.1425426796458447,
1840
+ "grad_norm": 15.450346946716309,
1841
+ "learning_rate": 1.5877301058124367e-05,
1842
+ "loss": 2.4194,
1843
+ "step": 21900
1844
+ },
1845
+ {
1846
+ "epoch": 2.152325979552903,
1847
+ "grad_norm": 13.243645668029785,
1848
+ "learning_rate": 1.5696115379040442e-05,
1849
+ "loss": 2.5018,
1850
+ "step": 22000
1851
+ },
1852
+ {
1853
+ "epoch": 2.152325979552903,
1854
+ "eval_runtime": 181.9841,
1855
+ "eval_samples_per_second": 112.328,
1856
+ "eval_steps_per_second": 14.045,
1857
+ "step": 22000
1858
+ },
1859
+ {
1860
+ "epoch": 2.162109279459962,
1861
+ "grad_norm": 16.996198654174805,
1862
+ "learning_rate": 1.5514929699956517e-05,
1863
+ "loss": 2.4492,
1864
+ "step": 22100
1865
+ },
1866
+ {
1867
+ "epoch": 2.1718925793670203,
1868
+ "grad_norm": 20.05558967590332,
1869
+ "learning_rate": 1.5333744020872588e-05,
1870
+ "loss": 2.489,
1871
+ "step": 22200
1872
+ },
1873
+ {
1874
+ "epoch": 2.181675879274079,
1875
+ "grad_norm": 15.66326904296875,
1876
+ "learning_rate": 1.5152558341788666e-05,
1877
+ "loss": 2.5089,
1878
+ "step": 22300
1879
+ },
1880
+ {
1881
+ "epoch": 2.191459179181138,
1882
+ "grad_norm": 17.83564567565918,
1883
+ "learning_rate": 1.4971372662704741e-05,
1884
+ "loss": 2.4945,
1885
+ "step": 22400
1886
+ },
1887
+ {
1888
+ "epoch": 2.2012424790881964,
1889
+ "grad_norm": 21.466899871826172,
1890
+ "learning_rate": 1.4790186983620816e-05,
1891
+ "loss": 2.5467,
1892
+ "step": 22500
1893
+ },
1894
+ {
1895
+ "epoch": 2.2012424790881964,
1896
+ "eval_runtime": 182.8328,
1897
+ "eval_samples_per_second": 111.807,
1898
+ "eval_steps_per_second": 13.98,
1899
+ "step": 22500
1900
+ },
1901
+ {
1902
+ "epoch": 2.211025778995255,
1903
+ "grad_norm": 17.91064453125,
1904
+ "learning_rate": 1.4609001304536891e-05,
1905
+ "loss": 2.5144,
1906
+ "step": 22600
1907
+ },
1908
+ {
1909
+ "epoch": 2.2208090789023136,
1910
+ "grad_norm": 17.678396224975586,
1911
+ "learning_rate": 1.4427815625452964e-05,
1912
+ "loss": 2.5018,
1913
+ "step": 22700
1914
+ },
1915
+ {
1916
+ "epoch": 2.2305923788093724,
1917
+ "grad_norm": 17.510461807250977,
1918
+ "learning_rate": 1.4246629946369039e-05,
1919
+ "loss": 2.4228,
1920
+ "step": 22800
1921
+ },
1922
+ {
1923
+ "epoch": 2.240375678716431,
1924
+ "grad_norm": 24.923967361450195,
1925
+ "learning_rate": 1.4065444267285114e-05,
1926
+ "loss": 2.5249,
1927
+ "step": 22900
1928
+ },
1929
+ {
1930
+ "epoch": 2.2501589786234897,
1931
+ "grad_norm": 17.82384490966797,
1932
+ "learning_rate": 1.388425858820119e-05,
1933
+ "loss": 2.4282,
1934
+ "step": 23000
1935
+ },
1936
+ {
1937
+ "epoch": 2.2501589786234897,
1938
+ "eval_runtime": 182.0459,
1939
+ "eval_samples_per_second": 112.29,
1940
+ "eval_steps_per_second": 14.04,
1941
+ "step": 23000
1942
+ },
1943
+ {
1944
+ "epoch": 2.2599422785305485,
1945
+ "grad_norm": 16.13028335571289,
1946
+ "learning_rate": 1.3703072909117265e-05,
1947
+ "loss": 2.4472,
1948
+ "step": 23100
1949
+ },
1950
+ {
1951
+ "epoch": 2.269725578437607,
1952
+ "grad_norm": 15.137242317199707,
1953
+ "learning_rate": 1.352188723003334e-05,
1954
+ "loss": 2.5985,
1955
+ "step": 23200
1956
+ },
1957
+ {
1958
+ "epoch": 2.2795088783446658,
1959
+ "grad_norm": 16.187530517578125,
1960
+ "learning_rate": 1.3340701550949415e-05,
1961
+ "loss": 2.4862,
1962
+ "step": 23300
1963
+ },
1964
+ {
1965
+ "epoch": 2.289292178251724,
1966
+ "grad_norm": 18.84433937072754,
1967
+ "learning_rate": 1.3159515871865488e-05,
1968
+ "loss": 2.516,
1969
+ "step": 23400
1970
+ },
1971
+ {
1972
+ "epoch": 2.299075478158783,
1973
+ "grad_norm": 20.209121704101562,
1974
+ "learning_rate": 1.2978330192781563e-05,
1975
+ "loss": 2.5031,
1976
+ "step": 23500
1977
+ },
1978
+ {
1979
+ "epoch": 2.299075478158783,
1980
+ "eval_runtime": 181.9806,
1981
+ "eval_samples_per_second": 112.331,
1982
+ "eval_steps_per_second": 14.045,
1983
+ "step": 23500
1984
+ },
1985
+ {
1986
+ "epoch": 2.308858778065842,
1987
+ "grad_norm": 67.4502182006836,
1988
+ "learning_rate": 1.2797144513697637e-05,
1989
+ "loss": 2.4491,
1990
+ "step": 23600
1991
+ },
1992
+ {
1993
+ "epoch": 2.3186420779729002,
1994
+ "grad_norm": 14.940401077270508,
1995
+ "learning_rate": 1.2615958834613712e-05,
1996
+ "loss": 2.5669,
1997
+ "step": 23700
1998
+ },
1999
+ {
2000
+ "epoch": 2.328425377879959,
2001
+ "grad_norm": 16.591793060302734,
2002
+ "learning_rate": 1.2434773155529787e-05,
2003
+ "loss": 2.4565,
2004
+ "step": 23800
2005
+ },
2006
+ {
2007
+ "epoch": 2.3382086777870175,
2008
+ "grad_norm": 16.798791885375977,
2009
+ "learning_rate": 1.2253587476445862e-05,
2010
+ "loss": 2.4046,
2011
+ "step": 23900
2012
+ },
2013
+ {
2014
+ "epoch": 2.3479919776940763,
2015
+ "grad_norm": 17.712255477905273,
2016
+ "learning_rate": 1.2072401797361937e-05,
2017
+ "loss": 2.4453,
2018
+ "step": 24000
2019
+ },
2020
+ {
2021
+ "epoch": 2.3479919776940763,
2022
+ "eval_runtime": 182.0401,
2023
+ "eval_samples_per_second": 112.294,
2024
+ "eval_steps_per_second": 14.041,
2025
+ "step": 24000
2026
+ },
2027
+ {
2028
+ "epoch": 2.3577752776011347,
2029
+ "grad_norm": 18.64284324645996,
2030
+ "learning_rate": 1.1891216118278011e-05,
2031
+ "loss": 2.3973,
2032
+ "step": 24100
2033
+ },
2034
+ {
2035
+ "epoch": 2.3675585775081935,
2036
+ "grad_norm": 18.185895919799805,
2037
+ "learning_rate": 1.1710030439194086e-05,
2038
+ "loss": 2.5045,
2039
+ "step": 24200
2040
+ },
2041
+ {
2042
+ "epoch": 2.377341877415252,
2043
+ "grad_norm": 23.201522827148438,
2044
+ "learning_rate": 1.1528844760110163e-05,
2045
+ "loss": 2.5402,
2046
+ "step": 24300
2047
+ },
2048
+ {
2049
+ "epoch": 2.3871251773223108,
2050
+ "grad_norm": 21.606412887573242,
2051
+ "learning_rate": 1.1347659081026236e-05,
2052
+ "loss": 2.4285,
2053
+ "step": 24400
2054
+ },
2055
+ {
2056
+ "epoch": 2.3969084772293696,
2057
+ "grad_norm": 16.318761825561523,
2058
+ "learning_rate": 1.116647340194231e-05,
2059
+ "loss": 2.5509,
2060
+ "step": 24500
2061
+ },
2062
+ {
2063
+ "epoch": 2.3969084772293696,
2064
+ "eval_runtime": 182.0431,
2065
+ "eval_samples_per_second": 112.292,
2066
+ "eval_steps_per_second": 14.041,
2067
+ "step": 24500
2068
+ },
2069
+ {
2070
+ "epoch": 2.406691777136428,
2071
+ "grad_norm": 17.779014587402344,
2072
+ "learning_rate": 1.0985287722858386e-05,
2073
+ "loss": 2.4245,
2074
+ "step": 24600
2075
+ },
2076
+ {
2077
+ "epoch": 2.416475077043487,
2078
+ "grad_norm": 18.44321060180664,
2079
+ "learning_rate": 1.080410204377446e-05,
2080
+ "loss": 2.5223,
2081
+ "step": 24700
2082
+ },
2083
+ {
2084
+ "epoch": 2.4262583769505452,
2085
+ "grad_norm": 24.017047882080078,
2086
+ "learning_rate": 1.0622916364690535e-05,
2087
+ "loss": 2.4846,
2088
+ "step": 24800
2089
+ },
2090
+ {
2091
+ "epoch": 2.436041676857604,
2092
+ "grad_norm": 14.89560604095459,
2093
+ "learning_rate": 1.044173068560661e-05,
2094
+ "loss": 2.5922,
2095
+ "step": 24900
2096
+ },
2097
+ {
2098
+ "epoch": 2.445824976764663,
2099
+ "grad_norm": 15.532561302185059,
2100
+ "learning_rate": 1.0260545006522685e-05,
2101
+ "loss": 2.3976,
2102
+ "step": 25000
2103
+ },
2104
+ {
2105
+ "epoch": 2.445824976764663,
2106
+ "eval_runtime": 182.1033,
2107
+ "eval_samples_per_second": 112.255,
2108
+ "eval_steps_per_second": 14.036,
2109
+ "step": 25000
2110
+ },
2111
+ {
2112
+ "epoch": 2.4556082766717213,
2113
+ "grad_norm": 18.041282653808594,
2114
+ "learning_rate": 1.007935932743876e-05,
2115
+ "loss": 2.4731,
2116
+ "step": 25100
2117
+ },
2118
+ {
2119
+ "epoch": 2.46539157657878,
2120
+ "grad_norm": 13.40858268737793,
2121
+ "learning_rate": 9.898173648354834e-06,
2122
+ "loss": 2.4838,
2123
+ "step": 25200
2124
+ },
2125
+ {
2126
+ "epoch": 2.4751748764858386,
2127
+ "grad_norm": 17.450841903686523,
2128
+ "learning_rate": 9.71698796927091e-06,
2129
+ "loss": 2.3999,
2130
+ "step": 25300
2131
+ },
2132
+ {
2133
+ "epoch": 2.4849581763928974,
2134
+ "grad_norm": 17.556467056274414,
2135
+ "learning_rate": 9.535802290186984e-06,
2136
+ "loss": 2.3867,
2137
+ "step": 25400
2138
+ },
2139
+ {
2140
+ "epoch": 2.494741476299956,
2141
+ "grad_norm": 18.578310012817383,
2142
+ "learning_rate": 9.354616611103059e-06,
2143
+ "loss": 2.4546,
2144
+ "step": 25500
2145
+ },
2146
+ {
2147
+ "epoch": 2.494741476299956,
2148
+ "eval_runtime": 182.0338,
2149
+ "eval_samples_per_second": 112.298,
2150
+ "eval_steps_per_second": 14.041,
2151
+ "step": 25500
2152
+ },
2153
+ {
2154
+ "epoch": 2.5045247762070146,
2155
+ "grad_norm": 14.936469078063965,
2156
+ "learning_rate": 9.173430932019134e-06,
2157
+ "loss": 2.5562,
2158
+ "step": 25600
2159
+ },
2160
+ {
2161
+ "epoch": 2.514308076114073,
2162
+ "grad_norm": 17.527040481567383,
2163
+ "learning_rate": 8.992245252935209e-06,
2164
+ "loss": 2.4008,
2165
+ "step": 25700
2166
+ },
2167
+ {
2168
+ "epoch": 2.524091376021132,
2169
+ "grad_norm": 12.91336727142334,
2170
+ "learning_rate": 8.811059573851283e-06,
2171
+ "loss": 2.4655,
2172
+ "step": 25800
2173
+ },
2174
+ {
2175
+ "epoch": 2.5338746759281907,
2176
+ "grad_norm": 15.168461799621582,
2177
+ "learning_rate": 8.629873894767358e-06,
2178
+ "loss": 2.4468,
2179
+ "step": 25900
2180
+ },
2181
+ {
2182
+ "epoch": 2.543657975835249,
2183
+ "grad_norm": 17.5390682220459,
2184
+ "learning_rate": 8.448688215683433e-06,
2185
+ "loss": 2.4836,
2186
+ "step": 26000
2187
+ },
2188
+ {
2189
+ "epoch": 2.543657975835249,
2190
+ "eval_runtime": 182.1148,
2191
+ "eval_samples_per_second": 112.248,
2192
+ "eval_steps_per_second": 14.035,
2193
+ "step": 26000
2194
+ },
2195
+ {
2196
+ "epoch": 2.553441275742308,
2197
+ "grad_norm": 15.126510620117188,
2198
+ "learning_rate": 8.267502536599508e-06,
2199
+ "loss": 2.387,
2200
+ "step": 26100
2201
+ },
2202
+ {
2203
+ "epoch": 2.5632245756493663,
2204
+ "grad_norm": 15.374293327331543,
2205
+ "learning_rate": 8.086316857515583e-06,
2206
+ "loss": 2.3652,
2207
+ "step": 26200
2208
+ },
2209
+ {
2210
+ "epoch": 2.573007875556425,
2211
+ "grad_norm": 15.498108863830566,
2212
+ "learning_rate": 7.905131178431657e-06,
2213
+ "loss": 2.4749,
2214
+ "step": 26300
2215
+ },
2216
+ {
2217
+ "epoch": 2.582791175463484,
2218
+ "grad_norm": 16.221315383911133,
2219
+ "learning_rate": 7.723945499347732e-06,
2220
+ "loss": 2.4567,
2221
+ "step": 26400
2222
+ },
2223
+ {
2224
+ "epoch": 2.5925744753705424,
2225
+ "grad_norm": 18.839122772216797,
2226
+ "learning_rate": 7.542759820263806e-06,
2227
+ "loss": 2.3554,
2228
+ "step": 26500
2229
+ },
2230
+ {
2231
+ "epoch": 2.5925744753705424,
2232
+ "eval_runtime": 181.9597,
2233
+ "eval_samples_per_second": 112.344,
2234
+ "eval_steps_per_second": 14.047,
2235
+ "step": 26500
2236
+ },
2237
+ {
2238
+ "epoch": 2.6023577752776013,
2239
+ "grad_norm": 22.626708984375,
2240
+ "learning_rate": 7.361574141179882e-06,
2241
+ "loss": 2.502,
2242
+ "step": 26600
2243
+ },
2244
+ {
2245
+ "epoch": 2.6121410751846597,
2246
+ "grad_norm": 16.519880294799805,
2247
+ "learning_rate": 7.180388462095957e-06,
2248
+ "loss": 2.5034,
2249
+ "step": 26700
2250
+ },
2251
+ {
2252
+ "epoch": 2.6219243750917185,
2253
+ "grad_norm": 27.421489715576172,
2254
+ "learning_rate": 6.999202783012031e-06,
2255
+ "loss": 2.5276,
2256
+ "step": 26800
2257
+ },
2258
+ {
2259
+ "epoch": 2.6317076749987773,
2260
+ "grad_norm": 15.274630546569824,
2261
+ "learning_rate": 6.8180171039281055e-06,
2262
+ "loss": 2.4121,
2263
+ "step": 26900
2264
+ },
2265
+ {
2266
+ "epoch": 2.6414909749058357,
2267
+ "grad_norm": 15.751582145690918,
2268
+ "learning_rate": 6.636831424844181e-06,
2269
+ "loss": 2.5799,
2270
+ "step": 27000
2271
+ },
2272
+ {
2273
+ "epoch": 2.6414909749058357,
2274
+ "eval_runtime": 182.0873,
2275
+ "eval_samples_per_second": 112.265,
2276
+ "eval_steps_per_second": 14.037,
2277
+ "step": 27000
2278
+ },
2279
+ {
2280
+ "epoch": 2.651274274812894,
2281
+ "grad_norm": 16.674850463867188,
2282
+ "learning_rate": 6.455645745760255e-06,
2283
+ "loss": 2.3872,
2284
+ "step": 27100
2285
+ },
2286
+ {
2287
+ "epoch": 2.661057574719953,
2288
+ "grad_norm": 12.62803840637207,
2289
+ "learning_rate": 6.27446006667633e-06,
2290
+ "loss": 2.4,
2291
+ "step": 27200
2292
+ },
2293
+ {
2294
+ "epoch": 2.670840874627012,
2295
+ "grad_norm": 18.055158615112305,
2296
+ "learning_rate": 6.093274387592405e-06,
2297
+ "loss": 2.4681,
2298
+ "step": 27300
2299
+ },
2300
+ {
2301
+ "epoch": 2.68062417453407,
2302
+ "grad_norm": 17.21278190612793,
2303
+ "learning_rate": 5.91208870850848e-06,
2304
+ "loss": 2.5441,
2305
+ "step": 27400
2306
+ },
2307
+ {
2308
+ "epoch": 2.690407474441129,
2309
+ "grad_norm": 20.945236206054688,
2310
+ "learning_rate": 5.7309030294245544e-06,
2311
+ "loss": 2.4388,
2312
+ "step": 27500
2313
+ },
2314
+ {
2315
+ "epoch": 2.690407474441129,
2316
+ "eval_runtime": 182.1279,
2317
+ "eval_samples_per_second": 112.24,
2318
+ "eval_steps_per_second": 14.034,
2319
+ "step": 27500
2320
+ },
2321
+ {
2322
+ "epoch": 2.7001907743481874,
2323
+ "grad_norm": 23.483661651611328,
2324
+ "learning_rate": 5.549717350340629e-06,
2325
+ "loss": 2.4589,
2326
+ "step": 27600
2327
+ },
2328
+ {
2329
+ "epoch": 2.7099740742552463,
2330
+ "grad_norm": 17.954036712646484,
2331
+ "learning_rate": 5.368531671256704e-06,
2332
+ "loss": 2.4477,
2333
+ "step": 27700
2334
+ },
2335
+ {
2336
+ "epoch": 2.719757374162305,
2337
+ "grad_norm": 16.187314987182617,
2338
+ "learning_rate": 5.187345992172779e-06,
2339
+ "loss": 2.4967,
2340
+ "step": 27800
2341
+ },
2342
+ {
2343
+ "epoch": 2.7295406740693635,
2344
+ "grad_norm": 14.324910163879395,
2345
+ "learning_rate": 5.006160313088854e-06,
2346
+ "loss": 2.3921,
2347
+ "step": 27900
2348
+ },
2349
+ {
2350
+ "epoch": 2.7393239739764224,
2351
+ "grad_norm": 20.81557846069336,
2352
+ "learning_rate": 4.8249746340049285e-06,
2353
+ "loss": 2.5201,
2354
+ "step": 28000
2355
+ },
2356
+ {
2357
+ "epoch": 2.7393239739764224,
2358
+ "eval_runtime": 182.146,
2359
+ "eval_samples_per_second": 112.229,
2360
+ "eval_steps_per_second": 14.033,
2361
+ "step": 28000
2362
+ },
2363
+ {
2364
+ "epoch": 2.7491072738834808,
2365
+ "grad_norm": 18.682844161987305,
2366
+ "learning_rate": 4.643788954921003e-06,
2367
+ "loss": 2.4325,
2368
+ "step": 28100
2369
+ },
2370
+ {
2371
+ "epoch": 2.7588905737905396,
2372
+ "grad_norm": 16.227272033691406,
2373
+ "learning_rate": 4.462603275837078e-06,
2374
+ "loss": 2.3864,
2375
+ "step": 28200
2376
+ },
2377
+ {
2378
+ "epoch": 2.7686738736975984,
2379
+ "grad_norm": 16.20302963256836,
2380
+ "learning_rate": 4.281417596753152e-06,
2381
+ "loss": 2.5296,
2382
+ "step": 28300
2383
+ },
2384
+ {
2385
+ "epoch": 2.778457173604657,
2386
+ "grad_norm": 18.634096145629883,
2387
+ "learning_rate": 4.100231917669228e-06,
2388
+ "loss": 2.4514,
2389
+ "step": 28400
2390
+ },
2391
+ {
2392
+ "epoch": 2.7882404735117157,
2393
+ "grad_norm": 13.040008544921875,
2394
+ "learning_rate": 3.919046238585303e-06,
2395
+ "loss": 2.3661,
2396
+ "step": 28500
2397
+ },
2398
+ {
2399
+ "epoch": 2.7882404735117157,
2400
+ "eval_runtime": 181.9164,
2401
+ "eval_samples_per_second": 112.37,
2402
+ "eval_steps_per_second": 14.05,
2403
+ "step": 28500
2404
+ },
2405
+ {
2406
+ "epoch": 2.798023773418774,
2407
+ "grad_norm": 14.142943382263184,
2408
+ "learning_rate": 3.737860559501377e-06,
2409
+ "loss": 2.5074,
2410
+ "step": 28600
2411
+ },
2412
+ {
2413
+ "epoch": 2.807807073325833,
2414
+ "grad_norm": 17.934324264526367,
2415
+ "learning_rate": 3.5566748804174523e-06,
2416
+ "loss": 2.4224,
2417
+ "step": 28700
2418
+ },
2419
+ {
2420
+ "epoch": 2.8175903732328913,
2421
+ "grad_norm": 14.450194358825684,
2422
+ "learning_rate": 3.3754892013335267e-06,
2423
+ "loss": 2.4949,
2424
+ "step": 28800
2425
+ },
2426
+ {
2427
+ "epoch": 2.82737367313995,
2428
+ "grad_norm": 17.746837615966797,
2429
+ "learning_rate": 3.194303522249602e-06,
2430
+ "loss": 2.4153,
2431
+ "step": 28900
2432
+ },
2433
+ {
2434
+ "epoch": 2.8371569730470085,
2435
+ "grad_norm": 13.962541580200195,
2436
+ "learning_rate": 3.0131178431656763e-06,
2437
+ "loss": 2.4804,
2438
+ "step": 29000
2439
+ },
2440
+ {
2441
+ "epoch": 2.8371569730470085,
2442
+ "eval_runtime": 182.0262,
2443
+ "eval_samples_per_second": 112.303,
2444
+ "eval_steps_per_second": 14.042,
2445
+ "step": 29000
2446
+ },
2447
+ {
2448
+ "epoch": 2.8469402729540674,
2449
+ "grad_norm": 16.669286727905273,
2450
+ "learning_rate": 2.831932164081751e-06,
2451
+ "loss": 2.5397,
2452
+ "step": 29100
2453
+ },
2454
+ {
2455
+ "epoch": 2.856723572861126,
2456
+ "grad_norm": 15.421733856201172,
2457
+ "learning_rate": 2.650746484997826e-06,
2458
+ "loss": 2.4175,
2459
+ "step": 29200
2460
+ },
2461
+ {
2462
+ "epoch": 2.8665068727681846,
2463
+ "grad_norm": 14.135702133178711,
2464
+ "learning_rate": 2.4695608059139007e-06,
2465
+ "loss": 2.5069,
2466
+ "step": 29300
2467
+ },
2468
+ {
2469
+ "epoch": 2.8762901726752435,
2470
+ "grad_norm": 17.41412925720215,
2471
+ "learning_rate": 2.2883751268299756e-06,
2472
+ "loss": 2.3997,
2473
+ "step": 29400
2474
+ },
2475
+ {
2476
+ "epoch": 2.886073472582302,
2477
+ "grad_norm": 14.824533462524414,
2478
+ "learning_rate": 2.1071894477460504e-06,
2479
+ "loss": 2.3945,
2480
+ "step": 29500
2481
+ },
2482
+ {
2483
+ "epoch": 2.886073472582302,
2484
+ "eval_runtime": 181.9299,
2485
+ "eval_samples_per_second": 112.362,
2486
+ "eval_steps_per_second": 14.049,
2487
+ "step": 29500
2488
+ },
2489
+ {
2490
+ "epoch": 2.8958567724893607,
2491
+ "grad_norm": 27.31865119934082,
2492
+ "learning_rate": 1.926003768662125e-06,
2493
+ "loss": 2.45,
2494
+ "step": 29600
2495
+ },
2496
+ {
2497
+ "epoch": 2.9056400723964195,
2498
+ "grad_norm": 18.966655731201172,
2499
+ "learning_rate": 1.7448180895781998e-06,
2500
+ "loss": 2.3916,
2501
+ "step": 29700
2502
+ },
2503
+ {
2504
+ "epoch": 2.915423372303478,
2505
+ "grad_norm": 18.538440704345703,
2506
+ "learning_rate": 1.5636324104942746e-06,
2507
+ "loss": 2.4625,
2508
+ "step": 29800
2509
+ },
2510
+ {
2511
+ "epoch": 2.9252066722105368,
2512
+ "grad_norm": 21.757272720336914,
2513
+ "learning_rate": 1.3824467314103494e-06,
2514
+ "loss": 2.3722,
2515
+ "step": 29900
2516
+ },
2517
+ {
2518
+ "epoch": 2.934989972117595,
2519
+ "grad_norm": 16.907358169555664,
2520
+ "learning_rate": 1.201261052326424e-06,
2521
+ "loss": 2.464,
2522
+ "step": 30000
2523
+ },
2524
+ {
2525
+ "epoch": 2.934989972117595,
2526
+ "eval_runtime": 181.9148,
2527
+ "eval_samples_per_second": 112.371,
2528
+ "eval_steps_per_second": 14.051,
2529
+ "step": 30000
2530
+ },
2531
+ {
2532
+ "epoch": 2.944773272024654,
2533
+ "grad_norm": 13.88399600982666,
2534
+ "learning_rate": 1.0200753732424989e-06,
2535
+ "loss": 2.5005,
2536
+ "step": 30100
2537
+ },
2538
+ {
2539
+ "epoch": 2.954556571931713,
2540
+ "grad_norm": 19.77507781982422,
2541
+ "learning_rate": 8.388896941585737e-07,
2542
+ "loss": 2.3829,
2543
+ "step": 30200
2544
+ },
2545
+ {
2546
+ "epoch": 2.9643398718387712,
2547
+ "grad_norm": 16.535932540893555,
2548
+ "learning_rate": 6.577040150746485e-07,
2549
+ "loss": 2.4788,
2550
+ "step": 30300
2551
+ },
2552
+ {
2553
+ "epoch": 2.9741231717458296,
2554
+ "grad_norm": 15.027000427246094,
2555
+ "learning_rate": 4.765183359907233e-07,
2556
+ "loss": 2.5007,
2557
+ "step": 30400
2558
+ },
2559
+ {
2560
+ "epoch": 2.9839064716528885,
2561
+ "grad_norm": 14.9392671585083,
2562
+ "learning_rate": 2.953326569067981e-07,
2563
+ "loss": 2.4847,
2564
+ "step": 30500
2565
+ },
2566
+ {
2567
+ "epoch": 2.9839064716528885,
2568
+ "eval_runtime": 181.9853,
2569
+ "eval_samples_per_second": 112.328,
2570
+ "eval_steps_per_second": 14.045,
2571
+ "step": 30500
2572
+ }
2573
+ ],
2574
+ "logging_steps": 100,
2575
+ "max_steps": 30663,
2576
+ "num_input_tokens_seen": 0,
2577
+ "num_train_epochs": 3,
2578
+ "save_steps": 500,
2579
+ "stateful_callbacks": {
2580
+ "TrainerControl": {
2581
+ "args": {
2582
+ "should_epoch_stop": false,
2583
+ "should_evaluate": false,
2584
+ "should_log": false,
2585
+ "should_save": true,
2586
+ "should_training_stop": false
2587
+ },
2588
+ "attributes": {}
2589
+ }
2590
+ },
2591
+ "total_flos": 1.0644527086729788e+16,
2592
+ "train_batch_size": 8,
2593
+ "trial_name": null,
2594
+ "trial_params": null
2595
+ }
muril_ch_domain/checkpoint-30500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c39b3134e7e5628432a425a5873f74f59d631bf591a12adab52f3ba906ae6906
3
+ size 5304
muril_ch_domain/checkpoint-30663/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/muril-base-cased
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.13.2
muril_ch_domain/checkpoint-30663/adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": {
4
+ "base_model_class": "BertForMaskedLM",
5
+ "parent_library": "transformers.models.bert.modeling_bert"
6
+ },
7
+ "base_model_name_or_path": "google/muril-base-cased",
8
+ "bias": "none",
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 128,
17
+ "lora_dropout": 0.1,
18
+ "megatron_config": null,
19
+ "megatron_core": "megatron.core",
20
+ "modules_to_save": null,
21
+ "peft_type": "LORA",
22
+ "r": 64,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "key",
27
+ "query",
28
+ "value",
29
+ "dense"
30
+ ],
31
+ "task_type": null,
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
muril_ch_domain/checkpoint-30663/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d84ccb91fb20f58aa9778952c060eea2273c69c0e40d164972562ff8c0ead9d
3
+ size 42881168
muril_ch_domain/checkpoint-30663/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ed5ec095ef64893bcddcfeca72b34c0cfd61506702bf2fcf85933f888ec0e1d
3
+ size 85843898
muril_ch_domain/checkpoint-30663/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c845563310a221ad0236804a9577e46a5928023e42425fd9e7226668ec75f0f
3
+ size 14244
muril_ch_domain/checkpoint-30663/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cbeb707e99493c97d2cffe0b096a573843cb8f847ed6fbc608d5da54abe9076
3
+ size 1064
muril_ch_domain/checkpoint-30663/trainer_state.json ADDED
@@ -0,0 +1,2602 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.999853250501394,
5
+ "eval_steps": 500,
6
+ "global_step": 30663,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.009783299907058651,
13
+ "grad_norm": 33.780609130859375,
14
+ "learning_rate": 1.6302575806977503e-06,
15
+ "loss": 6.574,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.019566599814117302,
20
+ "grad_norm": 33.439022064208984,
21
+ "learning_rate": 3.2605151613955006e-06,
22
+ "loss": 6.1653,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.02934989972117595,
27
+ "grad_norm": 27.25751304626465,
28
+ "learning_rate": 4.890772742093251e-06,
29
+ "loss": 5.5515,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.039133199628234604,
34
+ "grad_norm": 38.36979675292969,
35
+ "learning_rate": 6.521030322791001e-06,
36
+ "loss": 5.0531,
37
+ "step": 400
38
+ },
39
+ {
40
+ "epoch": 0.048916499535293256,
41
+ "grad_norm": 29.350488662719727,
42
+ "learning_rate": 8.15128790348875e-06,
43
+ "loss": 4.9225,
44
+ "step": 500
45
+ },
46
+ {
47
+ "epoch": 0.048916499535293256,
48
+ "eval_runtime": 181.5812,
49
+ "eval_samples_per_second": 112.578,
50
+ "eval_steps_per_second": 14.076,
51
+ "step": 500
52
+ },
53
+ {
54
+ "epoch": 0.0586997994423519,
55
+ "grad_norm": 33.02122497558594,
56
+ "learning_rate": 9.781545484186502e-06,
57
+ "loss": 4.8186,
58
+ "step": 600
59
+ },
60
+ {
61
+ "epoch": 0.06848309934941056,
62
+ "grad_norm": 42.41593933105469,
63
+ "learning_rate": 1.1411803064884251e-05,
64
+ "loss": 4.5769,
65
+ "step": 700
66
+ },
67
+ {
68
+ "epoch": 0.07826639925646921,
69
+ "grad_norm": 40.29044723510742,
70
+ "learning_rate": 1.3042060645582003e-05,
71
+ "loss": 4.3963,
72
+ "step": 800
73
+ },
74
+ {
75
+ "epoch": 0.08804969916352785,
76
+ "grad_norm": 38.0811653137207,
77
+ "learning_rate": 1.4672318226279752e-05,
78
+ "loss": 4.3393,
79
+ "step": 900
80
+ },
81
+ {
82
+ "epoch": 0.09783299907058651,
83
+ "grad_norm": 36.08370590209961,
84
+ "learning_rate": 1.63025758069775e-05,
85
+ "loss": 4.2421,
86
+ "step": 1000
87
+ },
88
+ {
89
+ "epoch": 0.09783299907058651,
90
+ "eval_runtime": 181.886,
91
+ "eval_samples_per_second": 112.389,
92
+ "eval_steps_per_second": 14.053,
93
+ "step": 1000
94
+ },
95
+ {
96
+ "epoch": 0.10761629897764516,
97
+ "grad_norm": 37.253684997558594,
98
+ "learning_rate": 1.7932833387675256e-05,
99
+ "loss": 4.1156,
100
+ "step": 1100
101
+ },
102
+ {
103
+ "epoch": 0.1173995988847038,
104
+ "grad_norm": 33.003475189208984,
105
+ "learning_rate": 1.9563090968373004e-05,
106
+ "loss": 4.0112,
107
+ "step": 1200
108
+ },
109
+ {
110
+ "epoch": 0.12718289879176245,
111
+ "grad_norm": 30.727867126464844,
112
+ "learning_rate": 2.1193348549070755e-05,
113
+ "loss": 3.9969,
114
+ "step": 1300
115
+ },
116
+ {
117
+ "epoch": 0.13696619869882112,
118
+ "grad_norm": 37.471092224121094,
119
+ "learning_rate": 2.2823606129768503e-05,
120
+ "loss": 3.874,
121
+ "step": 1400
122
+ },
123
+ {
124
+ "epoch": 0.14674949860587977,
125
+ "grad_norm": 42.32167434692383,
126
+ "learning_rate": 2.4453863710466254e-05,
127
+ "loss": 3.8518,
128
+ "step": 1500
129
+ },
130
+ {
131
+ "epoch": 0.14674949860587977,
132
+ "eval_runtime": 181.9332,
133
+ "eval_samples_per_second": 112.36,
134
+ "eval_steps_per_second": 14.049,
135
+ "step": 1500
136
+ },
137
+ {
138
+ "epoch": 0.15653279851293841,
139
+ "grad_norm": 38.00124740600586,
140
+ "learning_rate": 2.6084121291164005e-05,
141
+ "loss": 3.918,
142
+ "step": 1600
143
+ },
144
+ {
145
+ "epoch": 0.16631609841999706,
146
+ "grad_norm": 44.637386322021484,
147
+ "learning_rate": 2.7714378871861756e-05,
148
+ "loss": 3.9134,
149
+ "step": 1700
150
+ },
151
+ {
152
+ "epoch": 0.1760993983270557,
153
+ "grad_norm": 49.578609466552734,
154
+ "learning_rate": 2.9344636452559504e-05,
155
+ "loss": 3.7507,
156
+ "step": 1800
157
+ },
158
+ {
159
+ "epoch": 0.18588269823411438,
160
+ "grad_norm": 36.65715789794922,
161
+ "learning_rate": 3.0974894033257255e-05,
162
+ "loss": 3.7551,
163
+ "step": 1900
164
+ },
165
+ {
166
+ "epoch": 0.19566599814117303,
167
+ "grad_norm": 36.873443603515625,
168
+ "learning_rate": 3.2605151613955e-05,
169
+ "loss": 3.6951,
170
+ "step": 2000
171
+ },
172
+ {
173
+ "epoch": 0.19566599814117303,
174
+ "eval_runtime": 181.8273,
175
+ "eval_samples_per_second": 112.425,
176
+ "eval_steps_per_second": 14.057,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 0.20544929804823167,
181
+ "grad_norm": 33.025413513183594,
182
+ "learning_rate": 3.423540919465276e-05,
183
+ "loss": 3.6603,
184
+ "step": 2100
185
+ },
186
+ {
187
+ "epoch": 0.21523259795529032,
188
+ "grad_norm": 30.105051040649414,
189
+ "learning_rate": 3.586566677535051e-05,
190
+ "loss": 3.525,
191
+ "step": 2200
192
+ },
193
+ {
194
+ "epoch": 0.22501589786234896,
195
+ "grad_norm": 34.5129280090332,
196
+ "learning_rate": 3.749592435604825e-05,
197
+ "loss": 3.6454,
198
+ "step": 2300
199
+ },
200
+ {
201
+ "epoch": 0.2347991977694076,
202
+ "grad_norm": 33.16934585571289,
203
+ "learning_rate": 3.912618193674601e-05,
204
+ "loss": 3.6356,
205
+ "step": 2400
206
+ },
207
+ {
208
+ "epoch": 0.24458249767646628,
209
+ "grad_norm": 33.5789794921875,
210
+ "learning_rate": 4.0756439517443756e-05,
211
+ "loss": 3.5605,
212
+ "step": 2500
213
+ },
214
+ {
215
+ "epoch": 0.24458249767646628,
216
+ "eval_runtime": 181.7254,
217
+ "eval_samples_per_second": 112.488,
218
+ "eval_steps_per_second": 14.065,
219
+ "step": 2500
220
+ },
221
+ {
222
+ "epoch": 0.2543657975835249,
223
+ "grad_norm": 34.30876159667969,
224
+ "learning_rate": 4.238669709814151e-05,
225
+ "loss": 3.5447,
226
+ "step": 2600
227
+ },
228
+ {
229
+ "epoch": 0.2641490974905836,
230
+ "grad_norm": 29.907989501953125,
231
+ "learning_rate": 4.401695467883926e-05,
232
+ "loss": 3.5116,
233
+ "step": 2700
234
+ },
235
+ {
236
+ "epoch": 0.27393239739764225,
237
+ "grad_norm": 34.08231735229492,
238
+ "learning_rate": 4.5647212259537006e-05,
239
+ "loss": 3.4941,
240
+ "step": 2800
241
+ },
242
+ {
243
+ "epoch": 0.28371569730470086,
244
+ "grad_norm": 25.034149169921875,
245
+ "learning_rate": 4.727746984023476e-05,
246
+ "loss": 3.4863,
247
+ "step": 2900
248
+ },
249
+ {
250
+ "epoch": 0.29349899721175954,
251
+ "grad_norm": 32.21685028076172,
252
+ "learning_rate": 4.890772742093251e-05,
253
+ "loss": 3.5096,
254
+ "step": 3000
255
+ },
256
+ {
257
+ "epoch": 0.29349899721175954,
258
+ "eval_runtime": 181.6612,
259
+ "eval_samples_per_second": 112.528,
260
+ "eval_steps_per_second": 14.07,
261
+ "step": 3000
262
+ },
263
+ {
264
+ "epoch": 0.30328229711881816,
265
+ "grad_norm": 24.290380477905273,
266
+ "learning_rate": 4.9940208725902305e-05,
267
+ "loss": 3.3867,
268
+ "step": 3100
269
+ },
270
+ {
271
+ "epoch": 0.31306559702587683,
272
+ "grad_norm": 22.924575805664062,
273
+ "learning_rate": 4.975902304681838e-05,
274
+ "loss": 3.398,
275
+ "step": 3200
276
+ },
277
+ {
278
+ "epoch": 0.3228488969329355,
279
+ "grad_norm": 19.540430068969727,
280
+ "learning_rate": 4.957783736773446e-05,
281
+ "loss": 3.3727,
282
+ "step": 3300
283
+ },
284
+ {
285
+ "epoch": 0.3326321968399941,
286
+ "grad_norm": 22.529376983642578,
287
+ "learning_rate": 4.939665168865053e-05,
288
+ "loss": 3.3364,
289
+ "step": 3400
290
+ },
291
+ {
292
+ "epoch": 0.3424154967470528,
293
+ "grad_norm": 20.821264266967773,
294
+ "learning_rate": 4.921546600956661e-05,
295
+ "loss": 3.3126,
296
+ "step": 3500
297
+ },
298
+ {
299
+ "epoch": 0.3424154967470528,
300
+ "eval_runtime": 181.7582,
301
+ "eval_samples_per_second": 112.468,
302
+ "eval_steps_per_second": 14.063,
303
+ "step": 3500
304
+ },
305
+ {
306
+ "epoch": 0.3521987966541114,
307
+ "grad_norm": 24.346153259277344,
308
+ "learning_rate": 4.903428033048268e-05,
309
+ "loss": 3.2678,
310
+ "step": 3600
311
+ },
312
+ {
313
+ "epoch": 0.3619820965611701,
314
+ "grad_norm": 19.89035415649414,
315
+ "learning_rate": 4.8853094651398754e-05,
316
+ "loss": 3.3233,
317
+ "step": 3700
318
+ },
319
+ {
320
+ "epoch": 0.37176539646822876,
321
+ "grad_norm": 17.938880920410156,
322
+ "learning_rate": 4.8671908972314825e-05,
323
+ "loss": 3.2822,
324
+ "step": 3800
325
+ },
326
+ {
327
+ "epoch": 0.3815486963752874,
328
+ "grad_norm": 16.92071533203125,
329
+ "learning_rate": 4.84907232932309e-05,
330
+ "loss": 3.2254,
331
+ "step": 3900
332
+ },
333
+ {
334
+ "epoch": 0.39133199628234605,
335
+ "grad_norm": 18.241249084472656,
336
+ "learning_rate": 4.830953761414698e-05,
337
+ "loss": 3.2116,
338
+ "step": 4000
339
+ },
340
+ {
341
+ "epoch": 0.39133199628234605,
342
+ "eval_runtime": 182.8906,
343
+ "eval_samples_per_second": 111.772,
344
+ "eval_steps_per_second": 13.976,
345
+ "step": 4000
346
+ },
347
+ {
348
+ "epoch": 0.40111529618940467,
349
+ "grad_norm": 17.56020736694336,
350
+ "learning_rate": 4.812835193506305e-05,
351
+ "loss": 3.2232,
352
+ "step": 4100
353
+ },
354
+ {
355
+ "epoch": 0.41089859609646334,
356
+ "grad_norm": 17.81117057800293,
357
+ "learning_rate": 4.794716625597913e-05,
358
+ "loss": 3.1936,
359
+ "step": 4200
360
+ },
361
+ {
362
+ "epoch": 0.420681896003522,
363
+ "grad_norm": 19.89581871032715,
364
+ "learning_rate": 4.77659805768952e-05,
365
+ "loss": 3.1443,
366
+ "step": 4300
367
+ },
368
+ {
369
+ "epoch": 0.43046519591058063,
370
+ "grad_norm": 22.968582153320312,
371
+ "learning_rate": 4.758479489781128e-05,
372
+ "loss": 3.2084,
373
+ "step": 4400
374
+ },
375
+ {
376
+ "epoch": 0.4402484958176393,
377
+ "grad_norm": 17.119598388671875,
378
+ "learning_rate": 4.740360921872735e-05,
379
+ "loss": 3.1263,
380
+ "step": 4500
381
+ },
382
+ {
383
+ "epoch": 0.4402484958176393,
384
+ "eval_runtime": 182.3246,
385
+ "eval_samples_per_second": 112.119,
386
+ "eval_steps_per_second": 14.019,
387
+ "step": 4500
388
+ },
389
+ {
390
+ "epoch": 0.4500317957246979,
391
+ "grad_norm": 19.294527053833008,
392
+ "learning_rate": 4.722242353964343e-05,
393
+ "loss": 3.1327,
394
+ "step": 4600
395
+ },
396
+ {
397
+ "epoch": 0.4598150956317566,
398
+ "grad_norm": 16.941057205200195,
399
+ "learning_rate": 4.704123786055951e-05,
400
+ "loss": 3.0944,
401
+ "step": 4700
402
+ },
403
+ {
404
+ "epoch": 0.4695983955388152,
405
+ "grad_norm": 22.43411636352539,
406
+ "learning_rate": 4.686005218147558e-05,
407
+ "loss": 3.1093,
408
+ "step": 4800
409
+ },
410
+ {
411
+ "epoch": 0.4793816954458739,
412
+ "grad_norm": 19.64097023010254,
413
+ "learning_rate": 4.667886650239166e-05,
414
+ "loss": 3.0597,
415
+ "step": 4900
416
+ },
417
+ {
418
+ "epoch": 0.48916499535293256,
419
+ "grad_norm": 19.343788146972656,
420
+ "learning_rate": 4.649768082330773e-05,
421
+ "loss": 3.1659,
422
+ "step": 5000
423
+ },
424
+ {
425
+ "epoch": 0.48916499535293256,
426
+ "eval_runtime": 181.8771,
427
+ "eval_samples_per_second": 112.395,
428
+ "eval_steps_per_second": 14.053,
429
+ "step": 5000
430
+ },
431
+ {
432
+ "epoch": 0.4989482952599912,
433
+ "grad_norm": 19.657760620117188,
434
+ "learning_rate": 4.63164951442238e-05,
435
+ "loss": 3.0506,
436
+ "step": 5100
437
+ },
438
+ {
439
+ "epoch": 0.5087315951670498,
440
+ "grad_norm": 16.2425537109375,
441
+ "learning_rate": 4.613530946513987e-05,
442
+ "loss": 3.0524,
443
+ "step": 5200
444
+ },
445
+ {
446
+ "epoch": 0.5185148950741085,
447
+ "grad_norm": 19.64779281616211,
448
+ "learning_rate": 4.595412378605595e-05,
449
+ "loss": 2.9995,
450
+ "step": 5300
451
+ },
452
+ {
453
+ "epoch": 0.5282981949811671,
454
+ "grad_norm": 17.29520606994629,
455
+ "learning_rate": 4.577293810697203e-05,
456
+ "loss": 3.0932,
457
+ "step": 5400
458
+ },
459
+ {
460
+ "epoch": 0.5380814948882258,
461
+ "grad_norm": 17.694602966308594,
462
+ "learning_rate": 4.55917524278881e-05,
463
+ "loss": 3.0309,
464
+ "step": 5500
465
+ },
466
+ {
467
+ "epoch": 0.5380814948882258,
468
+ "eval_runtime": 181.7231,
469
+ "eval_samples_per_second": 112.49,
470
+ "eval_steps_per_second": 14.065,
471
+ "step": 5500
472
+ },
473
+ {
474
+ "epoch": 0.5478647947952845,
475
+ "grad_norm": 21.030174255371094,
476
+ "learning_rate": 4.541056674880418e-05,
477
+ "loss": 3.0313,
478
+ "step": 5600
479
+ },
480
+ {
481
+ "epoch": 0.5576480947023431,
482
+ "grad_norm": 12.339129447937012,
483
+ "learning_rate": 4.522938106972025e-05,
484
+ "loss": 3.047,
485
+ "step": 5700
486
+ },
487
+ {
488
+ "epoch": 0.5674313946094017,
489
+ "grad_norm": 16.496389389038086,
490
+ "learning_rate": 4.504819539063633e-05,
491
+ "loss": 2.9961,
492
+ "step": 5800
493
+ },
494
+ {
495
+ "epoch": 0.5772146945164603,
496
+ "grad_norm": 15.456297874450684,
497
+ "learning_rate": 4.48670097115524e-05,
498
+ "loss": 2.9821,
499
+ "step": 5900
500
+ },
501
+ {
502
+ "epoch": 0.5869979944235191,
503
+ "grad_norm": 17.8603572845459,
504
+ "learning_rate": 4.468582403246848e-05,
505
+ "loss": 2.9294,
506
+ "step": 6000
507
+ },
508
+ {
509
+ "epoch": 0.5869979944235191,
510
+ "eval_runtime": 181.8258,
511
+ "eval_samples_per_second": 112.426,
512
+ "eval_steps_per_second": 14.057,
513
+ "step": 6000
514
+ },
515
+ {
516
+ "epoch": 0.5967812943305777,
517
+ "grad_norm": 18.85349464416504,
518
+ "learning_rate": 4.450463835338455e-05,
519
+ "loss": 2.9929,
520
+ "step": 6100
521
+ },
522
+ {
523
+ "epoch": 0.6065645942376363,
524
+ "grad_norm": 22.971813201904297,
525
+ "learning_rate": 4.432345267430063e-05,
526
+ "loss": 2.9684,
527
+ "step": 6200
528
+ },
529
+ {
530
+ "epoch": 0.616347894144695,
531
+ "grad_norm": 15.877230644226074,
532
+ "learning_rate": 4.4142266995216706e-05,
533
+ "loss": 2.9399,
534
+ "step": 6300
535
+ },
536
+ {
537
+ "epoch": 0.6261311940517537,
538
+ "grad_norm": 19.847482681274414,
539
+ "learning_rate": 4.396108131613278e-05,
540
+ "loss": 2.88,
541
+ "step": 6400
542
+ },
543
+ {
544
+ "epoch": 0.6359144939588123,
545
+ "grad_norm": 15.004170417785645,
546
+ "learning_rate": 4.377989563704885e-05,
547
+ "loss": 2.9719,
548
+ "step": 6500
549
+ },
550
+ {
551
+ "epoch": 0.6359144939588123,
552
+ "eval_runtime": 182.6045,
553
+ "eval_samples_per_second": 111.947,
554
+ "eval_steps_per_second": 13.997,
555
+ "step": 6500
556
+ },
557
+ {
558
+ "epoch": 0.645697793865871,
559
+ "grad_norm": 19.473665237426758,
560
+ "learning_rate": 4.359870995796492e-05,
561
+ "loss": 2.9246,
562
+ "step": 6600
563
+ },
564
+ {
565
+ "epoch": 0.6554810937729296,
566
+ "grad_norm": 18.071683883666992,
567
+ "learning_rate": 4.3417524278881e-05,
568
+ "loss": 2.9031,
569
+ "step": 6700
570
+ },
571
+ {
572
+ "epoch": 0.6652643936799882,
573
+ "grad_norm": 17.544504165649414,
574
+ "learning_rate": 4.323633859979707e-05,
575
+ "loss": 2.8313,
576
+ "step": 6800
577
+ },
578
+ {
579
+ "epoch": 0.6750476935870469,
580
+ "grad_norm": 18.936140060424805,
581
+ "learning_rate": 4.305515292071315e-05,
582
+ "loss": 2.8536,
583
+ "step": 6900
584
+ },
585
+ {
586
+ "epoch": 0.6848309934941056,
587
+ "grad_norm": 14.77696418762207,
588
+ "learning_rate": 4.2873967241629226e-05,
589
+ "loss": 2.9104,
590
+ "step": 7000
591
+ },
592
+ {
593
+ "epoch": 0.6848309934941056,
594
+ "eval_runtime": 181.938,
595
+ "eval_samples_per_second": 112.357,
596
+ "eval_steps_per_second": 14.049,
597
+ "step": 7000
598
+ },
599
+ {
600
+ "epoch": 0.6946142934011642,
601
+ "grad_norm": 14.303226470947266,
602
+ "learning_rate": 4.26927815625453e-05,
603
+ "loss": 2.8386,
604
+ "step": 7100
605
+ },
606
+ {
607
+ "epoch": 0.7043975933082228,
608
+ "grad_norm": 17.11782455444336,
609
+ "learning_rate": 4.2511595883461376e-05,
610
+ "loss": 2.9013,
611
+ "step": 7200
612
+ },
613
+ {
614
+ "epoch": 0.7141808932152816,
615
+ "grad_norm": 18.661100387573242,
616
+ "learning_rate": 4.233041020437745e-05,
617
+ "loss": 2.9428,
618
+ "step": 7300
619
+ },
620
+ {
621
+ "epoch": 0.7239641931223402,
622
+ "grad_norm": 15.535719871520996,
623
+ "learning_rate": 4.2149224525293525e-05,
624
+ "loss": 2.8582,
625
+ "step": 7400
626
+ },
627
+ {
628
+ "epoch": 0.7337474930293988,
629
+ "grad_norm": 15.3306303024292,
630
+ "learning_rate": 4.19680388462096e-05,
631
+ "loss": 2.8896,
632
+ "step": 7500
633
+ },
634
+ {
635
+ "epoch": 0.7337474930293988,
636
+ "eval_runtime": 181.8938,
637
+ "eval_samples_per_second": 112.384,
638
+ "eval_steps_per_second": 14.052,
639
+ "step": 7500
640
+ },
641
+ {
642
+ "epoch": 0.7435307929364575,
643
+ "grad_norm": 16.730344772338867,
644
+ "learning_rate": 4.1786853167125675e-05,
645
+ "loss": 2.9097,
646
+ "step": 7600
647
+ },
648
+ {
649
+ "epoch": 0.7533140928435161,
650
+ "grad_norm": 18.755483627319336,
651
+ "learning_rate": 4.1605667488041746e-05,
652
+ "loss": 2.8815,
653
+ "step": 7700
654
+ },
655
+ {
656
+ "epoch": 0.7630973927505748,
657
+ "grad_norm": 18.737581253051758,
658
+ "learning_rate": 4.1424481808957824e-05,
659
+ "loss": 2.9202,
660
+ "step": 7800
661
+ },
662
+ {
663
+ "epoch": 0.7728806926576334,
664
+ "grad_norm": 14.711681365966797,
665
+ "learning_rate": 4.1243296129873896e-05,
666
+ "loss": 2.806,
667
+ "step": 7900
668
+ },
669
+ {
670
+ "epoch": 0.7826639925646921,
671
+ "grad_norm": 17.5069580078125,
672
+ "learning_rate": 4.106211045078997e-05,
673
+ "loss": 2.8576,
674
+ "step": 8000
675
+ },
676
+ {
677
+ "epoch": 0.7826639925646921,
678
+ "eval_runtime": 181.9442,
679
+ "eval_samples_per_second": 112.353,
680
+ "eval_steps_per_second": 14.048,
681
+ "step": 8000
682
+ },
683
+ {
684
+ "epoch": 0.7924472924717507,
685
+ "grad_norm": 17.678852081298828,
686
+ "learning_rate": 4.0880924771706046e-05,
687
+ "loss": 2.8035,
688
+ "step": 8100
689
+ },
690
+ {
691
+ "epoch": 0.8022305923788093,
692
+ "grad_norm": 17.644638061523438,
693
+ "learning_rate": 4.069973909262212e-05,
694
+ "loss": 2.7958,
695
+ "step": 8200
696
+ },
697
+ {
698
+ "epoch": 0.8120138922858681,
699
+ "grad_norm": 18.377134323120117,
700
+ "learning_rate": 4.0518553413538195e-05,
701
+ "loss": 2.8055,
702
+ "step": 8300
703
+ },
704
+ {
705
+ "epoch": 0.8217971921929267,
706
+ "grad_norm": 18.026033401489258,
707
+ "learning_rate": 4.0337367734454273e-05,
708
+ "loss": 2.7334,
709
+ "step": 8400
710
+ },
711
+ {
712
+ "epoch": 0.8315804920999853,
713
+ "grad_norm": 14.77315616607666,
714
+ "learning_rate": 4.0156182055370345e-05,
715
+ "loss": 2.8082,
716
+ "step": 8500
717
+ },
718
+ {
719
+ "epoch": 0.8315804920999853,
720
+ "eval_runtime": 182.4176,
721
+ "eval_samples_per_second": 112.062,
722
+ "eval_steps_per_second": 14.012,
723
+ "step": 8500
724
+ },
725
+ {
726
+ "epoch": 0.841363792007044,
727
+ "grad_norm": 13.729479789733887,
728
+ "learning_rate": 3.997499637628642e-05,
729
+ "loss": 2.7939,
730
+ "step": 8600
731
+ },
732
+ {
733
+ "epoch": 0.8511470919141026,
734
+ "grad_norm": 16.34333610534668,
735
+ "learning_rate": 3.9793810697202494e-05,
736
+ "loss": 2.8517,
737
+ "step": 8700
738
+ },
739
+ {
740
+ "epoch": 0.8609303918211613,
741
+ "grad_norm": 22.484411239624023,
742
+ "learning_rate": 3.961262501811857e-05,
743
+ "loss": 2.776,
744
+ "step": 8800
745
+ },
746
+ {
747
+ "epoch": 0.8707136917282199,
748
+ "grad_norm": 15.922870635986328,
749
+ "learning_rate": 3.9431439339034644e-05,
750
+ "loss": 2.7909,
751
+ "step": 8900
752
+ },
753
+ {
754
+ "epoch": 0.8804969916352786,
755
+ "grad_norm": 15.06955623626709,
756
+ "learning_rate": 3.925025365995072e-05,
757
+ "loss": 2.8416,
758
+ "step": 9000
759
+ },
760
+ {
761
+ "epoch": 0.8804969916352786,
762
+ "eval_runtime": 181.9314,
763
+ "eval_samples_per_second": 112.361,
764
+ "eval_steps_per_second": 14.049,
765
+ "step": 9000
766
+ },
767
+ {
768
+ "epoch": 0.8902802915423372,
769
+ "grad_norm": 16.060428619384766,
770
+ "learning_rate": 3.9069067980866794e-05,
771
+ "loss": 2.7803,
772
+ "step": 9100
773
+ },
774
+ {
775
+ "epoch": 0.9000635914493959,
776
+ "grad_norm": 16.80124855041504,
777
+ "learning_rate": 3.888788230178287e-05,
778
+ "loss": 2.7548,
779
+ "step": 9200
780
+ },
781
+ {
782
+ "epoch": 0.9098468913564546,
783
+ "grad_norm": 16.608434677124023,
784
+ "learning_rate": 3.870669662269894e-05,
785
+ "loss": 2.8606,
786
+ "step": 9300
787
+ },
788
+ {
789
+ "epoch": 0.9196301912635132,
790
+ "grad_norm": 14.83870792388916,
791
+ "learning_rate": 3.8525510943615015e-05,
792
+ "loss": 2.7833,
793
+ "step": 9400
794
+ },
795
+ {
796
+ "epoch": 0.9294134911705718,
797
+ "grad_norm": 25.778181076049805,
798
+ "learning_rate": 3.834432526453109e-05,
799
+ "loss": 2.7434,
800
+ "step": 9500
801
+ },
802
+ {
803
+ "epoch": 0.9294134911705718,
804
+ "eval_runtime": 181.99,
805
+ "eval_samples_per_second": 112.325,
806
+ "eval_steps_per_second": 14.045,
807
+ "step": 9500
808
+ },
809
+ {
810
+ "epoch": 0.9391967910776304,
811
+ "grad_norm": 17.374011993408203,
812
+ "learning_rate": 3.8163139585447164e-05,
813
+ "loss": 2.7258,
814
+ "step": 9600
815
+ },
816
+ {
817
+ "epoch": 0.9489800909846892,
818
+ "grad_norm": 17.551128387451172,
819
+ "learning_rate": 3.798195390636324e-05,
820
+ "loss": 2.824,
821
+ "step": 9700
822
+ },
823
+ {
824
+ "epoch": 0.9587633908917478,
825
+ "grad_norm": 14.35797119140625,
826
+ "learning_rate": 3.7800768227279314e-05,
827
+ "loss": 2.745,
828
+ "step": 9800
829
+ },
830
+ {
831
+ "epoch": 0.9685466907988064,
832
+ "grad_norm": 20.098552703857422,
833
+ "learning_rate": 3.761958254819539e-05,
834
+ "loss": 2.7025,
835
+ "step": 9900
836
+ },
837
+ {
838
+ "epoch": 0.9783299907058651,
839
+ "grad_norm": 16.218109130859375,
840
+ "learning_rate": 3.743839686911147e-05,
841
+ "loss": 2.8093,
842
+ "step": 10000
843
+ },
844
+ {
845
+ "epoch": 0.9783299907058651,
846
+ "eval_runtime": 181.8987,
847
+ "eval_samples_per_second": 112.381,
848
+ "eval_steps_per_second": 14.052,
849
+ "step": 10000
850
+ },
851
+ {
852
+ "epoch": 0.9881132906129237,
853
+ "grad_norm": 17.198423385620117,
854
+ "learning_rate": 3.725721119002754e-05,
855
+ "loss": 2.7124,
856
+ "step": 10100
857
+ },
858
+ {
859
+ "epoch": 0.9978965905199824,
860
+ "grad_norm": 18.021198272705078,
861
+ "learning_rate": 3.707602551094362e-05,
862
+ "loss": 2.6922,
863
+ "step": 10200
864
+ },
865
+ {
866
+ "epoch": 1.007679890427041,
867
+ "grad_norm": 15.27678108215332,
868
+ "learning_rate": 3.689483983185969e-05,
869
+ "loss": 2.6743,
870
+ "step": 10300
871
+ },
872
+ {
873
+ "epoch": 1.0174631903340996,
874
+ "grad_norm": 16.770511627197266,
875
+ "learning_rate": 3.671365415277577e-05,
876
+ "loss": 2.857,
877
+ "step": 10400
878
+ },
879
+ {
880
+ "epoch": 1.0272464902411584,
881
+ "grad_norm": 18.810932159423828,
882
+ "learning_rate": 3.653246847369184e-05,
883
+ "loss": 2.7269,
884
+ "step": 10500
885
+ },
886
+ {
887
+ "epoch": 1.0272464902411584,
888
+ "eval_runtime": 181.8537,
889
+ "eval_samples_per_second": 112.409,
890
+ "eval_steps_per_second": 14.055,
891
+ "step": 10500
892
+ },
893
+ {
894
+ "epoch": 1.037029790148217,
895
+ "grad_norm": 18.56201171875,
896
+ "learning_rate": 3.635128279460791e-05,
897
+ "loss": 2.7325,
898
+ "step": 10600
899
+ },
900
+ {
901
+ "epoch": 1.0468130900552757,
902
+ "grad_norm": 15.063011169433594,
903
+ "learning_rate": 3.617009711552399e-05,
904
+ "loss": 2.7827,
905
+ "step": 10700
906
+ },
907
+ {
908
+ "epoch": 1.0565963899623343,
909
+ "grad_norm": 15.339439392089844,
910
+ "learning_rate": 3.598891143644006e-05,
911
+ "loss": 2.7472,
912
+ "step": 10800
913
+ },
914
+ {
915
+ "epoch": 1.066379689869393,
916
+ "grad_norm": 17.466033935546875,
917
+ "learning_rate": 3.580772575735614e-05,
918
+ "loss": 2.7859,
919
+ "step": 10900
920
+ },
921
+ {
922
+ "epoch": 1.0761629897764515,
923
+ "grad_norm": 20.727872848510742,
924
+ "learning_rate": 3.562654007827221e-05,
925
+ "loss": 2.7278,
926
+ "step": 11000
927
+ },
928
+ {
929
+ "epoch": 1.0761629897764515,
930
+ "eval_runtime": 181.8566,
931
+ "eval_samples_per_second": 112.407,
932
+ "eval_steps_per_second": 14.055,
933
+ "step": 11000
934
+ },
935
+ {
936
+ "epoch": 1.0859462896835101,
937
+ "grad_norm": 16.02055549621582,
938
+ "learning_rate": 3.544535439918829e-05,
939
+ "loss": 2.6307,
940
+ "step": 11100
941
+ },
942
+ {
943
+ "epoch": 1.095729589590569,
944
+ "grad_norm": 20.069686889648438,
945
+ "learning_rate": 3.526416872010436e-05,
946
+ "loss": 2.711,
947
+ "step": 11200
948
+ },
949
+ {
950
+ "epoch": 1.1055128894976276,
951
+ "grad_norm": 14.833261489868164,
952
+ "learning_rate": 3.508298304102044e-05,
953
+ "loss": 2.6141,
954
+ "step": 11300
955
+ },
956
+ {
957
+ "epoch": 1.1152961894046862,
958
+ "grad_norm": 14.86436653137207,
959
+ "learning_rate": 3.490179736193652e-05,
960
+ "loss": 2.6816,
961
+ "step": 11400
962
+ },
963
+ {
964
+ "epoch": 1.1250794893117448,
965
+ "grad_norm": 17.955862045288086,
966
+ "learning_rate": 3.472061168285259e-05,
967
+ "loss": 2.6924,
968
+ "step": 11500
969
+ },
970
+ {
971
+ "epoch": 1.1250794893117448,
972
+ "eval_runtime": 181.8085,
973
+ "eval_samples_per_second": 112.437,
974
+ "eval_steps_per_second": 14.059,
975
+ "step": 11500
976
+ },
977
+ {
978
+ "epoch": 1.1348627892188035,
979
+ "grad_norm": 18.360109329223633,
980
+ "learning_rate": 3.453942600376867e-05,
981
+ "loss": 2.6181,
982
+ "step": 11600
983
+ },
984
+ {
985
+ "epoch": 1.144646089125862,
986
+ "grad_norm": 17.547542572021484,
987
+ "learning_rate": 3.435824032468474e-05,
988
+ "loss": 2.6394,
989
+ "step": 11700
990
+ },
991
+ {
992
+ "epoch": 1.154429389032921,
993
+ "grad_norm": 12.194833755493164,
994
+ "learning_rate": 3.417705464560082e-05,
995
+ "loss": 2.6684,
996
+ "step": 11800
997
+ },
998
+ {
999
+ "epoch": 1.1642126889399795,
1000
+ "grad_norm": 17.095104217529297,
1001
+ "learning_rate": 3.399586896651689e-05,
1002
+ "loss": 2.6129,
1003
+ "step": 11900
1004
+ },
1005
+ {
1006
+ "epoch": 1.1739959888470382,
1007
+ "grad_norm": 20.788406372070312,
1008
+ "learning_rate": 3.381468328743296e-05,
1009
+ "loss": 2.5663,
1010
+ "step": 12000
1011
+ },
1012
+ {
1013
+ "epoch": 1.1739959888470382,
1014
+ "eval_runtime": 181.8035,
1015
+ "eval_samples_per_second": 112.44,
1016
+ "eval_steps_per_second": 14.059,
1017
+ "step": 12000
1018
+ },
1019
+ {
1020
+ "epoch": 1.1837792887540968,
1021
+ "grad_norm": 14.261167526245117,
1022
+ "learning_rate": 3.363349760834904e-05,
1023
+ "loss": 2.6544,
1024
+ "step": 12100
1025
+ },
1026
+ {
1027
+ "epoch": 1.1935625886611554,
1028
+ "grad_norm": 24.68012046813965,
1029
+ "learning_rate": 3.345231192926511e-05,
1030
+ "loss": 2.6632,
1031
+ "step": 12200
1032
+ },
1033
+ {
1034
+ "epoch": 1.203345888568214,
1035
+ "grad_norm": 16.10886573791504,
1036
+ "learning_rate": 3.327112625018119e-05,
1037
+ "loss": 2.6366,
1038
+ "step": 12300
1039
+ },
1040
+ {
1041
+ "epoch": 1.2131291884752726,
1042
+ "grad_norm": 18.038848876953125,
1043
+ "learning_rate": 3.308994057109726e-05,
1044
+ "loss": 2.6563,
1045
+ "step": 12400
1046
+ },
1047
+ {
1048
+ "epoch": 1.2229124883823315,
1049
+ "grad_norm": 17.40920639038086,
1050
+ "learning_rate": 3.290875489201334e-05,
1051
+ "loss": 2.718,
1052
+ "step": 12500
1053
+ },
1054
+ {
1055
+ "epoch": 1.2229124883823315,
1056
+ "eval_runtime": 181.9491,
1057
+ "eval_samples_per_second": 112.35,
1058
+ "eval_steps_per_second": 14.048,
1059
+ "step": 12500
1060
+ },
1061
+ {
1062
+ "epoch": 1.23269578828939,
1063
+ "grad_norm": 15.097307205200195,
1064
+ "learning_rate": 3.272756921292941e-05,
1065
+ "loss": 2.7282,
1066
+ "step": 12600
1067
+ },
1068
+ {
1069
+ "epoch": 1.2424790881964487,
1070
+ "grad_norm": 17.63008689880371,
1071
+ "learning_rate": 3.254638353384549e-05,
1072
+ "loss": 2.7104,
1073
+ "step": 12700
1074
+ },
1075
+ {
1076
+ "epoch": 1.2522623881035073,
1077
+ "grad_norm": 16.161130905151367,
1078
+ "learning_rate": 3.236519785476156e-05,
1079
+ "loss": 2.6427,
1080
+ "step": 12800
1081
+ },
1082
+ {
1083
+ "epoch": 1.262045688010566,
1084
+ "grad_norm": 18.786882400512695,
1085
+ "learning_rate": 3.218401217567764e-05,
1086
+ "loss": 2.6105,
1087
+ "step": 12900
1088
+ },
1089
+ {
1090
+ "epoch": 1.2718289879176246,
1091
+ "grad_norm": 24.145421981811523,
1092
+ "learning_rate": 3.2002826496593715e-05,
1093
+ "loss": 2.6322,
1094
+ "step": 13000
1095
+ },
1096
+ {
1097
+ "epoch": 1.2718289879176246,
1098
+ "eval_runtime": 182.5613,
1099
+ "eval_samples_per_second": 111.973,
1100
+ "eval_steps_per_second": 14.001,
1101
+ "step": 13000
1102
+ },
1103
+ {
1104
+ "epoch": 1.2816122878246832,
1105
+ "grad_norm": 15.286133766174316,
1106
+ "learning_rate": 3.1821640817509786e-05,
1107
+ "loss": 2.6465,
1108
+ "step": 13100
1109
+ },
1110
+ {
1111
+ "epoch": 1.291395587731742,
1112
+ "grad_norm": 21.22935676574707,
1113
+ "learning_rate": 3.1640455138425865e-05,
1114
+ "loss": 2.6691,
1115
+ "step": 13200
1116
+ },
1117
+ {
1118
+ "epoch": 1.3011788876388006,
1119
+ "grad_norm": 18.064428329467773,
1120
+ "learning_rate": 3.1459269459341936e-05,
1121
+ "loss": 2.5904,
1122
+ "step": 13300
1123
+ },
1124
+ {
1125
+ "epoch": 1.3109621875458592,
1126
+ "grad_norm": 14.45976448059082,
1127
+ "learning_rate": 3.127808378025801e-05,
1128
+ "loss": 2.6602,
1129
+ "step": 13400
1130
+ },
1131
+ {
1132
+ "epoch": 1.3207454874529179,
1133
+ "grad_norm": 19.72386360168457,
1134
+ "learning_rate": 3.109689810117408e-05,
1135
+ "loss": 2.6337,
1136
+ "step": 13500
1137
+ },
1138
+ {
1139
+ "epoch": 1.3207454874529179,
1140
+ "eval_runtime": 182.4053,
1141
+ "eval_samples_per_second": 112.069,
1142
+ "eval_steps_per_second": 14.013,
1143
+ "step": 13500
1144
+ },
1145
+ {
1146
+ "epoch": 1.3305287873599765,
1147
+ "grad_norm": 17.639583587646484,
1148
+ "learning_rate": 3.091571242209016e-05,
1149
+ "loss": 2.6135,
1150
+ "step": 13600
1151
+ },
1152
+ {
1153
+ "epoch": 1.340312087267035,
1154
+ "grad_norm": 19.71700096130371,
1155
+ "learning_rate": 3.0734526743006235e-05,
1156
+ "loss": 2.6252,
1157
+ "step": 13700
1158
+ },
1159
+ {
1160
+ "epoch": 1.3500953871740937,
1161
+ "grad_norm": 16.715856552124023,
1162
+ "learning_rate": 3.055334106392231e-05,
1163
+ "loss": 2.6475,
1164
+ "step": 13800
1165
+ },
1166
+ {
1167
+ "epoch": 1.3598786870811526,
1168
+ "grad_norm": 12.645075798034668,
1169
+ "learning_rate": 3.0372155384838385e-05,
1170
+ "loss": 2.6199,
1171
+ "step": 13900
1172
+ },
1173
+ {
1174
+ "epoch": 1.3696619869882112,
1175
+ "grad_norm": 20.150625228881836,
1176
+ "learning_rate": 3.0190969705754456e-05,
1177
+ "loss": 2.5567,
1178
+ "step": 14000
1179
+ },
1180
+ {
1181
+ "epoch": 1.3696619869882112,
1182
+ "eval_runtime": 181.9086,
1183
+ "eval_samples_per_second": 112.375,
1184
+ "eval_steps_per_second": 14.051,
1185
+ "step": 14000
1186
+ },
1187
+ {
1188
+ "epoch": 1.3794452868952698,
1189
+ "grad_norm": 19.111286163330078,
1190
+ "learning_rate": 3.0009784026670535e-05,
1191
+ "loss": 2.59,
1192
+ "step": 14100
1193
+ },
1194
+ {
1195
+ "epoch": 1.3892285868023284,
1196
+ "grad_norm": 17.12226104736328,
1197
+ "learning_rate": 2.9828598347586606e-05,
1198
+ "loss": 2.5913,
1199
+ "step": 14200
1200
+ },
1201
+ {
1202
+ "epoch": 1.399011886709387,
1203
+ "grad_norm": 19.741445541381836,
1204
+ "learning_rate": 2.9647412668502684e-05,
1205
+ "loss": 2.5617,
1206
+ "step": 14300
1207
+ },
1208
+ {
1209
+ "epoch": 1.4087951866164456,
1210
+ "grad_norm": 17.605525970458984,
1211
+ "learning_rate": 2.946622698941876e-05,
1212
+ "loss": 2.6077,
1213
+ "step": 14400
1214
+ },
1215
+ {
1216
+ "epoch": 1.4185784865235043,
1217
+ "grad_norm": 17.433218002319336,
1218
+ "learning_rate": 2.928504131033483e-05,
1219
+ "loss": 2.5713,
1220
+ "step": 14500
1221
+ },
1222
+ {
1223
+ "epoch": 1.4185784865235043,
1224
+ "eval_runtime": 181.9305,
1225
+ "eval_samples_per_second": 112.362,
1226
+ "eval_steps_per_second": 14.049,
1227
+ "step": 14500
1228
+ },
1229
+ {
1230
+ "epoch": 1.428361786430563,
1231
+ "grad_norm": 15.442538261413574,
1232
+ "learning_rate": 2.910385563125091e-05,
1233
+ "loss": 2.6499,
1234
+ "step": 14600
1235
+ },
1236
+ {
1237
+ "epoch": 1.4381450863376217,
1238
+ "grad_norm": 15.078730583190918,
1239
+ "learning_rate": 2.892266995216698e-05,
1240
+ "loss": 2.6517,
1241
+ "step": 14700
1242
+ },
1243
+ {
1244
+ "epoch": 1.4479283862446803,
1245
+ "grad_norm": 23.07891273498535,
1246
+ "learning_rate": 2.874148427308306e-05,
1247
+ "loss": 2.594,
1248
+ "step": 14800
1249
+ },
1250
+ {
1251
+ "epoch": 1.457711686151739,
1252
+ "grad_norm": 16.707923889160156,
1253
+ "learning_rate": 2.856029859399913e-05,
1254
+ "loss": 2.6613,
1255
+ "step": 14900
1256
+ },
1257
+ {
1258
+ "epoch": 1.4674949860587976,
1259
+ "grad_norm": 16.731164932250977,
1260
+ "learning_rate": 2.8379112914915208e-05,
1261
+ "loss": 2.5927,
1262
+ "step": 15000
1263
+ },
1264
+ {
1265
+ "epoch": 1.4674949860587976,
1266
+ "eval_runtime": 181.9649,
1267
+ "eval_samples_per_second": 112.34,
1268
+ "eval_steps_per_second": 14.047,
1269
+ "step": 15000
1270
+ },
1271
+ {
1272
+ "epoch": 1.4772782859658564,
1273
+ "grad_norm": 16.020864486694336,
1274
+ "learning_rate": 2.819792723583128e-05,
1275
+ "loss": 2.6464,
1276
+ "step": 15100
1277
+ },
1278
+ {
1279
+ "epoch": 1.4870615858729148,
1280
+ "grad_norm": 16.674760818481445,
1281
+ "learning_rate": 2.8016741556747354e-05,
1282
+ "loss": 2.5853,
1283
+ "step": 15200
1284
+ },
1285
+ {
1286
+ "epoch": 1.4968448857799737,
1287
+ "grad_norm": 16.890748977661133,
1288
+ "learning_rate": 2.7835555877663432e-05,
1289
+ "loss": 2.5748,
1290
+ "step": 15300
1291
+ },
1292
+ {
1293
+ "epoch": 1.5066281856870323,
1294
+ "grad_norm": 20.217845916748047,
1295
+ "learning_rate": 2.7654370198579504e-05,
1296
+ "loss": 2.6204,
1297
+ "step": 15400
1298
+ },
1299
+ {
1300
+ "epoch": 1.516411485594091,
1301
+ "grad_norm": 20.459087371826172,
1302
+ "learning_rate": 2.7473184519495582e-05,
1303
+ "loss": 2.6103,
1304
+ "step": 15500
1305
+ },
1306
+ {
1307
+ "epoch": 1.516411485594091,
1308
+ "eval_runtime": 181.9454,
1309
+ "eval_samples_per_second": 112.352,
1310
+ "eval_steps_per_second": 14.048,
1311
+ "step": 15500
1312
+ },
1313
+ {
1314
+ "epoch": 1.5261947855011495,
1315
+ "grad_norm": 18.207612991333008,
1316
+ "learning_rate": 2.7291998840411654e-05,
1317
+ "loss": 2.5786,
1318
+ "step": 15600
1319
+ },
1320
+ {
1321
+ "epoch": 1.5359780854082081,
1322
+ "grad_norm": 18.084758758544922,
1323
+ "learning_rate": 2.7110813161327732e-05,
1324
+ "loss": 2.6535,
1325
+ "step": 15700
1326
+ },
1327
+ {
1328
+ "epoch": 1.545761385315267,
1329
+ "grad_norm": 15.03881549835205,
1330
+ "learning_rate": 2.6929627482243803e-05,
1331
+ "loss": 2.6061,
1332
+ "step": 15800
1333
+ },
1334
+ {
1335
+ "epoch": 1.5555446852223254,
1336
+ "grad_norm": 16.99995231628418,
1337
+ "learning_rate": 2.6748441803159878e-05,
1338
+ "loss": 2.6151,
1339
+ "step": 15900
1340
+ },
1341
+ {
1342
+ "epoch": 1.5653279851293842,
1343
+ "grad_norm": 15.581089973449707,
1344
+ "learning_rate": 2.6567256124075956e-05,
1345
+ "loss": 2.6163,
1346
+ "step": 16000
1347
+ },
1348
+ {
1349
+ "epoch": 1.5653279851293842,
1350
+ "eval_runtime": 181.8152,
1351
+ "eval_samples_per_second": 112.433,
1352
+ "eval_steps_per_second": 14.058,
1353
+ "step": 16000
1354
+ },
1355
+ {
1356
+ "epoch": 1.5751112850364428,
1357
+ "grad_norm": 21.4382266998291,
1358
+ "learning_rate": 2.6386070444992028e-05,
1359
+ "loss": 2.5975,
1360
+ "step": 16100
1361
+ },
1362
+ {
1363
+ "epoch": 1.5848945849435014,
1364
+ "grad_norm": 15.874536514282227,
1365
+ "learning_rate": 2.6204884765908106e-05,
1366
+ "loss": 2.5851,
1367
+ "step": 16200
1368
+ },
1369
+ {
1370
+ "epoch": 1.59467788485056,
1371
+ "grad_norm": 17.902137756347656,
1372
+ "learning_rate": 2.6023699086824177e-05,
1373
+ "loss": 2.6027,
1374
+ "step": 16300
1375
+ },
1376
+ {
1377
+ "epoch": 1.6044611847576187,
1378
+ "grad_norm": 17.04872703552246,
1379
+ "learning_rate": 2.5842513407740255e-05,
1380
+ "loss": 2.5854,
1381
+ "step": 16400
1382
+ },
1383
+ {
1384
+ "epoch": 1.6142444846646775,
1385
+ "grad_norm": 15.406013488769531,
1386
+ "learning_rate": 2.5661327728656327e-05,
1387
+ "loss": 2.5158,
1388
+ "step": 16500
1389
+ },
1390
+ {
1391
+ "epoch": 1.6142444846646775,
1392
+ "eval_runtime": 181.8647,
1393
+ "eval_samples_per_second": 112.402,
1394
+ "eval_steps_per_second": 14.054,
1395
+ "step": 16500
1396
+ },
1397
+ {
1398
+ "epoch": 1.624027784571736,
1399
+ "grad_norm": 19.62627601623535,
1400
+ "learning_rate": 2.5480142049572402e-05,
1401
+ "loss": 2.5378,
1402
+ "step": 16600
1403
+ },
1404
+ {
1405
+ "epoch": 1.6338110844787948,
1406
+ "grad_norm": 17.825178146362305,
1407
+ "learning_rate": 2.529895637048848e-05,
1408
+ "loss": 2.6162,
1409
+ "step": 16700
1410
+ },
1411
+ {
1412
+ "epoch": 1.6435943843858534,
1413
+ "grad_norm": 15.442023277282715,
1414
+ "learning_rate": 2.511777069140455e-05,
1415
+ "loss": 2.5802,
1416
+ "step": 16800
1417
+ },
1418
+ {
1419
+ "epoch": 1.653377684292912,
1420
+ "grad_norm": 18.695241928100586,
1421
+ "learning_rate": 2.4936585012320626e-05,
1422
+ "loss": 2.585,
1423
+ "step": 16900
1424
+ },
1425
+ {
1426
+ "epoch": 1.6631609841999706,
1427
+ "grad_norm": 18.992969512939453,
1428
+ "learning_rate": 2.4755399333236704e-05,
1429
+ "loss": 2.5448,
1430
+ "step": 17000
1431
+ },
1432
+ {
1433
+ "epoch": 1.6631609841999706,
1434
+ "eval_runtime": 181.91,
1435
+ "eval_samples_per_second": 112.374,
1436
+ "eval_steps_per_second": 14.051,
1437
+ "step": 17000
1438
+ },
1439
+ {
1440
+ "epoch": 1.6729442841070292,
1441
+ "grad_norm": 19.065349578857422,
1442
+ "learning_rate": 2.457421365415278e-05,
1443
+ "loss": 2.6565,
1444
+ "step": 17100
1445
+ },
1446
+ {
1447
+ "epoch": 1.682727584014088,
1448
+ "grad_norm": 20.110734939575195,
1449
+ "learning_rate": 2.439302797506885e-05,
1450
+ "loss": 2.5519,
1451
+ "step": 17200
1452
+ },
1453
+ {
1454
+ "epoch": 1.6925108839211465,
1455
+ "grad_norm": 15.886931419372559,
1456
+ "learning_rate": 2.4211842295984925e-05,
1457
+ "loss": 2.5589,
1458
+ "step": 17300
1459
+ },
1460
+ {
1461
+ "epoch": 1.7022941838282053,
1462
+ "grad_norm": 19.213207244873047,
1463
+ "learning_rate": 2.4030656616901e-05,
1464
+ "loss": 2.5714,
1465
+ "step": 17400
1466
+ },
1467
+ {
1468
+ "epoch": 1.712077483735264,
1469
+ "grad_norm": 17.117481231689453,
1470
+ "learning_rate": 2.3849470937817075e-05,
1471
+ "loss": 2.6682,
1472
+ "step": 17500
1473
+ },
1474
+ {
1475
+ "epoch": 1.712077483735264,
1476
+ "eval_runtime": 181.766,
1477
+ "eval_samples_per_second": 112.463,
1478
+ "eval_steps_per_second": 14.062,
1479
+ "step": 17500
1480
+ },
1481
+ {
1482
+ "epoch": 1.7218607836423225,
1483
+ "grad_norm": 17.19162940979004,
1484
+ "learning_rate": 2.366828525873315e-05,
1485
+ "loss": 2.5591,
1486
+ "step": 17600
1487
+ },
1488
+ {
1489
+ "epoch": 1.7316440835493812,
1490
+ "grad_norm": 15.454411506652832,
1491
+ "learning_rate": 2.3487099579649225e-05,
1492
+ "loss": 2.469,
1493
+ "step": 17700
1494
+ },
1495
+ {
1496
+ "epoch": 1.7414273834564398,
1497
+ "grad_norm": 15.227791786193848,
1498
+ "learning_rate": 2.3305913900565303e-05,
1499
+ "loss": 2.664,
1500
+ "step": 17800
1501
+ },
1502
+ {
1503
+ "epoch": 1.7512106833634986,
1504
+ "grad_norm": 18.5739688873291,
1505
+ "learning_rate": 2.3124728221481374e-05,
1506
+ "loss": 2.5991,
1507
+ "step": 17900
1508
+ },
1509
+ {
1510
+ "epoch": 1.760993983270557,
1511
+ "grad_norm": 12.589066505432129,
1512
+ "learning_rate": 2.294354254239745e-05,
1513
+ "loss": 2.6593,
1514
+ "step": 18000
1515
+ },
1516
+ {
1517
+ "epoch": 1.760993983270557,
1518
+ "eval_runtime": 181.9699,
1519
+ "eval_samples_per_second": 112.337,
1520
+ "eval_steps_per_second": 14.046,
1521
+ "step": 18000
1522
+ },
1523
+ {
1524
+ "epoch": 1.7707772831776158,
1525
+ "grad_norm": 20.695772171020508,
1526
+ "learning_rate": 2.2762356863313524e-05,
1527
+ "loss": 2.5555,
1528
+ "step": 18100
1529
+ },
1530
+ {
1531
+ "epoch": 1.7805605830846745,
1532
+ "grad_norm": 12.731703758239746,
1533
+ "learning_rate": 2.25811711842296e-05,
1534
+ "loss": 2.4617,
1535
+ "step": 18200
1536
+ },
1537
+ {
1538
+ "epoch": 1.790343882991733,
1539
+ "grad_norm": 18.506074905395508,
1540
+ "learning_rate": 2.2399985505145674e-05,
1541
+ "loss": 2.6061,
1542
+ "step": 18300
1543
+ },
1544
+ {
1545
+ "epoch": 1.800127182898792,
1546
+ "grad_norm": 14.8694486618042,
1547
+ "learning_rate": 2.221879982606175e-05,
1548
+ "loss": 2.5779,
1549
+ "step": 18400
1550
+ },
1551
+ {
1552
+ "epoch": 1.8099104828058503,
1553
+ "grad_norm": 22.47985076904297,
1554
+ "learning_rate": 2.2037614146977827e-05,
1555
+ "loss": 2.5012,
1556
+ "step": 18500
1557
+ },
1558
+ {
1559
+ "epoch": 1.8099104828058503,
1560
+ "eval_runtime": 182.3919,
1561
+ "eval_samples_per_second": 112.077,
1562
+ "eval_steps_per_second": 14.014,
1563
+ "step": 18500
1564
+ },
1565
+ {
1566
+ "epoch": 1.8196937827129092,
1567
+ "grad_norm": 25.74334144592285,
1568
+ "learning_rate": 2.1856428467893898e-05,
1569
+ "loss": 2.5265,
1570
+ "step": 18600
1571
+ },
1572
+ {
1573
+ "epoch": 1.8294770826199676,
1574
+ "grad_norm": 18.477630615234375,
1575
+ "learning_rate": 2.1675242788809973e-05,
1576
+ "loss": 2.5555,
1577
+ "step": 18700
1578
+ },
1579
+ {
1580
+ "epoch": 1.8392603825270264,
1581
+ "grad_norm": 14.832316398620605,
1582
+ "learning_rate": 2.1494057109726048e-05,
1583
+ "loss": 2.4609,
1584
+ "step": 18800
1585
+ },
1586
+ {
1587
+ "epoch": 1.849043682434085,
1588
+ "grad_norm": 17.025096893310547,
1589
+ "learning_rate": 2.1312871430642123e-05,
1590
+ "loss": 2.5119,
1591
+ "step": 18900
1592
+ },
1593
+ {
1594
+ "epoch": 1.8588269823411436,
1595
+ "grad_norm": 16.852436065673828,
1596
+ "learning_rate": 2.1131685751558197e-05,
1597
+ "loss": 2.5369,
1598
+ "step": 19000
1599
+ },
1600
+ {
1601
+ "epoch": 1.8588269823411436,
1602
+ "eval_runtime": 181.7443,
1603
+ "eval_samples_per_second": 112.477,
1604
+ "eval_steps_per_second": 14.064,
1605
+ "step": 19000
1606
+ },
1607
+ {
1608
+ "epoch": 1.8686102822482025,
1609
+ "grad_norm": 15.160259246826172,
1610
+ "learning_rate": 2.0950500072474272e-05,
1611
+ "loss": 2.6297,
1612
+ "step": 19100
1613
+ },
1614
+ {
1615
+ "epoch": 1.8783935821552609,
1616
+ "grad_norm": 15.909671783447266,
1617
+ "learning_rate": 2.0769314393390347e-05,
1618
+ "loss": 2.4696,
1619
+ "step": 19200
1620
+ },
1621
+ {
1622
+ "epoch": 1.8881768820623197,
1623
+ "grad_norm": 14.201844215393066,
1624
+ "learning_rate": 2.0588128714306422e-05,
1625
+ "loss": 2.5653,
1626
+ "step": 19300
1627
+ },
1628
+ {
1629
+ "epoch": 1.8979601819693783,
1630
+ "grad_norm": 16.351415634155273,
1631
+ "learning_rate": 2.0406943035222497e-05,
1632
+ "loss": 2.4962,
1633
+ "step": 19400
1634
+ },
1635
+ {
1636
+ "epoch": 1.907743481876437,
1637
+ "grad_norm": 16.943771362304688,
1638
+ "learning_rate": 2.022575735613857e-05,
1639
+ "loss": 2.5091,
1640
+ "step": 19500
1641
+ },
1642
+ {
1643
+ "epoch": 1.907743481876437,
1644
+ "eval_runtime": 181.6486,
1645
+ "eval_samples_per_second": 112.536,
1646
+ "eval_steps_per_second": 14.071,
1647
+ "step": 19500
1648
+ },
1649
+ {
1650
+ "epoch": 1.9175267817834956,
1651
+ "grad_norm": 15.006349563598633,
1652
+ "learning_rate": 2.0044571677054646e-05,
1653
+ "loss": 2.5214,
1654
+ "step": 19600
1655
+ },
1656
+ {
1657
+ "epoch": 1.9273100816905542,
1658
+ "grad_norm": 17.305580139160156,
1659
+ "learning_rate": 1.986338599797072e-05,
1660
+ "loss": 2.4989,
1661
+ "step": 19700
1662
+ },
1663
+ {
1664
+ "epoch": 1.937093381597613,
1665
+ "grad_norm": 17.28044891357422,
1666
+ "learning_rate": 1.9682200318886796e-05,
1667
+ "loss": 2.4008,
1668
+ "step": 19800
1669
+ },
1670
+ {
1671
+ "epoch": 1.9468766815046714,
1672
+ "grad_norm": 18.25079917907715,
1673
+ "learning_rate": 1.950101463980287e-05,
1674
+ "loss": 2.6015,
1675
+ "step": 19900
1676
+ },
1677
+ {
1678
+ "epoch": 1.9566599814117303,
1679
+ "grad_norm": 20.741668701171875,
1680
+ "learning_rate": 1.9319828960718946e-05,
1681
+ "loss": 2.4081,
1682
+ "step": 20000
1683
+ },
1684
+ {
1685
+ "epoch": 1.9566599814117303,
1686
+ "eval_runtime": 181.7745,
1687
+ "eval_samples_per_second": 112.458,
1688
+ "eval_steps_per_second": 14.061,
1689
+ "step": 20000
1690
+ },
1691
+ {
1692
+ "epoch": 1.9664432813187889,
1693
+ "grad_norm": 16.1226863861084,
1694
+ "learning_rate": 1.913864328163502e-05,
1695
+ "loss": 2.5418,
1696
+ "step": 20100
1697
+ },
1698
+ {
1699
+ "epoch": 1.9762265812258475,
1700
+ "grad_norm": 13.914982795715332,
1701
+ "learning_rate": 1.8957457602551095e-05,
1702
+ "loss": 2.5248,
1703
+ "step": 20200
1704
+ },
1705
+ {
1706
+ "epoch": 1.986009881132906,
1707
+ "grad_norm": 15.072690963745117,
1708
+ "learning_rate": 1.877627192346717e-05,
1709
+ "loss": 2.5488,
1710
+ "step": 20300
1711
+ },
1712
+ {
1713
+ "epoch": 1.9957931810399647,
1714
+ "grad_norm": 15.510763168334961,
1715
+ "learning_rate": 1.8595086244383245e-05,
1716
+ "loss": 2.4605,
1717
+ "step": 20400
1718
+ },
1719
+ {
1720
+ "epoch": 2.0055764809470236,
1721
+ "grad_norm": 18.463842391967773,
1722
+ "learning_rate": 1.841390056529932e-05,
1723
+ "loss": 2.522,
1724
+ "step": 20500
1725
+ },
1726
+ {
1727
+ "epoch": 2.0055764809470236,
1728
+ "eval_runtime": 182.07,
1729
+ "eval_samples_per_second": 112.276,
1730
+ "eval_steps_per_second": 14.039,
1731
+ "step": 20500
1732
+ },
1733
+ {
1734
+ "epoch": 2.015359780854082,
1735
+ "grad_norm": 16.670269012451172,
1736
+ "learning_rate": 1.8232714886215394e-05,
1737
+ "loss": 2.5585,
1738
+ "step": 20600
1739
+ },
1740
+ {
1741
+ "epoch": 2.025143080761141,
1742
+ "grad_norm": 20.60368537902832,
1743
+ "learning_rate": 1.805152920713147e-05,
1744
+ "loss": 2.5381,
1745
+ "step": 20700
1746
+ },
1747
+ {
1748
+ "epoch": 2.034926380668199,
1749
+ "grad_norm": 15.686981201171875,
1750
+ "learning_rate": 1.7870343528047544e-05,
1751
+ "loss": 2.5721,
1752
+ "step": 20800
1753
+ },
1754
+ {
1755
+ "epoch": 2.044709680575258,
1756
+ "grad_norm": 14.691718101501465,
1757
+ "learning_rate": 1.768915784896362e-05,
1758
+ "loss": 2.5187,
1759
+ "step": 20900
1760
+ },
1761
+ {
1762
+ "epoch": 2.054492980482317,
1763
+ "grad_norm": 16.31734848022461,
1764
+ "learning_rate": 1.7507972169879694e-05,
1765
+ "loss": 2.5202,
1766
+ "step": 21000
1767
+ },
1768
+ {
1769
+ "epoch": 2.054492980482317,
1770
+ "eval_runtime": 181.9896,
1771
+ "eval_samples_per_second": 112.325,
1772
+ "eval_steps_per_second": 14.045,
1773
+ "step": 21000
1774
+ },
1775
+ {
1776
+ "epoch": 2.0642762803893753,
1777
+ "grad_norm": 12.698554992675781,
1778
+ "learning_rate": 1.732678649079577e-05,
1779
+ "loss": 2.4228,
1780
+ "step": 21100
1781
+ },
1782
+ {
1783
+ "epoch": 2.074059580296434,
1784
+ "grad_norm": 16.34201431274414,
1785
+ "learning_rate": 1.7145600811711843e-05,
1786
+ "loss": 2.3963,
1787
+ "step": 21200
1788
+ },
1789
+ {
1790
+ "epoch": 2.0838428802034925,
1791
+ "grad_norm": 16.52840232849121,
1792
+ "learning_rate": 1.6964415132627918e-05,
1793
+ "loss": 2.4759,
1794
+ "step": 21300
1795
+ },
1796
+ {
1797
+ "epoch": 2.0936261801105513,
1798
+ "grad_norm": 14.856452941894531,
1799
+ "learning_rate": 1.6783229453543993e-05,
1800
+ "loss": 2.4675,
1801
+ "step": 21400
1802
+ },
1803
+ {
1804
+ "epoch": 2.1034094800176097,
1805
+ "grad_norm": 19.68895721435547,
1806
+ "learning_rate": 1.6602043774460068e-05,
1807
+ "loss": 2.5324,
1808
+ "step": 21500
1809
+ },
1810
+ {
1811
+ "epoch": 2.1034094800176097,
1812
+ "eval_runtime": 182.1877,
1813
+ "eval_samples_per_second": 112.203,
1814
+ "eval_steps_per_second": 14.029,
1815
+ "step": 21500
1816
+ },
1817
+ {
1818
+ "epoch": 2.1131927799246686,
1819
+ "grad_norm": 23.248056411743164,
1820
+ "learning_rate": 1.6420858095376143e-05,
1821
+ "loss": 2.5231,
1822
+ "step": 21600
1823
+ },
1824
+ {
1825
+ "epoch": 2.1229760798317274,
1826
+ "grad_norm": 25.471004486083984,
1827
+ "learning_rate": 1.6239672416292217e-05,
1828
+ "loss": 2.5871,
1829
+ "step": 21700
1830
+ },
1831
+ {
1832
+ "epoch": 2.132759379738786,
1833
+ "grad_norm": 17.794851303100586,
1834
+ "learning_rate": 1.6058486737208292e-05,
1835
+ "loss": 2.5008,
1836
+ "step": 21800
1837
+ },
1838
+ {
1839
+ "epoch": 2.1425426796458447,
1840
+ "grad_norm": 15.450346946716309,
1841
+ "learning_rate": 1.5877301058124367e-05,
1842
+ "loss": 2.4194,
1843
+ "step": 21900
1844
+ },
1845
+ {
1846
+ "epoch": 2.152325979552903,
1847
+ "grad_norm": 13.243645668029785,
1848
+ "learning_rate": 1.5696115379040442e-05,
1849
+ "loss": 2.5018,
1850
+ "step": 22000
1851
+ },
1852
+ {
1853
+ "epoch": 2.152325979552903,
1854
+ "eval_runtime": 181.9841,
1855
+ "eval_samples_per_second": 112.328,
1856
+ "eval_steps_per_second": 14.045,
1857
+ "step": 22000
1858
+ },
1859
+ {
1860
+ "epoch": 2.162109279459962,
1861
+ "grad_norm": 16.996198654174805,
1862
+ "learning_rate": 1.5514929699956517e-05,
1863
+ "loss": 2.4492,
1864
+ "step": 22100
1865
+ },
1866
+ {
1867
+ "epoch": 2.1718925793670203,
1868
+ "grad_norm": 20.05558967590332,
1869
+ "learning_rate": 1.5333744020872588e-05,
1870
+ "loss": 2.489,
1871
+ "step": 22200
1872
+ },
1873
+ {
1874
+ "epoch": 2.181675879274079,
1875
+ "grad_norm": 15.66326904296875,
1876
+ "learning_rate": 1.5152558341788666e-05,
1877
+ "loss": 2.5089,
1878
+ "step": 22300
1879
+ },
1880
+ {
1881
+ "epoch": 2.191459179181138,
1882
+ "grad_norm": 17.83564567565918,
1883
+ "learning_rate": 1.4971372662704741e-05,
1884
+ "loss": 2.4945,
1885
+ "step": 22400
1886
+ },
1887
+ {
1888
+ "epoch": 2.2012424790881964,
1889
+ "grad_norm": 21.466899871826172,
1890
+ "learning_rate": 1.4790186983620816e-05,
1891
+ "loss": 2.5467,
1892
+ "step": 22500
1893
+ },
1894
+ {
1895
+ "epoch": 2.2012424790881964,
1896
+ "eval_runtime": 182.8328,
1897
+ "eval_samples_per_second": 111.807,
1898
+ "eval_steps_per_second": 13.98,
1899
+ "step": 22500
1900
+ },
1901
+ {
1902
+ "epoch": 2.211025778995255,
1903
+ "grad_norm": 17.91064453125,
1904
+ "learning_rate": 1.4609001304536891e-05,
1905
+ "loss": 2.5144,
1906
+ "step": 22600
1907
+ },
1908
+ {
1909
+ "epoch": 2.2208090789023136,
1910
+ "grad_norm": 17.678396224975586,
1911
+ "learning_rate": 1.4427815625452964e-05,
1912
+ "loss": 2.5018,
1913
+ "step": 22700
1914
+ },
1915
+ {
1916
+ "epoch": 2.2305923788093724,
1917
+ "grad_norm": 17.510461807250977,
1918
+ "learning_rate": 1.4246629946369039e-05,
1919
+ "loss": 2.4228,
1920
+ "step": 22800
1921
+ },
1922
+ {
1923
+ "epoch": 2.240375678716431,
1924
+ "grad_norm": 24.923967361450195,
1925
+ "learning_rate": 1.4065444267285114e-05,
1926
+ "loss": 2.5249,
1927
+ "step": 22900
1928
+ },
1929
+ {
1930
+ "epoch": 2.2501589786234897,
1931
+ "grad_norm": 17.82384490966797,
1932
+ "learning_rate": 1.388425858820119e-05,
1933
+ "loss": 2.4282,
1934
+ "step": 23000
1935
+ },
1936
+ {
1937
+ "epoch": 2.2501589786234897,
1938
+ "eval_runtime": 182.0459,
1939
+ "eval_samples_per_second": 112.29,
1940
+ "eval_steps_per_second": 14.04,
1941
+ "step": 23000
1942
+ },
1943
+ {
1944
+ "epoch": 2.2599422785305485,
1945
+ "grad_norm": 16.13028335571289,
1946
+ "learning_rate": 1.3703072909117265e-05,
1947
+ "loss": 2.4472,
1948
+ "step": 23100
1949
+ },
1950
+ {
1951
+ "epoch": 2.269725578437607,
1952
+ "grad_norm": 15.137242317199707,
1953
+ "learning_rate": 1.352188723003334e-05,
1954
+ "loss": 2.5985,
1955
+ "step": 23200
1956
+ },
1957
+ {
1958
+ "epoch": 2.2795088783446658,
1959
+ "grad_norm": 16.187530517578125,
1960
+ "learning_rate": 1.3340701550949415e-05,
1961
+ "loss": 2.4862,
1962
+ "step": 23300
1963
+ },
1964
+ {
1965
+ "epoch": 2.289292178251724,
1966
+ "grad_norm": 18.84433937072754,
1967
+ "learning_rate": 1.3159515871865488e-05,
1968
+ "loss": 2.516,
1969
+ "step": 23400
1970
+ },
1971
+ {
1972
+ "epoch": 2.299075478158783,
1973
+ "grad_norm": 20.209121704101562,
1974
+ "learning_rate": 1.2978330192781563e-05,
1975
+ "loss": 2.5031,
1976
+ "step": 23500
1977
+ },
1978
+ {
1979
+ "epoch": 2.299075478158783,
1980
+ "eval_runtime": 181.9806,
1981
+ "eval_samples_per_second": 112.331,
1982
+ "eval_steps_per_second": 14.045,
1983
+ "step": 23500
1984
+ },
1985
+ {
1986
+ "epoch": 2.308858778065842,
1987
+ "grad_norm": 67.4502182006836,
1988
+ "learning_rate": 1.2797144513697637e-05,
1989
+ "loss": 2.4491,
1990
+ "step": 23600
1991
+ },
1992
+ {
1993
+ "epoch": 2.3186420779729002,
1994
+ "grad_norm": 14.940401077270508,
1995
+ "learning_rate": 1.2615958834613712e-05,
1996
+ "loss": 2.5669,
1997
+ "step": 23700
1998
+ },
1999
+ {
2000
+ "epoch": 2.328425377879959,
2001
+ "grad_norm": 16.591793060302734,
2002
+ "learning_rate": 1.2434773155529787e-05,
2003
+ "loss": 2.4565,
2004
+ "step": 23800
2005
+ },
2006
+ {
2007
+ "epoch": 2.3382086777870175,
2008
+ "grad_norm": 16.798791885375977,
2009
+ "learning_rate": 1.2253587476445862e-05,
2010
+ "loss": 2.4046,
2011
+ "step": 23900
2012
+ },
2013
+ {
2014
+ "epoch": 2.3479919776940763,
2015
+ "grad_norm": 17.712255477905273,
2016
+ "learning_rate": 1.2072401797361937e-05,
2017
+ "loss": 2.4453,
2018
+ "step": 24000
2019
+ },
2020
+ {
2021
+ "epoch": 2.3479919776940763,
2022
+ "eval_runtime": 182.0401,
2023
+ "eval_samples_per_second": 112.294,
2024
+ "eval_steps_per_second": 14.041,
2025
+ "step": 24000
2026
+ },
2027
+ {
2028
+ "epoch": 2.3577752776011347,
2029
+ "grad_norm": 18.64284324645996,
2030
+ "learning_rate": 1.1891216118278011e-05,
2031
+ "loss": 2.3973,
2032
+ "step": 24100
2033
+ },
2034
+ {
2035
+ "epoch": 2.3675585775081935,
2036
+ "grad_norm": 18.185895919799805,
2037
+ "learning_rate": 1.1710030439194086e-05,
2038
+ "loss": 2.5045,
2039
+ "step": 24200
2040
+ },
2041
+ {
2042
+ "epoch": 2.377341877415252,
2043
+ "grad_norm": 23.201522827148438,
2044
+ "learning_rate": 1.1528844760110163e-05,
2045
+ "loss": 2.5402,
2046
+ "step": 24300
2047
+ },
2048
+ {
2049
+ "epoch": 2.3871251773223108,
2050
+ "grad_norm": 21.606412887573242,
2051
+ "learning_rate": 1.1347659081026236e-05,
2052
+ "loss": 2.4285,
2053
+ "step": 24400
2054
+ },
2055
+ {
2056
+ "epoch": 2.3969084772293696,
2057
+ "grad_norm": 16.318761825561523,
2058
+ "learning_rate": 1.116647340194231e-05,
2059
+ "loss": 2.5509,
2060
+ "step": 24500
2061
+ },
2062
+ {
2063
+ "epoch": 2.3969084772293696,
2064
+ "eval_runtime": 182.0431,
2065
+ "eval_samples_per_second": 112.292,
2066
+ "eval_steps_per_second": 14.041,
2067
+ "step": 24500
2068
+ },
2069
+ {
2070
+ "epoch": 2.406691777136428,
2071
+ "grad_norm": 17.779014587402344,
2072
+ "learning_rate": 1.0985287722858386e-05,
2073
+ "loss": 2.4245,
2074
+ "step": 24600
2075
+ },
2076
+ {
2077
+ "epoch": 2.416475077043487,
2078
+ "grad_norm": 18.44321060180664,
2079
+ "learning_rate": 1.080410204377446e-05,
2080
+ "loss": 2.5223,
2081
+ "step": 24700
2082
+ },
2083
+ {
2084
+ "epoch": 2.4262583769505452,
2085
+ "grad_norm": 24.017047882080078,
2086
+ "learning_rate": 1.0622916364690535e-05,
2087
+ "loss": 2.4846,
2088
+ "step": 24800
2089
+ },
2090
+ {
2091
+ "epoch": 2.436041676857604,
2092
+ "grad_norm": 14.89560604095459,
2093
+ "learning_rate": 1.044173068560661e-05,
2094
+ "loss": 2.5922,
2095
+ "step": 24900
2096
+ },
2097
+ {
2098
+ "epoch": 2.445824976764663,
2099
+ "grad_norm": 15.532561302185059,
2100
+ "learning_rate": 1.0260545006522685e-05,
2101
+ "loss": 2.3976,
2102
+ "step": 25000
2103
+ },
2104
+ {
2105
+ "epoch": 2.445824976764663,
2106
+ "eval_runtime": 182.1033,
2107
+ "eval_samples_per_second": 112.255,
2108
+ "eval_steps_per_second": 14.036,
2109
+ "step": 25000
2110
+ },
2111
+ {
2112
+ "epoch": 2.4556082766717213,
2113
+ "grad_norm": 18.041282653808594,
2114
+ "learning_rate": 1.007935932743876e-05,
2115
+ "loss": 2.4731,
2116
+ "step": 25100
2117
+ },
2118
+ {
2119
+ "epoch": 2.46539157657878,
2120
+ "grad_norm": 13.40858268737793,
2121
+ "learning_rate": 9.898173648354834e-06,
2122
+ "loss": 2.4838,
2123
+ "step": 25200
2124
+ },
2125
+ {
2126
+ "epoch": 2.4751748764858386,
2127
+ "grad_norm": 17.450841903686523,
2128
+ "learning_rate": 9.71698796927091e-06,
2129
+ "loss": 2.3999,
2130
+ "step": 25300
2131
+ },
2132
+ {
2133
+ "epoch": 2.4849581763928974,
2134
+ "grad_norm": 17.556467056274414,
2135
+ "learning_rate": 9.535802290186984e-06,
2136
+ "loss": 2.3867,
2137
+ "step": 25400
2138
+ },
2139
+ {
2140
+ "epoch": 2.494741476299956,
2141
+ "grad_norm": 18.578310012817383,
2142
+ "learning_rate": 9.354616611103059e-06,
2143
+ "loss": 2.4546,
2144
+ "step": 25500
2145
+ },
2146
+ {
2147
+ "epoch": 2.494741476299956,
2148
+ "eval_runtime": 182.0338,
2149
+ "eval_samples_per_second": 112.298,
2150
+ "eval_steps_per_second": 14.041,
2151
+ "step": 25500
2152
+ },
2153
+ {
2154
+ "epoch": 2.5045247762070146,
2155
+ "grad_norm": 14.936469078063965,
2156
+ "learning_rate": 9.173430932019134e-06,
2157
+ "loss": 2.5562,
2158
+ "step": 25600
2159
+ },
2160
+ {
2161
+ "epoch": 2.514308076114073,
2162
+ "grad_norm": 17.527040481567383,
2163
+ "learning_rate": 8.992245252935209e-06,
2164
+ "loss": 2.4008,
2165
+ "step": 25700
2166
+ },
2167
+ {
2168
+ "epoch": 2.524091376021132,
2169
+ "grad_norm": 12.91336727142334,
2170
+ "learning_rate": 8.811059573851283e-06,
2171
+ "loss": 2.4655,
2172
+ "step": 25800
2173
+ },
2174
+ {
2175
+ "epoch": 2.5338746759281907,
2176
+ "grad_norm": 15.168461799621582,
2177
+ "learning_rate": 8.629873894767358e-06,
2178
+ "loss": 2.4468,
2179
+ "step": 25900
2180
+ },
2181
+ {
2182
+ "epoch": 2.543657975835249,
2183
+ "grad_norm": 17.5390682220459,
2184
+ "learning_rate": 8.448688215683433e-06,
2185
+ "loss": 2.4836,
2186
+ "step": 26000
2187
+ },
2188
+ {
2189
+ "epoch": 2.543657975835249,
2190
+ "eval_runtime": 182.1148,
2191
+ "eval_samples_per_second": 112.248,
2192
+ "eval_steps_per_second": 14.035,
2193
+ "step": 26000
2194
+ },
2195
+ {
2196
+ "epoch": 2.553441275742308,
2197
+ "grad_norm": 15.126510620117188,
2198
+ "learning_rate": 8.267502536599508e-06,
2199
+ "loss": 2.387,
2200
+ "step": 26100
2201
+ },
2202
+ {
2203
+ "epoch": 2.5632245756493663,
2204
+ "grad_norm": 15.374293327331543,
2205
+ "learning_rate": 8.086316857515583e-06,
2206
+ "loss": 2.3652,
2207
+ "step": 26200
2208
+ },
2209
+ {
2210
+ "epoch": 2.573007875556425,
2211
+ "grad_norm": 15.498108863830566,
2212
+ "learning_rate": 7.905131178431657e-06,
2213
+ "loss": 2.4749,
2214
+ "step": 26300
2215
+ },
2216
+ {
2217
+ "epoch": 2.582791175463484,
2218
+ "grad_norm": 16.221315383911133,
2219
+ "learning_rate": 7.723945499347732e-06,
2220
+ "loss": 2.4567,
2221
+ "step": 26400
2222
+ },
2223
+ {
2224
+ "epoch": 2.5925744753705424,
2225
+ "grad_norm": 18.839122772216797,
2226
+ "learning_rate": 7.542759820263806e-06,
2227
+ "loss": 2.3554,
2228
+ "step": 26500
2229
+ },
2230
+ {
2231
+ "epoch": 2.5925744753705424,
2232
+ "eval_runtime": 181.9597,
2233
+ "eval_samples_per_second": 112.344,
2234
+ "eval_steps_per_second": 14.047,
2235
+ "step": 26500
2236
+ },
2237
+ {
2238
+ "epoch": 2.6023577752776013,
2239
+ "grad_norm": 22.626708984375,
2240
+ "learning_rate": 7.361574141179882e-06,
2241
+ "loss": 2.502,
2242
+ "step": 26600
2243
+ },
2244
+ {
2245
+ "epoch": 2.6121410751846597,
2246
+ "grad_norm": 16.519880294799805,
2247
+ "learning_rate": 7.180388462095957e-06,
2248
+ "loss": 2.5034,
2249
+ "step": 26700
2250
+ },
2251
+ {
2252
+ "epoch": 2.6219243750917185,
2253
+ "grad_norm": 27.421489715576172,
2254
+ "learning_rate": 6.999202783012031e-06,
2255
+ "loss": 2.5276,
2256
+ "step": 26800
2257
+ },
2258
+ {
2259
+ "epoch": 2.6317076749987773,
2260
+ "grad_norm": 15.274630546569824,
2261
+ "learning_rate": 6.8180171039281055e-06,
2262
+ "loss": 2.4121,
2263
+ "step": 26900
2264
+ },
2265
+ {
2266
+ "epoch": 2.6414909749058357,
2267
+ "grad_norm": 15.751582145690918,
2268
+ "learning_rate": 6.636831424844181e-06,
2269
+ "loss": 2.5799,
2270
+ "step": 27000
2271
+ },
2272
+ {
2273
+ "epoch": 2.6414909749058357,
2274
+ "eval_runtime": 182.0873,
2275
+ "eval_samples_per_second": 112.265,
2276
+ "eval_steps_per_second": 14.037,
2277
+ "step": 27000
2278
+ },
2279
+ {
2280
+ "epoch": 2.651274274812894,
2281
+ "grad_norm": 16.674850463867188,
2282
+ "learning_rate": 6.455645745760255e-06,
2283
+ "loss": 2.3872,
2284
+ "step": 27100
2285
+ },
2286
+ {
2287
+ "epoch": 2.661057574719953,
2288
+ "grad_norm": 12.62803840637207,
2289
+ "learning_rate": 6.27446006667633e-06,
2290
+ "loss": 2.4,
2291
+ "step": 27200
2292
+ },
2293
+ {
2294
+ "epoch": 2.670840874627012,
2295
+ "grad_norm": 18.055158615112305,
2296
+ "learning_rate": 6.093274387592405e-06,
2297
+ "loss": 2.4681,
2298
+ "step": 27300
2299
+ },
2300
+ {
2301
+ "epoch": 2.68062417453407,
2302
+ "grad_norm": 17.21278190612793,
2303
+ "learning_rate": 5.91208870850848e-06,
2304
+ "loss": 2.5441,
2305
+ "step": 27400
2306
+ },
2307
+ {
2308
+ "epoch": 2.690407474441129,
2309
+ "grad_norm": 20.945236206054688,
2310
+ "learning_rate": 5.7309030294245544e-06,
2311
+ "loss": 2.4388,
2312
+ "step": 27500
2313
+ },
2314
+ {
2315
+ "epoch": 2.690407474441129,
2316
+ "eval_runtime": 182.1279,
2317
+ "eval_samples_per_second": 112.24,
2318
+ "eval_steps_per_second": 14.034,
2319
+ "step": 27500
2320
+ },
2321
+ {
2322
+ "epoch": 2.7001907743481874,
2323
+ "grad_norm": 23.483661651611328,
2324
+ "learning_rate": 5.549717350340629e-06,
2325
+ "loss": 2.4589,
2326
+ "step": 27600
2327
+ },
2328
+ {
2329
+ "epoch": 2.7099740742552463,
2330
+ "grad_norm": 17.954036712646484,
2331
+ "learning_rate": 5.368531671256704e-06,
2332
+ "loss": 2.4477,
2333
+ "step": 27700
2334
+ },
2335
+ {
2336
+ "epoch": 2.719757374162305,
2337
+ "grad_norm": 16.187314987182617,
2338
+ "learning_rate": 5.187345992172779e-06,
2339
+ "loss": 2.4967,
2340
+ "step": 27800
2341
+ },
2342
+ {
2343
+ "epoch": 2.7295406740693635,
2344
+ "grad_norm": 14.324910163879395,
2345
+ "learning_rate": 5.006160313088854e-06,
2346
+ "loss": 2.3921,
2347
+ "step": 27900
2348
+ },
2349
+ {
2350
+ "epoch": 2.7393239739764224,
2351
+ "grad_norm": 20.81557846069336,
2352
+ "learning_rate": 4.8249746340049285e-06,
2353
+ "loss": 2.5201,
2354
+ "step": 28000
2355
+ },
2356
+ {
2357
+ "epoch": 2.7393239739764224,
2358
+ "eval_runtime": 182.146,
2359
+ "eval_samples_per_second": 112.229,
2360
+ "eval_steps_per_second": 14.033,
2361
+ "step": 28000
2362
+ },
2363
+ {
2364
+ "epoch": 2.7491072738834808,
2365
+ "grad_norm": 18.682844161987305,
2366
+ "learning_rate": 4.643788954921003e-06,
2367
+ "loss": 2.4325,
2368
+ "step": 28100
2369
+ },
2370
+ {
2371
+ "epoch": 2.7588905737905396,
2372
+ "grad_norm": 16.227272033691406,
2373
+ "learning_rate": 4.462603275837078e-06,
2374
+ "loss": 2.3864,
2375
+ "step": 28200
2376
+ },
2377
+ {
2378
+ "epoch": 2.7686738736975984,
2379
+ "grad_norm": 16.20302963256836,
2380
+ "learning_rate": 4.281417596753152e-06,
2381
+ "loss": 2.5296,
2382
+ "step": 28300
2383
+ },
2384
+ {
2385
+ "epoch": 2.778457173604657,
2386
+ "grad_norm": 18.634096145629883,
2387
+ "learning_rate": 4.100231917669228e-06,
2388
+ "loss": 2.4514,
2389
+ "step": 28400
2390
+ },
2391
+ {
2392
+ "epoch": 2.7882404735117157,
2393
+ "grad_norm": 13.040008544921875,
2394
+ "learning_rate": 3.919046238585303e-06,
2395
+ "loss": 2.3661,
2396
+ "step": 28500
2397
+ },
2398
+ {
2399
+ "epoch": 2.7882404735117157,
2400
+ "eval_runtime": 181.9164,
2401
+ "eval_samples_per_second": 112.37,
2402
+ "eval_steps_per_second": 14.05,
2403
+ "step": 28500
2404
+ },
2405
+ {
2406
+ "epoch": 2.798023773418774,
2407
+ "grad_norm": 14.142943382263184,
2408
+ "learning_rate": 3.737860559501377e-06,
2409
+ "loss": 2.5074,
2410
+ "step": 28600
2411
+ },
2412
+ {
2413
+ "epoch": 2.807807073325833,
2414
+ "grad_norm": 17.934324264526367,
2415
+ "learning_rate": 3.5566748804174523e-06,
2416
+ "loss": 2.4224,
2417
+ "step": 28700
2418
+ },
2419
+ {
2420
+ "epoch": 2.8175903732328913,
2421
+ "grad_norm": 14.450194358825684,
2422
+ "learning_rate": 3.3754892013335267e-06,
2423
+ "loss": 2.4949,
2424
+ "step": 28800
2425
+ },
2426
+ {
2427
+ "epoch": 2.82737367313995,
2428
+ "grad_norm": 17.746837615966797,
2429
+ "learning_rate": 3.194303522249602e-06,
2430
+ "loss": 2.4153,
2431
+ "step": 28900
2432
+ },
2433
+ {
2434
+ "epoch": 2.8371569730470085,
2435
+ "grad_norm": 13.962541580200195,
2436
+ "learning_rate": 3.0131178431656763e-06,
2437
+ "loss": 2.4804,
2438
+ "step": 29000
2439
+ },
2440
+ {
2441
+ "epoch": 2.8371569730470085,
2442
+ "eval_runtime": 182.0262,
2443
+ "eval_samples_per_second": 112.303,
2444
+ "eval_steps_per_second": 14.042,
2445
+ "step": 29000
2446
+ },
2447
+ {
2448
+ "epoch": 2.8469402729540674,
2449
+ "grad_norm": 16.669286727905273,
2450
+ "learning_rate": 2.831932164081751e-06,
2451
+ "loss": 2.5397,
2452
+ "step": 29100
2453
+ },
2454
+ {
2455
+ "epoch": 2.856723572861126,
2456
+ "grad_norm": 15.421733856201172,
2457
+ "learning_rate": 2.650746484997826e-06,
2458
+ "loss": 2.4175,
2459
+ "step": 29200
2460
+ },
2461
+ {
2462
+ "epoch": 2.8665068727681846,
2463
+ "grad_norm": 14.135702133178711,
2464
+ "learning_rate": 2.4695608059139007e-06,
2465
+ "loss": 2.5069,
2466
+ "step": 29300
2467
+ },
2468
+ {
2469
+ "epoch": 2.8762901726752435,
2470
+ "grad_norm": 17.41412925720215,
2471
+ "learning_rate": 2.2883751268299756e-06,
2472
+ "loss": 2.3997,
2473
+ "step": 29400
2474
+ },
2475
+ {
2476
+ "epoch": 2.886073472582302,
2477
+ "grad_norm": 14.824533462524414,
2478
+ "learning_rate": 2.1071894477460504e-06,
2479
+ "loss": 2.3945,
2480
+ "step": 29500
2481
+ },
2482
+ {
2483
+ "epoch": 2.886073472582302,
2484
+ "eval_runtime": 181.9299,
2485
+ "eval_samples_per_second": 112.362,
2486
+ "eval_steps_per_second": 14.049,
2487
+ "step": 29500
2488
+ },
2489
+ {
2490
+ "epoch": 2.8958567724893607,
2491
+ "grad_norm": 27.31865119934082,
2492
+ "learning_rate": 1.926003768662125e-06,
2493
+ "loss": 2.45,
2494
+ "step": 29600
2495
+ },
2496
+ {
2497
+ "epoch": 2.9056400723964195,
2498
+ "grad_norm": 18.966655731201172,
2499
+ "learning_rate": 1.7448180895781998e-06,
2500
+ "loss": 2.3916,
2501
+ "step": 29700
2502
+ },
2503
+ {
2504
+ "epoch": 2.915423372303478,
2505
+ "grad_norm": 18.538440704345703,
2506
+ "learning_rate": 1.5636324104942746e-06,
2507
+ "loss": 2.4625,
2508
+ "step": 29800
2509
+ },
2510
+ {
2511
+ "epoch": 2.9252066722105368,
2512
+ "grad_norm": 21.757272720336914,
2513
+ "learning_rate": 1.3824467314103494e-06,
2514
+ "loss": 2.3722,
2515
+ "step": 29900
2516
+ },
2517
+ {
2518
+ "epoch": 2.934989972117595,
2519
+ "grad_norm": 16.907358169555664,
2520
+ "learning_rate": 1.201261052326424e-06,
2521
+ "loss": 2.464,
2522
+ "step": 30000
2523
+ },
2524
+ {
2525
+ "epoch": 2.934989972117595,
2526
+ "eval_runtime": 181.9148,
2527
+ "eval_samples_per_second": 112.371,
2528
+ "eval_steps_per_second": 14.051,
2529
+ "step": 30000
2530
+ },
2531
+ {
2532
+ "epoch": 2.944773272024654,
2533
+ "grad_norm": 13.88399600982666,
2534
+ "learning_rate": 1.0200753732424989e-06,
2535
+ "loss": 2.5005,
2536
+ "step": 30100
2537
+ },
2538
+ {
2539
+ "epoch": 2.954556571931713,
2540
+ "grad_norm": 19.77507781982422,
2541
+ "learning_rate": 8.388896941585737e-07,
2542
+ "loss": 2.3829,
2543
+ "step": 30200
2544
+ },
2545
+ {
2546
+ "epoch": 2.9643398718387712,
2547
+ "grad_norm": 16.535932540893555,
2548
+ "learning_rate": 6.577040150746485e-07,
2549
+ "loss": 2.4788,
2550
+ "step": 30300
2551
+ },
2552
+ {
2553
+ "epoch": 2.9741231717458296,
2554
+ "grad_norm": 15.027000427246094,
2555
+ "learning_rate": 4.765183359907233e-07,
2556
+ "loss": 2.5007,
2557
+ "step": 30400
2558
+ },
2559
+ {
2560
+ "epoch": 2.9839064716528885,
2561
+ "grad_norm": 14.9392671585083,
2562
+ "learning_rate": 2.953326569067981e-07,
2563
+ "loss": 2.4847,
2564
+ "step": 30500
2565
+ },
2566
+ {
2567
+ "epoch": 2.9839064716528885,
2568
+ "eval_runtime": 181.9853,
2569
+ "eval_samples_per_second": 112.328,
2570
+ "eval_steps_per_second": 14.045,
2571
+ "step": 30500
2572
+ },
2573
+ {
2574
+ "epoch": 2.9936897715599473,
2575
+ "grad_norm": 15.128337860107422,
2576
+ "learning_rate": 1.1414697782287289e-07,
2577
+ "loss": 2.4209,
2578
+ "step": 30600
2579
+ }
2580
+ ],
2581
+ "logging_steps": 100,
2582
+ "max_steps": 30663,
2583
+ "num_input_tokens_seen": 0,
2584
+ "num_train_epochs": 3,
2585
+ "save_steps": 500,
2586
+ "stateful_callbacks": {
2587
+ "TrainerControl": {
2588
+ "args": {
2589
+ "should_epoch_stop": false,
2590
+ "should_evaluate": false,
2591
+ "should_log": false,
2592
+ "should_save": true,
2593
+ "should_training_stop": true
2594
+ },
2595
+ "attributes": {}
2596
+ }
2597
+ },
2598
+ "total_flos": 1.0701267610290972e+16,
2599
+ "train_batch_size": 8,
2600
+ "trial_name": null,
2601
+ "trial_params": null
2602
+ }
muril_ch_domain/checkpoint-30663/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c39b3134e7e5628432a425a5873f74f59d631bf591a12adab52f3ba906ae6906
3
+ size 5304
muril_ch_domain/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/muril-base-cased",
3
+ "architectures": [
4
+ "BertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "embedding_size": 768,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 197285
26
+ }
muril_ch_domain/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "pad_token_id": 0,
4
+ "transformers_version": "4.46.3"
5
+ }
muril_ch_domain/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6543e1c5efec6b0dd698bbec06d48814f1a047b418ecec2ba87c217eb759385
3
+ size 951043900