innoku commited on
Commit
47d11bf
·
verified ·
1 Parent(s): e9b29da

Upload folder using huggingface_hub

Browse files
checkpoint-200/adapter_config.json CHANGED
@@ -33,13 +33,13 @@
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "v_proj",
37
- "down_proj",
38
  "gate_proj",
39
- "k_proj",
40
- "o_proj",
41
  "up_proj",
42
- "q_proj"
 
 
 
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
 
36
  "gate_proj",
 
 
37
  "up_proj",
38
+ "q_proj",
39
+ "v_proj",
40
+ "o_proj",
41
+ "down_proj",
42
+ "k_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
checkpoint-200/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0282772951453353736e0967b4ea89d7b47e2ecf8554536515db6c05fff1217
3
  size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:690bee4aed13f9e6515c2ea4fe6300ecc7f09058ad387f2333222aa4f27413a3
3
  size 528550256
checkpoint-200/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc39461c6f26c6d4ed2eaaa4d956f7e72506a2d24323a9aeb533f6acb3ec15f4
3
  size 1057397963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b382ba39fd819716ef66a215bd75da5b3e16911fd10cca93e7cfd5783eadbd8
3
  size 1057397963
checkpoint-200/trainer_state.json CHANGED
@@ -11,174 +11,174 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04509582863585118,
14
- "grad_norm": 6.922087050043046e-05,
15
  "learning_rate": 3.9130434782608694e-07,
16
- "loss": 1.6552,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.09019165727170236,
21
- "grad_norm": 5.8874407841358334e-05,
22
  "learning_rate": 8.260869565217391e-07,
23
- "loss": 1.7092,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.13528748590755355,
28
- "grad_norm": 6.833599763922393e-05,
29
  "learning_rate": 9.97758641300553e-07,
30
- "loss": 1.6897,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.18038331454340473,
35
- "grad_norm": 5.440233871922828e-05,
36
  "learning_rate": 9.841341526992535e-07,
37
- "loss": 1.9093,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2254791431792559,
42
- "grad_norm": 4.8365614929934964e-05,
43
  "learning_rate": 9.584688140963944e-07,
44
- "loss": 1.5903,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2254791431792559,
49
- "eval_loss": 1.7812488079071045,
50
- "eval_runtime": 44.931,
51
- "eval_samples_per_second": 4.162,
52
- "eval_steps_per_second": 2.092,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.2705749718151071,
57
- "grad_norm": 4.5753618906019256e-05,
58
  "learning_rate": 9.214009454506752e-07,
59
- "loss": 1.5573,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 0.3156708004509583,
64
- "grad_norm": 5.378713467507623e-05,
65
  "learning_rate": 8.738524578558546e-07,
66
- "loss": 1.6925,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 0.36076662908680945,
71
- "grad_norm": 4.737311974167824e-05,
72
  "learning_rate": 8.170059247861193e-07,
73
- "loss": 1.6041,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 0.40586245772266066,
78
- "grad_norm": 3.957717854063958e-05,
79
  "learning_rate": 7.522751704345887e-07,
80
- "loss": 1.4225,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.4509582863585118,
85
- "grad_norm": 4.50208863185253e-05,
86
  "learning_rate": 6.812701066393123e-07,
87
- "loss": 1.5622,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.4509582863585118,
92
- "eval_loss": 1.6484966278076172,
93
- "eval_runtime": 44.8144,
94
- "eval_samples_per_second": 4.173,
95
- "eval_steps_per_second": 2.098,
96
  "step": 100
97
  },
98
  {
99
  "epoch": 0.496054114994363,
100
- "grad_norm": 3.848442429443821e-05,
101
  "learning_rate": 6.057566929339095e-07,
102
- "loss": 1.4544,
103
  "step": 110
104
  },
105
  {
106
  "epoch": 0.5411499436302142,
107
- "grad_norm": 4.51727319159545e-05,
108
  "learning_rate": 5.27613015552254e-07,
109
- "loss": 1.5298,
110
  "step": 120
111
  },
112
  {
113
  "epoch": 0.5862457722660653,
114
- "grad_norm": 1.8444205124978907e-05,
115
  "learning_rate": 4.4878257774169345e-07,
116
- "loss": 1.4496,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 0.6313416009019166,
121
- "grad_norm": 3.5619468690129e-05,
122
  "learning_rate": 3.7122596309655174e-07,
123
- "loss": 1.4476,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 0.6764374295377678,
128
- "grad_norm": 2.6366453312220983e-05,
129
  "learning_rate": 2.9687207408810555e-07,
130
- "loss": 1.4962,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 0.6764374295377678,
135
- "eval_loss": 1.5979645252227783,
136
- "eval_runtime": 45.0918,
137
- "eval_samples_per_second": 4.147,
138
- "eval_steps_per_second": 2.085,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 0.7215332581736189,
143
- "grad_norm": 3.0969211366027594e-05,
144
  "learning_rate": 2.275701585324649e-07,
145
- "loss": 1.374,
146
  "step": 160
147
  },
148
  {
149
  "epoch": 0.7666290868094702,
150
- "grad_norm": 3.294655471108854e-05,
151
  "learning_rate": 1.6504381714107252e-07,
152
- "loss": 1.4554,
153
  "step": 170
154
  },
155
  {
156
  "epoch": 0.8117249154453213,
157
- "grad_norm": 2.8370055588311516e-05,
158
  "learning_rate": 1.1084813602723514e-07,
159
- "loss": 1.4978,
160
  "step": 180
161
  },
162
  {
163
  "epoch": 0.8568207440811725,
164
- "grad_norm": 3.117798769380897e-05,
165
  "learning_rate": 6.633101032164273e-08,
166
- "loss": 1.5816,
167
  "step": 190
168
  },
169
  {
170
  "epoch": 0.9019165727170236,
171
- "grad_norm": 2.8335167371551506e-05,
172
  "learning_rate": 3.2599620813200835e-08,
173
- "loss": 1.484,
174
  "step": 200
175
  },
176
  {
177
  "epoch": 0.9019165727170236,
178
- "eval_loss": 1.5858721733093262,
179
- "eval_runtime": 45.0617,
180
- "eval_samples_per_second": 4.15,
181
- "eval_steps_per_second": 2.086,
182
  "step": 200
183
  }
184
  ],
@@ -199,7 +199,7 @@
199
  "attributes": {}
200
  }
201
  },
202
- "total_flos": 3.1184814522587136e+16,
203
  "train_batch_size": 2,
204
  "trial_name": null,
205
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04509582863585118,
14
+ "grad_norm": 7.656381058041006e-05,
15
  "learning_rate": 3.9130434782608694e-07,
16
+ "loss": 1.6138,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.09019165727170236,
21
+ "grad_norm": 6.489222141681239e-05,
22
  "learning_rate": 8.260869565217391e-07,
23
+ "loss": 1.6747,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.13528748590755355,
28
+ "grad_norm": 6.760261749150231e-05,
29
  "learning_rate": 9.97758641300553e-07,
30
+ "loss": 1.6401,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.18038331454340473,
35
+ "grad_norm": 5.8056666603079066e-05,
36
  "learning_rate": 9.841341526992535e-07,
37
+ "loss": 1.8594,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2254791431792559,
42
+ "grad_norm": 4.639743929146789e-05,
43
  "learning_rate": 9.584688140963944e-07,
44
+ "loss": 1.5358,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2254791431792559,
49
+ "eval_loss": 1.7225253582000732,
50
+ "eval_runtime": 54.904,
51
+ "eval_samples_per_second": 3.406,
52
+ "eval_steps_per_second": 1.712,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.2705749718151071,
57
+ "grad_norm": 4.406080552143976e-05,
58
  "learning_rate": 9.214009454506752e-07,
59
+ "loss": 1.5081,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 0.3156708004509583,
64
+ "grad_norm": 4.9922884500119835e-05,
65
  "learning_rate": 8.738524578558546e-07,
66
+ "loss": 1.6377,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 0.36076662908680945,
71
+ "grad_norm": 4.485138924792409e-05,
72
  "learning_rate": 8.170059247861193e-07,
73
+ "loss": 1.5535,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 0.40586245772266066,
78
+ "grad_norm": 3.757755985134281e-05,
79
  "learning_rate": 7.522751704345887e-07,
80
+ "loss": 1.3739,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.4509582863585118,
85
+ "grad_norm": 4.152490146225318e-05,
86
  "learning_rate": 6.812701066393123e-07,
87
+ "loss": 1.5175,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.4509582863585118,
92
+ "eval_loss": 1.597386360168457,
93
+ "eval_runtime": 54.3789,
94
+ "eval_samples_per_second": 3.439,
95
+ "eval_steps_per_second": 1.729,
96
  "step": 100
97
  },
98
  {
99
  "epoch": 0.496054114994363,
100
+ "grad_norm": 3.326448131701909e-05,
101
  "learning_rate": 6.057566929339095e-07,
102
+ "loss": 1.414,
103
  "step": 110
104
  },
105
  {
106
  "epoch": 0.5411499436302142,
107
+ "grad_norm": 3.826636020676233e-05,
108
  "learning_rate": 5.27613015552254e-07,
109
+ "loss": 1.4865,
110
  "step": 120
111
  },
112
  {
113
  "epoch": 0.5862457722660653,
114
+ "grad_norm": 3.5070326703134924e-05,
115
  "learning_rate": 4.4878257774169345e-07,
116
+ "loss": 1.4087,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 0.6313416009019166,
121
+ "grad_norm": 3.05857029161416e-05,
122
  "learning_rate": 3.7122596309655174e-07,
123
+ "loss": 1.4122,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 0.6764374295377678,
128
+ "grad_norm": 2.6550629627308808e-05,
129
  "learning_rate": 2.9687207408810555e-07,
130
+ "loss": 1.4691,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 0.6764374295377678,
135
+ "eval_loss": 1.5566362142562866,
136
+ "eval_runtime": 54.2124,
137
+ "eval_samples_per_second": 3.449,
138
+ "eval_steps_per_second": 1.734,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 0.7215332581736189,
143
+ "grad_norm": 2.8728065444738604e-05,
144
  "learning_rate": 2.275701585324649e-07,
145
+ "loss": 1.3447,
146
  "step": 160
147
  },
148
  {
149
  "epoch": 0.7666290868094702,
150
+ "grad_norm": 2.87340644717915e-05,
151
  "learning_rate": 1.6504381714107252e-07,
152
+ "loss": 1.4244,
153
  "step": 170
154
  },
155
  {
156
  "epoch": 0.8117249154453213,
157
+ "grad_norm": 2.3698501536273398e-05,
158
  "learning_rate": 1.1084813602723514e-07,
159
+ "loss": 1.4641,
160
  "step": 180
161
  },
162
  {
163
  "epoch": 0.8568207440811725,
164
+ "grad_norm": 2.6557932869764045e-05,
165
  "learning_rate": 6.633101032164273e-08,
166
+ "loss": 1.5457,
167
  "step": 190
168
  },
169
  {
170
  "epoch": 0.9019165727170236,
171
+ "grad_norm": 2.557658990554046e-05,
172
  "learning_rate": 3.2599620813200835e-08,
173
+ "loss": 1.4515,
174
  "step": 200
175
  },
176
  {
177
  "epoch": 0.9019165727170236,
178
+ "eval_loss": 1.547265648841858,
179
+ "eval_runtime": 54.2134,
180
+ "eval_samples_per_second": 3.449,
181
+ "eval_steps_per_second": 1.734,
182
  "step": 200
183
  }
184
  ],
 
199
  "attributes": {}
200
  }
201
  },
202
+ "total_flos": 3.249236263099392e+16,
203
  "train_batch_size": 2,
204
  "trial_name": null,
205
  "trial_params": null
checkpoint-200/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a42faaba2ea3bd09475b07d4ab88bb954ecf8f4575151097cfb39ccca6d6b99e
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1577259edf2dee39c9253dceded37e554842038d7b13fec3632015f49fce31ff
3
  size 5841
checkpoint-222/adapter_config.json CHANGED
@@ -33,13 +33,13 @@
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
36
- "v_proj",
37
- "down_proj",
38
  "gate_proj",
39
- "k_proj",
40
- "o_proj",
41
  "up_proj",
42
- "q_proj"
 
 
 
 
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
 
33
  "rank_pattern": {},
34
  "revision": null,
35
  "target_modules": [
 
 
36
  "gate_proj",
 
 
37
  "up_proj",
38
+ "q_proj",
39
+ "v_proj",
40
+ "o_proj",
41
+ "down_proj",
42
+ "k_proj"
43
  ],
44
  "target_parameters": null,
45
  "task_type": "CAUSAL_LM",
checkpoint-222/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:572cffd8bf978a8d938709d8aba0f8a4575a2fc54e01ac992d85aa1d9f07ebb6
3
  size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54713618def6179ffa198ff7434de6345a660583c2f65327261f7ab176e0b09d
3
  size 528550256
checkpoint-222/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa032174334a9133b4c1841cd532fbf420567e2af5c02694172ea7c4b2c831af
3
  size 1057397963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14b148d42d2443efc60568edc8f2969ee97dd7fd938c699d3e6700a7f798b979
3
  size 1057397963
checkpoint-222/trainer_state.json CHANGED
@@ -11,188 +11,188 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.04509582863585118,
14
- "grad_norm": 6.922087050043046e-05,
15
  "learning_rate": 3.9130434782608694e-07,
16
- "loss": 1.6552,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.09019165727170236,
21
- "grad_norm": 5.8874407841358334e-05,
22
  "learning_rate": 8.260869565217391e-07,
23
- "loss": 1.7092,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.13528748590755355,
28
- "grad_norm": 6.833599763922393e-05,
29
  "learning_rate": 9.97758641300553e-07,
30
- "loss": 1.6897,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.18038331454340473,
35
- "grad_norm": 5.440233871922828e-05,
36
  "learning_rate": 9.841341526992535e-07,
37
- "loss": 1.9093,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2254791431792559,
42
- "grad_norm": 4.8365614929934964e-05,
43
  "learning_rate": 9.584688140963944e-07,
44
- "loss": 1.5903,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2254791431792559,
49
- "eval_loss": 1.7812488079071045,
50
- "eval_runtime": 44.931,
51
- "eval_samples_per_second": 4.162,
52
- "eval_steps_per_second": 2.092,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.2705749718151071,
57
- "grad_norm": 4.5753618906019256e-05,
58
  "learning_rate": 9.214009454506752e-07,
59
- "loss": 1.5573,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 0.3156708004509583,
64
- "grad_norm": 5.378713467507623e-05,
65
  "learning_rate": 8.738524578558546e-07,
66
- "loss": 1.6925,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 0.36076662908680945,
71
- "grad_norm": 4.737311974167824e-05,
72
  "learning_rate": 8.170059247861193e-07,
73
- "loss": 1.6041,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 0.40586245772266066,
78
- "grad_norm": 3.957717854063958e-05,
79
  "learning_rate": 7.522751704345887e-07,
80
- "loss": 1.4225,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.4509582863585118,
85
- "grad_norm": 4.50208863185253e-05,
86
  "learning_rate": 6.812701066393123e-07,
87
- "loss": 1.5622,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.4509582863585118,
92
- "eval_loss": 1.6484966278076172,
93
- "eval_runtime": 44.8144,
94
- "eval_samples_per_second": 4.173,
95
- "eval_steps_per_second": 2.098,
96
  "step": 100
97
  },
98
  {
99
  "epoch": 0.496054114994363,
100
- "grad_norm": 3.848442429443821e-05,
101
  "learning_rate": 6.057566929339095e-07,
102
- "loss": 1.4544,
103
  "step": 110
104
  },
105
  {
106
  "epoch": 0.5411499436302142,
107
- "grad_norm": 4.51727319159545e-05,
108
  "learning_rate": 5.27613015552254e-07,
109
- "loss": 1.5298,
110
  "step": 120
111
  },
112
  {
113
  "epoch": 0.5862457722660653,
114
- "grad_norm": 1.8444205124978907e-05,
115
  "learning_rate": 4.4878257774169345e-07,
116
- "loss": 1.4496,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 0.6313416009019166,
121
- "grad_norm": 3.5619468690129e-05,
122
  "learning_rate": 3.7122596309655174e-07,
123
- "loss": 1.4476,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 0.6764374295377678,
128
- "grad_norm": 2.6366453312220983e-05,
129
  "learning_rate": 2.9687207408810555e-07,
130
- "loss": 1.4962,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 0.6764374295377678,
135
- "eval_loss": 1.5979645252227783,
136
- "eval_runtime": 45.0918,
137
- "eval_samples_per_second": 4.147,
138
- "eval_steps_per_second": 2.085,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 0.7215332581736189,
143
- "grad_norm": 3.0969211366027594e-05,
144
  "learning_rate": 2.275701585324649e-07,
145
- "loss": 1.374,
146
  "step": 160
147
  },
148
  {
149
  "epoch": 0.7666290868094702,
150
- "grad_norm": 3.294655471108854e-05,
151
  "learning_rate": 1.6504381714107252e-07,
152
- "loss": 1.4554,
153
  "step": 170
154
  },
155
  {
156
  "epoch": 0.8117249154453213,
157
- "grad_norm": 2.8370055588311516e-05,
158
  "learning_rate": 1.1084813602723514e-07,
159
- "loss": 1.4978,
160
  "step": 180
161
  },
162
  {
163
  "epoch": 0.8568207440811725,
164
- "grad_norm": 3.117798769380897e-05,
165
  "learning_rate": 6.633101032164273e-08,
166
- "loss": 1.5816,
167
  "step": 190
168
  },
169
  {
170
  "epoch": 0.9019165727170236,
171
- "grad_norm": 2.8335167371551506e-05,
172
  "learning_rate": 3.2599620813200835e-08,
173
- "loss": 1.484,
174
  "step": 200
175
  },
176
  {
177
  "epoch": 0.9019165727170236,
178
- "eval_loss": 1.5858721733093262,
179
- "eval_runtime": 45.0617,
180
- "eval_samples_per_second": 4.15,
181
- "eval_steps_per_second": 2.086,
182
  "step": 200
183
  },
184
  {
185
  "epoch": 0.9470124013528749,
186
- "grad_norm": 3.576183371478692e-05,
187
  "learning_rate": 1.0492897371142728e-08,
188
- "loss": 1.3657,
189
  "step": 210
190
  },
191
  {
192
  "epoch": 0.992108229988726,
193
- "grad_norm": 3.1967378163244575e-05,
194
  "learning_rate": 5.606540077782162e-10,
195
- "loss": 1.5348,
196
  "step": 220
197
  }
198
  ],
@@ -213,7 +213,7 @@
213
  "attributes": {}
214
  }
215
  },
216
- "total_flos": 3.4466834382336e+16,
217
  "train_batch_size": 2,
218
  "trial_name": null,
219
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.04509582863585118,
14
+ "grad_norm": 7.656381058041006e-05,
15
  "learning_rate": 3.9130434782608694e-07,
16
+ "loss": 1.6138,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.09019165727170236,
21
+ "grad_norm": 6.489222141681239e-05,
22
  "learning_rate": 8.260869565217391e-07,
23
+ "loss": 1.6747,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.13528748590755355,
28
+ "grad_norm": 6.760261749150231e-05,
29
  "learning_rate": 9.97758641300553e-07,
30
+ "loss": 1.6401,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.18038331454340473,
35
+ "grad_norm": 5.8056666603079066e-05,
36
  "learning_rate": 9.841341526992535e-07,
37
+ "loss": 1.8594,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.2254791431792559,
42
+ "grad_norm": 4.639743929146789e-05,
43
  "learning_rate": 9.584688140963944e-07,
44
+ "loss": 1.5358,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.2254791431792559,
49
+ "eval_loss": 1.7225253582000732,
50
+ "eval_runtime": 54.904,
51
+ "eval_samples_per_second": 3.406,
52
+ "eval_steps_per_second": 1.712,
53
  "step": 50
54
  },
55
  {
56
  "epoch": 0.2705749718151071,
57
+ "grad_norm": 4.406080552143976e-05,
58
  "learning_rate": 9.214009454506752e-07,
59
+ "loss": 1.5081,
60
  "step": 60
61
  },
62
  {
63
  "epoch": 0.3156708004509583,
64
+ "grad_norm": 4.9922884500119835e-05,
65
  "learning_rate": 8.738524578558546e-07,
66
+ "loss": 1.6377,
67
  "step": 70
68
  },
69
  {
70
  "epoch": 0.36076662908680945,
71
+ "grad_norm": 4.485138924792409e-05,
72
  "learning_rate": 8.170059247861193e-07,
73
+ "loss": 1.5535,
74
  "step": 80
75
  },
76
  {
77
  "epoch": 0.40586245772266066,
78
+ "grad_norm": 3.757755985134281e-05,
79
  "learning_rate": 7.522751704345887e-07,
80
+ "loss": 1.3739,
81
  "step": 90
82
  },
83
  {
84
  "epoch": 0.4509582863585118,
85
+ "grad_norm": 4.152490146225318e-05,
86
  "learning_rate": 6.812701066393123e-07,
87
+ "loss": 1.5175,
88
  "step": 100
89
  },
90
  {
91
  "epoch": 0.4509582863585118,
92
+ "eval_loss": 1.597386360168457,
93
+ "eval_runtime": 54.3789,
94
+ "eval_samples_per_second": 3.439,
95
+ "eval_steps_per_second": 1.729,
96
  "step": 100
97
  },
98
  {
99
  "epoch": 0.496054114994363,
100
+ "grad_norm": 3.326448131701909e-05,
101
  "learning_rate": 6.057566929339095e-07,
102
+ "loss": 1.414,
103
  "step": 110
104
  },
105
  {
106
  "epoch": 0.5411499436302142,
107
+ "grad_norm": 3.826636020676233e-05,
108
  "learning_rate": 5.27613015552254e-07,
109
+ "loss": 1.4865,
110
  "step": 120
111
  },
112
  {
113
  "epoch": 0.5862457722660653,
114
+ "grad_norm": 3.5070326703134924e-05,
115
  "learning_rate": 4.4878257774169345e-07,
116
+ "loss": 1.4087,
117
  "step": 130
118
  },
119
  {
120
  "epoch": 0.6313416009019166,
121
+ "grad_norm": 3.05857029161416e-05,
122
  "learning_rate": 3.7122596309655174e-07,
123
+ "loss": 1.4122,
124
  "step": 140
125
  },
126
  {
127
  "epoch": 0.6764374295377678,
128
+ "grad_norm": 2.6550629627308808e-05,
129
  "learning_rate": 2.9687207408810555e-07,
130
+ "loss": 1.4691,
131
  "step": 150
132
  },
133
  {
134
  "epoch": 0.6764374295377678,
135
+ "eval_loss": 1.5566362142562866,
136
+ "eval_runtime": 54.2124,
137
+ "eval_samples_per_second": 3.449,
138
+ "eval_steps_per_second": 1.734,
139
  "step": 150
140
  },
141
  {
142
  "epoch": 0.7215332581736189,
143
+ "grad_norm": 2.8728065444738604e-05,
144
  "learning_rate": 2.275701585324649e-07,
145
+ "loss": 1.3447,
146
  "step": 160
147
  },
148
  {
149
  "epoch": 0.7666290868094702,
150
+ "grad_norm": 2.87340644717915e-05,
151
  "learning_rate": 1.6504381714107252e-07,
152
+ "loss": 1.4244,
153
  "step": 170
154
  },
155
  {
156
  "epoch": 0.8117249154453213,
157
+ "grad_norm": 2.3698501536273398e-05,
158
  "learning_rate": 1.1084813602723514e-07,
159
+ "loss": 1.4641,
160
  "step": 180
161
  },
162
  {
163
  "epoch": 0.8568207440811725,
164
+ "grad_norm": 2.6557932869764045e-05,
165
  "learning_rate": 6.633101032164273e-08,
166
+ "loss": 1.5457,
167
  "step": 190
168
  },
169
  {
170
  "epoch": 0.9019165727170236,
171
+ "grad_norm": 2.557658990554046e-05,
172
  "learning_rate": 3.2599620813200835e-08,
173
+ "loss": 1.4515,
174
  "step": 200
175
  },
176
  {
177
  "epoch": 0.9019165727170236,
178
+ "eval_loss": 1.547265648841858,
179
+ "eval_runtime": 54.2134,
180
+ "eval_samples_per_second": 3.449,
181
+ "eval_steps_per_second": 1.734,
182
  "step": 200
183
  },
184
  {
185
  "epoch": 0.9470124013528749,
186
+ "grad_norm": 2.975752249767538e-05,
187
  "learning_rate": 1.0492897371142728e-08,
188
+ "loss": 1.3338,
189
  "step": 210
190
  },
191
  {
192
  "epoch": 0.992108229988726,
193
+ "grad_norm": 2.890920586651191e-05,
194
  "learning_rate": 5.606540077782162e-10,
195
+ "loss": 1.5025,
196
  "step": 220
197
  }
198
  ],
 
213
  "attributes": {}
214
  }
215
  },
216
+ "total_flos": 3.5912091671706624e+16,
217
  "train_batch_size": 2,
218
  "trial_name": null,
219
  "trial_params": null
checkpoint-222/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a42faaba2ea3bd09475b07d4ab88bb954ecf8f4575151097cfb39ccca6d6b99e
3
  size 5841
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1577259edf2dee39c9253dceded37e554842038d7b13fec3632015f49fce31ff
3
  size 5841