EN3S commited on
Commit
3c00ead
·
verified ·
1 Parent(s): e497cdd

Training in progress, epoch 4

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86da292e44eb636b62e95367ef4936d89b7b6baee6fe9119a8b62c9e977f0c9d
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1543d11d39ae4983ad4b56210a391fd44e53d905d92d2536f8ce4494e3795db5
3
  size 437958648
run-0/checkpoint-117/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17e0c9d1a3cf96b438c626370ec0758ae280b04631d45470154b5eca1293573f
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8672e581da9d8df927955f18ee97b1fb7bc4b72d39bba7c191508198135749ab
3
  size 437958648
run-0/checkpoint-117/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02f3dffe3a080aec80d4aa45517d6cb1c8020dc49d3393ae96f05506fb56d8d1
3
  size 876038394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7c35b295bba54ba1b241603995ce1811c2382076f99713945e46fceacbb9937
3
  size 876038394
run-0/checkpoint-117/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2aa20791cd3401b748110a053f719d6902e4d9ccc845f2f5d2ff250a3d27441
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9df7fcee919248151322838e3ee2bda70121eb4c651bc0be349c4f5f62deeaf
3
  size 5432
run-0/checkpoint-156/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:45363ce679a8dfd6a6ce8f3513e67b5693b6d30b7c4329ec9c084a47504e9ba8
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1543d11d39ae4983ad4b56210a391fd44e53d905d92d2536f8ce4494e3795db5
3
  size 437958648
run-0/checkpoint-156/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ebd079703cd72b12a422caae45df454bcdc3dda626ba153bd836afb84b1093d
3
  size 876038394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:889253018f7f4ef7fe5b3708a9b99fd213e993b42054f01aab66786ee4ea3395
3
  size 876038394
run-0/checkpoint-156/trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "best_global_step": 156,
3
- "best_metric": 0.7003610108303249,
4
  "best_model_checkpoint": "bert-base-uncased-finetuned-rte-run_14/run-0/checkpoint-156",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
@@ -11,21 +11,21 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.2564102564102564,
14
- "grad_norm": 1.662625789642334,
15
  "learning_rate": 9.487179487179487e-05,
16
  "loss": 0.696,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.5128205128205128,
21
- "grad_norm": 2.0300352573394775,
22
  "learning_rate": 8.974358974358975e-05,
23
  "loss": 0.6793,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.7692307692307693,
28
- "grad_norm": 4.492157936096191,
29
  "learning_rate": 8.461538461538461e-05,
30
  "loss": 0.6499,
31
  "step": 30
@@ -33,121 +33,121 @@
33
  {
34
  "epoch": 1.0,
35
  "eval_accuracy": 0.631768953068592,
36
- "eval_loss": 0.6312768459320068,
37
- "eval_runtime": 0.6623,
38
- "eval_samples_per_second": 418.266,
39
- "eval_steps_per_second": 7.55,
40
  "step": 39
41
  },
42
  {
43
  "epoch": 1.0256410256410255,
44
- "grad_norm": 3.4885644912719727,
45
  "learning_rate": 7.948717948717948e-05,
46
  "loss": 0.6793,
47
  "step": 40
48
  },
49
  {
50
  "epoch": 1.282051282051282,
51
- "grad_norm": 5.2225494384765625,
52
  "learning_rate": 7.435897435897436e-05,
53
- "loss": 0.5596,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.5384615384615383,
58
- "grad_norm": 6.484560489654541,
59
  "learning_rate": 6.923076923076924e-05,
60
- "loss": 0.5713,
61
  "step": 60
62
  },
63
  {
64
  "epoch": 1.7948717948717947,
65
- "grad_norm": 4.836739540100098,
66
  "learning_rate": 6.410256410256412e-05,
67
- "loss": 0.545,
68
  "step": 70
69
  },
70
  {
71
  "epoch": 2.0,
72
- "eval_accuracy": 0.6714801444043321,
73
- "eval_loss": 0.658456563949585,
74
- "eval_runtime": 0.6622,
75
- "eval_samples_per_second": 418.306,
76
- "eval_steps_per_second": 7.551,
77
  "step": 78
78
  },
79
  {
80
  "epoch": 2.051282051282051,
81
- "grad_norm": 6.515610218048096,
82
  "learning_rate": 5.897435897435898e-05,
83
- "loss": 0.4786,
84
  "step": 80
85
  },
86
  {
87
  "epoch": 2.3076923076923075,
88
- "grad_norm": 5.974998950958252,
89
  "learning_rate": 5.384615384615385e-05,
90
- "loss": 0.3373,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 2.564102564102564,
95
- "grad_norm": 2.976608991622925,
96
  "learning_rate": 4.871794871794872e-05,
97
- "loss": 0.3314,
98
  "step": 100
99
  },
100
  {
101
  "epoch": 2.8205128205128203,
102
- "grad_norm": 3.50764799118042,
103
  "learning_rate": 4.358974358974359e-05,
104
- "loss": 0.3235,
105
  "step": 110
106
  },
107
  {
108
  "epoch": 3.0,
109
- "eval_accuracy": 0.6714801444043321,
110
- "eval_loss": 0.7251453399658203,
111
- "eval_runtime": 0.6621,
112
- "eval_samples_per_second": 418.365,
113
- "eval_steps_per_second": 7.552,
114
  "step": 117
115
  },
116
  {
117
  "epoch": 3.076923076923077,
118
- "grad_norm": 3.907212495803833,
119
  "learning_rate": 3.846153846153846e-05,
120
- "loss": 0.2728,
121
  "step": 120
122
  },
123
  {
124
  "epoch": 3.3333333333333335,
125
- "grad_norm": 7.000370979309082,
126
  "learning_rate": 3.3333333333333335e-05,
127
- "loss": 0.1829,
128
  "step": 130
129
  },
130
  {
131
  "epoch": 3.58974358974359,
132
- "grad_norm": 7.436763763427734,
133
  "learning_rate": 2.8205128205128207e-05,
134
- "loss": 0.1877,
135
  "step": 140
136
  },
137
  {
138
  "epoch": 3.8461538461538463,
139
- "grad_norm": 7.767152786254883,
140
  "learning_rate": 2.307692307692308e-05,
141
- "loss": 0.1335,
142
  "step": 150
143
  },
144
  {
145
  "epoch": 4.0,
146
- "eval_accuracy": 0.7003610108303249,
147
- "eval_loss": 0.9089646935462952,
148
- "eval_runtime": 0.6606,
149
- "eval_samples_per_second": 419.294,
150
- "eval_steps_per_second": 7.568,
151
  "step": 156
152
  }
153
  ],
 
1
  {
2
  "best_global_step": 156,
3
+ "best_metric": 0.6967509025270758,
4
  "best_model_checkpoint": "bert-base-uncased-finetuned-rte-run_14/run-0/checkpoint-156",
5
  "epoch": 4.0,
6
  "eval_steps": 500,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.2564102564102564,
14
+ "grad_norm": 1.662626028060913,
15
  "learning_rate": 9.487179487179487e-05,
16
  "loss": 0.696,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.5128205128205128,
21
+ "grad_norm": 2.0300467014312744,
22
  "learning_rate": 8.974358974358975e-05,
23
  "loss": 0.6793,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.7692307692307693,
28
+ "grad_norm": 4.490738868713379,
29
  "learning_rate": 8.461538461538461e-05,
30
  "loss": 0.6499,
31
  "step": 30
 
33
  {
34
  "epoch": 1.0,
35
  "eval_accuracy": 0.631768953068592,
36
+ "eval_loss": 0.6309793591499329,
37
+ "eval_runtime": 0.6606,
38
+ "eval_samples_per_second": 419.298,
39
+ "eval_steps_per_second": 7.569,
40
  "step": 39
41
  },
42
  {
43
  "epoch": 1.0256410256410255,
44
+ "grad_norm": 3.481339454650879,
45
  "learning_rate": 7.948717948717948e-05,
46
  "loss": 0.6793,
47
  "step": 40
48
  },
49
  {
50
  "epoch": 1.282051282051282,
51
+ "grad_norm": 4.932971477508545,
52
  "learning_rate": 7.435897435897436e-05,
53
+ "loss": 0.5602,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.5384615384615383,
58
+ "grad_norm": 7.099682807922363,
59
  "learning_rate": 6.923076923076924e-05,
60
+ "loss": 0.5998,
61
  "step": 60
62
  },
63
  {
64
  "epoch": 1.7948717948717947,
65
+ "grad_norm": 3.696152925491333,
66
  "learning_rate": 6.410256410256412e-05,
67
+ "loss": 0.5403,
68
  "step": 70
69
  },
70
  {
71
  "epoch": 2.0,
72
+ "eval_accuracy": 0.6931407942238267,
73
+ "eval_loss": 0.6287456154823303,
74
+ "eval_runtime": 0.6619,
75
+ "eval_samples_per_second": 418.511,
76
+ "eval_steps_per_second": 7.554,
77
  "step": 78
78
  },
79
  {
80
  "epoch": 2.051282051282051,
81
+ "grad_norm": 3.8591785430908203,
82
  "learning_rate": 5.897435897435898e-05,
83
+ "loss": 0.4872,
84
  "step": 80
85
  },
86
  {
87
  "epoch": 2.3076923076923075,
88
+ "grad_norm": 4.27885627746582,
89
  "learning_rate": 5.384615384615385e-05,
90
+ "loss": 0.3396,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 2.564102564102564,
95
+ "grad_norm": 4.205892562866211,
96
  "learning_rate": 4.871794871794872e-05,
97
+ "loss": 0.3372,
98
  "step": 100
99
  },
100
  {
101
  "epoch": 2.8205128205128203,
102
+ "grad_norm": 5.800762176513672,
103
  "learning_rate": 4.358974358974359e-05,
104
+ "loss": 0.3422,
105
  "step": 110
106
  },
107
  {
108
  "epoch": 3.0,
109
+ "eval_accuracy": 0.6787003610108303,
110
+ "eval_loss": 0.7227063179016113,
111
+ "eval_runtime": 0.6696,
112
+ "eval_samples_per_second": 413.706,
113
+ "eval_steps_per_second": 7.468,
114
  "step": 117
115
  },
116
  {
117
  "epoch": 3.076923076923077,
118
+ "grad_norm": 3.5253195762634277,
119
  "learning_rate": 3.846153846153846e-05,
120
+ "loss": 0.2668,
121
  "step": 120
122
  },
123
  {
124
  "epoch": 3.3333333333333335,
125
+ "grad_norm": 11.2240629196167,
126
  "learning_rate": 3.3333333333333335e-05,
127
+ "loss": 0.1835,
128
  "step": 130
129
  },
130
  {
131
  "epoch": 3.58974358974359,
132
+ "grad_norm": 5.6398420333862305,
133
  "learning_rate": 2.8205128205128207e-05,
134
+ "loss": 0.1725,
135
  "step": 140
136
  },
137
  {
138
  "epoch": 3.8461538461538463,
139
+ "grad_norm": 6.835488319396973,
140
  "learning_rate": 2.307692307692308e-05,
141
+ "loss": 0.1297,
142
  "step": 150
143
  },
144
  {
145
  "epoch": 4.0,
146
+ "eval_accuracy": 0.6967509025270758,
147
+ "eval_loss": 0.9852063655853271,
148
+ "eval_runtime": 0.6608,
149
+ "eval_samples_per_second": 419.212,
150
+ "eval_steps_per_second": 7.567,
151
  "step": 156
152
  }
153
  ],
run-0/checkpoint-156/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2aa20791cd3401b748110a053f719d6902e4d9ccc845f2f5d2ff250a3d27441
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9df7fcee919248151322838e3ee2bda70121eb4c651bc0be349c4f5f62deeaf
3
  size 5432
run-0/checkpoint-195/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_global_step": 195,
3
- "best_metric": 0.7111913357400722,
4
- "best_model_checkpoint": "bert-base-uncased-finetuned-rte-run_14/run-0/checkpoint-195",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 195,
@@ -11,21 +11,21 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.2564102564102564,
14
- "grad_norm": 1.662625789642334,
15
  "learning_rate": 9.487179487179487e-05,
16
  "loss": 0.696,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.5128205128205128,
21
- "grad_norm": 2.0300352573394775,
22
  "learning_rate": 8.974358974358975e-05,
23
  "loss": 0.6793,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.7692307692307693,
28
- "grad_norm": 4.492157936096191,
29
  "learning_rate": 8.461538461538461e-05,
30
  "loss": 0.6499,
31
  "step": 30
@@ -33,158 +33,158 @@
33
  {
34
  "epoch": 1.0,
35
  "eval_accuracy": 0.631768953068592,
36
- "eval_loss": 0.6312768459320068,
37
- "eval_runtime": 0.6623,
38
- "eval_samples_per_second": 418.266,
39
- "eval_steps_per_second": 7.55,
40
  "step": 39
41
  },
42
  {
43
  "epoch": 1.0256410256410255,
44
- "grad_norm": 3.4885644912719727,
45
  "learning_rate": 7.948717948717948e-05,
46
  "loss": 0.6793,
47
  "step": 40
48
  },
49
  {
50
  "epoch": 1.282051282051282,
51
- "grad_norm": 5.2225494384765625,
52
  "learning_rate": 7.435897435897436e-05,
53
- "loss": 0.5596,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.5384615384615383,
58
- "grad_norm": 6.484560489654541,
59
  "learning_rate": 6.923076923076924e-05,
60
- "loss": 0.5713,
61
  "step": 60
62
  },
63
  {
64
  "epoch": 1.7948717948717947,
65
- "grad_norm": 4.836739540100098,
66
  "learning_rate": 6.410256410256412e-05,
67
- "loss": 0.545,
68
  "step": 70
69
  },
70
  {
71
  "epoch": 2.0,
72
- "eval_accuracy": 0.6714801444043321,
73
- "eval_loss": 0.658456563949585,
74
- "eval_runtime": 0.6622,
75
- "eval_samples_per_second": 418.306,
76
- "eval_steps_per_second": 7.551,
77
  "step": 78
78
  },
79
  {
80
  "epoch": 2.051282051282051,
81
- "grad_norm": 6.515610218048096,
82
  "learning_rate": 5.897435897435898e-05,
83
- "loss": 0.4786,
84
  "step": 80
85
  },
86
  {
87
  "epoch": 2.3076923076923075,
88
- "grad_norm": 5.974998950958252,
89
  "learning_rate": 5.384615384615385e-05,
90
- "loss": 0.3373,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 2.564102564102564,
95
- "grad_norm": 2.976608991622925,
96
  "learning_rate": 4.871794871794872e-05,
97
- "loss": 0.3314,
98
  "step": 100
99
  },
100
  {
101
  "epoch": 2.8205128205128203,
102
- "grad_norm": 3.50764799118042,
103
  "learning_rate": 4.358974358974359e-05,
104
- "loss": 0.3235,
105
  "step": 110
106
  },
107
  {
108
  "epoch": 3.0,
109
- "eval_accuracy": 0.6714801444043321,
110
- "eval_loss": 0.7251453399658203,
111
- "eval_runtime": 0.6621,
112
- "eval_samples_per_second": 418.365,
113
- "eval_steps_per_second": 7.552,
114
  "step": 117
115
  },
116
  {
117
  "epoch": 3.076923076923077,
118
- "grad_norm": 3.907212495803833,
119
  "learning_rate": 3.846153846153846e-05,
120
- "loss": 0.2728,
121
  "step": 120
122
  },
123
  {
124
  "epoch": 3.3333333333333335,
125
- "grad_norm": 7.000370979309082,
126
  "learning_rate": 3.3333333333333335e-05,
127
- "loss": 0.1829,
128
  "step": 130
129
  },
130
  {
131
  "epoch": 3.58974358974359,
132
- "grad_norm": 7.436763763427734,
133
  "learning_rate": 2.8205128205128207e-05,
134
- "loss": 0.1877,
135
  "step": 140
136
  },
137
  {
138
  "epoch": 3.8461538461538463,
139
- "grad_norm": 7.767152786254883,
140
  "learning_rate": 2.307692307692308e-05,
141
- "loss": 0.1335,
142
  "step": 150
143
  },
144
  {
145
  "epoch": 4.0,
146
- "eval_accuracy": 0.7003610108303249,
147
- "eval_loss": 0.9089646935462952,
148
- "eval_runtime": 0.6606,
149
- "eval_samples_per_second": 419.294,
150
- "eval_steps_per_second": 7.568,
151
  "step": 156
152
  },
153
  {
154
  "epoch": 4.102564102564102,
155
- "grad_norm": 2.6948187351226807,
156
  "learning_rate": 1.794871794871795e-05,
157
- "loss": 0.1229,
158
  "step": 160
159
  },
160
  {
161
  "epoch": 4.358974358974359,
162
- "grad_norm": 3.5418930053710938,
163
  "learning_rate": 1.282051282051282e-05,
164
- "loss": 0.0868,
165
  "step": 170
166
  },
167
  {
168
  "epoch": 4.615384615384615,
169
- "grad_norm": 6.394577980041504,
170
  "learning_rate": 7.692307692307694e-06,
171
- "loss": 0.0624,
172
  "step": 180
173
  },
174
  {
175
  "epoch": 4.871794871794872,
176
- "grad_norm": 7.906170845031738,
177
  "learning_rate": 2.564102564102564e-06,
178
- "loss": 0.0608,
179
  "step": 190
180
  },
181
  {
182
  "epoch": 5.0,
183
- "eval_accuracy": 0.7111913357400722,
184
- "eval_loss": 1.0780714750289917,
185
- "eval_runtime": 0.6628,
186
- "eval_samples_per_second": 417.893,
187
- "eval_steps_per_second": 7.543,
188
  "step": 195
189
  }
190
  ],
 
1
  {
2
+ "best_global_step": 156,
3
+ "best_metric": 0.6967509025270758,
4
+ "best_model_checkpoint": "bert-base-uncased-finetuned-rte-run_14/run-0/checkpoint-156",
5
  "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 195,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.2564102564102564,
14
+ "grad_norm": 1.662626028060913,
15
  "learning_rate": 9.487179487179487e-05,
16
  "loss": 0.696,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.5128205128205128,
21
+ "grad_norm": 2.0300467014312744,
22
  "learning_rate": 8.974358974358975e-05,
23
  "loss": 0.6793,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.7692307692307693,
28
+ "grad_norm": 4.490738868713379,
29
  "learning_rate": 8.461538461538461e-05,
30
  "loss": 0.6499,
31
  "step": 30
 
33
  {
34
  "epoch": 1.0,
35
  "eval_accuracy": 0.631768953068592,
36
+ "eval_loss": 0.6309793591499329,
37
+ "eval_runtime": 0.6606,
38
+ "eval_samples_per_second": 419.298,
39
+ "eval_steps_per_second": 7.569,
40
  "step": 39
41
  },
42
  {
43
  "epoch": 1.0256410256410255,
44
+ "grad_norm": 3.481339454650879,
45
  "learning_rate": 7.948717948717948e-05,
46
  "loss": 0.6793,
47
  "step": 40
48
  },
49
  {
50
  "epoch": 1.282051282051282,
51
+ "grad_norm": 4.932971477508545,
52
  "learning_rate": 7.435897435897436e-05,
53
+ "loss": 0.5602,
54
  "step": 50
55
  },
56
  {
57
  "epoch": 1.5384615384615383,
58
+ "grad_norm": 7.099682807922363,
59
  "learning_rate": 6.923076923076924e-05,
60
+ "loss": 0.5998,
61
  "step": 60
62
  },
63
  {
64
  "epoch": 1.7948717948717947,
65
+ "grad_norm": 3.696152925491333,
66
  "learning_rate": 6.410256410256412e-05,
67
+ "loss": 0.5403,
68
  "step": 70
69
  },
70
  {
71
  "epoch": 2.0,
72
+ "eval_accuracy": 0.6931407942238267,
73
+ "eval_loss": 0.6287456154823303,
74
+ "eval_runtime": 0.6619,
75
+ "eval_samples_per_second": 418.511,
76
+ "eval_steps_per_second": 7.554,
77
  "step": 78
78
  },
79
  {
80
  "epoch": 2.051282051282051,
81
+ "grad_norm": 3.8591785430908203,
82
  "learning_rate": 5.897435897435898e-05,
83
+ "loss": 0.4872,
84
  "step": 80
85
  },
86
  {
87
  "epoch": 2.3076923076923075,
88
+ "grad_norm": 4.27885627746582,
89
  "learning_rate": 5.384615384615385e-05,
90
+ "loss": 0.3396,
91
  "step": 90
92
  },
93
  {
94
  "epoch": 2.564102564102564,
95
+ "grad_norm": 4.205892562866211,
96
  "learning_rate": 4.871794871794872e-05,
97
+ "loss": 0.3372,
98
  "step": 100
99
  },
100
  {
101
  "epoch": 2.8205128205128203,
102
+ "grad_norm": 5.800762176513672,
103
  "learning_rate": 4.358974358974359e-05,
104
+ "loss": 0.3422,
105
  "step": 110
106
  },
107
  {
108
  "epoch": 3.0,
109
+ "eval_accuracy": 0.6787003610108303,
110
+ "eval_loss": 0.7227063179016113,
111
+ "eval_runtime": 0.6696,
112
+ "eval_samples_per_second": 413.706,
113
+ "eval_steps_per_second": 7.468,
114
  "step": 117
115
  },
116
  {
117
  "epoch": 3.076923076923077,
118
+ "grad_norm": 3.5253195762634277,
119
  "learning_rate": 3.846153846153846e-05,
120
+ "loss": 0.2668,
121
  "step": 120
122
  },
123
  {
124
  "epoch": 3.3333333333333335,
125
+ "grad_norm": 11.2240629196167,
126
  "learning_rate": 3.3333333333333335e-05,
127
+ "loss": 0.1835,
128
  "step": 130
129
  },
130
  {
131
  "epoch": 3.58974358974359,
132
+ "grad_norm": 5.6398420333862305,
133
  "learning_rate": 2.8205128205128207e-05,
134
+ "loss": 0.1725,
135
  "step": 140
136
  },
137
  {
138
  "epoch": 3.8461538461538463,
139
+ "grad_norm": 6.835488319396973,
140
  "learning_rate": 2.307692307692308e-05,
141
+ "loss": 0.1297,
142
  "step": 150
143
  },
144
  {
145
  "epoch": 4.0,
146
+ "eval_accuracy": 0.6967509025270758,
147
+ "eval_loss": 0.9852063655853271,
148
+ "eval_runtime": 0.6608,
149
+ "eval_samples_per_second": 419.212,
150
+ "eval_steps_per_second": 7.567,
151
  "step": 156
152
  },
153
  {
154
  "epoch": 4.102564102564102,
155
+ "grad_norm": 1.7055132389068604,
156
  "learning_rate": 1.794871794871795e-05,
157
+ "loss": 0.1088,
158
  "step": 160
159
  },
160
  {
161
  "epoch": 4.358974358974359,
162
+ "grad_norm": 4.614296913146973,
163
  "learning_rate": 1.282051282051282e-05,
164
+ "loss": 0.0803,
165
  "step": 170
166
  },
167
  {
168
  "epoch": 4.615384615384615,
169
+ "grad_norm": 4.053183555603027,
170
  "learning_rate": 7.692307692307694e-06,
171
+ "loss": 0.0625,
172
  "step": 180
173
  },
174
  {
175
  "epoch": 4.871794871794872,
176
+ "grad_norm": 7.156663417816162,
177
  "learning_rate": 2.564102564102564e-06,
178
+ "loss": 0.0686,
179
  "step": 190
180
  },
181
  {
182
  "epoch": 5.0,
183
+ "eval_accuracy": 0.6859205776173285,
184
+ "eval_loss": 1.0840200185775757,
185
+ "eval_runtime": 0.6626,
186
+ "eval_samples_per_second": 418.066,
187
+ "eval_steps_per_second": 7.546,
188
  "step": 195
189
  }
190
  ],
run-0/checkpoint-78/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87efa456251f68adc0b2b9363c9086483d78108b5a3a35553d7869669813f8d9
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b4648ded986884eb769e5736bda3f39ff6235308b7ff65c5d9cae456d76b11
3
  size 437958648
run-0/checkpoint-78/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca0f0afc8ff9b1bbf70aee7c27e39d28e11475bb7312389acd7e3b7b91c16532
3
  size 876038394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee1e01f564ab20f9119e0efa77357cde7af5a4cb8d40e9d5be30ce84b313f8cb
3
  size 876038394