Rubywong123 commited on
Commit
e715d4c
·
verified ·
1 Parent(s): 5b0ee6d

Upload folder using huggingface_hub

Browse files
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.97165991902834,
3
- "total_flos": 2.060122896507863e+17,
4
- "train_loss": 0.3745692343246646,
5
- "train_runtime": 1139.0785,
6
- "train_samples": 1973,
7
- "train_samples_per_second": 3.464,
8
- "train_steps_per_second": 0.072
9
  }
 
1
  {
2
  "epoch": 1.97165991902834,
3
+ "total_flos": 2.0517622997476966e+17,
4
+ "train_loss": 0.37475141192354805,
5
+ "train_runtime": 2614.8185,
6
+ "train_samples": 1975,
7
+ "train_samples_per_second": 1.511,
8
+ "train_steps_per_second": 0.031
9
  }
checkpoint-82/global_step81/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:143394608decbe9339288031e9faceb467aa98b43f353d87525b1786d5e09332
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:772be605f016e386297e126ecd911a98b240109d72e4a8b97919ffb8eedfdd2d
3
  size 24090788620
checkpoint-82/global_step81/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:335146e66f1707b9b859c602dfd8e0c543c204505547b7a8b601e67d12d2f0e6
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3cb9e233ac73dbe5a67f97dec86d2c76ed1acbd4ca8ce3392a7e31772361dab
3
  size 24090788620
checkpoint-82/global_step81/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d3c23436c5666be9bd4adbf7c2763c464636b62b2118db073fd8f382465045e
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02736f4157c8085c75b68cb549638ac5d39448b25ef04ea0377a4f804c8cff73
3
  size 24090788620
checkpoint-82/global_step81/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:661a4c40af871e41613b232059f1ab1ee64fd650f5f24c07f256fc7b5e0c69ba
3
  size 24090788620
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:460c7a5d71d407d520aaefa94da8f760d17187f357666759932c09348b5bc2dc
3
  size 24090788620
checkpoint-82/model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8da71067fd24903a5f239e83499aa649fb0b58e78a0a859b57168ff929a8c79
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a58e68ae154c51fb0115916614d01959b36487afc2856929e63494251406ba18
3
  size 4976698672
checkpoint-82/model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b26df981b0a4e517baf613afd3a2ab28e48bdc9161575bac6b6c62e992af4b5c
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429d1ab50399348d9ac22648c8f785528d4e4c84e24cd5b295fbdd0656119bce
3
  size 4999802720
checkpoint-82/model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8208a999548f9326ceea1d7e4c955f7e1f4213cc1dbf5464bbe280afa81fdc2a
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa816c1409c3ea4825672525d1f7e14467e63e6bc3970d9a330c6af794dbbd1
3
  size 4915916176
checkpoint-82/model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06694afd06a3dee719db9b30789880e2db55c0c76e35d1fd6bb5f9f00f4d259e
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91585421d6f1c79a45c2b306b2310915478a949300d121910cb8f69856b08dee
3
  size 1168138808
checkpoint-82/trainer_state.json CHANGED
@@ -10,137 +10,137 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.024291497975708502,
13
- "grad_norm": 0.11209844991800896,
14
  "learning_rate": 1.111111111111111e-06,
15
- "loss": 0.4775,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1214574898785425,
20
- "grad_norm": 0.08382199120821986,
21
  "learning_rate": 5.555555555555557e-06,
22
- "loss": 0.4599,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.242914979757085,
27
- "grad_norm": 0.07640751074808542,
28
  "learning_rate": 9.995370575511151e-06,
29
- "loss": 0.3905,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3643724696356275,
34
- "grad_norm": 0.07013064191404028,
35
  "learning_rate": 9.834239068026388e-06,
36
- "loss": 0.4121,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.48582995951417,
41
- "grad_norm": 0.057718125703579023,
42
  "learning_rate": 9.450137882173385e-06,
43
- "loss": 0.3976,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6072874493927125,
48
- "grad_norm": 0.054712718743830574,
49
  "learning_rate": 8.860782922495821e-06,
50
- "loss": 0.4166,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.728744939271255,
55
- "grad_norm": 0.05369672316503755,
56
  "learning_rate": 8.093357016312518e-06,
57
- "loss": 0.4158,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8502024291497976,
62
- "grad_norm": 0.059354108055340686,
63
  "learning_rate": 7.183256159780321e-06,
64
- "loss": 0.3999,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.97165991902834,
69
- "grad_norm": 0.048784304812542124,
70
  "learning_rate": 6.1724569478520495e-06,
71
- "loss": 0.3717,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
- "eval_loss": 0.4079369306564331,
77
- "eval_runtime": 28.3728,
78
- "eval_samples_per_second": 19.35,
79
- "eval_steps_per_second": 4.864,
80
  "step": 42
81
  },
82
  {
83
  "epoch": 1.0728744939271255,
84
- "grad_norm": 0.04604995737556533,
85
  "learning_rate": 5.107580487181112e-06,
86
- "loss": 0.3224,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.194331983805668,
91
- "grad_norm": 0.04192353362499278,
92
  "learning_rate": 4.037742090145851e-06,
93
- "loss": 0.3723,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3157894736842106,
98
- "grad_norm": 0.04224220843535167,
99
  "learning_rate": 3.0122859285872214e-06,
100
- "loss": 0.3359,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4372469635627532,
105
- "grad_norm": 0.048275248650347574,
106
  "learning_rate": 2.0785091318581577e-06,
107
- "loss": 0.3142,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5587044534412957,
112
- "grad_norm": 0.04406728850959383,
113
  "learning_rate": 1.2794803006431984e-06,
114
- "loss": 0.3507,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.680161943319838,
119
- "grad_norm": 0.04494931344151913,
120
  "learning_rate": 6.52053053266945e-07,
121
- "loss": 0.3408,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8016194331983806,
126
- "grad_norm": 0.053669633291877746,
127
  "learning_rate": 2.2516622572372416e-07,
128
- "loss": 0.3379,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9230769230769231,
133
- "grad_norm": 0.0424593640450628,
134
  "learning_rate": 1.850912532696092e-08,
135
- "loss": 0.3454,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.97165991902834,
140
- "eval_loss": 0.412166029214859,
141
- "eval_runtime": 28.3377,
142
- "eval_samples_per_second": 19.373,
143
- "eval_steps_per_second": 4.87,
144
  "step": 82
145
  }
146
  ],
@@ -161,7 +161,7 @@
161
  "attributes": {}
162
  }
163
  },
164
- "total_flos": 2.060122896507863e+17,
165
  "train_batch_size": 1,
166
  "trial_name": null,
167
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.024291497975708502,
13
+ "grad_norm": 0.09191887920247531,
14
  "learning_rate": 1.111111111111111e-06,
15
+ "loss": 0.4142,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1214574898785425,
20
+ "grad_norm": 0.09020457134242624,
21
  "learning_rate": 5.555555555555557e-06,
22
+ "loss": 0.4433,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.242914979757085,
27
+ "grad_norm": 0.0766066808817807,
28
  "learning_rate": 9.995370575511151e-06,
29
+ "loss": 0.4128,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3643724696356275,
34
+ "grad_norm": 0.06441683314911666,
35
  "learning_rate": 9.834239068026388e-06,
36
+ "loss": 0.4322,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.48582995951417,
41
+ "grad_norm": 0.055907229466585676,
42
  "learning_rate": 9.450137882173385e-06,
43
+ "loss": 0.4369,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6072874493927125,
48
+ "grad_norm": 0.04914388925866741,
49
  "learning_rate": 8.860782922495821e-06,
50
+ "loss": 0.3921,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.728744939271255,
55
+ "grad_norm": 0.0453987667724486,
56
  "learning_rate": 8.093357016312518e-06,
57
+ "loss": 0.3694,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8502024291497976,
62
+ "grad_norm": 0.053149652003049455,
63
  "learning_rate": 7.183256159780321e-06,
64
+ "loss": 0.3811,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.97165991902834,
69
+ "grad_norm": 0.04557625788490129,
70
  "learning_rate": 6.1724569478520495e-06,
71
+ "loss": 0.391,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "eval_loss": 0.40577438473701477,
77
+ "eval_runtime": 89.9891,
78
+ "eval_samples_per_second": 6.101,
79
+ "eval_steps_per_second": 1.534,
80
  "step": 42
81
  },
82
  {
83
  "epoch": 1.0728744939271255,
84
+ "grad_norm": 0.04223782838268721,
85
  "learning_rate": 5.107580487181112e-06,
86
+ "loss": 0.3726,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.194331983805668,
91
+ "grad_norm": 0.04745008341707443,
92
  "learning_rate": 4.037742090145851e-06,
93
+ "loss": 0.3255,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3157894736842106,
98
+ "grad_norm": 0.044998456767885005,
99
  "learning_rate": 3.0122859285872214e-06,
100
+ "loss": 0.2935,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4372469635627532,
105
+ "grad_norm": 0.047569447760125,
106
  "learning_rate": 2.0785091318581577e-06,
107
+ "loss": 0.3633,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5587044534412957,
112
+ "grad_norm": 0.042433025378520894,
113
  "learning_rate": 1.2794803006431984e-06,
114
+ "loss": 0.3413,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.680161943319838,
119
+ "grad_norm": 0.04190184177466641,
120
  "learning_rate": 6.52053053266945e-07,
121
+ "loss": 0.3395,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8016194331983806,
126
+ "grad_norm": 0.043317378643889536,
127
  "learning_rate": 2.2516622572372416e-07,
128
+ "loss": 0.3358,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9230769230769231,
133
+ "grad_norm": 0.04222433691390119,
134
  "learning_rate": 1.850912532696092e-08,
135
+ "loss": 0.3539,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.97165991902834,
140
+ "eval_loss": 0.41050779819488525,
141
+ "eval_runtime": 89.729,
142
+ "eval_samples_per_second": 6.118,
143
+ "eval_steps_per_second": 1.538,
144
  "step": 82
145
  }
146
  ],
 
161
  "attributes": {}
162
  }
163
  },
164
+ "total_flos": 2.0517622997476966e+17,
165
  "train_batch_size": 1,
166
  "trial_name": null,
167
  "trial_params": null
checkpoint-82/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e4bc04a3bba6a6d9b09b78696c097c266ac9301b7239ff4c31b2fd311ec38d
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b0a5ee5ef482c791ec4ed3e641536e91563b5072f53f5b35f73788e3114fef8
3
  size 7352
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8da71067fd24903a5f239e83499aa649fb0b58e78a0a859b57168ff929a8c79
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a58e68ae154c51fb0115916614d01959b36487afc2856929e63494251406ba18
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b26df981b0a4e517baf613afd3a2ab28e48bdc9161575bac6b6c62e992af4b5c
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429d1ab50399348d9ac22648c8f785528d4e4c84e24cd5b295fbdd0656119bce
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8208a999548f9326ceea1d7e4c955f7e1f4213cc1dbf5464bbe280afa81fdc2a
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa816c1409c3ea4825672525d1f7e14467e63e6bc3970d9a330c6af794dbbd1
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06694afd06a3dee719db9b30789880e2db55c0c76e35d1fd6bb5f9f00f4d259e
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91585421d6f1c79a45c2b306b2310915478a949300d121910cb8f69856b08dee
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 1.97165991902834,
3
- "total_flos": 2.060122896507863e+17,
4
- "train_loss": 0.3745692343246646,
5
- "train_runtime": 1139.0785,
6
- "train_samples": 1973,
7
- "train_samples_per_second": 3.464,
8
- "train_steps_per_second": 0.072
9
  }
 
1
  {
2
  "epoch": 1.97165991902834,
3
+ "total_flos": 2.0517622997476966e+17,
4
+ "train_loss": 0.37475141192354805,
5
+ "train_runtime": 2614.8185,
6
+ "train_samples": 1975,
7
+ "train_samples_per_second": 1.511,
8
+ "train_steps_per_second": 0.031
9
  }
trainer_state.json CHANGED
@@ -10,147 +10,147 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.024291497975708502,
13
- "grad_norm": 0.11209844991800896,
14
  "learning_rate": 1.111111111111111e-06,
15
- "loss": 0.4775,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1214574898785425,
20
- "grad_norm": 0.08382199120821986,
21
  "learning_rate": 5.555555555555557e-06,
22
- "loss": 0.4599,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.242914979757085,
27
- "grad_norm": 0.07640751074808542,
28
  "learning_rate": 9.995370575511151e-06,
29
- "loss": 0.3905,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3643724696356275,
34
- "grad_norm": 0.07013064191404028,
35
  "learning_rate": 9.834239068026388e-06,
36
- "loss": 0.4121,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.48582995951417,
41
- "grad_norm": 0.057718125703579023,
42
  "learning_rate": 9.450137882173385e-06,
43
- "loss": 0.3976,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6072874493927125,
48
- "grad_norm": 0.054712718743830574,
49
  "learning_rate": 8.860782922495821e-06,
50
- "loss": 0.4166,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.728744939271255,
55
- "grad_norm": 0.05369672316503755,
56
  "learning_rate": 8.093357016312518e-06,
57
- "loss": 0.4158,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8502024291497976,
62
- "grad_norm": 0.059354108055340686,
63
  "learning_rate": 7.183256159780321e-06,
64
- "loss": 0.3999,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.97165991902834,
69
- "grad_norm": 0.048784304812542124,
70
  "learning_rate": 6.1724569478520495e-06,
71
- "loss": 0.3717,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
- "eval_loss": 0.4079369306564331,
77
- "eval_runtime": 28.3728,
78
- "eval_samples_per_second": 19.35,
79
- "eval_steps_per_second": 4.864,
80
  "step": 42
81
  },
82
  {
83
  "epoch": 1.0728744939271255,
84
- "grad_norm": 0.04604995737556533,
85
  "learning_rate": 5.107580487181112e-06,
86
- "loss": 0.3224,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.194331983805668,
91
- "grad_norm": 0.04192353362499278,
92
  "learning_rate": 4.037742090145851e-06,
93
- "loss": 0.3723,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3157894736842106,
98
- "grad_norm": 0.04224220843535167,
99
  "learning_rate": 3.0122859285872214e-06,
100
- "loss": 0.3359,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4372469635627532,
105
- "grad_norm": 0.048275248650347574,
106
  "learning_rate": 2.0785091318581577e-06,
107
- "loss": 0.3142,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5587044534412957,
112
- "grad_norm": 0.04406728850959383,
113
  "learning_rate": 1.2794803006431984e-06,
114
- "loss": 0.3507,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.680161943319838,
119
- "grad_norm": 0.04494931344151913,
120
  "learning_rate": 6.52053053266945e-07,
121
- "loss": 0.3408,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8016194331983806,
126
- "grad_norm": 0.053669633291877746,
127
  "learning_rate": 2.2516622572372416e-07,
128
- "loss": 0.3379,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9230769230769231,
133
- "grad_norm": 0.0424593640450628,
134
  "learning_rate": 1.850912532696092e-08,
135
- "loss": 0.3454,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.97165991902834,
140
- "eval_loss": 0.412166029214859,
141
- "eval_runtime": 28.3377,
142
- "eval_samples_per_second": 19.373,
143
- "eval_steps_per_second": 4.87,
144
  "step": 82
145
  },
146
  {
147
  "epoch": 1.97165991902834,
148
  "step": 82,
149
- "total_flos": 2.060122896507863e+17,
150
- "train_loss": 0.3745692343246646,
151
- "train_runtime": 1139.0785,
152
- "train_samples_per_second": 3.464,
153
- "train_steps_per_second": 0.072
154
  }
155
  ],
156
  "logging_steps": 5,
@@ -170,7 +170,7 @@
170
  "attributes": {}
171
  }
172
  },
173
- "total_flos": 2.060122896507863e+17,
174
  "train_batch_size": 1,
175
  "trial_name": null,
176
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.024291497975708502,
13
+ "grad_norm": 0.09191887920247531,
14
  "learning_rate": 1.111111111111111e-06,
15
+ "loss": 0.4142,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.1214574898785425,
20
+ "grad_norm": 0.09020457134242624,
21
  "learning_rate": 5.555555555555557e-06,
22
+ "loss": 0.4433,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.242914979757085,
27
+ "grad_norm": 0.0766066808817807,
28
  "learning_rate": 9.995370575511151e-06,
29
+ "loss": 0.4128,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.3643724696356275,
34
+ "grad_norm": 0.06441683314911666,
35
  "learning_rate": 9.834239068026388e-06,
36
+ "loss": 0.4322,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.48582995951417,
41
+ "grad_norm": 0.055907229466585676,
42
  "learning_rate": 9.450137882173385e-06,
43
+ "loss": 0.4369,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.6072874493927125,
48
+ "grad_norm": 0.04914388925866741,
49
  "learning_rate": 8.860782922495821e-06,
50
+ "loss": 0.3921,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.728744939271255,
55
+ "grad_norm": 0.0453987667724486,
56
  "learning_rate": 8.093357016312518e-06,
57
+ "loss": 0.3694,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.8502024291497976,
62
+ "grad_norm": 0.053149652003049455,
63
  "learning_rate": 7.183256159780321e-06,
64
+ "loss": 0.3811,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.97165991902834,
69
+ "grad_norm": 0.04557625788490129,
70
  "learning_rate": 6.1724569478520495e-06,
71
+ "loss": 0.391,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 1.0,
76
+ "eval_loss": 0.40577438473701477,
77
+ "eval_runtime": 89.9891,
78
+ "eval_samples_per_second": 6.101,
79
+ "eval_steps_per_second": 1.534,
80
  "step": 42
81
  },
82
  {
83
  "epoch": 1.0728744939271255,
84
+ "grad_norm": 0.04223782838268721,
85
  "learning_rate": 5.107580487181112e-06,
86
+ "loss": 0.3726,
87
  "step": 45
88
  },
89
  {
90
  "epoch": 1.194331983805668,
91
+ "grad_norm": 0.04745008341707443,
92
  "learning_rate": 4.037742090145851e-06,
93
+ "loss": 0.3255,
94
  "step": 50
95
  },
96
  {
97
  "epoch": 1.3157894736842106,
98
+ "grad_norm": 0.044998456767885005,
99
  "learning_rate": 3.0122859285872214e-06,
100
+ "loss": 0.2935,
101
  "step": 55
102
  },
103
  {
104
  "epoch": 1.4372469635627532,
105
+ "grad_norm": 0.047569447760125,
106
  "learning_rate": 2.0785091318581577e-06,
107
+ "loss": 0.3633,
108
  "step": 60
109
  },
110
  {
111
  "epoch": 1.5587044534412957,
112
+ "grad_norm": 0.042433025378520894,
113
  "learning_rate": 1.2794803006431984e-06,
114
+ "loss": 0.3413,
115
  "step": 65
116
  },
117
  {
118
  "epoch": 1.680161943319838,
119
+ "grad_norm": 0.04190184177466641,
120
  "learning_rate": 6.52053053266945e-07,
121
+ "loss": 0.3395,
122
  "step": 70
123
  },
124
  {
125
  "epoch": 1.8016194331983806,
126
+ "grad_norm": 0.043317378643889536,
127
  "learning_rate": 2.2516622572372416e-07,
128
+ "loss": 0.3358,
129
  "step": 75
130
  },
131
  {
132
  "epoch": 1.9230769230769231,
133
+ "grad_norm": 0.04222433691390119,
134
  "learning_rate": 1.850912532696092e-08,
135
+ "loss": 0.3539,
136
  "step": 80
137
  },
138
  {
139
  "epoch": 1.97165991902834,
140
+ "eval_loss": 0.41050779819488525,
141
+ "eval_runtime": 89.729,
142
+ "eval_samples_per_second": 6.118,
143
+ "eval_steps_per_second": 1.538,
144
  "step": 82
145
  },
146
  {
147
  "epoch": 1.97165991902834,
148
  "step": 82,
149
+ "total_flos": 2.0517622997476966e+17,
150
+ "train_loss": 0.37475141192354805,
151
+ "train_runtime": 2614.8185,
152
+ "train_samples_per_second": 1.511,
153
+ "train_steps_per_second": 0.031
154
  }
155
  ],
156
  "logging_steps": 5,
 
170
  "attributes": {}
171
  }
172
  },
173
+ "total_flos": 2.0517622997476966e+17,
174
  "train_batch_size": 1,
175
  "trial_name": null,
176
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03e4bc04a3bba6a6d9b09b78696c097c266ac9301b7239ff4c31b2fd311ec38d
3
  size 7352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b0a5ee5ef482c791ec4ed3e641536e91563b5072f53f5b35f73788e3114fef8
3
  size 7352