Ibisbill commited on
Commit
0ae9922
·
verified ·
1 Parent(s): 07d4615

Upload checkpoint-30

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e62180cb948d2fc691d3e6bdc6e6cec102b87801a5cf8ec6f9106fd0f93b31f
3
  size 4967215360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd2cdece46b452ec010d55b01a2b724d773ed26dbe3890a2db995aa8af278f15
3
  size 4967215360
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b479e35fca65e6130d442f9b05c99cff1fb1b1f526bbc383810e2ba6c4a8933c
3
  size 3077766632
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70df4cbf577b2f3bb995d52059564bda214f5c93f5709b885a6135f6035a38f7
3
  size 3077766632
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7007299270072993,
6
  "eval_steps": 1000,
7
  "global_step": 30,
8
  "is_hyper_param_search": false,
@@ -10,220 +10,220 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.058394160583941604,
14
- "grad_norm": 4.656690719780717,
15
  "learning_rate": 0.0,
16
- "loss": 0.4427,
17
  "step": 1
18
  },
19
  {
20
- "epoch": 0.11678832116788321,
21
- "grad_norm": 4.86023891396574,
22
- "learning_rate": 1.6666666666666667e-06,
23
- "loss": 0.4479,
24
  "step": 2
25
  },
26
  {
27
- "epoch": 0.17518248175182483,
28
- "grad_norm": 4.720432036194861,
29
- "learning_rate": 3.3333333333333333e-06,
30
- "loss": 0.438,
31
  "step": 3
32
  },
33
  {
34
- "epoch": 0.23357664233576642,
35
- "grad_norm": 2.7065388639758448,
36
- "learning_rate": 5e-06,
37
- "loss": 0.4169,
38
  "step": 4
39
  },
40
  {
41
- "epoch": 0.291970802919708,
42
- "grad_norm": 2.1869690387653904,
43
- "learning_rate": 6.666666666666667e-06,
44
- "loss": 0.4031,
45
  "step": 5
46
  },
47
  {
48
- "epoch": 0.35036496350364965,
49
- "grad_norm": 1.6374910771466962,
50
- "learning_rate": 8.333333333333334e-06,
51
- "loss": 0.3685,
52
  "step": 6
53
  },
54
  {
55
- "epoch": 0.40875912408759124,
56
- "grad_norm": 1.5026227694746652,
57
- "learning_rate": 1e-05,
58
- "loss": 0.3638,
59
  "step": 7
60
  },
61
  {
62
- "epoch": 0.46715328467153283,
63
- "grad_norm": 1.7874379044083826,
64
- "learning_rate": 9.987820251299121e-06,
65
- "loss": 0.3384,
66
  "step": 8
67
  },
68
  {
69
- "epoch": 0.5255474452554745,
70
- "grad_norm": 1.399545646496995,
71
- "learning_rate": 9.951340343707852e-06,
72
- "loss": 0.3199,
73
  "step": 9
74
  },
75
  {
76
- "epoch": 0.583941605839416,
77
- "grad_norm": 0.9591867357924391,
78
- "learning_rate": 9.890738003669029e-06,
79
- "loss": 0.3052,
80
  "step": 10
81
  },
82
  {
83
- "epoch": 0.6423357664233577,
84
- "grad_norm": 1.1005637398680819,
85
- "learning_rate": 9.806308479691595e-06,
86
- "loss": 0.3048,
87
  "step": 11
88
  },
89
  {
90
- "epoch": 0.7007299270072993,
91
- "grad_norm": 1.0830814869796386,
92
- "learning_rate": 9.698463103929542e-06,
93
- "loss": 0.3054,
94
  "step": 12
95
  },
96
  {
97
- "epoch": 0.7591240875912408,
98
- "grad_norm": 0.6865164477755549,
99
- "learning_rate": 9.567727288213005e-06,
100
- "loss": 0.2981,
101
  "step": 13
102
  },
103
  {
104
- "epoch": 0.8175182481751825,
105
- "grad_norm": 0.5516228834269103,
106
- "learning_rate": 9.414737964294636e-06,
107
- "loss": 0.2911,
108
  "step": 14
109
  },
110
  {
111
- "epoch": 0.8759124087591241,
112
- "grad_norm": 0.5820793130011475,
113
- "learning_rate": 9.24024048078213e-06,
114
- "loss": 0.2853,
115
  "step": 15
116
  },
117
  {
118
- "epoch": 0.9343065693430657,
119
- "grad_norm": 0.4932271006804814,
120
- "learning_rate": 9.045084971874738e-06,
121
- "loss": 0.2869,
122
  "step": 16
123
  },
124
  {
125
- "epoch": 0.9927007299270073,
126
- "grad_norm": 0.4055326985718036,
127
- "learning_rate": 8.83022221559489e-06,
128
- "loss": 0.2839,
129
  "step": 17
130
  },
131
  {
132
- "epoch": 1.0,
133
- "grad_norm": 0.4055326985718036,
134
- "learning_rate": 8.596699001693257e-06,
135
- "loss": 0.2804,
136
  "step": 18
137
  },
138
  {
139
- "epoch": 1.0583941605839415,
140
- "grad_norm": 0.7337766666428717,
141
- "learning_rate": 8.345653031794292e-06,
142
- "loss": 0.2726,
143
  "step": 19
144
  },
145
  {
146
- "epoch": 1.1167883211678833,
147
- "grad_norm": 0.3691176180981217,
148
- "learning_rate": 8.078307376628292e-06,
149
- "loss": 0.276,
150
  "step": 20
151
  },
152
  {
153
- "epoch": 1.1751824817518248,
154
- "grad_norm": 0.344408713464,
155
- "learning_rate": 7.795964517353734e-06,
156
- "loss": 0.2691,
157
  "step": 21
158
  },
159
  {
160
- "epoch": 1.2335766423357664,
161
- "grad_norm": 0.33587123887427156,
162
- "learning_rate": 7.500000000000001e-06,
163
- "loss": 0.2708,
164
  "step": 22
165
  },
166
  {
167
- "epoch": 1.2919708029197081,
168
- "grad_norm": 0.36312692209169634,
169
- "learning_rate": 7.191855733945388e-06,
170
- "loss": 0.2722,
171
  "step": 23
172
  },
173
  {
174
- "epoch": 1.3503649635036497,
175
- "grad_norm": 0.3467493765316541,
176
- "learning_rate": 6.873032967079562e-06,
177
- "loss": 0.2684,
178
  "step": 24
179
  },
180
  {
181
- "epoch": 1.4087591240875912,
182
- "grad_norm": 0.29985014243285585,
183
- "learning_rate": 6.545084971874738e-06,
184
- "loss": 0.2644,
185
  "step": 25
186
  },
187
  {
188
- "epoch": 1.4671532846715327,
189
- "grad_norm": 0.3009900653944404,
190
- "learning_rate": 6.209609477998339e-06,
191
- "loss": 0.2688,
192
  "step": 26
193
  },
194
  {
195
- "epoch": 1.5255474452554745,
196
- "grad_norm": 0.3055997904476839,
197
- "learning_rate": 5.8682408883346535e-06,
198
- "loss": 0.269,
199
  "step": 27
200
  },
201
  {
202
- "epoch": 1.583941605839416,
203
- "grad_norm": 0.30028028239827603,
204
- "learning_rate": 5.522642316338268e-06,
205
- "loss": 0.2656,
206
  "step": 28
207
  },
208
  {
209
- "epoch": 1.6423357664233578,
210
- "grad_norm": 0.2763150426486829,
211
- "learning_rate": 5.174497483512506e-06,
212
- "loss": 0.2603,
213
  "step": 29
214
  },
215
  {
216
- "epoch": 1.7007299270072993,
217
- "grad_norm": 0.2701531414864163,
218
- "learning_rate": 4.825502516487497e-06,
219
- "loss": 0.2624,
220
  "step": 30
221
  }
222
  ],
223
  "logging_steps": 1,
224
- "max_steps": 51,
225
  "num_input_tokens_seen": 0,
226
- "num_train_epochs": 3,
227
  "save_steps": 10,
228
  "stateful_callbacks": {
229
  "TrainerControl": {
@@ -237,8 +237,8 @@
237
  "attributes": {}
238
  }
239
  },
240
- "total_flos": 71710518804480.0,
241
- "train_batch_size": 1,
242
  "trial_name": null,
243
  "trial_params": null
244
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.11121408711770157,
6
  "eval_steps": 1000,
7
  "global_step": 30,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.0037071362372567192,
14
+ "grad_norm": 4.805793835959296,
15
  "learning_rate": 0.0,
16
+ "loss": 0.4732,
17
  "step": 1
18
  },
19
  {
20
+ "epoch": 0.0074142724745134385,
21
+ "grad_norm": 4.999676761082255,
22
+ "learning_rate": 1.8518518518518518e-07,
23
+ "loss": 0.5107,
24
  "step": 2
25
  },
26
  {
27
+ "epoch": 0.011121408711770158,
28
+ "grad_norm": 4.864588159866656,
29
+ "learning_rate": 3.7037037037037036e-07,
30
+ "loss": 0.4708,
31
  "step": 3
32
  },
33
  {
34
+ "epoch": 0.014828544949026877,
35
+ "grad_norm": 5.136331269447859,
36
+ "learning_rate": 5.555555555555555e-07,
37
+ "loss": 0.5024,
38
  "step": 4
39
  },
40
  {
41
+ "epoch": 0.018535681186283594,
42
+ "grad_norm": 4.731519955515801,
43
+ "learning_rate": 7.407407407407407e-07,
44
+ "loss": 0.4598,
45
  "step": 5
46
  },
47
  {
48
+ "epoch": 0.022242817423540315,
49
+ "grad_norm": 4.567784280784228,
50
+ "learning_rate": 9.259259259259259e-07,
51
+ "loss": 0.4717,
52
  "step": 6
53
  },
54
  {
55
+ "epoch": 0.025949953660797033,
56
+ "grad_norm": 4.522993398842187,
57
+ "learning_rate": 1.111111111111111e-06,
58
+ "loss": 0.4649,
59
  "step": 7
60
  },
61
  {
62
+ "epoch": 0.029657089898053754,
63
+ "grad_norm": 2.864282407749261,
64
+ "learning_rate": 1.2962962962962962e-06,
65
+ "loss": 0.4499,
66
  "step": 8
67
  },
68
  {
69
+ "epoch": 0.033364226135310475,
70
+ "grad_norm": 2.781272716749165,
71
+ "learning_rate": 1.4814814814814815e-06,
72
+ "loss": 0.4952,
73
  "step": 9
74
  },
75
  {
76
+ "epoch": 0.03707136237256719,
77
+ "grad_norm": 2.546317122615437,
78
+ "learning_rate": 1.6666666666666667e-06,
79
+ "loss": 0.4487,
80
  "step": 10
81
  },
82
  {
83
+ "epoch": 0.04077849860982391,
84
+ "grad_norm": 2.700137363750789,
85
+ "learning_rate": 1.8518518518518519e-06,
86
+ "loss": 0.4335,
87
  "step": 11
88
  },
89
  {
90
+ "epoch": 0.04448563484708063,
91
+ "grad_norm": 2.7541576896447233,
92
+ "learning_rate": 2.037037037037037e-06,
93
+ "loss": 0.4371,
94
  "step": 12
95
  },
96
  {
97
+ "epoch": 0.04819277108433735,
98
+ "grad_norm": 2.8211373607312447,
99
+ "learning_rate": 2.222222222222222e-06,
100
+ "loss": 0.4374,
101
  "step": 13
102
  },
103
  {
104
+ "epoch": 0.051899907321594066,
105
+ "grad_norm": 2.762988661288179,
106
+ "learning_rate": 2.4074074074074075e-06,
107
+ "loss": 0.4609,
108
  "step": 14
109
  },
110
  {
111
+ "epoch": 0.05560704355885079,
112
+ "grad_norm": 2.0744764582695923,
113
+ "learning_rate": 2.5925925925925925e-06,
114
+ "loss": 0.3954,
115
  "step": 15
116
  },
117
  {
118
+ "epoch": 0.05931417979610751,
119
+ "grad_norm": 2.0106943337826277,
120
+ "learning_rate": 2.7777777777777783e-06,
121
+ "loss": 0.3589,
122
  "step": 16
123
  },
124
  {
125
+ "epoch": 0.06302131603336422,
126
+ "grad_norm": 1.9151145546731518,
127
+ "learning_rate": 2.962962962962963e-06,
128
+ "loss": 0.374,
129
  "step": 17
130
  },
131
  {
132
+ "epoch": 0.06672845227062095,
133
+ "grad_norm": 1.7742581344696668,
134
+ "learning_rate": 3.1481481481481483e-06,
135
+ "loss": 0.4013,
136
  "step": 18
137
  },
138
  {
139
+ "epoch": 0.07043558850787766,
140
+ "grad_norm": 1.5873818678054419,
141
+ "learning_rate": 3.3333333333333333e-06,
142
+ "loss": 0.3847,
143
  "step": 19
144
  },
145
  {
146
+ "epoch": 0.07414272474513438,
147
+ "grad_norm": 1.1782654410938447,
148
+ "learning_rate": 3.5185185185185187e-06,
149
+ "loss": 0.3392,
150
  "step": 20
151
  },
152
  {
153
+ "epoch": 0.0778498609823911,
154
+ "grad_norm": 1.2100197827995287,
155
+ "learning_rate": 3.7037037037037037e-06,
156
+ "loss": 0.3174,
157
  "step": 21
158
  },
159
  {
160
+ "epoch": 0.08155699721964782,
161
+ "grad_norm": 1.3105688995199969,
162
+ "learning_rate": 3.88888888888889e-06,
163
+ "loss": 0.3774,
164
  "step": 22
165
  },
166
  {
167
+ "epoch": 0.08526413345690455,
168
+ "grad_norm": 1.0431241378849054,
169
+ "learning_rate": 4.074074074074074e-06,
170
+ "loss": 0.3329,
171
  "step": 23
172
  },
173
  {
174
+ "epoch": 0.08897126969416126,
175
+ "grad_norm": 0.8631707844136063,
176
+ "learning_rate": 4.2592592592592596e-06,
177
+ "loss": 0.2954,
178
  "step": 24
179
  },
180
  {
181
+ "epoch": 0.09267840593141798,
182
+ "grad_norm": 0.890999961539687,
183
+ "learning_rate": 4.444444444444444e-06,
184
+ "loss": 0.3143,
185
  "step": 25
186
  },
187
  {
188
+ "epoch": 0.0963855421686747,
189
+ "grad_norm": 0.8696087553408013,
190
+ "learning_rate": 4.62962962962963e-06,
191
+ "loss": 0.3012,
192
  "step": 26
193
  },
194
  {
195
+ "epoch": 0.10009267840593142,
196
+ "grad_norm": 0.9182891405006179,
197
+ "learning_rate": 4.814814814814815e-06,
198
+ "loss": 0.2918,
199
  "step": 27
200
  },
201
  {
202
+ "epoch": 0.10379981464318813,
203
+ "grad_norm": 0.9323649589307044,
204
+ "learning_rate": 5e-06,
205
+ "loss": 0.2953,
206
  "step": 28
207
  },
208
  {
209
+ "epoch": 0.10750695088044486,
210
+ "grad_norm": 0.812755436108598,
211
+ "learning_rate": 5.185185185185185e-06,
212
+ "loss": 0.3003,
213
  "step": 29
214
  },
215
  {
216
+ "epoch": 0.11121408711770157,
217
+ "grad_norm": 0.7059616195446038,
218
+ "learning_rate": 5.370370370370371e-06,
219
+ "loss": 0.2843,
220
  "step": 30
221
  }
222
  ],
223
  "logging_steps": 1,
224
+ "max_steps": 538,
225
  "num_input_tokens_seen": 0,
226
+ "num_train_epochs": 2,
227
  "save_steps": 10,
228
  "stateful_callbacks": {
229
  "TrainerControl": {
 
237
  "attributes": {}
238
  }
239
  },
240
+ "total_flos": 8820667908096.0,
241
+ "train_batch_size": 4,
242
  "trial_name": null,
243
  "trial_params": null
244
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53c6f4efddf302c1bffd66ab18844ca3866955a21ba82011ba2b03f6c0e75c9d
3
  size 7672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a64ffa12d6705d8296a8b6f2566b231894f0a3dd59a90b16aa729e1533fd4fa8
3
  size 7672