mohammadmahdinouri commited on
Commit
0ee27b7
·
verified ·
1 Parent(s): 57d8fb2

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b93ceca8e88ff460f8ccb50f4380d6798124eb004993f41675d14510f8c47b7
3
  size 448472762
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24d910ed994581c6f8ea6c3e52c2bd01eea52f2eabde8dd6e0704d202c1d60ef
3
  size 448472762
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5791df358e1c8a02bbb41e3d1e52d823a2a78d0ff48fd6f7de4f19e14e0bb520
3
  size 151589028
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37e29a2ee04e66d7aea70e004882b65125d5ba077c73c4403518a2a60505c8d5
3
  size 151589028
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cccb7abf0f8614f3fc64c31710fad6c824dca1edbb4986a5b9fb1ad1d2d802cb
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40599dcf66d195bd468cd608918590eb1aebaacc685d576254e5a8407b72bcaf
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:655ec14a75109d5e8c18da96c3a0f554fd551816773411140b362973eb5b2691
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f37c31af6d04569a7beeed96f0a16a7f301f7bd85bb5ae4d2109c3c0aceb44f2
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:697c100484a8888e919d71fa6c0aefff1702c654a32d364f7623997e3c0d9e2d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482b2a39fd2aedd38d5ea68619507991e6d883e66aa3db23423855a04855b803
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ffb6ed56578248732f2cb9d5be51bee1d41b9fd8c2fcf9ccf47064ba796dd60
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0c2cea090d6d49edddc7a1414205b00e8be5cf0638feefa5bd893ffe74fda01
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8dea3a8122c383e315053a97f608c6689c05237886892101fcacb12765eef233
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e137401e542831b1939ede4bb41d0e73ea53be0648cbbc1ab5857d463b977f62
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.006325077350425098,
6
  "eval_steps": 500,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2108,6 +2108,356 @@
2108
  "learning_rate": 0.0004991215588235708,
2109
  "loss": 2.716,
2110
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2111
  }
2112
  ],
2113
  "logging_steps": 10,
@@ -2127,7 +2477,7 @@
2127
  "attributes": {}
2128
  }
2129
  },
2130
- "total_flos": 9.80967272673706e+17,
2131
  "train_batch_size": 48,
2132
  "trial_name": null,
2133
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.007379256908829281,
6
  "eval_steps": 500,
7
+ "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2108
  "learning_rate": 0.0004991215588235708,
2109
  "loss": 2.716,
2110
  "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.006346160941593182,
2114
+ "grad_norm": 0.765625,
2115
+ "learning_rate": 0.0004991180436527968,
2116
+ "loss": 2.7439,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.006367244532761265,
2121
+ "grad_norm": 0.703125,
2122
+ "learning_rate": 0.0004991145284820227,
2123
+ "loss": 2.7267,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.006388328123929349,
2128
+ "grad_norm": 0.69921875,
2129
+ "learning_rate": 0.0004991110133112487,
2130
+ "loss": 2.7129,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.006409411715097433,
2135
+ "grad_norm": 0.7578125,
2136
+ "learning_rate": 0.0004991074981404747,
2137
+ "loss": 2.7252,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.006430495306265516,
2142
+ "grad_norm": 0.75390625,
2143
+ "learning_rate": 0.0004991039829697006,
2144
+ "loss": 2.7236,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.0064515788974336,
2149
+ "grad_norm": 0.73828125,
2150
+ "learning_rate": 0.0004991004677989266,
2151
+ "loss": 2.7489,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.006472662488601683,
2156
+ "grad_norm": 0.81640625,
2157
+ "learning_rate": 0.0004990969526281526,
2158
+ "loss": 2.7133,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.006493746079769767,
2163
+ "grad_norm": 1.0390625,
2164
+ "learning_rate": 0.0004990934374573785,
2165
+ "loss": 2.729,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.006514829670937851,
2170
+ "grad_norm": 0.69921875,
2171
+ "learning_rate": 0.0004990899222866046,
2172
+ "loss": 2.7134,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.006535913262105934,
2177
+ "grad_norm": 0.72265625,
2178
+ "learning_rate": 0.0004990864071158306,
2179
+ "loss": 2.7033,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.006556996853274018,
2184
+ "grad_norm": 0.71484375,
2185
+ "learning_rate": 0.0004990828919450564,
2186
+ "loss": 2.7242,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.006578080444442102,
2191
+ "grad_norm": 0.73828125,
2192
+ "learning_rate": 0.0004990793767742825,
2193
+ "loss": 2.7018,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 0.006599164035610185,
2198
+ "grad_norm": 0.79296875,
2199
+ "learning_rate": 0.0004990758616035085,
2200
+ "loss": 2.7018,
2201
+ "step": 3130
2202
+ },
2203
+ {
2204
+ "epoch": 0.006620247626778269,
2205
+ "grad_norm": 0.7421875,
2206
+ "learning_rate": 0.0004990723464327344,
2207
+ "loss": 2.7101,
2208
+ "step": 3140
2209
+ },
2210
+ {
2211
+ "epoch": 0.006641331217946353,
2212
+ "grad_norm": 0.671875,
2213
+ "learning_rate": 0.0004990688312619604,
2214
+ "loss": 2.7059,
2215
+ "step": 3150
2216
+ },
2217
+ {
2218
+ "epoch": 0.006662414809114436,
2219
+ "grad_norm": 0.76953125,
2220
+ "learning_rate": 0.0004990653160911864,
2221
+ "loss": 2.7004,
2222
+ "step": 3160
2223
+ },
2224
+ {
2225
+ "epoch": 0.00668349840028252,
2226
+ "grad_norm": 0.8125,
2227
+ "learning_rate": 0.0004990618009204123,
2228
+ "loss": 2.6897,
2229
+ "step": 3170
2230
+ },
2231
+ {
2232
+ "epoch": 0.006704581991450604,
2233
+ "grad_norm": 0.8828125,
2234
+ "learning_rate": 0.0004990582857496383,
2235
+ "loss": 2.6965,
2236
+ "step": 3180
2237
+ },
2238
+ {
2239
+ "epoch": 0.006725665582618687,
2240
+ "grad_norm": 0.69140625,
2241
+ "learning_rate": 0.0004990547705788642,
2242
+ "loss": 2.699,
2243
+ "step": 3190
2244
+ },
2245
+ {
2246
+ "epoch": 0.006746749173786771,
2247
+ "grad_norm": 0.73828125,
2248
+ "learning_rate": 0.0004990512554080902,
2249
+ "loss": 2.6956,
2250
+ "step": 3200
2251
+ },
2252
+ {
2253
+ "epoch": 0.006767832764954854,
2254
+ "grad_norm": 0.7578125,
2255
+ "learning_rate": 0.0004990477402373162,
2256
+ "loss": 2.693,
2257
+ "step": 3210
2258
+ },
2259
+ {
2260
+ "epoch": 0.006788916356122938,
2261
+ "grad_norm": 0.69921875,
2262
+ "learning_rate": 0.0004990442250665421,
2263
+ "loss": 2.6899,
2264
+ "step": 3220
2265
+ },
2266
+ {
2267
+ "epoch": 0.006809999947291022,
2268
+ "grad_norm": 0.8046875,
2269
+ "learning_rate": 0.0004990407098957681,
2270
+ "loss": 2.6946,
2271
+ "step": 3230
2272
+ },
2273
+ {
2274
+ "epoch": 0.006831083538459105,
2275
+ "grad_norm": 0.828125,
2276
+ "learning_rate": 0.0004990371947249942,
2277
+ "loss": 2.6904,
2278
+ "step": 3240
2279
+ },
2280
+ {
2281
+ "epoch": 0.006852167129627189,
2282
+ "grad_norm": 0.72265625,
2283
+ "learning_rate": 0.0004990336795542202,
2284
+ "loss": 2.6968,
2285
+ "step": 3250
2286
+ },
2287
+ {
2288
+ "epoch": 0.006873250720795273,
2289
+ "grad_norm": 0.72265625,
2290
+ "learning_rate": 0.000499030164383446,
2291
+ "loss": 2.6748,
2292
+ "step": 3260
2293
+ },
2294
+ {
2295
+ "epoch": 0.006894334311963356,
2296
+ "grad_norm": 1.0078125,
2297
+ "learning_rate": 0.0004990266492126721,
2298
+ "loss": 2.6752,
2299
+ "step": 3270
2300
+ },
2301
+ {
2302
+ "epoch": 0.00691541790313144,
2303
+ "grad_norm": 0.79296875,
2304
+ "learning_rate": 0.000499023134041898,
2305
+ "loss": 2.6882,
2306
+ "step": 3280
2307
+ },
2308
+ {
2309
+ "epoch": 0.006936501494299524,
2310
+ "grad_norm": 0.7421875,
2311
+ "learning_rate": 0.000499019618871124,
2312
+ "loss": 2.6847,
2313
+ "step": 3290
2314
+ },
2315
+ {
2316
+ "epoch": 0.0069575850854676075,
2317
+ "grad_norm": 0.66015625,
2318
+ "learning_rate": 0.00049901610370035,
2319
+ "loss": 2.6776,
2320
+ "step": 3300
2321
+ },
2322
+ {
2323
+ "epoch": 0.006978668676635691,
2324
+ "grad_norm": 0.703125,
2325
+ "learning_rate": 0.0004990125885295759,
2326
+ "loss": 2.6876,
2327
+ "step": 3310
2328
+ },
2329
+ {
2330
+ "epoch": 0.006999752267803775,
2331
+ "grad_norm": 0.7578125,
2332
+ "learning_rate": 0.0004990090733588019,
2333
+ "loss": 2.6798,
2334
+ "step": 3320
2335
+ },
2336
+ {
2337
+ "epoch": 0.0070208358589718585,
2338
+ "grad_norm": 0.71875,
2339
+ "learning_rate": 0.0004990055581880279,
2340
+ "loss": 2.6768,
2341
+ "step": 3330
2342
+ },
2343
+ {
2344
+ "epoch": 0.0070419194501399425,
2345
+ "grad_norm": 0.7109375,
2346
+ "learning_rate": 0.0004990020430172538,
2347
+ "loss": 2.6804,
2348
+ "step": 3340
2349
+ },
2350
+ {
2351
+ "epoch": 0.0070630030413080256,
2352
+ "grad_norm": 0.73828125,
2353
+ "learning_rate": 0.0004989985278464799,
2354
+ "loss": 2.6843,
2355
+ "step": 3350
2356
+ },
2357
+ {
2358
+ "epoch": 0.0070840866324761095,
2359
+ "grad_norm": 0.72265625,
2360
+ "learning_rate": 0.0004989950126757058,
2361
+ "loss": 2.6767,
2362
+ "step": 3360
2363
+ },
2364
+ {
2365
+ "epoch": 0.0071051702236441935,
2366
+ "grad_norm": 0.63671875,
2367
+ "learning_rate": 0.0004989914975049317,
2368
+ "loss": 2.6691,
2369
+ "step": 3370
2370
+ },
2371
+ {
2372
+ "epoch": 0.007126253814812277,
2373
+ "grad_norm": 0.7109375,
2374
+ "learning_rate": 0.0004989879823341578,
2375
+ "loss": 2.6738,
2376
+ "step": 3380
2377
+ },
2378
+ {
2379
+ "epoch": 0.007147337405980361,
2380
+ "grad_norm": 0.68359375,
2381
+ "learning_rate": 0.0004989844671633838,
2382
+ "loss": 2.6834,
2383
+ "step": 3390
2384
+ },
2385
+ {
2386
+ "epoch": 0.0071684209971484445,
2387
+ "grad_norm": 0.76171875,
2388
+ "learning_rate": 0.0004989809519926097,
2389
+ "loss": 2.6865,
2390
+ "step": 3400
2391
+ },
2392
+ {
2393
+ "epoch": 0.007189504588316528,
2394
+ "grad_norm": 0.734375,
2395
+ "learning_rate": 0.0004989774368218357,
2396
+ "loss": 2.6792,
2397
+ "step": 3410
2398
+ },
2399
+ {
2400
+ "epoch": 0.007210588179484612,
2401
+ "grad_norm": 0.8046875,
2402
+ "learning_rate": 0.0004989739216510617,
2403
+ "loss": 2.6694,
2404
+ "step": 3420
2405
+ },
2406
+ {
2407
+ "epoch": 0.007231671770652696,
2408
+ "grad_norm": 0.671875,
2409
+ "learning_rate": 0.0004989704064802876,
2410
+ "loss": 2.675,
2411
+ "step": 3430
2412
+ },
2413
+ {
2414
+ "epoch": 0.007252755361820779,
2415
+ "grad_norm": 0.69921875,
2416
+ "learning_rate": 0.0004989668913095136,
2417
+ "loss": 2.6746,
2418
+ "step": 3440
2419
+ },
2420
+ {
2421
+ "epoch": 0.007273838952988863,
2422
+ "grad_norm": 0.7578125,
2423
+ "learning_rate": 0.0004989633761387396,
2424
+ "loss": 2.6632,
2425
+ "step": 3450
2426
+ },
2427
+ {
2428
+ "epoch": 0.007294922544156947,
2429
+ "grad_norm": 0.6875,
2430
+ "learning_rate": 0.0004989598609679655,
2431
+ "loss": 2.664,
2432
+ "step": 3460
2433
+ },
2434
+ {
2435
+ "epoch": 0.00731600613532503,
2436
+ "grad_norm": 0.73828125,
2437
+ "learning_rate": 0.0004989563457971915,
2438
+ "loss": 2.6797,
2439
+ "step": 3470
2440
+ },
2441
+ {
2442
+ "epoch": 0.007337089726493114,
2443
+ "grad_norm": 0.70703125,
2444
+ "learning_rate": 0.0004989528306264175,
2445
+ "loss": 2.6545,
2446
+ "step": 3480
2447
+ },
2448
+ {
2449
+ "epoch": 0.007358173317661198,
2450
+ "grad_norm": 0.80859375,
2451
+ "learning_rate": 0.0004989493154556434,
2452
+ "loss": 2.6551,
2453
+ "step": 3490
2454
+ },
2455
+ {
2456
+ "epoch": 0.007379256908829281,
2457
+ "grad_norm": 0.6953125,
2458
+ "learning_rate": 0.0004989458002848695,
2459
+ "loss": 2.6639,
2460
+ "step": 3500
2461
  }
2462
  ],
2463
  "logging_steps": 10,
 
2477
  "attributes": {}
2478
  }
2479
  },
2480
+ "total_flos": 1.1444837229894042e+18,
2481
  "train_batch_size": 48,
2482
  "trial_name": null,
2483
  "trial_params": null