NairaRahim commited on
Commit
dec7b6e
·
verified ·
1 Parent(s): 6789286

Training in progress, epoch 25, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:042139fdd65771f5b0b7308a2f417f55545abb2e4526ac63abae75562bc1ba38
3
  size 1227009528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ebf6b113805e8d5c18f20cc3a7f743cea1ac029ed8f4448a7b46de82a6c516e9
3
  size 1227009528
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfff17389b523f38117b38d88e821ea6445c02dd357105f6f1ae677afbba8082
3
  size 2454133690
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7534969a274bb3fa8023a906c28ee9fb96fa28e85e22f56fbd7e7b549d41dd80
3
  size 2454133690
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:757d904eb6af9ff171fa4920de86211e4579188ed035d00ce1124e203a605855
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f7a8d60a2f79f9fed2cea73d23dd3dfda5f5e479acfb4213a6f2e863cb76904
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8963a103756502091940dcb6256fb47e583c349918f9cb8e5f151486ec5304af
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cbd4af8c4ab3cb75893cf7c4c12466d6c795077167416da697449ce4a12b474
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 34.53865432739258,
3
  "best_model_checkpoint": "/kaggle/working/output/checkpoint-28710",
4
- "epoch": 23.0,
5
  "eval_steps": 500,
6
- "global_step": 30015,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2291,6 +2291,204 @@
2291
  "eval_samples_per_second": 26.45,
2292
  "eval_steps_per_second": 3.324,
2293
  "step": 30015
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2294
  }
2295
  ],
2296
  "logging_steps": 100,
@@ -2305,7 +2503,7 @@
2305
  "early_stopping_threshold": 0.0
2306
  },
2307
  "attributes": {
2308
- "early_stopping_patience_counter": 1
2309
  }
2310
  },
2311
  "TrainerControl": {
@@ -2319,7 +2517,7 @@
2319
  "attributes": {}
2320
  }
2321
  },
2322
- "total_flos": 3.2368150770084864e+16,
2323
  "train_batch_size": 8,
2324
  "trial_name": null,
2325
  "trial_params": null
 
1
  {
2
  "best_metric": 34.53865432739258,
3
  "best_model_checkpoint": "/kaggle/working/output/checkpoint-28710",
4
+ "epoch": 25.0,
5
  "eval_steps": 500,
6
+ "global_step": 32625,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2291
  "eval_samples_per_second": 26.45,
2292
  "eval_steps_per_second": 3.324,
2293
  "step": 30015
2294
+ },
2295
+ {
2296
+ "epoch": 23.06513409961686,
2297
+ "grad_norm": 1.9898459911346436,
2298
+ "learning_rate": 3.559099616858237e-05,
2299
+ "loss": 33.2659,
2300
+ "step": 30100
2301
+ },
2302
+ {
2303
+ "epoch": 23.14176245210728,
2304
+ "grad_norm": 3.3541698455810547,
2305
+ "learning_rate": 3.554310344827587e-05,
2306
+ "loss": 33.4747,
2307
+ "step": 30200
2308
+ },
2309
+ {
2310
+ "epoch": 23.2183908045977,
2311
+ "grad_norm": 2.298229694366455,
2312
+ "learning_rate": 3.5495210727969354e-05,
2313
+ "loss": 33.8791,
2314
+ "step": 30300
2315
+ },
2316
+ {
2317
+ "epoch": 23.295019157088124,
2318
+ "grad_norm": 3.9336183071136475,
2319
+ "learning_rate": 3.544731800766284e-05,
2320
+ "loss": 33.8427,
2321
+ "step": 30400
2322
+ },
2323
+ {
2324
+ "epoch": 23.371647509578544,
2325
+ "grad_norm": 2.9286720752716064,
2326
+ "learning_rate": 3.539942528735633e-05,
2327
+ "loss": 33.9572,
2328
+ "step": 30500
2329
+ },
2330
+ {
2331
+ "epoch": 23.448275862068964,
2332
+ "grad_norm": 2.9716665744781494,
2333
+ "learning_rate": 3.5351532567049814e-05,
2334
+ "loss": 32.5295,
2335
+ "step": 30600
2336
+ },
2337
+ {
2338
+ "epoch": 23.52490421455939,
2339
+ "grad_norm": 3.5073654651641846,
2340
+ "learning_rate": 3.5303639846743294e-05,
2341
+ "loss": 33.3511,
2342
+ "step": 30700
2343
+ },
2344
+ {
2345
+ "epoch": 23.60153256704981,
2346
+ "grad_norm": 4.5670084953308105,
2347
+ "learning_rate": 3.525574712643678e-05,
2348
+ "loss": 33.4249,
2349
+ "step": 30800
2350
+ },
2351
+ {
2352
+ "epoch": 23.67816091954023,
2353
+ "grad_norm": 2.563405990600586,
2354
+ "learning_rate": 3.520785440613027e-05,
2355
+ "loss": 33.821,
2356
+ "step": 30900
2357
+ },
2358
+ {
2359
+ "epoch": 23.754789272030653,
2360
+ "grad_norm": 3.5928332805633545,
2361
+ "learning_rate": 3.5159961685823755e-05,
2362
+ "loss": 32.9252,
2363
+ "step": 31000
2364
+ },
2365
+ {
2366
+ "epoch": 23.831417624521073,
2367
+ "grad_norm": 3.2677550315856934,
2368
+ "learning_rate": 3.511206896551724e-05,
2369
+ "loss": 33.4694,
2370
+ "step": 31100
2371
+ },
2372
+ {
2373
+ "epoch": 23.908045977011493,
2374
+ "grad_norm": 3.8751015663146973,
2375
+ "learning_rate": 3.506417624521073e-05,
2376
+ "loss": 32.7835,
2377
+ "step": 31200
2378
+ },
2379
+ {
2380
+ "epoch": 23.984674329501917,
2381
+ "grad_norm": 3.955101490020752,
2382
+ "learning_rate": 3.5016283524904216e-05,
2383
+ "loss": 32.6658,
2384
+ "step": 31300
2385
+ },
2386
+ {
2387
+ "epoch": 24.0,
2388
+ "eval_loss": 34.550262451171875,
2389
+ "eval_runtime": 49.3313,
2390
+ "eval_samples_per_second": 26.454,
2391
+ "eval_steps_per_second": 3.324,
2392
+ "step": 31320
2393
+ },
2394
+ {
2395
+ "epoch": 24.061302681992338,
2396
+ "grad_norm": 3.885087013244629,
2397
+ "learning_rate": 3.49683908045977e-05,
2398
+ "loss": 33.5285,
2399
+ "step": 31400
2400
+ },
2401
+ {
2402
+ "epoch": 24.137931034482758,
2403
+ "grad_norm": 8.908398628234863,
2404
+ "learning_rate": 3.4920977011494254e-05,
2405
+ "loss": 33.1673,
2406
+ "step": 31500
2407
+ },
2408
+ {
2409
+ "epoch": 24.21455938697318,
2410
+ "grad_norm": 4.042150974273682,
2411
+ "learning_rate": 3.487308429118774e-05,
2412
+ "loss": 33.0384,
2413
+ "step": 31600
2414
+ },
2415
+ {
2416
+ "epoch": 24.291187739463602,
2417
+ "grad_norm": 4.992551803588867,
2418
+ "learning_rate": 3.482519157088123e-05,
2419
+ "loss": 33.7439,
2420
+ "step": 31700
2421
+ },
2422
+ {
2423
+ "epoch": 24.367816091954023,
2424
+ "grad_norm": 5.118918418884277,
2425
+ "learning_rate": 3.4777298850574715e-05,
2426
+ "loss": 33.5604,
2427
+ "step": 31800
2428
+ },
2429
+ {
2430
+ "epoch": 24.444444444444443,
2431
+ "grad_norm": 3.2756083011627197,
2432
+ "learning_rate": 3.47294061302682e-05,
2433
+ "loss": 33.6225,
2434
+ "step": 31900
2435
+ },
2436
+ {
2437
+ "epoch": 24.521072796934867,
2438
+ "grad_norm": 2.9864351749420166,
2439
+ "learning_rate": 3.468151340996169e-05,
2440
+ "loss": 34.0539,
2441
+ "step": 32000
2442
+ },
2443
+ {
2444
+ "epoch": 24.597701149425287,
2445
+ "grad_norm": 2.945171356201172,
2446
+ "learning_rate": 3.463362068965517e-05,
2447
+ "loss": 33.2655,
2448
+ "step": 32100
2449
+ },
2450
+ {
2451
+ "epoch": 24.674329501915707,
2452
+ "grad_norm": 4.09877347946167,
2453
+ "learning_rate": 3.4585727969348656e-05,
2454
+ "loss": 33.239,
2455
+ "step": 32200
2456
+ },
2457
+ {
2458
+ "epoch": 24.75095785440613,
2459
+ "grad_norm": 3.7949306964874268,
2460
+ "learning_rate": 3.453783524904215e-05,
2461
+ "loss": 32.7246,
2462
+ "step": 32300
2463
+ },
2464
+ {
2465
+ "epoch": 24.82758620689655,
2466
+ "grad_norm": 3.8750340938568115,
2467
+ "learning_rate": 3.4489942528735636e-05,
2468
+ "loss": 32.5477,
2469
+ "step": 32400
2470
+ },
2471
+ {
2472
+ "epoch": 24.904214559386972,
2473
+ "grad_norm": 3.84676456451416,
2474
+ "learning_rate": 3.444204980842912e-05,
2475
+ "loss": 33.5781,
2476
+ "step": 32500
2477
+ },
2478
+ {
2479
+ "epoch": 24.980842911877396,
2480
+ "grad_norm": 2.3316519260406494,
2481
+ "learning_rate": 3.439415708812261e-05,
2482
+ "loss": 33.0241,
2483
+ "step": 32600
2484
+ },
2485
+ {
2486
+ "epoch": 25.0,
2487
+ "eval_loss": 34.565101623535156,
2488
+ "eval_runtime": 49.343,
2489
+ "eval_samples_per_second": 26.448,
2490
+ "eval_steps_per_second": 3.324,
2491
+ "step": 32625
2492
  }
2493
  ],
2494
  "logging_steps": 100,
 
2503
  "early_stopping_threshold": 0.0
2504
  },
2505
  "attributes": {
2506
+ "early_stopping_patience_counter": 3
2507
  }
2508
  },
2509
  "TrainerControl": {
 
2517
  "attributes": {}
2518
  }
2519
  },
2520
+ "total_flos": 3.51827725761792e+16,
2521
  "train_batch_size": 8,
2522
  "trial_name": null,
2523
  "trial_params": null