kiritan commited on
Commit
313fa85
·
verified ·
1 Parent(s): a817b4c

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/global_step9000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b2e4b5173ecac6a9f3534c1b14e6d36ec29577616c64122a8c1e0f65db43555
3
+ size 5117197489
last-checkpoint/global_step9000/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dc66a01d1ac7b2fd44ab1e6565c0a596ae931e03269eec0d60d8dbe27476c99
3
+ size 859127933
last-checkpoint/latest CHANGED
@@ -1 +1 @@
1
- global_step8000
 
1
+ global_step9000
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0823a45cd5f5f262d4113d9c6af3e480a93b1328d895090e11d3841575e98029
3
  size 962205216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aef828c688fc4b40c4f970b4f1621324009e1b6de86d3a3ed65007b337b7f7e7
3
  size 962205216
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f2132517ec1780cf0e43d2d85e0457c9953dabc448540c499dce25e57e2b052
3
  size 14709
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64e572f1314b9da8f922a0fbf0c91986e4b7b809f9a1dbb178f491f4b7541f4c
3
  size 14709
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cba89e97c806c2994342d3ee7fc823d23ef358301180bf2dcf6ac57f1ab3869c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec7cb829bad4c5e40215f974eb8875988bba1a68c4193a01021b2b11b0d8359f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 84.13012729844414,
3
  "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
- "epoch": 8.810572687224669,
5
  "eval_steps": 1000,
6
- "global_step": 8000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2327,6 +2327,296 @@
2327
  "eval_steps_per_second": 2.028,
2328
  "eval_wer": 85.53512494106553,
2329
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2330
  }
2331
  ],
2332
  "logging_steps": 25,
@@ -2346,7 +2636,7 @@
2346
  "attributes": {}
2347
  }
2348
  },
2349
- "total_flos": 1.3745145491920781e+20,
2350
  "train_batch_size": 4,
2351
  "trial_name": null,
2352
  "trial_params": null
 
1
  {
2
  "best_metric": 84.13012729844414,
3
  "best_model_checkpoint": "./iteboshi_student_model_temp/checkpoint-7000",
4
+ "epoch": 9.911894273127754,
5
  "eval_steps": 1000,
6
+ "global_step": 9000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2327
  "eval_steps_per_second": 2.028,
2328
  "eval_wer": 85.53512494106553,
2329
  "step": 8000
2330
+ },
2331
+ {
2332
+ "epoch": 8.838105726872246,
2333
+ "grad_norm": 0.33088424801826477,
2334
+ "learning_rate": 1.2282051282051283e-05,
2335
+ "loss": 0.049,
2336
+ "step": 8025
2337
+ },
2338
+ {
2339
+ "epoch": 8.865638766519824,
2340
+ "grad_norm": 0.3602592945098877,
2341
+ "learning_rate": 1.2256410256410259e-05,
2342
+ "loss": 0.0572,
2343
+ "step": 8050
2344
+ },
2345
+ {
2346
+ "epoch": 8.8931718061674,
2347
+ "grad_norm": 0.35909441113471985,
2348
+ "learning_rate": 1.2230769230769232e-05,
2349
+ "loss": 0.051,
2350
+ "step": 8075
2351
+ },
2352
+ {
2353
+ "epoch": 8.920704845814978,
2354
+ "grad_norm": 0.45318055152893066,
2355
+ "learning_rate": 1.2205128205128208e-05,
2356
+ "loss": 0.0642,
2357
+ "step": 8100
2358
+ },
2359
+ {
2360
+ "epoch": 8.948237885462555,
2361
+ "grad_norm": 1.001381754875183,
2362
+ "learning_rate": 1.217948717948718e-05,
2363
+ "loss": 0.0522,
2364
+ "step": 8125
2365
+ },
2366
+ {
2367
+ "epoch": 8.975770925110131,
2368
+ "grad_norm": 0.5000578761100769,
2369
+ "learning_rate": 1.2153846153846153e-05,
2370
+ "loss": 0.0423,
2371
+ "step": 8150
2372
+ },
2373
+ {
2374
+ "epoch": 9.003303964757709,
2375
+ "grad_norm": 0.29771438241004944,
2376
+ "learning_rate": 1.2128205128205129e-05,
2377
+ "loss": 0.063,
2378
+ "step": 8175
2379
+ },
2380
+ {
2381
+ "epoch": 9.030837004405287,
2382
+ "grad_norm": 0.43198081851005554,
2383
+ "learning_rate": 1.2102564102564102e-05,
2384
+ "loss": 0.0416,
2385
+ "step": 8200
2386
+ },
2387
+ {
2388
+ "epoch": 9.058370044052863,
2389
+ "grad_norm": 0.31377923488616943,
2390
+ "learning_rate": 1.2076923076923078e-05,
2391
+ "loss": 0.0358,
2392
+ "step": 8225
2393
+ },
2394
+ {
2395
+ "epoch": 9.08590308370044,
2396
+ "grad_norm": 1.0352481603622437,
2397
+ "learning_rate": 1.2051282051282051e-05,
2398
+ "loss": 0.0348,
2399
+ "step": 8250
2400
+ },
2401
+ {
2402
+ "epoch": 9.113436123348018,
2403
+ "grad_norm": 0.421749472618103,
2404
+ "learning_rate": 1.2025641025641027e-05,
2405
+ "loss": 0.0403,
2406
+ "step": 8275
2407
+ },
2408
+ {
2409
+ "epoch": 9.140969162995594,
2410
+ "grad_norm": 0.39680853486061096,
2411
+ "learning_rate": 1.2e-05,
2412
+ "loss": 0.0348,
2413
+ "step": 8300
2414
+ },
2415
+ {
2416
+ "epoch": 9.168502202643172,
2417
+ "grad_norm": 0.25886261463165283,
2418
+ "learning_rate": 1.1974358974358976e-05,
2419
+ "loss": 0.0255,
2420
+ "step": 8325
2421
+ },
2422
+ {
2423
+ "epoch": 9.19603524229075,
2424
+ "grad_norm": 0.29378727078437805,
2425
+ "learning_rate": 1.194871794871795e-05,
2426
+ "loss": 0.0421,
2427
+ "step": 8350
2428
+ },
2429
+ {
2430
+ "epoch": 9.223568281938325,
2431
+ "grad_norm": 0.14189021289348602,
2432
+ "learning_rate": 1.1923076923076925e-05,
2433
+ "loss": 0.0346,
2434
+ "step": 8375
2435
+ },
2436
+ {
2437
+ "epoch": 9.251101321585903,
2438
+ "grad_norm": 0.3648456335067749,
2439
+ "learning_rate": 1.1897435897435898e-05,
2440
+ "loss": 0.0358,
2441
+ "step": 8400
2442
+ },
2443
+ {
2444
+ "epoch": 9.27863436123348,
2445
+ "grad_norm": 0.22953101992607117,
2446
+ "learning_rate": 1.1871794871794872e-05,
2447
+ "loss": 0.0377,
2448
+ "step": 8425
2449
+ },
2450
+ {
2451
+ "epoch": 9.306167400881057,
2452
+ "grad_norm": 0.13100098073482513,
2453
+ "learning_rate": 1.1846153846153847e-05,
2454
+ "loss": 0.0345,
2455
+ "step": 8450
2456
+ },
2457
+ {
2458
+ "epoch": 9.333700440528634,
2459
+ "grad_norm": 0.41983163356781006,
2460
+ "learning_rate": 1.1820512820512821e-05,
2461
+ "loss": 0.0358,
2462
+ "step": 8475
2463
+ },
2464
+ {
2465
+ "epoch": 9.361233480176212,
2466
+ "grad_norm": 0.28245824575424194,
2467
+ "learning_rate": 1.1794871794871796e-05,
2468
+ "loss": 0.0467,
2469
+ "step": 8500
2470
+ },
2471
+ {
2472
+ "epoch": 9.388766519823788,
2473
+ "grad_norm": 0.46235162019729614,
2474
+ "learning_rate": 1.176923076923077e-05,
2475
+ "loss": 0.0407,
2476
+ "step": 8525
2477
+ },
2478
+ {
2479
+ "epoch": 9.416299559471366,
2480
+ "grad_norm": 0.4608246684074402,
2481
+ "learning_rate": 1.1743589743589745e-05,
2482
+ "loss": 0.0407,
2483
+ "step": 8550
2484
+ },
2485
+ {
2486
+ "epoch": 9.443832599118943,
2487
+ "grad_norm": 0.32346612215042114,
2488
+ "learning_rate": 1.1717948717948719e-05,
2489
+ "loss": 0.0347,
2490
+ "step": 8575
2491
+ },
2492
+ {
2493
+ "epoch": 9.47136563876652,
2494
+ "grad_norm": 0.3498935401439667,
2495
+ "learning_rate": 1.1692307692307694e-05,
2496
+ "loss": 0.0413,
2497
+ "step": 8600
2498
+ },
2499
+ {
2500
+ "epoch": 9.498898678414097,
2501
+ "grad_norm": 0.48518890142440796,
2502
+ "learning_rate": 1.1666666666666668e-05,
2503
+ "loss": 0.0503,
2504
+ "step": 8625
2505
+ },
2506
+ {
2507
+ "epoch": 9.526431718061675,
2508
+ "grad_norm": 0.20332852005958557,
2509
+ "learning_rate": 1.1641025641025643e-05,
2510
+ "loss": 0.0472,
2511
+ "step": 8650
2512
+ },
2513
+ {
2514
+ "epoch": 9.55396475770925,
2515
+ "grad_norm": 0.3680901825428009,
2516
+ "learning_rate": 1.1615384615384617e-05,
2517
+ "loss": 0.039,
2518
+ "step": 8675
2519
+ },
2520
+ {
2521
+ "epoch": 9.581497797356828,
2522
+ "grad_norm": 0.4770890176296234,
2523
+ "learning_rate": 1.1589743589743592e-05,
2524
+ "loss": 0.0325,
2525
+ "step": 8700
2526
+ },
2527
+ {
2528
+ "epoch": 9.609030837004406,
2529
+ "grad_norm": 0.3051774501800537,
2530
+ "learning_rate": 1.1564102564102566e-05,
2531
+ "loss": 0.0406,
2532
+ "step": 8725
2533
+ },
2534
+ {
2535
+ "epoch": 9.636563876651982,
2536
+ "grad_norm": 0.4181124269962311,
2537
+ "learning_rate": 1.1538461538461538e-05,
2538
+ "loss": 0.0425,
2539
+ "step": 8750
2540
+ },
2541
+ {
2542
+ "epoch": 9.66409691629956,
2543
+ "grad_norm": 0.3570977449417114,
2544
+ "learning_rate": 1.1512820512820513e-05,
2545
+ "loss": 0.0349,
2546
+ "step": 8775
2547
+ },
2548
+ {
2549
+ "epoch": 9.691629955947137,
2550
+ "grad_norm": 0.40113458037376404,
2551
+ "learning_rate": 1.1487179487179487e-05,
2552
+ "loss": 0.0332,
2553
+ "step": 8800
2554
+ },
2555
+ {
2556
+ "epoch": 9.719162995594713,
2557
+ "grad_norm": 0.5174753665924072,
2558
+ "learning_rate": 1.1461538461538462e-05,
2559
+ "loss": 0.0365,
2560
+ "step": 8825
2561
+ },
2562
+ {
2563
+ "epoch": 9.746696035242291,
2564
+ "grad_norm": 0.6039919853210449,
2565
+ "learning_rate": 1.1435897435897436e-05,
2566
+ "loss": 0.0423,
2567
+ "step": 8850
2568
+ },
2569
+ {
2570
+ "epoch": 9.774229074889869,
2571
+ "grad_norm": 0.33104389905929565,
2572
+ "learning_rate": 1.1410256410256411e-05,
2573
+ "loss": 0.0492,
2574
+ "step": 8875
2575
+ },
2576
+ {
2577
+ "epoch": 9.801762114537445,
2578
+ "grad_norm": 0.5245941877365112,
2579
+ "learning_rate": 1.1384615384615385e-05,
2580
+ "loss": 0.0403,
2581
+ "step": 8900
2582
+ },
2583
+ {
2584
+ "epoch": 9.829295154185022,
2585
+ "grad_norm": 0.4713122546672821,
2586
+ "learning_rate": 1.135897435897436e-05,
2587
+ "loss": 0.0539,
2588
+ "step": 8925
2589
+ },
2590
+ {
2591
+ "epoch": 9.8568281938326,
2592
+ "grad_norm": 0.3693457841873169,
2593
+ "learning_rate": 1.1333333333333334e-05,
2594
+ "loss": 0.0419,
2595
+ "step": 8950
2596
+ },
2597
+ {
2598
+ "epoch": 9.884361233480176,
2599
+ "grad_norm": 0.4538707733154297,
2600
+ "learning_rate": 1.1307692307692309e-05,
2601
+ "loss": 0.0345,
2602
+ "step": 8975
2603
+ },
2604
+ {
2605
+ "epoch": 9.911894273127754,
2606
+ "grad_norm": 0.25317588448524475,
2607
+ "learning_rate": 1.1282051282051283e-05,
2608
+ "loss": 0.0329,
2609
+ "step": 9000
2610
+ },
2611
+ {
2612
+ "epoch": 9.911894273127754,
2613
+ "eval_cer": 24.903992755035365,
2614
+ "eval_loss": 0.8292160034179688,
2615
+ "eval_runtime": 1301.6266,
2616
+ "eval_samples_per_second": 8.129,
2617
+ "eval_steps_per_second": 2.033,
2618
+ "eval_wer": 84.72418670438473,
2619
+ "step": 9000
2620
  }
2621
  ],
2622
  "logging_steps": 25,
 
2636
  "attributes": {}
2637
  }
2638
  },
2639
+ "total_flos": 1.546328867841088e+20,
2640
  "train_batch_size": 4,
2641
  "trial_name": null,
2642
  "trial_params": null