Azrail commited on
Commit
261cd2b
·
verified ·
1 Parent(s): 18e074b

Training in progress, step 3000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09e9299c4a411196fea3d5279894585e8d1a7575c08eb1779c5008bb7e4a49b7
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f81f1606cbb4066658322a9b01b024ebe1fe01d7f9c79d6a2b4af556fe6aa975
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c444730bd04d34d146261d2e799975f1275e5903aace9e152e7e5c01154c912
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2bdc54e623a858f4b04c457346b0f903dc827e2ac006197959be017f0bd1f45
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3c7e8305d45d254f0365c29c304654706064d85b369eee2a35f47f258c35c43
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11ff07d587c5a9307740887f980afedff8f43c8da2bd4cbf45f5f3cf546cf38d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e7450dae308a1f566442c67e6e8e15b97c271edd460f95249b85ad7cccbd395
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a3d374142fb5a9a375b1a828a38137498daacdc810ac93109a9de1e8639e3a1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.04393208648580376,
6
  "eval_steps": 500,
7
- "global_step": 2000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -364,11 +364,189 @@
364
  "eval_steps_per_second": 19.877,
365
  "num_input_tokens_seen": 2097152000,
366
  "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
  }
368
  ],
369
  "logging_steps": 50,
370
  "max_steps": 200000,
371
- "num_input_tokens_seen": 2097152000,
372
  "num_train_epochs": 5,
373
  "save_steps": 1000,
374
  "stateful_callbacks": {
@@ -383,7 +561,7 @@
383
  "attributes": {}
384
  }
385
  },
386
- "total_flos": 1.194343431929856e+18,
387
  "train_batch_size": 64,
388
  "trial_name": null,
389
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.06589812972870564,
6
  "eval_steps": 500,
7
+ "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
364
  "eval_steps_per_second": 19.877,
365
  "num_input_tokens_seen": 2097152000,
366
  "step": 2000
367
+ },
368
+ {
369
+ "epoch": 0.04503038864794885,
370
+ "grad_norm": 0.33002936840057373,
371
+ "learning_rate": 0.00041,
372
+ "loss": 3.5684,
373
+ "num_input_tokens_seen": 2149580800,
374
+ "step": 2050
375
+ },
376
+ {
377
+ "epoch": 0.04612869081009394,
378
+ "grad_norm": 0.43806758522987366,
379
+ "learning_rate": 0.00042,
380
+ "loss": 3.5436,
381
+ "num_input_tokens_seen": 2202009600,
382
+ "step": 2100
383
+ },
384
+ {
385
+ "epoch": 0.04722699297223904,
386
+ "grad_norm": 0.32842758297920227,
387
+ "learning_rate": 0.00043,
388
+ "loss": 3.5191,
389
+ "num_input_tokens_seen": 2254438400,
390
+ "step": 2150
391
+ },
392
+ {
393
+ "epoch": 0.04832529513438413,
394
+ "grad_norm": 0.3068505525588989,
395
+ "learning_rate": 0.00044,
396
+ "loss": 3.5009,
397
+ "num_input_tokens_seen": 2306867200,
398
+ "step": 2200
399
+ },
400
+ {
401
+ "epoch": 0.049423597296529224,
402
+ "grad_norm": 0.2950410544872284,
403
+ "learning_rate": 0.00045000000000000004,
404
+ "loss": 3.4796,
405
+ "num_input_tokens_seen": 2359296000,
406
+ "step": 2250
407
+ },
408
+ {
409
+ "epoch": 0.05052189945867432,
410
+ "grad_norm": 0.29731425642967224,
411
+ "learning_rate": 0.00046,
412
+ "loss": 3.4583,
413
+ "num_input_tokens_seen": 2411724800,
414
+ "step": 2300
415
+ },
416
+ {
417
+ "epoch": 0.051620201620819414,
418
+ "grad_norm": 0.2702693045139313,
419
+ "learning_rate": 0.00047,
420
+ "loss": 3.4385,
421
+ "num_input_tokens_seen": 2464153600,
422
+ "step": 2350
423
+ },
424
+ {
425
+ "epoch": 0.05271850378296451,
426
+ "grad_norm": 0.2418452948331833,
427
+ "learning_rate": 0.00048,
428
+ "loss": 3.4244,
429
+ "num_input_tokens_seen": 2516582400,
430
+ "step": 2400
431
+ },
432
+ {
433
+ "epoch": 0.053816805945109604,
434
+ "grad_norm": 0.28668686747550964,
435
+ "learning_rate": 0.00049,
436
+ "loss": 3.3977,
437
+ "num_input_tokens_seen": 2569011200,
438
+ "step": 2450
439
+ },
440
+ {
441
+ "epoch": 0.054915108107254695,
442
+ "grad_norm": 0.3115544319152832,
443
+ "learning_rate": 0.0005,
444
+ "loss": 3.3881,
445
+ "num_input_tokens_seen": 2621440000,
446
+ "step": 2500
447
+ },
448
+ {
449
+ "epoch": 0.054915108107254695,
450
+ "eval_loss": 3.2789928913116455,
451
+ "eval_runtime": 62.6749,
452
+ "eval_samples_per_second": 79.777,
453
+ "eval_steps_per_second": 19.944,
454
+ "num_input_tokens_seen": 2621440000,
455
+ "step": 2500
456
+ },
457
+ {
458
+ "epoch": 0.056013410269399794,
459
+ "grad_norm": 0.32340022921562195,
460
+ "learning_rate": 0.00051,
461
+ "loss": 3.3667,
462
+ "num_input_tokens_seen": 2673868800,
463
+ "step": 2550
464
+ },
465
+ {
466
+ "epoch": 0.057111712431544885,
467
+ "grad_norm": 0.2612442970275879,
468
+ "learning_rate": 0.0005200000000000001,
469
+ "loss": 3.3612,
470
+ "num_input_tokens_seen": 2726297600,
471
+ "step": 2600
472
+ },
473
+ {
474
+ "epoch": 0.05821001459368998,
475
+ "grad_norm": 0.29934820532798767,
476
+ "learning_rate": 0.0005300000000000001,
477
+ "loss": 3.3386,
478
+ "num_input_tokens_seen": 2778726400,
479
+ "step": 2650
480
+ },
481
+ {
482
+ "epoch": 0.059308316755835075,
483
+ "grad_norm": 0.2737022042274475,
484
+ "learning_rate": 0.00054,
485
+ "loss": 3.3274,
486
+ "num_input_tokens_seen": 2831155200,
487
+ "step": 2700
488
+ },
489
+ {
490
+ "epoch": 0.060406618917980166,
491
+ "grad_norm": 0.2101408988237381,
492
+ "learning_rate": 0.00055,
493
+ "loss": 3.3153,
494
+ "num_input_tokens_seen": 2883584000,
495
+ "step": 2750
496
+ },
497
+ {
498
+ "epoch": 0.061504921080125265,
499
+ "grad_norm": 0.3240911066532135,
500
+ "learning_rate": 0.0005600000000000001,
501
+ "loss": 3.2978,
502
+ "num_input_tokens_seen": 2936012800,
503
+ "step": 2800
504
+ },
505
+ {
506
+ "epoch": 0.06260322324227036,
507
+ "grad_norm": 0.20592735707759857,
508
+ "learning_rate": 0.00057,
509
+ "loss": 3.2984,
510
+ "num_input_tokens_seen": 2988441600,
511
+ "step": 2850
512
+ },
513
+ {
514
+ "epoch": 0.06370152540441545,
515
+ "grad_norm": 0.263443261384964,
516
+ "learning_rate": 0.00058,
517
+ "loss": 3.2706,
518
+ "num_input_tokens_seen": 3040870400,
519
+ "step": 2900
520
+ },
521
+ {
522
+ "epoch": 0.06479982756656054,
523
+ "grad_norm": 0.24249990284442902,
524
+ "learning_rate": 0.00059,
525
+ "loss": 3.2673,
526
+ "num_input_tokens_seen": 3093299200,
527
+ "step": 2950
528
+ },
529
+ {
530
+ "epoch": 0.06589812972870564,
531
+ "grad_norm": 0.25961214303970337,
532
+ "learning_rate": 0.0006,
533
+ "loss": 3.2512,
534
+ "num_input_tokens_seen": 3145728000,
535
+ "step": 3000
536
+ },
537
+ {
538
+ "epoch": 0.06589812972870564,
539
+ "eval_loss": 3.150442600250244,
540
+ "eval_runtime": 65.9549,
541
+ "eval_samples_per_second": 75.809,
542
+ "eval_steps_per_second": 18.952,
543
+ "num_input_tokens_seen": 3145728000,
544
+ "step": 3000
545
  }
546
  ],
547
  "logging_steps": 50,
548
  "max_steps": 200000,
549
+ "num_input_tokens_seen": 3145728000,
550
  "num_train_epochs": 5,
551
  "save_steps": 1000,
552
  "stateful_callbacks": {
 
561
  "attributes": {}
562
  }
563
  },
564
+ "total_flos": 1.791515147894784e+18,
565
  "train_batch_size": 64,
566
  "trial_name": null,
567
  "trial_params": null