ranjan56cse commited on
Commit
1255eae
·
verified ·
1 Parent(s): f2285c9

Latest checkpoint (step 13000)

Browse files
checkpoint-latest/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c75a7b6500f668420328e5064892a57588ede668a752b8e65610afad5d2a844
3
  size 7098016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d11ec0a17acf750db437799bc80d1a2b9d6c8e4c600859e3c5ed67537cb503e7
3
  size 7098016
checkpoint-latest/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:297257d1b5066c7efe02366005386e610020a8122a08f3cb7c43bb60e6037e07
3
  size 14277259
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9e4992c32ab2fe8569b7714e9dc6297aa9204b9453065fabb7cc100bb1a6af3
3
  size 14277259
checkpoint-latest/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7f41050d1acaa3ed4a9e37bedc427799f650ec5640cc96982536cba4dbcd081
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afb5dee99676843adef33a180fbff383b90ecfe64b37d99b77a2c819b85ba663
3
  size 14645
checkpoint-latest/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2316a7428e94930457fe075f6934df08a523a4ed19225b8d13b30505b5beee7
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:241a1effee64c67d4a8f871310906ab4404335a122c053b27e3a2c4c5480249a
3
  size 1465
checkpoint-latest/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.5003042221069336,
3
  "best_model_checkpoint": "./t5_checkpoints_full/checkpoint-1000",
4
- "epoch": 1.8819101387908728,
5
  "eval_steps": 1000,
6
- "global_step": 12000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1543,6 +1543,134 @@
1543
  "eval_samples_per_second": 119.615,
1544
  "eval_steps_per_second": 7.484,
1545
  "step": 12000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1546
  }
1547
  ],
1548
  "logging_steps": 50,
@@ -1550,7 +1678,7 @@
1550
  "num_input_tokens_seen": 0,
1551
  "num_train_epochs": 3,
1552
  "save_steps": 1000,
1553
- "total_flos": 2.3592534089977037e+17,
1554
  "train_batch_size": 16,
1555
  "trial_name": null,
1556
  "trial_params": null
 
1
  {
2
  "best_metric": 0.5003042221069336,
3
  "best_model_checkpoint": "./t5_checkpoints_full/checkpoint-1000",
4
+ "epoch": 2.038735983690112,
5
  "eval_steps": 1000,
6
+ "global_step": 13000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1543
  "eval_samples_per_second": 119.615,
1544
  "eval_steps_per_second": 7.484,
1545
  "step": 12000
1546
+ },
1547
+ {
1548
+ "epoch": 1.89,
1549
+ "learning_rate": 0.00011627657290100923,
1550
+ "loss": 1.3686,
1551
+ "step": 12050
1552
+ },
1553
+ {
1554
+ "epoch": 1.9,
1555
+ "learning_rate": 0.00011547133347648699,
1556
+ "loss": 1.3721,
1557
+ "step": 12100
1558
+ },
1559
+ {
1560
+ "epoch": 1.91,
1561
+ "learning_rate": 0.00011466609405196477,
1562
+ "loss": 1.3638,
1563
+ "step": 12150
1564
+ },
1565
+ {
1566
+ "epoch": 1.91,
1567
+ "learning_rate": 0.00011386085462744256,
1568
+ "loss": 1.375,
1569
+ "step": 12200
1570
+ },
1571
+ {
1572
+ "epoch": 1.92,
1573
+ "learning_rate": 0.00011305561520292033,
1574
+ "loss": 1.3774,
1575
+ "step": 12250
1576
+ },
1577
+ {
1578
+ "epoch": 1.93,
1579
+ "learning_rate": 0.00011226648056688854,
1580
+ "loss": 1.3897,
1581
+ "step": 12300
1582
+ },
1583
+ {
1584
+ "epoch": 1.94,
1585
+ "learning_rate": 0.00011146124114236632,
1586
+ "loss": 1.369,
1587
+ "step": 12350
1588
+ },
1589
+ {
1590
+ "epoch": 1.94,
1591
+ "learning_rate": 0.00011089757354520076,
1592
+ "loss": 1.8621,
1593
+ "step": 12400
1594
+ },
1595
+ {
1596
+ "epoch": 1.95,
1597
+ "learning_rate": 0.00011089757354520076,
1598
+ "loss": 0.0,
1599
+ "step": 12450
1600
+ },
1601
+ {
1602
+ "epoch": 1.96,
1603
+ "learning_rate": 0.00011089757354520076,
1604
+ "loss": 0.0,
1605
+ "step": 12500
1606
+ },
1607
+ {
1608
+ "epoch": 1.97,
1609
+ "learning_rate": 0.00011089757354520076,
1610
+ "loss": 0.0,
1611
+ "step": 12550
1612
+ },
1613
+ {
1614
+ "epoch": 1.98,
1615
+ "learning_rate": 0.00011089757354520076,
1616
+ "loss": 0.0,
1617
+ "step": 12600
1618
+ },
1619
+ {
1620
+ "epoch": 1.98,
1621
+ "learning_rate": 0.00011089757354520076,
1622
+ "loss": 0.0,
1623
+ "step": 12650
1624
+ },
1625
+ {
1626
+ "epoch": 1.99,
1627
+ "learning_rate": 0.00011089757354520076,
1628
+ "loss": 0.0,
1629
+ "step": 12700
1630
+ },
1631
+ {
1632
+ "epoch": 2.0,
1633
+ "learning_rate": 0.00011089757354520076,
1634
+ "loss": 0.0,
1635
+ "step": 12750
1636
+ },
1637
+ {
1638
+ "epoch": 2.01,
1639
+ "learning_rate": 0.00011089757354520076,
1640
+ "loss": 0.0,
1641
+ "step": 12800
1642
+ },
1643
+ {
1644
+ "epoch": 2.02,
1645
+ "learning_rate": 0.00011089757354520076,
1646
+ "loss": 0.0,
1647
+ "step": 12850
1648
+ },
1649
+ {
1650
+ "epoch": 2.02,
1651
+ "learning_rate": 0.00011089757354520076,
1652
+ "loss": 0.0,
1653
+ "step": 12900
1654
+ },
1655
+ {
1656
+ "epoch": 2.03,
1657
+ "learning_rate": 0.00011089757354520076,
1658
+ "loss": 0.0,
1659
+ "step": 12950
1660
+ },
1661
+ {
1662
+ "epoch": 2.04,
1663
+ "learning_rate": 0.00011089757354520076,
1664
+ "loss": 0.0,
1665
+ "step": 13000
1666
+ },
1667
+ {
1668
+ "epoch": 2.04,
1669
+ "eval_loss": NaN,
1670
+ "eval_runtime": 93.4731,
1671
+ "eval_samples_per_second": 121.233,
1672
+ "eval_steps_per_second": 7.585,
1673
+ "step": 13000
1674
  }
1675
  ],
1676
  "logging_steps": 50,
 
1678
  "num_input_tokens_seen": 0,
1679
  "num_train_epochs": 3,
1680
  "save_steps": 1000,
1681
+ "total_flos": 2.5558409639205274e+17,
1682
  "train_batch_size": 16,
1683
  "trial_name": null,
1684
  "trial_params": null
checkpoint-latest/training_metrics.json CHANGED
@@ -1,90 +1,90 @@
1
  {
2
- "step": 12000,
3
- "epoch": 1.8819101387908728,
4
  "best_eval_loss": 0.5003042221069336,
5
- "checkpoint_number": 12,
6
  "recent_metrics": [
7
  {
8
- "step": 11600,
9
- "epoch": 1.82,
10
- "loss": 1.3655,
11
- "learning_rate": 0.00012344320377925702,
12
  "gpu_memory_gb": 0.8661794662475586,
13
  "system_memory_percent": 6.9
14
  },
15
  {
16
- "step": 11650,
17
- "epoch": 1.83,
18
- "loss": 1.3702,
19
- "learning_rate": 0.0001226379643547348,
20
  "gpu_memory_gb": 0.8661794662475586,
21
  "system_memory_percent": 6.9
22
  },
23
  {
24
- "step": 11700,
25
- "epoch": 1.83,
26
- "loss": 1.3929,
27
- "learning_rate": 0.00012184882971870302,
28
  "gpu_memory_gb": 0.8661794662475586,
29
  "system_memory_percent": 6.9
30
  },
31
  {
32
- "step": 11750,
33
- "epoch": 1.84,
34
- "loss": 1.3611,
35
- "learning_rate": 0.00012104359029418079,
36
  "gpu_memory_gb": 0.8661794662475586,
37
  "system_memory_percent": 6.9
38
  },
39
  {
40
- "step": 11800,
41
- "epoch": 1.85,
42
- "loss": 1.37,
43
- "learning_rate": 0.00012023835086965857,
44
  "gpu_memory_gb": 0.8661794662475586,
45
- "system_memory_percent": 6.9
46
  },
47
  {
48
- "step": 11850,
49
- "epoch": 1.86,
50
- "loss": 1.4018,
51
- "learning_rate": 0.00011946532102211722,
52
  "gpu_memory_gb": 0.8661794662475586,
53
- "system_memory_percent": 6.9
54
  },
55
  {
56
- "step": 11900,
57
- "epoch": 1.87,
58
- "loss": 1.3757,
59
- "learning_rate": 0.000118660081597595,
60
  "gpu_memory_gb": 0.8661794662475586,
61
  "system_memory_percent": 6.9
62
  },
63
  {
64
- "step": 11950,
65
- "epoch": 1.87,
66
- "loss": 1.3949,
67
- "learning_rate": 0.00011788705175005367,
68
  "gpu_memory_gb": 0.8661794662475586,
69
- "system_memory_percent": 6.9
70
  },
71
  {
72
- "step": 12000,
73
- "epoch": 1.88,
74
- "loss": 1.3671,
75
- "learning_rate": 0.00011708181232553145,
76
  "gpu_memory_gb": 0.8661794662475586,
77
- "system_memory_percent": 6.9
78
  },
79
  {
80
- "step": 12000,
81
- "epoch": 1.88,
82
- "eval_loss": 1.1848528385162354,
83
- "eval_runtime": 94.7376,
84
- "eval_samples_per_second": 119.615,
85
- "eval_steps_per_second": 7.484,
86
  "gpu_memory_gb": 0.8661794662475586,
87
- "system_memory_percent": 6.9
88
  }
89
  ]
90
  }
 
1
  {
2
+ "step": 13000,
3
+ "epoch": 2.038735983690112,
4
  "best_eval_loss": 0.5003042221069336,
5
+ "checkpoint_number": 13,
6
  "recent_metrics": [
7
  {
8
+ "step": 12600,
9
+ "epoch": 1.98,
10
+ "loss": 0.0,
11
+ "learning_rate": 0.00011089757354520076,
12
  "gpu_memory_gb": 0.8661794662475586,
13
  "system_memory_percent": 6.9
14
  },
15
  {
16
+ "step": 12650,
17
+ "epoch": 1.98,
18
+ "loss": 0.0,
19
+ "learning_rate": 0.00011089757354520076,
20
  "gpu_memory_gb": 0.8661794662475586,
21
  "system_memory_percent": 6.9
22
  },
23
  {
24
+ "step": 12700,
25
+ "epoch": 1.99,
26
+ "loss": 0.0,
27
+ "learning_rate": 0.00011089757354520076,
28
  "gpu_memory_gb": 0.8661794662475586,
29
  "system_memory_percent": 6.9
30
  },
31
  {
32
+ "step": 12750,
33
+ "epoch": 2.0,
34
+ "loss": 0.0,
35
+ "learning_rate": 0.00011089757354520076,
36
  "gpu_memory_gb": 0.8661794662475586,
37
  "system_memory_percent": 6.9
38
  },
39
  {
40
+ "step": 12800,
41
+ "epoch": 2.01,
42
+ "loss": 0.0,
43
+ "learning_rate": 0.00011089757354520076,
44
  "gpu_memory_gb": 0.8661794662475586,
45
+ "system_memory_percent": 7.1
46
  },
47
  {
48
+ "step": 12850,
49
+ "epoch": 2.02,
50
+ "loss": 0.0,
51
+ "learning_rate": 0.00011089757354520076,
52
  "gpu_memory_gb": 0.8661794662475586,
53
+ "system_memory_percent": 7.0
54
  },
55
  {
56
+ "step": 12900,
57
+ "epoch": 2.02,
58
+ "loss": 0.0,
59
+ "learning_rate": 0.00011089757354520076,
60
  "gpu_memory_gb": 0.8661794662475586,
61
  "system_memory_percent": 6.9
62
  },
63
  {
64
+ "step": 12950,
65
+ "epoch": 2.03,
66
+ "loss": 0.0,
67
+ "learning_rate": 0.00011089757354520076,
68
  "gpu_memory_gb": 0.8661794662475586,
69
+ "system_memory_percent": 7.0
70
  },
71
  {
72
+ "step": 13000,
73
+ "epoch": 2.04,
74
+ "loss": 0.0,
75
+ "learning_rate": 0.00011089757354520076,
76
  "gpu_memory_gb": 0.8661794662475586,
77
+ "system_memory_percent": 7.0
78
  },
79
  {
80
+ "step": 13000,
81
+ "epoch": 2.04,
82
+ "eval_loss": NaN,
83
+ "eval_runtime": 93.4731,
84
+ "eval_samples_per_second": 121.233,
85
+ "eval_steps_per_second": 7.585,
86
  "gpu_memory_gb": 0.8661794662475586,
87
+ "system_memory_percent": 7.0
88
  }
89
  ]
90
  }