alpaca / checkpoint-398 /trainer_state.json
kloodia's picture
Upload folder using huggingface_hub
c12362f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998745294855709,
"eval_steps": 100,
"global_step": 398,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.1427878886461258,
"learning_rate": 2e-05,
"loss": 1.3359,
"step": 1
},
{
"epoch": 0.0,
"eval_loss": 1.324710488319397,
"eval_runtime": 82.0652,
"eval_samples_per_second": 31.67,
"eval_steps_per_second": 31.67,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 0.13537771999835968,
"learning_rate": 4e-05,
"loss": 1.2865,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 0.14623422920703888,
"learning_rate": 6e-05,
"loss": 1.3192,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 0.15388095378875732,
"learning_rate": 8e-05,
"loss": 1.3244,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.1628686636686325,
"learning_rate": 0.0001,
"loss": 1.3,
"step": 5
},
{
"epoch": 0.02,
"grad_norm": 0.20623335242271423,
"learning_rate": 0.00012,
"loss": 1.244,
"step": 6
},
{
"epoch": 0.02,
"grad_norm": 0.1510678231716156,
"learning_rate": 0.00014,
"loss": 1.2799,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 0.15237094461917877,
"learning_rate": 0.00016,
"loss": 1.2979,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 0.15166334807872772,
"learning_rate": 0.00018,
"loss": 1.28,
"step": 9
},
{
"epoch": 0.03,
"grad_norm": 0.17794868350028992,
"learning_rate": 0.0002,
"loss": 1.2298,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 0.25811436772346497,
"learning_rate": 0.0001999998028228211,
"loss": 1.1909,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 0.19142530858516693,
"learning_rate": 0.000199999211292062,
"loss": 1.178,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 0.1891462802886963,
"learning_rate": 0.00019999822541005537,
"loss": 1.1173,
"step": 13
},
{
"epoch": 0.04,
"grad_norm": 0.17077742516994476,
"learning_rate": 0.00019999684518068916,
"loss": 1.2092,
"step": 14
},
{
"epoch": 0.04,
"grad_norm": 0.15135815739631653,
"learning_rate": 0.00019999507060940625,
"loss": 1.1439,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 0.1767009049654007,
"learning_rate": 0.00019999290170320485,
"loss": 1.1408,
"step": 16
},
{
"epoch": 0.04,
"grad_norm": 0.1310850977897644,
"learning_rate": 0.00019999033847063811,
"loss": 1.2369,
"step": 17
},
{
"epoch": 0.05,
"grad_norm": 0.12432192265987396,
"learning_rate": 0.00019998738092181421,
"loss": 1.152,
"step": 18
},
{
"epoch": 0.05,
"grad_norm": 0.12430022656917572,
"learning_rate": 0.00019998402906839643,
"loss": 1.2111,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 0.12175025045871735,
"learning_rate": 0.00019998028292360286,
"loss": 1.1686,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 0.11878372728824615,
"learning_rate": 0.0001999761425022067,
"loss": 1.2452,
"step": 21
},
{
"epoch": 0.06,
"grad_norm": 0.11329779773950577,
"learning_rate": 0.00019997160782053578,
"loss": 1.0964,
"step": 22
},
{
"epoch": 0.06,
"grad_norm": 0.11987729370594025,
"learning_rate": 0.00019996667889647288,
"loss": 1.1809,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 0.12245086580514908,
"learning_rate": 0.00019996135574945544,
"loss": 1.1138,
"step": 24
},
{
"epoch": 0.06,
"grad_norm": 0.1399640142917633,
"learning_rate": 0.00019995563840047542,
"loss": 1.184,
"step": 25
},
{
"epoch": 0.07,
"grad_norm": 0.13597123324871063,
"learning_rate": 0.00019994952687207954,
"loss": 1.1872,
"step": 26
},
{
"epoch": 0.07,
"grad_norm": 0.13976556062698364,
"learning_rate": 0.00019994302118836883,
"loss": 1.1685,
"step": 27
},
{
"epoch": 0.07,
"grad_norm": 0.13106240332126617,
"learning_rate": 0.00019993612137499876,
"loss": 1.1872,
"step": 28
},
{
"epoch": 0.07,
"grad_norm": 0.12896399199962616,
"learning_rate": 0.00019992882745917902,
"loss": 1.1462,
"step": 29
},
{
"epoch": 0.08,
"grad_norm": 0.13873620331287384,
"learning_rate": 0.00019992113946967353,
"loss": 1.1742,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 0.14103546738624573,
"learning_rate": 0.00019991305743680013,
"loss": 1.1245,
"step": 31
},
{
"epoch": 0.08,
"grad_norm": 0.1377720981836319,
"learning_rate": 0.00019990458139243077,
"loss": 1.2045,
"step": 32
},
{
"epoch": 0.08,
"grad_norm": 0.13191157579421997,
"learning_rate": 0.000199895711369991,
"loss": 1.1716,
"step": 33
},
{
"epoch": 0.09,
"grad_norm": 0.13426551222801208,
"learning_rate": 0.00019988644740446022,
"loss": 1.1382,
"step": 34
},
{
"epoch": 0.09,
"grad_norm": 0.13733097910881042,
"learning_rate": 0.00019987678953237127,
"loss": 1.1677,
"step": 35
},
{
"epoch": 0.09,
"grad_norm": 0.12618272006511688,
"learning_rate": 0.00019986673779181033,
"loss": 1.2195,
"step": 36
},
{
"epoch": 0.09,
"grad_norm": 0.13636991381645203,
"learning_rate": 0.00019985629222241694,
"loss": 1.1577,
"step": 37
},
{
"epoch": 0.1,
"grad_norm": 0.13234035670757294,
"learning_rate": 0.0001998454528653836,
"loss": 1.1089,
"step": 38
},
{
"epoch": 0.1,
"grad_norm": 0.1395445317029953,
"learning_rate": 0.00019983421976345586,
"loss": 1.139,
"step": 39
},
{
"epoch": 0.1,
"grad_norm": 0.1284484714269638,
"learning_rate": 0.0001998225929609319,
"loss": 1.117,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 0.13304275274276733,
"learning_rate": 0.00019981057250366253,
"loss": 1.161,
"step": 41
},
{
"epoch": 0.11,
"grad_norm": 0.13184913992881775,
"learning_rate": 0.00019979815843905097,
"loss": 1.1826,
"step": 42
},
{
"epoch": 0.11,
"grad_norm": 0.12830235064029694,
"learning_rate": 0.0001997853508160526,
"loss": 1.0739,
"step": 43
},
{
"epoch": 0.11,
"grad_norm": 0.1346379965543747,
"learning_rate": 0.0001997721496851748,
"loss": 1.191,
"step": 44
},
{
"epoch": 0.11,
"grad_norm": 0.13036642968654633,
"learning_rate": 0.00019975855509847686,
"loss": 1.1361,
"step": 45
},
{
"epoch": 0.12,
"grad_norm": 0.12707848846912384,
"learning_rate": 0.00019974456710956964,
"loss": 1.101,
"step": 46
},
{
"epoch": 0.12,
"grad_norm": 0.12984970211982727,
"learning_rate": 0.00019973018577361536,
"loss": 1.1085,
"step": 47
},
{
"epoch": 0.12,
"grad_norm": 0.12627972662448883,
"learning_rate": 0.00019971541114732741,
"loss": 1.1607,
"step": 48
},
{
"epoch": 0.12,
"grad_norm": 0.13074152171611786,
"learning_rate": 0.00019970024328897022,
"loss": 1.1004,
"step": 49
},
{
"epoch": 0.13,
"grad_norm": 0.1309152990579605,
"learning_rate": 0.0001996846822583589,
"loss": 1.1378,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 0.1303664743900299,
"learning_rate": 0.000199668728116859,
"loss": 1.0956,
"step": 51
},
{
"epoch": 0.13,
"grad_norm": 0.13290388882160187,
"learning_rate": 0.00019965238092738643,
"loss": 1.1264,
"step": 52
},
{
"epoch": 0.13,
"grad_norm": 0.12805409729480743,
"learning_rate": 0.00019963564075440703,
"loss": 1.183,
"step": 53
},
{
"epoch": 0.14,
"grad_norm": 0.1399564892053604,
"learning_rate": 0.0001996185076639364,
"loss": 1.1102,
"step": 54
},
{
"epoch": 0.14,
"grad_norm": 0.12978173792362213,
"learning_rate": 0.00019960098172353962,
"loss": 1.1634,
"step": 55
},
{
"epoch": 0.14,
"grad_norm": 0.13925811648368835,
"learning_rate": 0.00019958306300233098,
"loss": 1.0636,
"step": 56
},
{
"epoch": 0.14,
"grad_norm": 0.13258852064609528,
"learning_rate": 0.00019956475157097378,
"loss": 1.1428,
"step": 57
},
{
"epoch": 0.15,
"grad_norm": 0.1285356879234314,
"learning_rate": 0.00019954604750167993,
"loss": 1.1664,
"step": 58
},
{
"epoch": 0.15,
"grad_norm": 0.1321210116147995,
"learning_rate": 0.00019952695086820975,
"loss": 1.1419,
"step": 59
},
{
"epoch": 0.15,
"grad_norm": 0.14086973667144775,
"learning_rate": 0.00019950746174587163,
"loss": 1.1827,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 0.1311366856098175,
"learning_rate": 0.0001994875802115218,
"loss": 1.1971,
"step": 61
},
{
"epoch": 0.16,
"grad_norm": 0.14063993096351624,
"learning_rate": 0.0001994673063435639,
"loss": 1.1945,
"step": 62
},
{
"epoch": 0.16,
"grad_norm": 0.12695981562137604,
"learning_rate": 0.00019944664022194885,
"loss": 1.0385,
"step": 63
},
{
"epoch": 0.16,
"grad_norm": 0.14170674979686737,
"learning_rate": 0.0001994255819281744,
"loss": 1.0883,
"step": 64
},
{
"epoch": 0.16,
"grad_norm": 0.13162197172641754,
"learning_rate": 0.0001994041315452849,
"loss": 1.153,
"step": 65
},
{
"epoch": 0.17,
"grad_norm": 0.1326906979084015,
"learning_rate": 0.0001993822891578708,
"loss": 1.1186,
"step": 66
},
{
"epoch": 0.17,
"grad_norm": 0.13306689262390137,
"learning_rate": 0.00019936005485206851,
"loss": 1.1587,
"step": 67
},
{
"epoch": 0.17,
"grad_norm": 0.13625258207321167,
"learning_rate": 0.00019933742871556,
"loss": 1.1339,
"step": 68
},
{
"epoch": 0.17,
"grad_norm": 0.13773800432682037,
"learning_rate": 0.00019931441083757245,
"loss": 1.1944,
"step": 69
},
{
"epoch": 0.18,
"grad_norm": 0.15291447937488556,
"learning_rate": 0.00019929100130887782,
"loss": 1.1028,
"step": 70
},
{
"epoch": 0.18,
"grad_norm": 0.15140767395496368,
"learning_rate": 0.0001992672002217926,
"loss": 1.1896,
"step": 71
},
{
"epoch": 0.18,
"grad_norm": 0.1344233751296997,
"learning_rate": 0.0001992430076701775,
"loss": 1.0561,
"step": 72
},
{
"epoch": 0.18,
"grad_norm": 0.13877920806407928,
"learning_rate": 0.0001992184237494368,
"loss": 1.1108,
"step": 73
},
{
"epoch": 0.19,
"grad_norm": 0.1359027922153473,
"learning_rate": 0.00019919344855651833,
"loss": 1.1563,
"step": 74
},
{
"epoch": 0.19,
"grad_norm": 0.14610135555267334,
"learning_rate": 0.0001991680821899128,
"loss": 1.1299,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 0.14259958267211914,
"learning_rate": 0.00019914232474965365,
"loss": 1.1021,
"step": 76
},
{
"epoch": 0.19,
"grad_norm": 0.14158602058887482,
"learning_rate": 0.00019911617633731638,
"loss": 1.0787,
"step": 77
},
{
"epoch": 0.2,
"grad_norm": 0.1418074518442154,
"learning_rate": 0.00019908963705601846,
"loss": 1.1359,
"step": 78
},
{
"epoch": 0.2,
"grad_norm": 0.12850767374038696,
"learning_rate": 0.0001990627070104187,
"loss": 1.1373,
"step": 79
},
{
"epoch": 0.2,
"grad_norm": 0.1312914341688156,
"learning_rate": 0.0001990353863067169,
"loss": 1.0832,
"step": 80
},
{
"epoch": 0.2,
"grad_norm": 0.13280583918094635,
"learning_rate": 0.0001990076750526534,
"loss": 1.0462,
"step": 81
},
{
"epoch": 0.21,
"grad_norm": 0.13617292046546936,
"learning_rate": 0.00019897957335750878,
"loss": 1.1059,
"step": 82
},
{
"epoch": 0.21,
"grad_norm": 0.15030132234096527,
"learning_rate": 0.00019895108133210335,
"loss": 1.0761,
"step": 83
},
{
"epoch": 0.21,
"grad_norm": 0.14291270077228546,
"learning_rate": 0.00019892219908879653,
"loss": 1.1217,
"step": 84
},
{
"epoch": 0.21,
"grad_norm": 0.1685461699962616,
"learning_rate": 0.00019889292674148682,
"loss": 1.1607,
"step": 85
},
{
"epoch": 0.22,
"grad_norm": 0.13756121695041656,
"learning_rate": 0.00019886326440561093,
"loss": 1.0914,
"step": 86
},
{
"epoch": 0.22,
"grad_norm": 0.13901358842849731,
"learning_rate": 0.0001988332121981436,
"loss": 1.1234,
"step": 87
},
{
"epoch": 0.22,
"grad_norm": 0.13816247880458832,
"learning_rate": 0.00019880277023759702,
"loss": 1.1583,
"step": 88
},
{
"epoch": 0.22,
"grad_norm": 0.13309679925441742,
"learning_rate": 0.00019877193864402038,
"loss": 1.163,
"step": 89
},
{
"epoch": 0.23,
"grad_norm": 0.13356180489063263,
"learning_rate": 0.0001987407175389994,
"loss": 1.1301,
"step": 90
},
{
"epoch": 0.23,
"grad_norm": 0.1388397067785263,
"learning_rate": 0.00019870910704565588,
"loss": 1.1326,
"step": 91
},
{
"epoch": 0.23,
"grad_norm": 0.13303454220294952,
"learning_rate": 0.0001986771072886472,
"loss": 1.0779,
"step": 92
},
{
"epoch": 0.23,
"grad_norm": 0.1316283941268921,
"learning_rate": 0.00019864471839416576,
"loss": 1.0935,
"step": 93
},
{
"epoch": 0.24,
"grad_norm": 0.1348309963941574,
"learning_rate": 0.00019861194048993863,
"loss": 1.1816,
"step": 94
},
{
"epoch": 0.24,
"grad_norm": 0.1341564655303955,
"learning_rate": 0.00019857877370522685,
"loss": 1.1187,
"step": 95
},
{
"epoch": 0.24,
"grad_norm": 0.13689687848091125,
"learning_rate": 0.0001985452181708251,
"loss": 1.1637,
"step": 96
},
{
"epoch": 0.24,
"grad_norm": 0.13348707556724548,
"learning_rate": 0.0001985112740190611,
"loss": 1.1026,
"step": 97
},
{
"epoch": 0.25,
"grad_norm": 0.13700643181800842,
"learning_rate": 0.00019847694138379506,
"loss": 1.1508,
"step": 98
},
{
"epoch": 0.25,
"grad_norm": 0.13654476404190063,
"learning_rate": 0.00019844222040041928,
"loss": 1.1668,
"step": 99
},
{
"epoch": 0.25,
"grad_norm": 0.15331624448299408,
"learning_rate": 0.0001984071112058574,
"loss": 1.1121,
"step": 100
},
{
"epoch": 0.25,
"eval_loss": 1.1294280290603638,
"eval_runtime": 81.6595,
"eval_samples_per_second": 31.827,
"eval_steps_per_second": 31.827,
"step": 100
},
{
"epoch": 0.25,
"grad_norm": 0.14425526559352875,
"learning_rate": 0.0001983716139385641,
"loss": 1.1447,
"step": 101
},
{
"epoch": 0.26,
"grad_norm": 0.13741208612918854,
"learning_rate": 0.00019833572873852444,
"loss": 1.1001,
"step": 102
},
{
"epoch": 0.26,
"grad_norm": 0.1282232254743576,
"learning_rate": 0.0001982994557472532,
"loss": 1.1199,
"step": 103
},
{
"epoch": 0.26,
"grad_norm": 0.13605354726314545,
"learning_rate": 0.00019826279510779454,
"loss": 1.154,
"step": 104
},
{
"epoch": 0.26,
"grad_norm": 0.13503985106945038,
"learning_rate": 0.00019822574696472126,
"loss": 1.0565,
"step": 105
},
{
"epoch": 0.27,
"grad_norm": 0.13878273963928223,
"learning_rate": 0.00019818831146413434,
"loss": 1.106,
"step": 106
},
{
"epoch": 0.27,
"grad_norm": 0.141740083694458,
"learning_rate": 0.00019815048875366234,
"loss": 1.0848,
"step": 107
},
{
"epoch": 0.27,
"grad_norm": 0.13799507915973663,
"learning_rate": 0.0001981122789824607,
"loss": 1.1582,
"step": 108
},
{
"epoch": 0.27,
"grad_norm": 0.1441466212272644,
"learning_rate": 0.0001980736823012114,
"loss": 1.0787,
"step": 109
},
{
"epoch": 0.28,
"grad_norm": 0.1377534121274948,
"learning_rate": 0.0001980346988621221,
"loss": 1.1092,
"step": 110
},
{
"epoch": 0.28,
"grad_norm": 0.1400901973247528,
"learning_rate": 0.00019799532881892564,
"loss": 1.0549,
"step": 111
},
{
"epoch": 0.28,
"grad_norm": 0.13621239364147186,
"learning_rate": 0.00019795557232687956,
"loss": 1.0991,
"step": 112
},
{
"epoch": 0.28,
"grad_norm": 0.1324262171983719,
"learning_rate": 0.0001979154295427653,
"loss": 1.0583,
"step": 113
},
{
"epoch": 0.29,
"grad_norm": 0.13273654878139496,
"learning_rate": 0.0001978749006248877,
"loss": 1.1504,
"step": 114
},
{
"epoch": 0.29,
"grad_norm": 0.14279481768608093,
"learning_rate": 0.00019783398573307428,
"loss": 1.0941,
"step": 115
},
{
"epoch": 0.29,
"grad_norm": 0.1432316154241562,
"learning_rate": 0.00019779268502867473,
"loss": 1.1111,
"step": 116
},
{
"epoch": 0.29,
"grad_norm": 0.14505276083946228,
"learning_rate": 0.00019775099867456013,
"loss": 1.0941,
"step": 117
},
{
"epoch": 0.3,
"grad_norm": 0.13935014605522156,
"learning_rate": 0.0001977089268351225,
"loss": 1.0597,
"step": 118
},
{
"epoch": 0.3,
"grad_norm": 0.14532430469989777,
"learning_rate": 0.0001976664696762739,
"loss": 1.1116,
"step": 119
},
{
"epoch": 0.3,
"grad_norm": 0.14096760749816895,
"learning_rate": 0.00019762362736544607,
"loss": 1.1381,
"step": 120
},
{
"epoch": 0.3,
"grad_norm": 0.1470746099948883,
"learning_rate": 0.00019758040007158948,
"loss": 1.1215,
"step": 121
},
{
"epoch": 0.31,
"grad_norm": 0.13610850274562836,
"learning_rate": 0.00019753678796517282,
"loss": 1.136,
"step": 122
},
{
"epoch": 0.31,
"grad_norm": 0.1399529129266739,
"learning_rate": 0.00019749279121818235,
"loss": 1.1035,
"step": 123
},
{
"epoch": 0.31,
"grad_norm": 0.13626012206077576,
"learning_rate": 0.00019744841000412123,
"loss": 1.1248,
"step": 124
},
{
"epoch": 0.31,
"grad_norm": 0.13053762912750244,
"learning_rate": 0.0001974036444980086,
"loss": 1.1286,
"step": 125
},
{
"epoch": 0.32,
"grad_norm": 0.14427675306797028,
"learning_rate": 0.00019735849487637929,
"loss": 1.2792,
"step": 126
},
{
"epoch": 0.32,
"grad_norm": 0.14464688301086426,
"learning_rate": 0.0001973129613172827,
"loss": 1.1091,
"step": 127
},
{
"epoch": 0.32,
"grad_norm": 0.12712322175502777,
"learning_rate": 0.0001972670440002825,
"loss": 1.1219,
"step": 128
},
{
"epoch": 0.32,
"grad_norm": 0.13343971967697144,
"learning_rate": 0.00019722074310645553,
"loss": 1.1296,
"step": 129
},
{
"epoch": 0.33,
"grad_norm": 0.15525247156620026,
"learning_rate": 0.00019717405881839145,
"loss": 1.159,
"step": 130
},
{
"epoch": 0.33,
"grad_norm": 0.12908953428268433,
"learning_rate": 0.0001971269913201918,
"loss": 1.0821,
"step": 131
},
{
"epoch": 0.33,
"grad_norm": 0.24165280163288116,
"learning_rate": 0.00019707954079746927,
"loss": 1.1388,
"step": 132
},
{
"epoch": 0.33,
"grad_norm": 0.1432817280292511,
"learning_rate": 0.00019703170743734706,
"loss": 1.1184,
"step": 133
},
{
"epoch": 0.34,
"grad_norm": 0.14007362723350525,
"learning_rate": 0.00019698349142845814,
"loss": 1.1576,
"step": 134
},
{
"epoch": 0.34,
"grad_norm": 0.14235983788967133,
"learning_rate": 0.00019693489296094443,
"loss": 1.0847,
"step": 135
},
{
"epoch": 0.34,
"grad_norm": 0.1430092453956604,
"learning_rate": 0.00019688591222645607,
"loss": 1.1562,
"step": 136
},
{
"epoch": 0.34,
"grad_norm": 0.13986627757549286,
"learning_rate": 0.00019683654941815077,
"loss": 1.124,
"step": 137
},
{
"epoch": 0.35,
"grad_norm": 0.13933469355106354,
"learning_rate": 0.00019678680473069293,
"loss": 1.1001,
"step": 138
},
{
"epoch": 0.35,
"grad_norm": 0.13476844131946564,
"learning_rate": 0.00019673667836025283,
"loss": 1.1186,
"step": 139
},
{
"epoch": 0.35,
"grad_norm": 0.13418316841125488,
"learning_rate": 0.00019668617050450603,
"loss": 1.1309,
"step": 140
},
{
"epoch": 0.35,
"grad_norm": 0.12794847786426544,
"learning_rate": 0.00019663528136263246,
"loss": 1.1142,
"step": 141
},
{
"epoch": 0.36,
"grad_norm": 0.1326293647289276,
"learning_rate": 0.00019658401113531565,
"loss": 1.0503,
"step": 142
},
{
"epoch": 0.36,
"grad_norm": 0.14793147146701813,
"learning_rate": 0.000196532360024742,
"loss": 1.2104,
"step": 143
},
{
"epoch": 0.36,
"grad_norm": 0.13718444108963013,
"learning_rate": 0.00019648032823459994,
"loss": 1.1685,
"step": 144
},
{
"epoch": 0.36,
"grad_norm": 0.14404018223285675,
"learning_rate": 0.00019642791597007902,
"loss": 1.09,
"step": 145
},
{
"epoch": 0.37,
"grad_norm": 0.14241506159305573,
"learning_rate": 0.00019637512343786937,
"loss": 1.1355,
"step": 146
},
{
"epoch": 0.37,
"grad_norm": 0.14581352472305298,
"learning_rate": 0.00019632195084616063,
"loss": 1.1005,
"step": 147
},
{
"epoch": 0.37,
"grad_norm": 0.14792676270008087,
"learning_rate": 0.00019626839840464119,
"loss": 1.1168,
"step": 148
},
{
"epoch": 0.37,
"grad_norm": 0.1484677940607071,
"learning_rate": 0.00019621446632449744,
"loss": 1.1138,
"step": 149
},
{
"epoch": 0.38,
"grad_norm": 0.15315671265125275,
"learning_rate": 0.0001961601548184129,
"loss": 1.1636,
"step": 150
},
{
"epoch": 0.38,
"grad_norm": 0.14746810495853424,
"learning_rate": 0.0001961054641005674,
"loss": 1.0881,
"step": 151
},
{
"epoch": 0.38,
"grad_norm": 0.1407732516527176,
"learning_rate": 0.00019605039438663614,
"loss": 1.0347,
"step": 152
},
{
"epoch": 0.38,
"grad_norm": 0.14150719344615936,
"learning_rate": 0.0001959949458937889,
"loss": 1.1112,
"step": 153
},
{
"epoch": 0.39,
"grad_norm": 0.16782569885253906,
"learning_rate": 0.0001959391188406893,
"loss": 1.0496,
"step": 154
},
{
"epoch": 0.39,
"grad_norm": 0.1452791690826416,
"learning_rate": 0.0001958829134474937,
"loss": 1.1185,
"step": 155
},
{
"epoch": 0.39,
"grad_norm": 0.145284965634346,
"learning_rate": 0.00019582632993585052,
"loss": 1.1431,
"step": 156
},
{
"epoch": 0.39,
"grad_norm": 0.15500612556934357,
"learning_rate": 0.00019576936852889936,
"loss": 1.1679,
"step": 157
},
{
"epoch": 0.4,
"grad_norm": 0.1416521966457367,
"learning_rate": 0.00019571202945126994,
"loss": 1.1322,
"step": 158
},
{
"epoch": 0.4,
"grad_norm": 0.1465340405702591,
"learning_rate": 0.00019565431292908146,
"loss": 1.0693,
"step": 159
},
{
"epoch": 0.4,
"grad_norm": 0.13601765036582947,
"learning_rate": 0.0001955962191899415,
"loss": 1.0676,
"step": 160
},
{
"epoch": 0.4,
"grad_norm": 0.14759162068367004,
"learning_rate": 0.0001955377484629453,
"loss": 1.0506,
"step": 161
},
{
"epoch": 0.41,
"grad_norm": 0.14839032292366028,
"learning_rate": 0.00019547890097867468,
"loss": 1.1245,
"step": 162
},
{
"epoch": 0.41,
"grad_norm": 0.1440214365720749,
"learning_rate": 0.0001954196769691973,
"loss": 1.1672,
"step": 163
},
{
"epoch": 0.41,
"grad_norm": 0.1372719258069992,
"learning_rate": 0.00019536007666806556,
"loss": 1.1084,
"step": 164
},
{
"epoch": 0.41,
"grad_norm": 0.14372558891773224,
"learning_rate": 0.00019530010031031586,
"loss": 1.1679,
"step": 165
},
{
"epoch": 0.42,
"grad_norm": 0.13789264857769012,
"learning_rate": 0.00019523974813246767,
"loss": 1.1253,
"step": 166
},
{
"epoch": 0.42,
"grad_norm": 0.14368915557861328,
"learning_rate": 0.0001951790203725223,
"loss": 1.085,
"step": 167
},
{
"epoch": 0.42,
"grad_norm": 0.1380469799041748,
"learning_rate": 0.00019511791726996243,
"loss": 1.1379,
"step": 168
},
{
"epoch": 0.42,
"grad_norm": 0.13288158178329468,
"learning_rate": 0.00019505643906575073,
"loss": 1.113,
"step": 169
},
{
"epoch": 0.43,
"grad_norm": 0.1390606164932251,
"learning_rate": 0.0001949945860023292,
"loss": 1.095,
"step": 170
},
{
"epoch": 0.43,
"grad_norm": 0.14271940290927887,
"learning_rate": 0.0001949323583236181,
"loss": 1.1063,
"step": 171
},
{
"epoch": 0.43,
"grad_norm": 0.13795693218708038,
"learning_rate": 0.00019486975627501502,
"loss": 1.0628,
"step": 172
},
{
"epoch": 0.43,
"grad_norm": 0.14073535799980164,
"learning_rate": 0.0001948067801033938,
"loss": 1.1192,
"step": 173
},
{
"epoch": 0.44,
"grad_norm": 0.138822540640831,
"learning_rate": 0.0001947434300571038,
"loss": 1.1299,
"step": 174
},
{
"epoch": 0.44,
"grad_norm": 0.13592712581157684,
"learning_rate": 0.0001946797063859686,
"loss": 1.0868,
"step": 175
},
{
"epoch": 0.44,
"grad_norm": 0.1379610300064087,
"learning_rate": 0.00019461560934128533,
"loss": 1.069,
"step": 176
},
{
"epoch": 0.44,
"grad_norm": 0.14286787807941437,
"learning_rate": 0.00019455113917582346,
"loss": 1.139,
"step": 177
},
{
"epoch": 0.45,
"grad_norm": 0.14168201386928558,
"learning_rate": 0.0001944862961438239,
"loss": 1.1405,
"step": 178
},
{
"epoch": 0.45,
"grad_norm": 0.1345077008008957,
"learning_rate": 0.000194421080500998,
"loss": 1.1039,
"step": 179
},
{
"epoch": 0.45,
"grad_norm": 0.1363426297903061,
"learning_rate": 0.00019435549250452645,
"loss": 1.1056,
"step": 180
},
{
"epoch": 0.45,
"grad_norm": 0.14109478890895844,
"learning_rate": 0.00019428953241305838,
"loss": 1.0927,
"step": 181
},
{
"epoch": 0.46,
"grad_norm": 0.14332321286201477,
"learning_rate": 0.0001942232004867103,
"loss": 1.0305,
"step": 182
},
{
"epoch": 0.46,
"grad_norm": 0.15956294536590576,
"learning_rate": 0.00019415649698706507,
"loss": 1.1245,
"step": 183
},
{
"epoch": 0.46,
"grad_norm": 0.14164718985557556,
"learning_rate": 0.0001940894221771708,
"loss": 1.0963,
"step": 184
},
{
"epoch": 0.46,
"grad_norm": 0.14296875894069672,
"learning_rate": 0.00019402197632153992,
"loss": 1.0853,
"step": 185
},
{
"epoch": 0.47,
"grad_norm": 0.12994709610939026,
"learning_rate": 0.00019395415968614813,
"loss": 1.0503,
"step": 186
},
{
"epoch": 0.47,
"grad_norm": 0.1399766504764557,
"learning_rate": 0.00019388597253843334,
"loss": 1.0623,
"step": 187
},
{
"epoch": 0.47,
"grad_norm": 0.14874404668807983,
"learning_rate": 0.00019381741514729443,
"loss": 1.0885,
"step": 188
},
{
"epoch": 0.47,
"grad_norm": 0.1453857719898224,
"learning_rate": 0.00019374848778309055,
"loss": 1.1702,
"step": 189
},
{
"epoch": 0.48,
"grad_norm": 0.14976643025875092,
"learning_rate": 0.0001936791907176397,
"loss": 1.0834,
"step": 190
},
{
"epoch": 0.48,
"grad_norm": 0.1418897956609726,
"learning_rate": 0.00019360952422421793,
"loss": 1.0918,
"step": 191
},
{
"epoch": 0.48,
"grad_norm": 0.14602817595005035,
"learning_rate": 0.00019353948857755803,
"loss": 1.0825,
"step": 192
},
{
"epoch": 0.48,
"grad_norm": 0.14669157564640045,
"learning_rate": 0.00019346908405384867,
"loss": 1.0973,
"step": 193
},
{
"epoch": 0.49,
"grad_norm": 0.14327263832092285,
"learning_rate": 0.00019339831093073318,
"loss": 1.1191,
"step": 194
},
{
"epoch": 0.49,
"grad_norm": 0.13806897401809692,
"learning_rate": 0.0001933271694873084,
"loss": 1.1504,
"step": 195
},
{
"epoch": 0.49,
"grad_norm": 0.13992969691753387,
"learning_rate": 0.00019325566000412376,
"loss": 1.0865,
"step": 196
},
{
"epoch": 0.49,
"grad_norm": 0.14395759999752045,
"learning_rate": 0.00019318378276318,
"loss": 1.1204,
"step": 197
},
{
"epoch": 0.5,
"grad_norm": 0.1409691572189331,
"learning_rate": 0.0001931115380479281,
"loss": 1.0766,
"step": 198
},
{
"epoch": 0.5,
"grad_norm": 0.1448824405670166,
"learning_rate": 0.00019303892614326836,
"loss": 1.1741,
"step": 199
},
{
"epoch": 0.5,
"grad_norm": 0.142364963889122,
"learning_rate": 0.00019296594733554892,
"loss": 1.1716,
"step": 200
},
{
"epoch": 0.5,
"eval_loss": 1.109603762626648,
"eval_runtime": 81.7249,
"eval_samples_per_second": 31.802,
"eval_steps_per_second": 31.802,
"step": 200
},
{
"epoch": 0.5,
"grad_norm": 0.1372615098953247,
"learning_rate": 0.00019289260191256483,
"loss": 1.1084,
"step": 201
},
{
"epoch": 0.51,
"grad_norm": 0.13863563537597656,
"learning_rate": 0.0001928188901635571,
"loss": 1.0546,
"step": 202
},
{
"epoch": 0.51,
"grad_norm": 0.13055531680583954,
"learning_rate": 0.00019274481237921114,
"loss": 1.018,
"step": 203
},
{
"epoch": 0.51,
"grad_norm": 0.14135099947452545,
"learning_rate": 0.00019267036885165588,
"loss": 1.1131,
"step": 204
},
{
"epoch": 0.51,
"grad_norm": 0.14308464527130127,
"learning_rate": 0.0001925955598744627,
"loss": 1.0723,
"step": 205
},
{
"epoch": 0.52,
"grad_norm": 0.13907764852046967,
"learning_rate": 0.00019252038574264405,
"loss": 1.1607,
"step": 206
},
{
"epoch": 0.52,
"grad_norm": 0.13771073520183563,
"learning_rate": 0.00019244484675265232,
"loss": 1.172,
"step": 207
},
{
"epoch": 0.52,
"grad_norm": 0.13774815201759338,
"learning_rate": 0.00019236894320237894,
"loss": 1.0622,
"step": 208
},
{
"epoch": 0.52,
"grad_norm": 0.1426474153995514,
"learning_rate": 0.0001922926753911527,
"loss": 1.0368,
"step": 209
},
{
"epoch": 0.53,
"grad_norm": 0.1380661278963089,
"learning_rate": 0.00019221604361973919,
"loss": 1.0873,
"step": 210
},
{
"epoch": 0.53,
"grad_norm": 0.14044702053070068,
"learning_rate": 0.00019213904819033903,
"loss": 1.0901,
"step": 211
},
{
"epoch": 0.53,
"grad_norm": 0.1415887176990509,
"learning_rate": 0.00019206168940658712,
"loss": 1.1061,
"step": 212
},
{
"epoch": 0.53,
"grad_norm": 0.1580592840909958,
"learning_rate": 0.00019198396757355118,
"loss": 1.1073,
"step": 213
},
{
"epoch": 0.54,
"grad_norm": 0.14094668626785278,
"learning_rate": 0.00019190588299773062,
"loss": 1.1781,
"step": 214
},
{
"epoch": 0.54,
"grad_norm": 0.14229640364646912,
"learning_rate": 0.00019182743598705542,
"loss": 1.1095,
"step": 215
},
{
"epoch": 0.54,
"grad_norm": 0.140314981341362,
"learning_rate": 0.00019174862685088472,
"loss": 1.1534,
"step": 216
},
{
"epoch": 0.54,
"grad_norm": 0.160028338432312,
"learning_rate": 0.00019166945590000584,
"loss": 1.087,
"step": 217
},
{
"epoch": 0.55,
"grad_norm": 0.14278572797775269,
"learning_rate": 0.0001915899234466328,
"loss": 1.1583,
"step": 218
},
{
"epoch": 0.55,
"grad_norm": 0.13695856928825378,
"learning_rate": 0.0001915100298044054,
"loss": 1.1151,
"step": 219
},
{
"epoch": 0.55,
"grad_norm": 0.14235751330852509,
"learning_rate": 0.00019142977528838762,
"loss": 1.1111,
"step": 220
},
{
"epoch": 0.55,
"grad_norm": 0.15174664556980133,
"learning_rate": 0.00019134916021506666,
"loss": 1.1438,
"step": 221
},
{
"epoch": 0.56,
"grad_norm": 0.15249325335025787,
"learning_rate": 0.0001912681849023516,
"loss": 1.1575,
"step": 222
},
{
"epoch": 0.56,
"grad_norm": 0.14303787052631378,
"learning_rate": 0.00019118684966957207,
"loss": 1.1302,
"step": 223
},
{
"epoch": 0.56,
"grad_norm": 0.1405183970928192,
"learning_rate": 0.00019110515483747716,
"loss": 1.1157,
"step": 224
},
{
"epoch": 0.56,
"grad_norm": 0.1475205421447754,
"learning_rate": 0.00019102310072823393,
"loss": 1.1175,
"step": 225
},
{
"epoch": 0.57,
"grad_norm": 0.14406634867191315,
"learning_rate": 0.0001909406876654264,
"loss": 1.0578,
"step": 226
},
{
"epoch": 0.57,
"grad_norm": 0.13999773561954498,
"learning_rate": 0.00019085791597405404,
"loss": 1.0865,
"step": 227
},
{
"epoch": 0.57,
"grad_norm": 0.1409848928451538,
"learning_rate": 0.00019077478598053063,
"loss": 1.1297,
"step": 228
},
{
"epoch": 0.57,
"grad_norm": 0.14548417925834656,
"learning_rate": 0.00019069129801268294,
"loss": 1.1524,
"step": 229
},
{
"epoch": 0.58,
"grad_norm": 0.13622736930847168,
"learning_rate": 0.00019060745239974936,
"loss": 1.0744,
"step": 230
},
{
"epoch": 0.58,
"grad_norm": 0.14302954077720642,
"learning_rate": 0.0001905232494723788,
"loss": 1.1469,
"step": 231
},
{
"epoch": 0.58,
"grad_norm": 0.15202221274375916,
"learning_rate": 0.0001904386895626291,
"loss": 1.0693,
"step": 232
},
{
"epoch": 0.58,
"grad_norm": 0.14072120189666748,
"learning_rate": 0.00019035377300396597,
"loss": 1.0584,
"step": 233
},
{
"epoch": 0.59,
"grad_norm": 0.13941141963005066,
"learning_rate": 0.00019026850013126157,
"loss": 1.1257,
"step": 234
},
{
"epoch": 0.59,
"grad_norm": 0.1389845460653305,
"learning_rate": 0.0001901828712807932,
"loss": 1.0003,
"step": 235
},
{
"epoch": 0.59,
"grad_norm": 0.1431329846382141,
"learning_rate": 0.0001900968867902419,
"loss": 1.0795,
"step": 236
},
{
"epoch": 0.59,
"grad_norm": 0.15022633969783783,
"learning_rate": 0.00019001054699869133,
"loss": 1.1427,
"step": 237
},
{
"epoch": 0.6,
"grad_norm": 0.1578160673379898,
"learning_rate": 0.00018992385224662623,
"loss": 1.13,
"step": 238
},
{
"epoch": 0.6,
"grad_norm": 0.13778769969940186,
"learning_rate": 0.00018983680287593105,
"loss": 1.0739,
"step": 239
},
{
"epoch": 0.6,
"grad_norm": 0.1454969048500061,
"learning_rate": 0.00018974939922988883,
"loss": 1.0864,
"step": 240
},
{
"epoch": 0.6,
"grad_norm": 0.13545964658260345,
"learning_rate": 0.00018966164165317966,
"loss": 1.0169,
"step": 241
},
{
"epoch": 0.61,
"grad_norm": 0.13648608326911926,
"learning_rate": 0.00018957353049187936,
"loss": 1.0732,
"step": 242
},
{
"epoch": 0.61,
"grad_norm": 0.14080677926540375,
"learning_rate": 0.00018948506609345813,
"loss": 1.0579,
"step": 243
},
{
"epoch": 0.61,
"grad_norm": 0.14503297209739685,
"learning_rate": 0.00018939624880677918,
"loss": 1.0755,
"step": 244
},
{
"epoch": 0.61,
"grad_norm": 0.15316741168498993,
"learning_rate": 0.00018930707898209733,
"loss": 1.0885,
"step": 245
},
{
"epoch": 0.62,
"grad_norm": 0.14839263260364532,
"learning_rate": 0.0001892175569710577,
"loss": 1.121,
"step": 246
},
{
"epoch": 0.62,
"grad_norm": 0.13919925689697266,
"learning_rate": 0.00018912768312669424,
"loss": 1.1039,
"step": 247
},
{
"epoch": 0.62,
"grad_norm": 0.13975974917411804,
"learning_rate": 0.00018903745780342839,
"loss": 1.1454,
"step": 248
},
{
"epoch": 0.62,
"grad_norm": 0.13851100206375122,
"learning_rate": 0.0001889468813570676,
"loss": 1.0905,
"step": 249
},
{
"epoch": 0.63,
"grad_norm": 0.14839564263820648,
"learning_rate": 0.00018885595414480405,
"loss": 1.1002,
"step": 250
},
{
"epoch": 0.63,
"grad_norm": 0.1421942263841629,
"learning_rate": 0.00018876467652521317,
"loss": 1.093,
"step": 251
},
{
"epoch": 0.63,
"grad_norm": 0.14453786611557007,
"learning_rate": 0.0001886730488582522,
"loss": 1.0278,
"step": 252
},
{
"epoch": 0.63,
"grad_norm": 0.13856688141822815,
"learning_rate": 0.0001885810715052589,
"loss": 1.079,
"step": 253
},
{
"epoch": 0.64,
"grad_norm": 0.14092479646205902,
"learning_rate": 0.00018848874482894993,
"loss": 1.0608,
"step": 254
},
{
"epoch": 0.64,
"grad_norm": 0.14616413414478302,
"learning_rate": 0.0001883960691934196,
"loss": 1.1097,
"step": 255
},
{
"epoch": 0.64,
"grad_norm": 0.1410474181175232,
"learning_rate": 0.00018830304496413822,
"loss": 1.0577,
"step": 256
},
{
"epoch": 0.64,
"grad_norm": 0.15473878383636475,
"learning_rate": 0.000188209672507951,
"loss": 1.1453,
"step": 257
},
{
"epoch": 0.65,
"grad_norm": 0.14370983839035034,
"learning_rate": 0.00018811595219307622,
"loss": 1.1732,
"step": 258
},
{
"epoch": 0.65,
"grad_norm": 0.14861780405044556,
"learning_rate": 0.00018802188438910405,
"loss": 1.1471,
"step": 259
},
{
"epoch": 0.65,
"grad_norm": 0.1523188352584839,
"learning_rate": 0.000187927469466995,
"loss": 1.129,
"step": 260
},
{
"epoch": 0.65,
"grad_norm": 0.14366289973258972,
"learning_rate": 0.00018783270779907838,
"loss": 1.0792,
"step": 261
},
{
"epoch": 0.66,
"grad_norm": 0.1363295018672943,
"learning_rate": 0.00018773759975905098,
"loss": 0.9848,
"step": 262
},
{
"epoch": 0.66,
"grad_norm": 0.1438857764005661,
"learning_rate": 0.00018764214572197552,
"loss": 1.1371,
"step": 263
},
{
"epoch": 0.66,
"grad_norm": 0.13751162588596344,
"learning_rate": 0.00018754634606427914,
"loss": 1.0557,
"step": 264
},
{
"epoch": 0.66,
"grad_norm": 0.1384708732366562,
"learning_rate": 0.00018745020116375197,
"loss": 1.0664,
"step": 265
},
{
"epoch": 0.67,
"grad_norm": 0.14196960628032684,
"learning_rate": 0.00018735371139954558,
"loss": 1.0828,
"step": 266
},
{
"epoch": 0.67,
"grad_norm": 0.15374121069908142,
"learning_rate": 0.00018725687715217163,
"loss": 1.073,
"step": 267
},
{
"epoch": 0.67,
"grad_norm": 0.14955537021160126,
"learning_rate": 0.0001871596988035001,
"loss": 1.1444,
"step": 268
},
{
"epoch": 0.68,
"grad_norm": 0.13760650157928467,
"learning_rate": 0.00018706217673675811,
"loss": 1.088,
"step": 269
},
{
"epoch": 0.68,
"grad_norm": 0.17072008550167084,
"learning_rate": 0.00018696431133652817,
"loss": 1.07,
"step": 270
},
{
"epoch": 0.68,
"grad_norm": 0.14745061099529266,
"learning_rate": 0.00018686610298874676,
"loss": 1.1105,
"step": 271
},
{
"epoch": 0.68,
"grad_norm": 0.14695587754249573,
"learning_rate": 0.00018676755208070275,
"loss": 1.0612,
"step": 272
},
{
"epoch": 0.69,
"grad_norm": 0.15686020255088806,
"learning_rate": 0.00018666865900103597,
"loss": 1.0933,
"step": 273
},
{
"epoch": 0.69,
"grad_norm": 0.14162233471870422,
"learning_rate": 0.00018656942413973555,
"loss": 1.0832,
"step": 274
},
{
"epoch": 0.69,
"grad_norm": 0.14662939310073853,
"learning_rate": 0.00018646984788813856,
"loss": 1.1175,
"step": 275
},
{
"epoch": 0.69,
"grad_norm": 0.13886839151382446,
"learning_rate": 0.0001863699306389282,
"loss": 1.1221,
"step": 276
},
{
"epoch": 0.7,
"grad_norm": 0.13897326588630676,
"learning_rate": 0.00018626967278613253,
"loss": 1.0767,
"step": 277
},
{
"epoch": 0.7,
"grad_norm": 0.13283655047416687,
"learning_rate": 0.0001861690747251228,
"loss": 1.1397,
"step": 278
},
{
"epoch": 0.7,
"grad_norm": 0.14036604762077332,
"learning_rate": 0.0001860681368526118,
"loss": 1.0965,
"step": 279
},
{
"epoch": 0.7,
"grad_norm": 0.1449379026889801,
"learning_rate": 0.00018596685956665245,
"loss": 1.1262,
"step": 280
},
{
"epoch": 0.71,
"grad_norm": 0.14264287054538727,
"learning_rate": 0.00018586524326663615,
"loss": 1.1317,
"step": 281
},
{
"epoch": 0.71,
"grad_norm": 0.14677459001541138,
"learning_rate": 0.00018576328835329117,
"loss": 1.0785,
"step": 282
},
{
"epoch": 0.71,
"grad_norm": 0.14834077656269073,
"learning_rate": 0.00018566099522868119,
"loss": 1.0892,
"step": 283
},
{
"epoch": 0.71,
"grad_norm": 0.15325355529785156,
"learning_rate": 0.00018555836429620358,
"loss": 1.0843,
"step": 284
},
{
"epoch": 0.72,
"grad_norm": 0.14825651049613953,
"learning_rate": 0.00018545539596058795,
"loss": 1.1288,
"step": 285
},
{
"epoch": 0.72,
"grad_norm": 0.14722499251365662,
"learning_rate": 0.00018535209062789433,
"loss": 1.1391,
"step": 286
},
{
"epoch": 0.72,
"grad_norm": 0.14388781785964966,
"learning_rate": 0.00018524844870551185,
"loss": 1.1013,
"step": 287
},
{
"epoch": 0.72,
"grad_norm": 0.1455835998058319,
"learning_rate": 0.00018514447060215698,
"loss": 1.0811,
"step": 288
},
{
"epoch": 0.73,
"grad_norm": 0.14625433087348938,
"learning_rate": 0.00018504015672787184,
"loss": 1.0854,
"step": 289
},
{
"epoch": 0.73,
"grad_norm": 0.13978470861911774,
"learning_rate": 0.00018493550749402278,
"loss": 1.1398,
"step": 290
},
{
"epoch": 0.73,
"grad_norm": 0.1447162628173828,
"learning_rate": 0.00018483052331329857,
"loss": 1.0553,
"step": 291
},
{
"epoch": 0.73,
"grad_norm": 0.13894303143024445,
"learning_rate": 0.00018472520459970898,
"loss": 1.0305,
"step": 292
},
{
"epoch": 0.74,
"grad_norm": 0.1372181624174118,
"learning_rate": 0.00018461955176858285,
"loss": 1.021,
"step": 293
},
{
"epoch": 0.74,
"grad_norm": 0.14599645137786865,
"learning_rate": 0.0001845135652365668,
"loss": 1.0808,
"step": 294
},
{
"epoch": 0.74,
"grad_norm": 0.1599220335483551,
"learning_rate": 0.00018440724542162328,
"loss": 1.1143,
"step": 295
},
{
"epoch": 0.74,
"grad_norm": 0.1450476050376892,
"learning_rate": 0.00018430059274302917,
"loss": 1.0508,
"step": 296
},
{
"epoch": 0.75,
"grad_norm": 0.1439283937215805,
"learning_rate": 0.00018419360762137395,
"loss": 1.0592,
"step": 297
},
{
"epoch": 0.75,
"grad_norm": 0.1410531848669052,
"learning_rate": 0.00018408629047855804,
"loss": 1.0632,
"step": 298
},
{
"epoch": 0.75,
"grad_norm": 0.1468774974346161,
"learning_rate": 0.00018397864173779133,
"loss": 1.056,
"step": 299
},
{
"epoch": 0.75,
"grad_norm": 0.1467033177614212,
"learning_rate": 0.00018387066182359133,
"loss": 1.1122,
"step": 300
},
{
"epoch": 0.75,
"eval_loss": 1.0955116748809814,
"eval_runtime": 81.7775,
"eval_samples_per_second": 31.781,
"eval_steps_per_second": 31.781,
"step": 300
},
{
"epoch": 0.76,
"grad_norm": 0.14950688183307648,
"learning_rate": 0.00018376235116178148,
"loss": 1.0698,
"step": 301
},
{
"epoch": 0.76,
"grad_norm": 0.142381951212883,
"learning_rate": 0.00018365371017948964,
"loss": 1.0528,
"step": 302
},
{
"epoch": 0.76,
"grad_norm": 0.1410701423883438,
"learning_rate": 0.0001835447393051463,
"loss": 1.0785,
"step": 303
},
{
"epoch": 0.76,
"grad_norm": 0.14708860218524933,
"learning_rate": 0.00018343543896848273,
"loss": 1.0142,
"step": 304
},
{
"epoch": 0.77,
"grad_norm": 0.1467617303133011,
"learning_rate": 0.00018332580960052965,
"loss": 1.0973,
"step": 305
},
{
"epoch": 0.77,
"grad_norm": 0.15761792659759521,
"learning_rate": 0.00018321585163361527,
"loss": 1.1745,
"step": 306
},
{
"epoch": 0.77,
"grad_norm": 0.13972119987010956,
"learning_rate": 0.00018310556550136357,
"loss": 1.0832,
"step": 307
},
{
"epoch": 0.77,
"grad_norm": 0.1481141895055771,
"learning_rate": 0.00018299495163869275,
"loss": 1.1573,
"step": 308
},
{
"epoch": 0.78,
"grad_norm": 0.14397870004177094,
"learning_rate": 0.0001828840104818134,
"loss": 1.171,
"step": 309
},
{
"epoch": 0.78,
"grad_norm": 0.14765049517154694,
"learning_rate": 0.0001827727424682268,
"loss": 1.0544,
"step": 310
},
{
"epoch": 0.78,
"grad_norm": 0.14956365525722504,
"learning_rate": 0.00018266114803672318,
"loss": 1.1755,
"step": 311
},
{
"epoch": 0.78,
"grad_norm": 0.15122386813163757,
"learning_rate": 0.00018254922762738008,
"loss": 1.1547,
"step": 312
},
{
"epoch": 0.79,
"grad_norm": 0.14254115521907806,
"learning_rate": 0.00018243698168156054,
"loss": 1.1075,
"step": 313
},
{
"epoch": 0.79,
"grad_norm": 0.14294452965259552,
"learning_rate": 0.00018232441064191125,
"loss": 1.1419,
"step": 314
},
{
"epoch": 0.79,
"grad_norm": 0.14777772128582,
"learning_rate": 0.0001822115149523611,
"loss": 1.1662,
"step": 315
},
{
"epoch": 0.79,
"grad_norm": 0.14944781363010406,
"learning_rate": 0.0001820982950581191,
"loss": 1.1497,
"step": 316
},
{
"epoch": 0.8,
"grad_norm": 0.1466801017522812,
"learning_rate": 0.00018198475140567287,
"loss": 1.1374,
"step": 317
},
{
"epoch": 0.8,
"grad_norm": 0.15346656739711761,
"learning_rate": 0.00018187088444278674,
"loss": 1.1356,
"step": 318
},
{
"epoch": 0.8,
"grad_norm": 0.15271005034446716,
"learning_rate": 0.00018175669461850005,
"loss": 1.0845,
"step": 319
},
{
"epoch": 0.8,
"grad_norm": 0.14452996850013733,
"learning_rate": 0.00018164218238312535,
"loss": 1.1162,
"step": 320
},
{
"epoch": 0.81,
"grad_norm": 0.14632536470890045,
"learning_rate": 0.00018152734818824658,
"loss": 1.0187,
"step": 321
},
{
"epoch": 0.81,
"grad_norm": 0.14935997128486633,
"learning_rate": 0.00018141219248671745,
"loss": 1.1167,
"step": 322
},
{
"epoch": 0.81,
"grad_norm": 0.14043933153152466,
"learning_rate": 0.0001812967157326595,
"loss": 1.0044,
"step": 323
},
{
"epoch": 0.81,
"grad_norm": 0.14850106835365295,
"learning_rate": 0.00018118091838146029,
"loss": 1.1226,
"step": 324
},
{
"epoch": 0.82,
"grad_norm": 0.14655061066150665,
"learning_rate": 0.00018106480088977172,
"loss": 1.0508,
"step": 325
},
{
"epoch": 0.82,
"grad_norm": 0.14721763134002686,
"learning_rate": 0.00018094836371550824,
"loss": 1.0659,
"step": 326
},
{
"epoch": 0.82,
"grad_norm": 0.1433349996805191,
"learning_rate": 0.00018083160731784486,
"loss": 1.147,
"step": 327
},
{
"epoch": 0.82,
"grad_norm": 0.13528144359588623,
"learning_rate": 0.00018071453215721554,
"loss": 1.0388,
"step": 328
},
{
"epoch": 0.83,
"grad_norm": 0.15466062724590302,
"learning_rate": 0.0001805971386953113,
"loss": 1.0649,
"step": 329
},
{
"epoch": 0.83,
"grad_norm": 0.15163114666938782,
"learning_rate": 0.00018047942739507836,
"loss": 1.1454,
"step": 330
},
{
"epoch": 0.83,
"grad_norm": 0.14693276584148407,
"learning_rate": 0.0001803613987207163,
"loss": 1.1137,
"step": 331
},
{
"epoch": 0.83,
"grad_norm": 0.14229321479797363,
"learning_rate": 0.00018024305313767646,
"loss": 1.0153,
"step": 332
},
{
"epoch": 0.84,
"grad_norm": 0.13863018155097961,
"learning_rate": 0.00018012439111265974,
"loss": 1.0491,
"step": 333
},
{
"epoch": 0.84,
"grad_norm": 0.1422068327665329,
"learning_rate": 0.000180005413113615,
"loss": 1.0952,
"step": 334
},
{
"epoch": 0.84,
"grad_norm": 0.1419857293367386,
"learning_rate": 0.00017988611960973713,
"loss": 1.0532,
"step": 335
},
{
"epoch": 0.84,
"grad_norm": 0.1446901261806488,
"learning_rate": 0.00017976651107146533,
"loss": 1.0477,
"step": 336
},
{
"epoch": 0.85,
"grad_norm": 0.14558811485767365,
"learning_rate": 0.00017964658797048108,
"loss": 1.1481,
"step": 337
},
{
"epoch": 0.85,
"grad_norm": 0.15488363802433014,
"learning_rate": 0.0001795263507797063,
"loss": 1.1302,
"step": 338
},
{
"epoch": 0.85,
"grad_norm": 0.14942613244056702,
"learning_rate": 0.00017940579997330165,
"loss": 1.0698,
"step": 339
},
{
"epoch": 0.85,
"grad_norm": 0.14417564868927002,
"learning_rate": 0.00017928493602666445,
"loss": 1.0867,
"step": 340
},
{
"epoch": 0.86,
"grad_norm": 0.14839497208595276,
"learning_rate": 0.0001791637594164269,
"loss": 1.0124,
"step": 341
},
{
"epoch": 0.86,
"grad_norm": 0.1415972113609314,
"learning_rate": 0.00017904227062045437,
"loss": 1.0958,
"step": 342
},
{
"epoch": 0.86,
"grad_norm": 0.143202543258667,
"learning_rate": 0.00017892047011784312,
"loss": 1.0808,
"step": 343
},
{
"epoch": 0.86,
"grad_norm": 0.14291773736476898,
"learning_rate": 0.00017879835838891875,
"loss": 1.1386,
"step": 344
},
{
"epoch": 0.87,
"grad_norm": 0.1504325121641159,
"learning_rate": 0.00017867593591523422,
"loss": 1.0804,
"step": 345
},
{
"epoch": 0.87,
"grad_norm": 0.1444767862558365,
"learning_rate": 0.00017855320317956784,
"loss": 1.1207,
"step": 346
},
{
"epoch": 0.87,
"grad_norm": 0.14493699371814728,
"learning_rate": 0.00017843016066592158,
"loss": 1.0954,
"step": 347
},
{
"epoch": 0.87,
"grad_norm": 0.14571166038513184,
"learning_rate": 0.00017830680885951887,
"loss": 1.0676,
"step": 348
},
{
"epoch": 0.88,
"grad_norm": 0.14583171904087067,
"learning_rate": 0.000178183148246803,
"loss": 1.0674,
"step": 349
},
{
"epoch": 0.88,
"grad_norm": 0.15080390870571136,
"learning_rate": 0.00017805917931543492,
"loss": 1.0757,
"step": 350
},
{
"epoch": 0.88,
"grad_norm": 0.14790864288806915,
"learning_rate": 0.00017793490255429157,
"loss": 1.1005,
"step": 351
},
{
"epoch": 0.88,
"grad_norm": 0.14861677587032318,
"learning_rate": 0.00017781031845346375,
"loss": 1.0645,
"step": 352
},
{
"epoch": 0.89,
"grad_norm": 0.15099036693572998,
"learning_rate": 0.00017768542750425426,
"loss": 1.1306,
"step": 353
},
{
"epoch": 0.89,
"grad_norm": 0.14353971183300018,
"learning_rate": 0.00017756023019917607,
"loss": 1.0834,
"step": 354
},
{
"epoch": 0.89,
"grad_norm": 0.14582550525665283,
"learning_rate": 0.00017743472703195015,
"loss": 1.0722,
"step": 355
},
{
"epoch": 0.89,
"grad_norm": 0.14268234372138977,
"learning_rate": 0.00017730891849750377,
"loss": 1.092,
"step": 356
},
{
"epoch": 0.9,
"grad_norm": 0.1424105316400528,
"learning_rate": 0.00017718280509196828,
"loss": 1.1355,
"step": 357
},
{
"epoch": 0.9,
"grad_norm": 0.13972117006778717,
"learning_rate": 0.0001770563873126775,
"loss": 1.0318,
"step": 358
},
{
"epoch": 0.9,
"grad_norm": 0.14622163772583008,
"learning_rate": 0.00017692966565816532,
"loss": 1.0985,
"step": 359
},
{
"epoch": 0.9,
"grad_norm": 0.13956372439861298,
"learning_rate": 0.0001768026406281642,
"loss": 1.102,
"step": 360
},
{
"epoch": 0.91,
"grad_norm": 0.14042189717292786,
"learning_rate": 0.0001766753127236029,
"loss": 1.0284,
"step": 361
},
{
"epoch": 0.91,
"grad_norm": 0.14376944303512573,
"learning_rate": 0.00017654768244660448,
"loss": 1.1452,
"step": 362
},
{
"epoch": 0.91,
"grad_norm": 0.14055544137954712,
"learning_rate": 0.00017641975030048454,
"loss": 1.0306,
"step": 363
},
{
"epoch": 0.91,
"grad_norm": 0.14599303901195526,
"learning_rate": 0.00017629151678974907,
"loss": 1.0838,
"step": 364
},
{
"epoch": 0.92,
"grad_norm": 0.1528831571340561,
"learning_rate": 0.00017616298242009251,
"loss": 1.1293,
"step": 365
},
{
"epoch": 0.92,
"grad_norm": 0.1404455453157425,
"learning_rate": 0.00017603414769839577,
"loss": 1.0425,
"step": 366
},
{
"epoch": 0.92,
"grad_norm": 0.14992842078208923,
"learning_rate": 0.00017590501313272415,
"loss": 1.0928,
"step": 367
},
{
"epoch": 0.92,
"grad_norm": 0.14540541172027588,
"learning_rate": 0.00017577557923232546,
"loss": 1.0366,
"step": 368
},
{
"epoch": 0.93,
"grad_norm": 0.1451583057641983,
"learning_rate": 0.00017564584650762793,
"loss": 1.1108,
"step": 369
},
{
"epoch": 0.93,
"grad_norm": 0.155447855591774,
"learning_rate": 0.00017551581547023819,
"loss": 1.1394,
"step": 370
},
{
"epoch": 0.93,
"grad_norm": 0.1441376656293869,
"learning_rate": 0.0001753854866329393,
"loss": 1.0264,
"step": 371
},
{
"epoch": 0.93,
"grad_norm": 0.13875485956668854,
"learning_rate": 0.00017525486050968875,
"loss": 1.0672,
"step": 372
},
{
"epoch": 0.94,
"grad_norm": 0.14158080518245697,
"learning_rate": 0.00017512393761561632,
"loss": 1.053,
"step": 373
},
{
"epoch": 0.94,
"grad_norm": 0.15505361557006836,
"learning_rate": 0.00017499271846702213,
"loss": 1.0713,
"step": 374
},
{
"epoch": 0.94,
"grad_norm": 0.14172373712062836,
"learning_rate": 0.0001748612035813747,
"loss": 1.0544,
"step": 375
},
{
"epoch": 0.94,
"grad_norm": 0.14016349613666534,
"learning_rate": 0.00017472939347730856,
"loss": 1.0382,
"step": 376
},
{
"epoch": 0.95,
"grad_norm": 0.15148378908634186,
"learning_rate": 0.00017459728867462275,
"loss": 1.1218,
"step": 377
},
{
"epoch": 0.95,
"grad_norm": 0.1416306346654892,
"learning_rate": 0.0001744648896942782,
"loss": 1.0895,
"step": 378
},
{
"epoch": 0.95,
"grad_norm": 0.14276988804340363,
"learning_rate": 0.00017433219705839616,
"loss": 1.0991,
"step": 379
},
{
"epoch": 0.95,
"grad_norm": 0.13922327756881714,
"learning_rate": 0.00017419921129025576,
"loss": 1.0883,
"step": 380
},
{
"epoch": 0.96,
"grad_norm": 0.1479676216840744,
"learning_rate": 0.00017406593291429217,
"loss": 1.1083,
"step": 381
},
{
"epoch": 0.96,
"grad_norm": 0.14659778773784637,
"learning_rate": 0.0001739323624560945,
"loss": 1.0863,
"step": 382
},
{
"epoch": 0.96,
"grad_norm": 0.14685633778572083,
"learning_rate": 0.00017379850044240368,
"loss": 1.1075,
"step": 383
},
{
"epoch": 0.96,
"grad_norm": 0.14316044747829437,
"learning_rate": 0.00017366434740111037,
"loss": 1.0584,
"step": 384
},
{
"epoch": 0.97,
"grad_norm": 0.14292864501476288,
"learning_rate": 0.00017352990386125292,
"loss": 1.1002,
"step": 385
},
{
"epoch": 0.97,
"grad_norm": 0.14412067830562592,
"learning_rate": 0.00017339517035301532,
"loss": 1.0671,
"step": 386
},
{
"epoch": 0.97,
"grad_norm": 0.14292089641094208,
"learning_rate": 0.000173260147407725,
"loss": 1.0958,
"step": 387
},
{
"epoch": 0.97,
"grad_norm": 0.1490335911512375,
"learning_rate": 0.00017312483555785086,
"loss": 1.1074,
"step": 388
},
{
"epoch": 0.98,
"grad_norm": 0.14249826967716217,
"learning_rate": 0.00017298923533700107,
"loss": 1.1546,
"step": 389
},
{
"epoch": 0.98,
"grad_norm": 0.14555396139621735,
"learning_rate": 0.000172853347279921,
"loss": 1.076,
"step": 390
},
{
"epoch": 0.98,
"grad_norm": 0.14374902844429016,
"learning_rate": 0.00017271717192249116,
"loss": 1.0767,
"step": 391
},
{
"epoch": 0.98,
"grad_norm": 0.14903804659843445,
"learning_rate": 0.00017258070980172494,
"loss": 1.0969,
"step": 392
},
{
"epoch": 0.99,
"grad_norm": 0.1533229798078537,
"learning_rate": 0.00017244396145576672,
"loss": 1.1206,
"step": 393
},
{
"epoch": 0.99,
"grad_norm": 0.14720167219638824,
"learning_rate": 0.0001723069274238895,
"loss": 1.0655,
"step": 394
},
{
"epoch": 0.99,
"grad_norm": 0.14380764961242676,
"learning_rate": 0.00017216960824649303,
"loss": 1.0123,
"step": 395
},
{
"epoch": 0.99,
"grad_norm": 0.14513961970806122,
"learning_rate": 0.0001720320044651014,
"loss": 1.0196,
"step": 396
},
{
"epoch": 1.0,
"grad_norm": 0.14310909807682037,
"learning_rate": 0.0001718941166223612,
"loss": 1.0278,
"step": 397
},
{
"epoch": 1.0,
"grad_norm": 0.14312389492988586,
"learning_rate": 0.00017175594526203905,
"loss": 1.0649,
"step": 398
}
],
"logging_steps": 1,
"max_steps": 1592,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 398,
"total_flos": 2.96912159961514e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}